master_lexer.cc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. // Copyright (C) 2012 Internet Systems Consortium, Inc. ("ISC")
  2. //
  3. // Permission to use, copy, modify, and/or distribute this software for any
  4. // purpose with or without fee is hereby granted, provided that the above
  5. // copyright notice and this permission notice appear in all copies.
  6. //
  7. // THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  8. // REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  9. // AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  10. // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  11. // LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  12. // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  13. // PERFORMANCE OF THIS SOFTWARE.
  14. #include <exceptions/exceptions.h>
  15. #include <dns/master_lexer.h>
  16. #include <dns/master_lexer_inputsource.h>
  17. #include <dns/master_lexer_state.h>
  18. #include <boost/foreach.hpp>
  19. #include <boost/shared_ptr.hpp>
  20. #include <boost/lexical_cast.hpp>
  21. #include <bitset>
  22. #include <cassert>
  23. #include <limits>
  24. #include <string>
  25. #include <vector>
  26. namespace isc {
  27. namespace dns {
  28. // The definition of SOURCE_SIZE_UNKNOWN. Note that we initialize it using
  29. // a method of another library. Technically, this could trigger a static
  30. // initialization fiasco. But in this particular usage it's very unlikely
  31. // to happen because this value is expected to be used only as a return
  32. // value of a MasterLexer's method, and its constructor needs definitions
  33. // here.
  34. const size_t MasterLexer::SOURCE_SIZE_UNKNOWN =
  35. std::numeric_limits<size_t>::max();
  36. namespace {
  37. typedef boost::shared_ptr<master_lexer_internal::InputSource> InputSourcePtr;
  38. } // end unnamed namespace
  39. using namespace master_lexer_internal;
  40. struct MasterLexer::MasterLexerImpl {
  41. MasterLexerImpl() : source_(NULL), token_(MasterToken::NOT_STARTED),
  42. total_size_(0), popped_size_(0),
  43. paren_count_(0), last_was_eol_(true),
  44. has_previous_(false),
  45. previous_paren_count_(0),
  46. previous_was_eol_(false)
  47. {
  48. separators_.set('\r');
  49. separators_.set('\n');
  50. separators_.set(' ');
  51. separators_.set('\t');
  52. separators_.set('(');
  53. separators_.set(')');
  54. esc_separators_.set('\r');
  55. esc_separators_.set('\n');
  56. }
  57. // A helper method to skip possible comments toward the end of EOL or EOF.
  58. // commonly used by state classes. It returns the corresponding "end-of"
  59. // character in case it's a comment; otherwise it simply returns the
  60. // current character.
  61. int skipComment(int c, bool escaped = false) {
  62. if (c == ';' && !escaped) {
  63. while (true) {
  64. c = source_->getChar();
  65. if (c == '\n' || c == InputSource::END_OF_STREAM) {
  66. return (c);
  67. }
  68. }
  69. }
  70. return (c);
  71. }
  72. bool isTokenEnd(int c, bool escaped) {
  73. // Special case of EOF (end of stream); this is not in the bitmaps
  74. if (c == InputSource::END_OF_STREAM) {
  75. return (true);
  76. }
  77. // In this implementation we only ensure the behavior for unsigned
  78. // range of characters, so we restrict the range of the values up to
  79. // 0x7f = 127
  80. return (escaped ? esc_separators_.test(c & 0x7f) :
  81. separators_.test(c & 0x7f));
  82. }
  83. void setTotalSize() {
  84. assert(source_ != NULL);
  85. if (total_size_ != SOURCE_SIZE_UNKNOWN) {
  86. const size_t current_size = source_->getSize();
  87. if (current_size != SOURCE_SIZE_UNKNOWN) {
  88. total_size_ += current_size;
  89. } else {
  90. total_size_ = SOURCE_SIZE_UNKNOWN;
  91. }
  92. }
  93. }
  94. std::vector<InputSourcePtr> sources_;
  95. InputSource* source_; // current source (NULL if sources_ is empty)
  96. MasterToken token_; // currently recognized token (set by a state)
  97. std::vector<char> data_; // placeholder for string data
  98. // Keep track of the total size of all sources and characters that have
  99. // been read from sources already popped.
  100. size_t total_size_; // accumulated size (# of chars) of sources
  101. size_t popped_size_; // total size of sources that have been popped
  102. // These are used in states, and defined here only as a placeholder.
  103. // The main lexer class does not need these members.
  104. size_t paren_count_; // nest count of the parentheses
  105. bool last_was_eol_; // whether the lexer just passed an end-of-line
  106. // Bitmaps that gives whether a given (positive) character should be
  107. // considered a separator of a string/number token. The esc_ version
  108. // is a subset of the other, excluding characters that can be ignored
  109. // if escaped by a backslash. See isTokenEnd() for the bitmap size.
  110. std::bitset<128> separators_;
  111. std::bitset<128> esc_separators_;
  112. // These are to allow restoring state before previous token.
  113. bool has_previous_;
  114. size_t previous_paren_count_;
  115. bool previous_was_eol_;
  116. };
  117. MasterLexer::MasterLexer() : impl_(new MasterLexerImpl) {
  118. }
  119. MasterLexer::~MasterLexer() {
  120. delete impl_;
  121. }
  122. bool
  123. MasterLexer::pushSource(const char* filename, std::string* error) {
  124. if (filename == NULL) {
  125. isc_throw(InvalidParameter,
  126. "NULL filename for MasterLexer::pushSource");
  127. }
  128. try {
  129. impl_->sources_.push_back(InputSourcePtr(new InputSource(filename)));
  130. } catch (const InputSource::OpenError& ex) {
  131. if (error != NULL) {
  132. *error = ex.what();
  133. }
  134. return (false);
  135. }
  136. impl_->source_ = impl_->sources_.back().get();
  137. impl_->has_previous_ = false;
  138. impl_->last_was_eol_ = true;
  139. impl_->setTotalSize();
  140. return (true);
  141. }
  142. void
  143. MasterLexer::pushSource(std::istream& input) {
  144. try {
  145. impl_->sources_.push_back(InputSourcePtr(new InputSource(input)));
  146. } catch (const InputSource::OpenError& ex) {
  147. // Convert the "internal" exception to public one.
  148. isc_throw(Unexpected, "Failed to push a stream to lexer: " <<
  149. ex.what());
  150. }
  151. impl_->source_ = impl_->sources_.back().get();
  152. impl_->has_previous_ = false;
  153. impl_->last_was_eol_ = true;
  154. impl_->setTotalSize();
  155. }
  156. void
  157. MasterLexer::popSource() {
  158. if (impl_->sources_.empty()) {
  159. isc_throw(InvalidOperation,
  160. "MasterLexer::popSource on an empty source");
  161. }
  162. impl_->popped_size_ += impl_->source_->getPosition();
  163. impl_->sources_.pop_back();
  164. impl_->source_ = impl_->sources_.empty() ? NULL :
  165. impl_->sources_.back().get();
  166. impl_->has_previous_ = false;
  167. }
  168. size_t
  169. MasterLexer::getSourceCount() const {
  170. return (impl_->sources_.size());
  171. }
  172. std::string
  173. MasterLexer::getSourceName() const {
  174. if (impl_->sources_.empty()) {
  175. return (std::string());
  176. }
  177. return (impl_->sources_.back()->getName());
  178. }
  179. size_t
  180. MasterLexer::getSourceLine() const {
  181. if (impl_->sources_.empty()) {
  182. return (0);
  183. }
  184. return (impl_->sources_.back()->getCurrentLine());
  185. }
  186. size_t
  187. MasterLexer::getTotalSourceSize() const {
  188. return (impl_->total_size_);
  189. }
  190. size_t
  191. MasterLexer::getPosition() const {
  192. size_t position = impl_->popped_size_;
  193. BOOST_FOREACH(InputSourcePtr& src, impl_->sources_) {
  194. position += src->getPosition();
  195. }
  196. return (position);
  197. }
  198. const MasterToken&
  199. MasterLexer::getNextToken(Options options) {
  200. if (impl_->source_ == NULL) {
  201. isc_throw(isc::InvalidOperation, "No source to read tokens from");
  202. }
  203. // Store the current state so we can restore it in ungetToken
  204. impl_->previous_paren_count_ = impl_->paren_count_;
  205. impl_->previous_was_eol_ = impl_->last_was_eol_;
  206. impl_->source_->mark();
  207. impl_->has_previous_ = true;
  208. // Reset the token now. This is to check a token was actually produced.
  209. // This is debugging aid.
  210. impl_->token_ = MasterToken(MasterToken::NO_TOKEN_PRODUCED);
  211. // And get the token
  212. // This actually handles EOF internally too.
  213. const State* state = State::start(*this, options);
  214. if (state != NULL) {
  215. state->handle(*this);
  216. }
  217. // Make sure a token was produced. Since this Can Not Happen, we assert
  218. // here instead of throwing.
  219. assert(impl_->token_.getType() != MasterToken::ERROR ||
  220. impl_->token_.getErrorCode() != MasterToken::NO_TOKEN_PRODUCED);
  221. return (impl_->token_);
  222. }
  223. namespace {
  224. inline MasterLexer::Options
  225. optionsForTokenType(MasterToken::Type expect) {
  226. switch (expect) {
  227. case MasterToken::STRING:
  228. return (MasterLexer::NONE);
  229. case MasterToken::QSTRING:
  230. return (MasterLexer::QSTRING);
  231. case MasterToken::NUMBER:
  232. return (MasterLexer::NUMBER);
  233. default:
  234. isc_throw(InvalidParameter,
  235. "expected type for getNextToken not supported: " << expect);
  236. }
  237. }
  238. }
  239. const MasterToken&
  240. MasterLexer::getNextToken(MasterToken::Type expect, bool eol_ok) {
  241. // Get the next token, specifying an appropriate option corresponding to
  242. // the expected type. The result should be set in impl_->token_.
  243. getNextToken(optionsForTokenType(expect));
  244. if (impl_->token_.getType() == MasterToken::ERROR) {
  245. if (impl_->token_.getErrorCode() == MasterToken::NUMBER_OUT_OF_RANGE) {
  246. ungetToken();
  247. }
  248. throw LexerError(__FILE__, __LINE__, impl_->token_);
  249. }
  250. const bool is_eol_like =
  251. (impl_->token_.getType() == MasterToken::END_OF_LINE ||
  252. impl_->token_.getType() == MasterToken::END_OF_FILE);
  253. if (eol_ok && is_eol_like) {
  254. return (impl_->token_);
  255. }
  256. if (impl_->token_.getType() == MasterToken::STRING &&
  257. expect == MasterToken::QSTRING) {
  258. return (impl_->token_);
  259. }
  260. if (impl_->token_.getType() != expect) {
  261. ungetToken();
  262. if (is_eol_like) {
  263. throw LexerError(__FILE__, __LINE__,
  264. MasterToken(MasterToken::UNEXPECTED_END));
  265. }
  266. assert(expect == MasterToken::NUMBER);
  267. throw LexerError(__FILE__, __LINE__,
  268. MasterToken(MasterToken::BAD_NUMBER));
  269. }
  270. return (impl_->token_);
  271. }
  272. void
  273. MasterLexer::ungetToken() {
  274. if (impl_->has_previous_) {
  275. impl_->has_previous_ = false;
  276. impl_->source_->ungetAll();
  277. impl_->last_was_eol_ = impl_->previous_was_eol_;
  278. impl_->paren_count_ = impl_->previous_paren_count_;
  279. } else {
  280. isc_throw(isc::InvalidOperation, "No token to unget ready");
  281. }
  282. }
  283. namespace {
  284. const char* const error_text[] = {
  285. "lexer not started", // NOT_STARTED
  286. "unbalanced parentheses", // UNBALANCED_PAREN
  287. "unexpected end of input", // UNEXPECTED_END
  288. "unbalanced quotes", // UNBALANCED_QUOTES
  289. "no token produced", // NO_TOKEN_PRODUCED
  290. "number out of range", // NUMBER_OUT_OF_RANGE
  291. "not a valid number" // BAD_NUMBER
  292. };
  293. const size_t error_text_max_count = sizeof(error_text) / sizeof(error_text[0]);
  294. } // end unnamed namespace
  295. std::string
  296. MasterToken::getErrorText() const {
  297. if (type_ != ERROR) {
  298. isc_throw(InvalidOperation,
  299. "MasterToken::getErrorText() for non error type");
  300. }
  301. // The class integrity ensures the following:
  302. assert(val_.error_code_ < error_text_max_count);
  303. return (error_text[val_.error_code_]);
  304. }
  305. namespace master_lexer_internal {
  306. // Below we implement state classes for state transitions of MasterLexer.
  307. // Note that these need to be defined here so that they can refer to
  308. // the details of MasterLexerImpl.
  309. bool
  310. State::wasLastEOL(const MasterLexer& lexer) const {
  311. return (lexer.impl_->last_was_eol_);
  312. }
  313. const MasterToken&
  314. State::getToken(const MasterLexer& lexer) const {
  315. return (lexer.impl_->token_);
  316. }
  317. size_t
  318. State::getParenCount(const MasterLexer& lexer) const {
  319. return (lexer.impl_->paren_count_);
  320. }
  321. namespace {
  322. class CRLF : public State {
  323. public:
  324. CRLF() {}
  325. virtual ~CRLF() {} // see the base class for the destructor
  326. virtual void handle(MasterLexer& lexer) const {
  327. // We've just seen '\r'. If this is part of a sequence of '\r\n',
  328. // we combine them as a single END-OF-LINE. Otherwise we treat the
  329. // single '\r' as an EOL and continue tokeniziation from the character
  330. // immediately after '\r'. One tricky case is that there's a comment
  331. // between '\r' and '\n'. This implementation combines these
  332. // characters and treats them as a single EOL (the behavior derived
  333. // from BIND 9). Technically this may not be correct, but in practice
  334. // the caller wouldn't distinguish this case from the case it has
  335. // two EOLs, so we simplify the process.
  336. const int c = getLexerImpl(lexer)->skipComment(
  337. getLexerImpl(lexer)->source_->getChar());
  338. if (c != '\n') {
  339. getLexerImpl(lexer)->source_->ungetChar();
  340. }
  341. getLexerImpl(lexer)->token_ = MasterToken(MasterToken::END_OF_LINE);
  342. getLexerImpl(lexer)->last_was_eol_ = true;
  343. }
  344. };
  345. class String : public State {
  346. public:
  347. String() {}
  348. virtual ~String() {} // see the base class for the destructor
  349. virtual void handle(MasterLexer& lexer) const;
  350. };
  351. class QString : public State {
  352. public:
  353. QString() {}
  354. virtual ~QString() {} // see the base class for the destructor
  355. virtual void handle(MasterLexer& lexer) const;
  356. };
  357. class Number : public State {
  358. public:
  359. Number() {}
  360. virtual ~Number() {}
  361. virtual void handle(MasterLexer& lexer) const;
  362. };
  363. // We use a common instance of a each state in a singleton-like way to save
  364. // construction overhead. They are not singletons in its strict sense as
  365. // we don't prohibit direct construction of these objects. But that doesn't
  366. // matter much anyway, because the definitions are completely hidden within
  367. // this file.
  368. const CRLF CRLF_STATE;
  369. const String STRING_STATE;
  370. const QString QSTRING_STATE;
  371. const Number NUMBER_STATE;
  372. } // end unnamed namespace
  373. const State&
  374. State::getInstance(ID state_id) {
  375. switch (state_id) {
  376. case CRLF:
  377. return (CRLF_STATE);
  378. case String:
  379. return (STRING_STATE);
  380. case QString:
  381. return (QSTRING_STATE);
  382. case Number:
  383. return (NUMBER_STATE);
  384. }
  385. // This is a bug of the caller, and this method is only expected to be
  386. // used by tests, so we just forcefully make it fail by asserting the
  387. // condition.
  388. assert(false);
  389. return (STRING_STATE); // a dummy return, to silence some compilers.
  390. }
  391. const State*
  392. State::start(MasterLexer& lexer, MasterLexer::Options options) {
  393. // define some shortcuts
  394. MasterLexer::MasterLexerImpl& lexerimpl = *lexer.impl_;
  395. size_t& paren_count = lexerimpl.paren_count_;
  396. // Note: the if-else in the loop is getting complicated. When we complete
  397. // #2374, revisit the organization to see if we need a fundamental
  398. // refactoring.
  399. while (true) {
  400. const int c = lexerimpl.skipComment(lexerimpl.source_->getChar());
  401. if (c == InputSource::END_OF_STREAM) {
  402. lexerimpl.last_was_eol_ = false;
  403. if (paren_count != 0) {
  404. lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
  405. paren_count = 0; // reset to 0; this helps in lenient mode.
  406. return (NULL);
  407. }
  408. lexerimpl.token_ = MasterToken(MasterToken::END_OF_FILE);
  409. return (NULL);
  410. } else if (c == ' ' || c == '\t') {
  411. // If requested and we are not in (), recognize the initial space.
  412. if (lexerimpl.last_was_eol_ && paren_count == 0 &&
  413. (options & MasterLexer::INITIAL_WS) != 0) {
  414. lexerimpl.last_was_eol_ = false;
  415. lexerimpl.token_ = MasterToken(MasterToken::INITIAL_WS);
  416. return (NULL);
  417. }
  418. } else if (c == '\n') {
  419. lexerimpl.last_was_eol_ = true;
  420. if (paren_count == 0) { // we don't recognize EOL if we are in ()
  421. lexerimpl.token_ = MasterToken(MasterToken::END_OF_LINE);
  422. return (NULL);
  423. }
  424. } else if (c == '\r') {
  425. if (paren_count == 0) { // check if we are in () (see above)
  426. return (&CRLF_STATE);
  427. }
  428. } else if (c == '"' && (options & MasterLexer::QSTRING) != 0) {
  429. lexerimpl.last_was_eol_ = false;
  430. return (&QSTRING_STATE);
  431. } else if (c == '(') {
  432. lexerimpl.last_was_eol_ = false;
  433. ++paren_count;
  434. } else if (c == ')') {
  435. lexerimpl.last_was_eol_ = false;
  436. if (paren_count == 0) {
  437. lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
  438. return (NULL);
  439. }
  440. --paren_count;
  441. } else if ((options & MasterLexer::NUMBER) != 0 &&isdigit(c)) {
  442. lexerimpl.last_was_eol_ = false;
  443. // this character will be handled in the number state
  444. lexerimpl.source_->ungetChar();
  445. return (&NUMBER_STATE);
  446. } else {
  447. // this character will be handled in the string state
  448. lexerimpl.source_->ungetChar();
  449. lexerimpl.last_was_eol_ = false;
  450. return (&STRING_STATE);
  451. }
  452. // no code should be here; we just continue the loop.
  453. }
  454. }
  455. void
  456. String::handle(MasterLexer& lexer) const {
  457. std::vector<char>& data = getLexerImpl(lexer)->data_;
  458. data.clear();
  459. bool escaped = false;
  460. while (true) {
  461. const int c = getLexerImpl(lexer)->skipComment(
  462. getLexerImpl(lexer)->source_->getChar(), escaped);
  463. if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
  464. getLexerImpl(lexer)->source_->ungetChar();
  465. // make sure it nul-terminated as a c-str (excluded from token
  466. // data).
  467. data.push_back('\0');
  468. getLexerImpl(lexer)->token_ =
  469. MasterToken(&data.at(0), data.size() - 1);
  470. return;
  471. }
  472. escaped = (c == '\\' && !escaped);
  473. data.push_back(c);
  474. }
  475. }
  476. void
  477. QString::handle(MasterLexer& lexer) const {
  478. MasterToken& token = getLexerImpl(lexer)->token_;
  479. std::vector<char>& data = getLexerImpl(lexer)->data_;
  480. data.clear();
  481. bool escaped = false;
  482. while (true) {
  483. const int c = getLexerImpl(lexer)->source_->getChar();
  484. if (c == InputSource::END_OF_STREAM) {
  485. token = MasterToken(MasterToken::UNEXPECTED_END);
  486. return;
  487. } else if (c == '"') {
  488. if (escaped) {
  489. // found escaped '"'. overwrite the preceding backslash.
  490. assert(!data.empty());
  491. escaped = false;
  492. data.back() = '"';
  493. } else {
  494. // make sure it nul-terminated as a c-str (excluded from token
  495. // data). This also simplifies the case of an empty string.
  496. data.push_back('\0');
  497. token = MasterToken(&data.at(0), data.size() - 1, true);
  498. return;
  499. }
  500. } else if (c == '\n' && !escaped) {
  501. getLexerImpl(lexer)->source_->ungetChar();
  502. token = MasterToken(MasterToken::UNBALANCED_QUOTES);
  503. return;
  504. } else {
  505. escaped = (c == '\\' && !escaped);
  506. data.push_back(c);
  507. }
  508. }
  509. }
  510. void
  511. Number::handle(MasterLexer& lexer) const {
  512. MasterToken& token = getLexerImpl(lexer)->token_;
  513. // It may yet turn out to be a string, so we first
  514. // collect all the data
  515. bool digits_only = true;
  516. std::vector<char>& data = getLexerImpl(lexer)->data_;
  517. data.clear();
  518. bool escaped = false;
  519. while (true) {
  520. const int c = getLexerImpl(lexer)->skipComment(
  521. getLexerImpl(lexer)->source_->getChar(), escaped);
  522. if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
  523. getLexerImpl(lexer)->source_->ungetChar();
  524. // We need to close the string whether it's digits-only (for
  525. // lexical_cast) or not (see String::handle()).
  526. data.push_back('\0');
  527. if (digits_only) {
  528. try {
  529. const uint32_t number32 =
  530. boost::lexical_cast<uint32_t, const char*>(&data[0]);
  531. token = MasterToken(number32);
  532. } catch (const boost::bad_lexical_cast&) {
  533. // Since we already know we have only digits,
  534. // range should be the only possible problem.
  535. token = MasterToken(MasterToken::NUMBER_OUT_OF_RANGE);
  536. }
  537. } else {
  538. token = MasterToken(&data.at(0), data.size() - 1);
  539. }
  540. return;
  541. }
  542. if (!isdigit(c)) {
  543. digits_only = false;
  544. }
  545. escaped = (c == '\\' && !escaped);
  546. data.push_back(c);
  547. }
  548. }
  549. } // namespace master_lexer_internal
  550. } // end of namespace dns
  551. } // end of namespace isc