master_lexer.cc 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. // Copyright (C) 2012 Internet Systems Consortium, Inc. ("ISC")
  2. //
  3. // Permission to use, copy, modify, and/or distribute this software for any
  4. // purpose with or without fee is hereby granted, provided that the above
  5. // copyright notice and this permission notice appear in all copies.
  6. //
  7. // THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  8. // REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  9. // AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  10. // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  11. // LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  12. // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  13. // PERFORMANCE OF THIS SOFTWARE.
  14. #include <exceptions/exceptions.h>
  15. #include <dns/master_lexer.h>
  16. #include <dns/master_lexer_inputsource.h>
  17. #include <dns/master_lexer_state.h>
  18. #include <boost/foreach.hpp>
  19. #include <boost/shared_ptr.hpp>
  20. #include <boost/lexical_cast.hpp>
  21. #include <bitset>
  22. #include <cassert>
  23. #include <limits>
  24. #include <string>
  25. #include <vector>
  26. namespace isc {
  27. namespace dns {
  28. // The definition of SOURCE_SIZE_UNKNOWN. Note that we initialize it using
  29. // a method of another library. Technically, this could trigger a static
  30. // initialization fiasco. But in this particular usage it's very unlikely
  31. // to happen because this value is expected to be used only as a return
  32. // value of a MasterLexer's method, and its constructor needs definitions
  33. // here.
  34. const size_t MasterLexer::SOURCE_SIZE_UNKNOWN =
  35. std::numeric_limits<size_t>::max();
  36. namespace {
  37. typedef boost::shared_ptr<master_lexer_internal::InputSource> InputSourcePtr;
  38. } // end unnamed namespace
  39. using namespace master_lexer_internal;
  40. struct MasterLexer::MasterLexerImpl {
  41. MasterLexerImpl() : source_(NULL), token_(MasterToken::NOT_STARTED),
  42. total_size_(0), popped_size_(0),
  43. paren_count_(0), last_was_eol_(true),
  44. has_previous_(false),
  45. previous_paren_count_(0),
  46. previous_was_eol_(false)
  47. {
  48. separators_.set('\r');
  49. separators_.set('\n');
  50. separators_.set(' ');
  51. separators_.set('\t');
  52. separators_.set('(');
  53. separators_.set(')');
  54. esc_separators_.set('\r');
  55. esc_separators_.set('\n');
  56. }
  57. // A helper method to skip possible comments toward the end of EOL or EOF.
  58. // commonly used by state classes. It returns the corresponding "end-of"
  59. // character in case it's a comment; otherwise it simply returns the
  60. // current character.
  61. int skipComment(int c, bool escaped = false) {
  62. if (c == ';' && !escaped) {
  63. while (true) {
  64. c = source_->getChar();
  65. if (c == '\n' || c == InputSource::END_OF_STREAM) {
  66. return (c);
  67. }
  68. }
  69. }
  70. return (c);
  71. }
  72. bool isTokenEnd(int c, bool escaped) {
  73. // Special case of EOF (end of stream); this is not in the bitmaps
  74. if (c == InputSource::END_OF_STREAM) {
  75. return (true);
  76. }
  77. // In this implementation we only ensure the behavior for unsigned
  78. // range of characters, so we restrict the range of the values up to
  79. // 0x7f = 127
  80. return (escaped ? esc_separators_.test(c & 0x7f) :
  81. separators_.test(c & 0x7f));
  82. }
  83. void setTotalSize() {
  84. assert(source_ != NULL);
  85. if (total_size_ != SOURCE_SIZE_UNKNOWN) {
  86. const size_t current_size = source_->getSize();
  87. if (current_size != SOURCE_SIZE_UNKNOWN) {
  88. total_size_ += current_size;
  89. } else {
  90. total_size_ = SOURCE_SIZE_UNKNOWN;
  91. }
  92. }
  93. }
  94. std::vector<InputSourcePtr> sources_;
  95. InputSource* source_; // current source (NULL if sources_ is empty)
  96. MasterToken token_; // currently recognized token (set by a state)
  97. std::vector<char> data_; // placeholder for string data
  98. // Keep track of the total size of all sources and characters that have
  99. // been read from sources already popped.
  100. size_t total_size_; // accumulated size (# of chars) of sources
  101. size_t popped_size_; // total size of sources that have been popped
  102. // These are used in states, and defined here only as a placeholder.
  103. // The main lexer class does not need these members.
  104. size_t paren_count_; // nest count of the parentheses
  105. bool last_was_eol_; // whether the lexer just passed an end-of-line
  106. // Bitmaps that gives whether a given (positive) character should be
  107. // considered a separator of a string/number token. The esc_ version
  108. // is a subset of the other, excluding characters that can be ignored
  109. // if escaped by a backslash. See isTokenEnd() for the bitmap size.
  110. std::bitset<128> separators_;
  111. std::bitset<128> esc_separators_;
  112. // These are to allow restoring state before previous token.
  113. bool has_previous_;
  114. size_t previous_paren_count_;
  115. bool previous_was_eol_;
  116. };
  117. MasterLexer::MasterLexer() : impl_(new MasterLexerImpl) {
  118. }
  119. MasterLexer::~MasterLexer() {
  120. delete impl_;
  121. }
  122. bool
  123. MasterLexer::pushSource(const char* filename, std::string* error) {
  124. if (filename == NULL) {
  125. isc_throw(InvalidParameter,
  126. "NULL filename for MasterLexer::pushSource");
  127. }
  128. try {
  129. impl_->sources_.push_back(InputSourcePtr(new InputSource(filename)));
  130. } catch (const InputSource::OpenError& ex) {
  131. if (error != NULL) {
  132. *error = ex.what();
  133. }
  134. return (false);
  135. }
  136. impl_->source_ = impl_->sources_.back().get();
  137. impl_->has_previous_ = false;
  138. impl_->last_was_eol_ = true;
  139. impl_->setTotalSize();
  140. return (true);
  141. }
  142. void
  143. MasterLexer::pushSource(std::istream& input) {
  144. impl_->sources_.push_back(InputSourcePtr(new InputSource(input)));
  145. impl_->source_ = impl_->sources_.back().get();
  146. impl_->has_previous_ = false;
  147. impl_->last_was_eol_ = true;
  148. impl_->setTotalSize();
  149. }
  150. void
  151. MasterLexer::popSource() {
  152. if (impl_->sources_.empty()) {
  153. isc_throw(InvalidOperation,
  154. "MasterLexer::popSource on an empty source");
  155. }
  156. impl_->popped_size_ += impl_->source_->getPosition();
  157. impl_->sources_.pop_back();
  158. impl_->source_ = impl_->sources_.empty() ? NULL :
  159. impl_->sources_.back().get();
  160. impl_->has_previous_ = false;
  161. }
  162. size_t
  163. MasterLexer::getSourceCount() const {
  164. return (impl_->sources_.size());
  165. }
  166. std::string
  167. MasterLexer::getSourceName() const {
  168. if (impl_->sources_.empty()) {
  169. return (std::string());
  170. }
  171. return (impl_->sources_.back()->getName());
  172. }
  173. size_t
  174. MasterLexer::getSourceLine() const {
  175. if (impl_->sources_.empty()) {
  176. return (0);
  177. }
  178. return (impl_->sources_.back()->getCurrentLine());
  179. }
  180. size_t
  181. MasterLexer::getTotalSourceSize() const {
  182. return (impl_->total_size_);
  183. }
  184. size_t
  185. MasterLexer::getPosition() const {
  186. size_t position = impl_->popped_size_;
  187. BOOST_FOREACH(InputSourcePtr& src, impl_->sources_) {
  188. position += src->getPosition();
  189. }
  190. return (position);
  191. }
  192. const MasterToken&
  193. MasterLexer::getNextToken(Options options) {
  194. if (impl_->source_ == NULL) {
  195. isc_throw(isc::InvalidOperation, "No source to read tokens from");
  196. }
  197. // Store the current state so we can restore it in ungetToken
  198. impl_->previous_paren_count_ = impl_->paren_count_;
  199. impl_->previous_was_eol_ = impl_->last_was_eol_;
  200. impl_->source_->mark();
  201. impl_->has_previous_ = true;
  202. // Reset the token now. This is to check a token was actually produced.
  203. // This is debugging aid.
  204. impl_->token_ = MasterToken(MasterToken::NO_TOKEN_PRODUCED);
  205. // And get the token
  206. // This actually handles EOF internally too.
  207. const State* state = State::start(*this, options);
  208. if (state != NULL) {
  209. state->handle(*this);
  210. }
  211. // Make sure a token was produced. Since this Can Not Happen, we assert
  212. // here instead of throwing.
  213. assert(impl_->token_.getType() != MasterToken::ERROR ||
  214. impl_->token_.getErrorCode() != MasterToken::NO_TOKEN_PRODUCED);
  215. return (impl_->token_);
  216. }
  217. namespace {
  218. inline MasterLexer::Options
  219. optionsForTokenType(MasterToken::Type expect) {
  220. switch (expect) {
  221. case MasterToken::STRING:
  222. return (MasterLexer::NONE);
  223. case MasterToken::QSTRING:
  224. return (MasterLexer::QSTRING);
  225. case MasterToken::NUMBER:
  226. return (MasterLexer::NUMBER);
  227. default:
  228. isc_throw(InvalidParameter,
  229. "expected type for getNextToken not supported: " << expect);
  230. }
  231. }
  232. }
  233. const MasterToken&
  234. MasterLexer::getNextToken(MasterToken::Type expect, bool eol_ok) {
  235. // Get the next token, specifying an appropriate option corresponding to
  236. // the expected type. The result should be set in impl_->token_.
  237. getNextToken(optionsForTokenType(expect));
  238. if (impl_->token_.getType() == MasterToken::ERROR) {
  239. if (impl_->token_.getErrorCode() == MasterToken::NUMBER_OUT_OF_RANGE) {
  240. ungetToken();
  241. }
  242. throw LexerError(__FILE__, __LINE__, impl_->token_);
  243. }
  244. const bool is_eol_like =
  245. (impl_->token_.getType() == MasterToken::END_OF_LINE ||
  246. impl_->token_.getType() == MasterToken::END_OF_FILE);
  247. if (eol_ok && is_eol_like) {
  248. return (impl_->token_);
  249. }
  250. if (impl_->token_.getType() == MasterToken::STRING &&
  251. expect == MasterToken::QSTRING) {
  252. return (impl_->token_);
  253. }
  254. if (impl_->token_.getType() != expect) {
  255. ungetToken();
  256. if (is_eol_like) {
  257. throw LexerError(__FILE__, __LINE__,
  258. MasterToken(MasterToken::UNEXPECTED_END));
  259. }
  260. assert(expect == MasterToken::NUMBER);
  261. throw LexerError(__FILE__, __LINE__,
  262. MasterToken(MasterToken::BAD_NUMBER));
  263. }
  264. return (impl_->token_);
  265. }
  266. void
  267. MasterLexer::ungetToken() {
  268. if (impl_->has_previous_) {
  269. impl_->has_previous_ = false;
  270. impl_->source_->ungetAll();
  271. impl_->last_was_eol_ = impl_->previous_was_eol_;
  272. impl_->paren_count_ = impl_->previous_paren_count_;
  273. } else {
  274. isc_throw(isc::InvalidOperation, "No token to unget ready");
  275. }
  276. }
  277. namespace {
  278. const char* const error_text[] = {
  279. "lexer not started", // NOT_STARTED
  280. "unbalanced parentheses", // UNBALANCED_PAREN
  281. "unexpected end of input", // UNEXPECTED_END
  282. "unbalanced quotes", // UNBALANCED_QUOTES
  283. "no token produced", // NO_TOKEN_PRODUCED
  284. "number out of range", // NUMBER_OUT_OF_RANGE
  285. "not a valid number" // BAD_NUMBER
  286. };
  287. const size_t error_text_max_count = sizeof(error_text) / sizeof(error_text[0]);
  288. } // end unnamed namespace
  289. std::string
  290. MasterToken::getErrorText() const {
  291. if (type_ != ERROR) {
  292. isc_throw(InvalidOperation,
  293. "MasterToken::getErrorText() for non error type");
  294. }
  295. // The class integrity ensures the following:
  296. assert(val_.error_code_ < error_text_max_count);
  297. return (error_text[val_.error_code_]);
  298. }
  299. namespace master_lexer_internal {
  300. // Below we implement state classes for state transitions of MasterLexer.
  301. // Note that these need to be defined here so that they can refer to
  302. // the details of MasterLexerImpl.
  303. bool
  304. State::wasLastEOL(const MasterLexer& lexer) const {
  305. return (lexer.impl_->last_was_eol_);
  306. }
  307. const MasterToken&
  308. State::getToken(const MasterLexer& lexer) const {
  309. return (lexer.impl_->token_);
  310. }
  311. size_t
  312. State::getParenCount(const MasterLexer& lexer) const {
  313. return (lexer.impl_->paren_count_);
  314. }
  315. namespace {
  316. class CRLF : public State {
  317. public:
  318. CRLF() {}
  319. virtual ~CRLF() {} // see the base class for the destructor
  320. virtual void handle(MasterLexer& lexer) const {
  321. // We've just seen '\r'. If this is part of a sequence of '\r\n',
  322. // we combine them as a single END-OF-LINE. Otherwise we treat the
  323. // single '\r' as an EOL and continue tokeniziation from the character
  324. // immediately after '\r'. One tricky case is that there's a comment
  325. // between '\r' and '\n'. This implementation combines these
  326. // characters and treats them as a single EOL (the behavior derived
  327. // from BIND 9). Technically this may not be correct, but in practice
  328. // the caller wouldn't distinguish this case from the case it has
  329. // two EOLs, so we simplify the process.
  330. const int c = getLexerImpl(lexer)->skipComment(
  331. getLexerImpl(lexer)->source_->getChar());
  332. if (c != '\n') {
  333. getLexerImpl(lexer)->source_->ungetChar();
  334. }
  335. getLexerImpl(lexer)->token_ = MasterToken(MasterToken::END_OF_LINE);
  336. getLexerImpl(lexer)->last_was_eol_ = true;
  337. }
  338. };
  339. class String : public State {
  340. public:
  341. String() {}
  342. virtual ~String() {} // see the base class for the destructor
  343. virtual void handle(MasterLexer& lexer) const;
  344. };
  345. class QString : public State {
  346. public:
  347. QString() {}
  348. virtual ~QString() {} // see the base class for the destructor
  349. virtual void handle(MasterLexer& lexer) const;
  350. };
  351. class Number : public State {
  352. public:
  353. Number() {}
  354. virtual ~Number() {}
  355. virtual void handle(MasterLexer& lexer) const;
  356. };
  357. // We use a common instance of a each state in a singleton-like way to save
  358. // construction overhead. They are not singletons in its strict sense as
  359. // we don't prohibit direct construction of these objects. But that doesn't
  360. // matter much anyway, because the definitions are completely hidden within
  361. // this file.
  362. const CRLF CRLF_STATE;
  363. const String STRING_STATE;
  364. const QString QSTRING_STATE;
  365. const Number NUMBER_STATE;
  366. } // end unnamed namespace
  367. const State&
  368. State::getInstance(ID state_id) {
  369. switch (state_id) {
  370. case CRLF:
  371. return (CRLF_STATE);
  372. case String:
  373. return (STRING_STATE);
  374. case QString:
  375. return (QSTRING_STATE);
  376. case Number:
  377. return (NUMBER_STATE);
  378. }
  379. // This is a bug of the caller, and this method is only expected to be
  380. // used by tests, so we just forcefully make it fail by asserting the
  381. // condition.
  382. assert(false);
  383. return (STRING_STATE); // a dummy return, to silence some compilers.
  384. }
  385. const State*
  386. State::start(MasterLexer& lexer, MasterLexer::Options options) {
  387. // define some shortcuts
  388. MasterLexer::MasterLexerImpl& lexerimpl = *lexer.impl_;
  389. size_t& paren_count = lexerimpl.paren_count_;
  390. // Note: the if-else in the loop is getting complicated. When we complete
  391. // #2374, revisit the organization to see if we need a fundamental
  392. // refactoring.
  393. while (true) {
  394. const int c = lexerimpl.skipComment(lexerimpl.source_->getChar());
  395. if (c == InputSource::END_OF_STREAM) {
  396. lexerimpl.last_was_eol_ = false;
  397. if (paren_count != 0) {
  398. lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
  399. paren_count = 0; // reset to 0; this helps in lenient mode.
  400. return (NULL);
  401. }
  402. lexerimpl.token_ = MasterToken(MasterToken::END_OF_FILE);
  403. return (NULL);
  404. } else if (c == ' ' || c == '\t') {
  405. // If requested and we are not in (), recognize the initial space.
  406. if (lexerimpl.last_was_eol_ && paren_count == 0 &&
  407. (options & MasterLexer::INITIAL_WS) != 0) {
  408. lexerimpl.last_was_eol_ = false;
  409. lexerimpl.token_ = MasterToken(MasterToken::INITIAL_WS);
  410. return (NULL);
  411. }
  412. } else if (c == '\n') {
  413. lexerimpl.last_was_eol_ = true;
  414. if (paren_count == 0) { // we don't recognize EOL if we are in ()
  415. lexerimpl.token_ = MasterToken(MasterToken::END_OF_LINE);
  416. return (NULL);
  417. }
  418. } else if (c == '\r') {
  419. if (paren_count == 0) { // check if we are in () (see above)
  420. return (&CRLF_STATE);
  421. }
  422. } else if (c == '"' && (options & MasterLexer::QSTRING) != 0) {
  423. lexerimpl.last_was_eol_ = false;
  424. return (&QSTRING_STATE);
  425. } else if (c == '(') {
  426. lexerimpl.last_was_eol_ = false;
  427. ++paren_count;
  428. } else if (c == ')') {
  429. lexerimpl.last_was_eol_ = false;
  430. if (paren_count == 0) {
  431. lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
  432. return (NULL);
  433. }
  434. --paren_count;
  435. } else if ((options & MasterLexer::NUMBER) != 0 &&isdigit(c)) {
  436. lexerimpl.last_was_eol_ = false;
  437. // this character will be handled in the number state
  438. lexerimpl.source_->ungetChar();
  439. return (&NUMBER_STATE);
  440. } else {
  441. // this character will be handled in the string state
  442. lexerimpl.source_->ungetChar();
  443. lexerimpl.last_was_eol_ = false;
  444. return (&STRING_STATE);
  445. }
  446. // no code should be here; we just continue the loop.
  447. }
  448. }
  449. void
  450. String::handle(MasterLexer& lexer) const {
  451. std::vector<char>& data = getLexerImpl(lexer)->data_;
  452. data.clear();
  453. bool escaped = false;
  454. while (true) {
  455. const int c = getLexerImpl(lexer)->skipComment(
  456. getLexerImpl(lexer)->source_->getChar(), escaped);
  457. if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
  458. getLexerImpl(lexer)->source_->ungetChar();
  459. // make sure it nul-terminated as a c-str (excluded from token
  460. // data).
  461. data.push_back('\0');
  462. getLexerImpl(lexer)->token_ =
  463. MasterToken(&data.at(0), data.size() - 1);
  464. return;
  465. }
  466. escaped = (c == '\\' && !escaped);
  467. data.push_back(c);
  468. }
  469. }
  470. void
  471. QString::handle(MasterLexer& lexer) const {
  472. MasterToken& token = getLexerImpl(lexer)->token_;
  473. std::vector<char>& data = getLexerImpl(lexer)->data_;
  474. data.clear();
  475. bool escaped = false;
  476. while (true) {
  477. const int c = getLexerImpl(lexer)->source_->getChar();
  478. if (c == InputSource::END_OF_STREAM) {
  479. token = MasterToken(MasterToken::UNEXPECTED_END);
  480. return;
  481. } else if (c == '"') {
  482. if (escaped) {
  483. // found escaped '"'. overwrite the preceding backslash.
  484. assert(!data.empty());
  485. escaped = false;
  486. data.back() = '"';
  487. } else {
  488. // make sure it nul-terminated as a c-str (excluded from token
  489. // data). This also simplifies the case of an empty string.
  490. data.push_back('\0');
  491. token = MasterToken(&data.at(0), data.size() - 1, true);
  492. return;
  493. }
  494. } else if (c == '\n' && !escaped) {
  495. getLexerImpl(lexer)->source_->ungetChar();
  496. token = MasterToken(MasterToken::UNBALANCED_QUOTES);
  497. return;
  498. } else {
  499. escaped = (c == '\\' && !escaped);
  500. data.push_back(c);
  501. }
  502. }
  503. }
  504. void
  505. Number::handle(MasterLexer& lexer) const {
  506. MasterToken& token = getLexerImpl(lexer)->token_;
  507. // It may yet turn out to be a string, so we first
  508. // collect all the data
  509. bool digits_only = true;
  510. std::vector<char>& data = getLexerImpl(lexer)->data_;
  511. data.clear();
  512. bool escaped = false;
  513. while (true) {
  514. const int c = getLexerImpl(lexer)->skipComment(
  515. getLexerImpl(lexer)->source_->getChar(), escaped);
  516. if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
  517. getLexerImpl(lexer)->source_->ungetChar();
  518. // We need to close the string whether it's digits-only (for
  519. // lexical_cast) or not (see String::handle()).
  520. data.push_back('\0');
  521. if (digits_only) {
  522. try {
  523. const uint32_t number32 =
  524. boost::lexical_cast<uint32_t, const char*>(&data[0]);
  525. token = MasterToken(number32);
  526. } catch (const boost::bad_lexical_cast&) {
  527. // Since we already know we have only digits,
  528. // range should be the only possible problem.
  529. token = MasterToken(MasterToken::NUMBER_OUT_OF_RANGE);
  530. }
  531. } else {
  532. token = MasterToken(&data.at(0), data.size() - 1);
  533. }
  534. return;
  535. }
  536. if (!isdigit(c)) {
  537. digits_only = false;
  538. }
  539. escaped = (c == '\\' && !escaped);
  540. data.push_back(c);
  541. }
  542. }
  543. } // namespace master_lexer_internal
  544. } // end of namespace dns
  545. } // end of namespace isc