master_lexer.cc 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. // Copyright (C) 2012 Internet Systems Consortium, Inc. ("ISC")
  2. //
  3. // Permission to use, copy, modify, and/or distribute this software for any
  4. // purpose with or without fee is hereby granted, provided that the above
  5. // copyright notice and this permission notice appear in all copies.
  6. //
  7. // THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  8. // REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  9. // AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  10. // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  11. // LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  12. // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  13. // PERFORMANCE OF THIS SOFTWARE.
  14. #include <exceptions/exceptions.h>
  15. #include <dns/master_lexer.h>
  16. #include <dns/master_lexer_inputsource.h>
  17. #include <dns/master_lexer_state.h>
  18. #include <boost/foreach.hpp>
  19. #include <boost/shared_ptr.hpp>
  20. #include <boost/lexical_cast.hpp>
  21. #include <bitset>
  22. #include <cassert>
  23. #include <limits>
  24. #include <string>
  25. #include <vector>
  26. namespace isc {
  27. namespace dns {
  28. // The definition of SOURCE_SIZE_UNKNOWN. Note that we initialize it using
  29. // a method of another library. Technically, this could trigger a static
  30. // initialization fiasco. But in this particular usage it's very unlikely
  31. // to happen because this value is expected to be used only as a return
  32. // value of a MasterLexer's method, and its constructor needs definitions
  33. // here.
  34. const size_t MasterLexer::SOURCE_SIZE_UNKNOWN =
  35. std::numeric_limits<size_t>::max();
  36. namespace {
  37. typedef boost::shared_ptr<master_lexer_internal::InputSource> InputSourcePtr;
  38. } // end unnamed namespace
  39. using namespace master_lexer_internal;
  40. struct MasterLexer::MasterLexerImpl {
  41. MasterLexerImpl() : source_(NULL), token_(MasterToken::NOT_STARTED),
  42. paren_count_(0), last_was_eol_(true),
  43. has_previous_(false),
  44. previous_paren_count_(0),
  45. previous_was_eol_(false)
  46. {
  47. separators_.set('\r');
  48. separators_.set('\n');
  49. separators_.set(' ');
  50. separators_.set('\t');
  51. separators_.set('(');
  52. separators_.set(')');
  53. esc_separators_.set('\r');
  54. esc_separators_.set('\n');
  55. }
  56. // A helper method to skip possible comments toward the end of EOL or EOF.
  57. // commonly used by state classes. It returns the corresponding "end-of"
  58. // character in case it's a comment; otherwise it simply returns the
  59. // current character.
  60. int skipComment(int c, bool escaped = false) {
  61. if (c == ';' && !escaped) {
  62. while (true) {
  63. c = source_->getChar();
  64. if (c == '\n' || c == InputSource::END_OF_STREAM) {
  65. return (c);
  66. }
  67. }
  68. }
  69. return (c);
  70. }
  71. bool isTokenEnd(int c, bool escaped) {
  72. // Special case of EOF (end of stream); this is not in the bitmaps
  73. if (c == InputSource::END_OF_STREAM) {
  74. return (true);
  75. }
  76. // In this implementation we only ensure the behavior for unsigned
  77. // range of characters, so we restrict the range of the values up to
  78. // 0x7f = 127
  79. return (escaped ? esc_separators_.test(c & 0x7f) :
  80. separators_.test(c & 0x7f));
  81. }
  82. std::vector<InputSourcePtr> sources_;
  83. InputSource* source_; // current source (NULL if sources_ is empty)
  84. MasterToken token_; // currently recognized token (set by a state)
  85. std::vector<char> data_; // placeholder for string data
  86. // These are used in states, and defined here only as a placeholder.
  87. // The main lexer class does not need these members.
  88. size_t paren_count_; // nest count of the parentheses
  89. bool last_was_eol_; // whether the lexer just passed an end-of-line
  90. // Bitmaps that gives whether a given (positive) character should be
  91. // considered a separator of a string/number token. The esc_ version
  92. // is a subset of the other, excluding characters that can be ignored
  93. // if escaped by a backslash. See isTokenEnd() for the bitmap size.
  94. std::bitset<128> separators_;
  95. std::bitset<128> esc_separators_;
  96. // These are to allow restoring state before previous token.
  97. bool has_previous_;
  98. size_t previous_paren_count_;
  99. bool previous_was_eol_;
  100. };
  101. MasterLexer::MasterLexer() : impl_(new MasterLexerImpl) {
  102. }
  103. MasterLexer::~MasterLexer() {
  104. delete impl_;
  105. }
  106. bool
  107. MasterLexer::pushSource(const char* filename, std::string* error) {
  108. if (filename == NULL) {
  109. isc_throw(InvalidParameter,
  110. "NULL filename for MasterLexer::pushSource");
  111. }
  112. try {
  113. impl_->sources_.push_back(InputSourcePtr(new InputSource(filename)));
  114. } catch (const InputSource::OpenError& ex) {
  115. if (error != NULL) {
  116. *error = ex.what();
  117. }
  118. return (false);
  119. }
  120. impl_->source_ = impl_->sources_.back().get();
  121. impl_->has_previous_ = false;
  122. impl_->last_was_eol_ = true;
  123. return (true);
  124. }
  125. void
  126. MasterLexer::pushSource(std::istream& input) {
  127. impl_->sources_.push_back(InputSourcePtr(new InputSource(input)));
  128. impl_->source_ = impl_->sources_.back().get();
  129. impl_->has_previous_ = false;
  130. impl_->last_was_eol_ = true;
  131. }
  132. void
  133. MasterLexer::popSource() {
  134. if (impl_->sources_.empty()) {
  135. isc_throw(InvalidOperation,
  136. "MasterLexer::popSource on an empty source");
  137. }
  138. impl_->sources_.pop_back();
  139. impl_->source_ = impl_->sources_.empty() ? NULL :
  140. impl_->sources_.back().get();
  141. impl_->has_previous_ = false;
  142. }
  143. size_t
  144. MasterLexer::getSourceCount() const {
  145. return (impl_->sources_.size());
  146. }
  147. std::string
  148. MasterLexer::getSourceName() const {
  149. if (impl_->sources_.empty()) {
  150. return (std::string());
  151. }
  152. return (impl_->sources_.back()->getName());
  153. }
  154. size_t
  155. MasterLexer::getSourceLine() const {
  156. if (impl_->sources_.empty()) {
  157. return (0);
  158. }
  159. return (impl_->sources_.back()->getCurrentLine());
  160. }
  161. size_t
  162. MasterLexer::getTotalSourceSize() const {
  163. size_t total_size = 0;
  164. BOOST_FOREACH(InputSourcePtr& src, impl_->sources_) {
  165. total_size += src->getSize();
  166. }
  167. return (total_size);
  168. }
  169. size_t
  170. MasterLexer::getPosition() const {
  171. size_t position = 0;
  172. BOOST_FOREACH(InputSourcePtr& src, impl_->sources_) {
  173. position += src->getPosition();
  174. }
  175. return (position);
  176. }
  177. const MasterToken&
  178. MasterLexer::getNextToken(Options options) {
  179. if (impl_->source_ == NULL) {
  180. isc_throw(isc::InvalidOperation, "No source to read tokens from");
  181. }
  182. // Store the current state so we can restore it in ungetToken
  183. impl_->previous_paren_count_ = impl_->paren_count_;
  184. impl_->previous_was_eol_ = impl_->last_was_eol_;
  185. impl_->source_->mark();
  186. impl_->has_previous_ = true;
  187. // Reset the token now. This is to check a token was actually produced.
  188. // This is debugging aid.
  189. impl_->token_ = MasterToken(MasterToken::NO_TOKEN_PRODUCED);
  190. // And get the token
  191. // This actually handles EOF internally too.
  192. const State* state = State::start(*this, options);
  193. if (state != NULL) {
  194. state->handle(*this);
  195. }
  196. // Make sure a token was produced. Since this Can Not Happen, we assert
  197. // here instead of throwing.
  198. assert(impl_->token_.getType() != MasterToken::ERROR ||
  199. impl_->token_.getErrorCode() != MasterToken::NO_TOKEN_PRODUCED);
  200. return (impl_->token_);
  201. }
  202. namespace {
  203. inline MasterLexer::Options
  204. optionsForTokenType(MasterToken::Type expect) {
  205. switch (expect) {
  206. case MasterToken::STRING:
  207. return (MasterLexer::NONE);
  208. case MasterToken::QSTRING:
  209. return (MasterLexer::QSTRING);
  210. case MasterToken::NUMBER:
  211. return (MasterLexer::NUMBER);
  212. default:
  213. isc_throw(InvalidParameter,
  214. "expected type for getNextToken not supported: " << expect);
  215. }
  216. }
  217. }
  218. const MasterToken&
  219. MasterLexer::getNextToken(MasterToken::Type expect, bool eol_ok) {
  220. // Get the next token, specifying an appropriate option corresponding to
  221. // the expected type. The result should be set in impl_->token_.
  222. getNextToken(optionsForTokenType(expect));
  223. if (impl_->token_.getType() == MasterToken::ERROR) {
  224. if (impl_->token_.getErrorCode() == MasterToken::NUMBER_OUT_OF_RANGE) {
  225. ungetToken();
  226. }
  227. throw LexerError(__FILE__, __LINE__, impl_->token_);
  228. }
  229. const bool is_eol_like =
  230. (impl_->token_.getType() == MasterToken::END_OF_LINE ||
  231. impl_->token_.getType() == MasterToken::END_OF_FILE);
  232. if (eol_ok && is_eol_like) {
  233. return (impl_->token_);
  234. }
  235. if (impl_->token_.getType() == MasterToken::STRING &&
  236. expect == MasterToken::QSTRING) {
  237. return (impl_->token_);
  238. }
  239. if (impl_->token_.getType() != expect) {
  240. ungetToken();
  241. if (is_eol_like) {
  242. throw LexerError(__FILE__, __LINE__,
  243. MasterToken(MasterToken::UNEXPECTED_END));
  244. }
  245. assert(expect == MasterToken::NUMBER);
  246. throw LexerError(__FILE__, __LINE__,
  247. MasterToken(MasterToken::BAD_NUMBER));
  248. }
  249. return (impl_->token_);
  250. }
  251. void
  252. MasterLexer::ungetToken() {
  253. if (impl_->has_previous_) {
  254. impl_->has_previous_ = false;
  255. impl_->source_->ungetAll();
  256. impl_->last_was_eol_ = impl_->previous_was_eol_;
  257. impl_->paren_count_ = impl_->previous_paren_count_;
  258. } else {
  259. isc_throw(isc::InvalidOperation, "No token to unget ready");
  260. }
  261. }
  262. namespace {
  263. const char* const error_text[] = {
  264. "lexer not started", // NOT_STARTED
  265. "unbalanced parentheses", // UNBALANCED_PAREN
  266. "unexpected end of input", // UNEXPECTED_END
  267. "unbalanced quotes", // UNBALANCED_QUOTES
  268. "no token produced", // NO_TOKEN_PRODUCED
  269. "number out of range", // NUMBER_OUT_OF_RANGE
  270. "not a valid number" // BAD_NUMBER
  271. };
  272. const size_t error_text_max_count = sizeof(error_text) / sizeof(error_text[0]);
  273. } // end unnamed namespace
  274. std::string
  275. MasterToken::getErrorText() const {
  276. if (type_ != ERROR) {
  277. isc_throw(InvalidOperation,
  278. "MasterToken::getErrorText() for non error type");
  279. }
  280. // The class integrity ensures the following:
  281. assert(val_.error_code_ < error_text_max_count);
  282. return (error_text[val_.error_code_]);
  283. }
  284. namespace master_lexer_internal {
  285. // Below we implement state classes for state transitions of MasterLexer.
  286. // Note that these need to be defined here so that they can refer to
  287. // the details of MasterLexerImpl.
  288. bool
  289. State::wasLastEOL(const MasterLexer& lexer) const {
  290. return (lexer.impl_->last_was_eol_);
  291. }
  292. const MasterToken&
  293. State::getToken(const MasterLexer& lexer) const {
  294. return (lexer.impl_->token_);
  295. }
  296. size_t
  297. State::getParenCount(const MasterLexer& lexer) const {
  298. return (lexer.impl_->paren_count_);
  299. }
  300. namespace {
  301. class CRLF : public State {
  302. public:
  303. CRLF() {}
  304. virtual ~CRLF() {} // see the base class for the destructor
  305. virtual void handle(MasterLexer& lexer) const {
  306. // We've just seen '\r'. If this is part of a sequence of '\r\n',
  307. // we combine them as a single END-OF-LINE. Otherwise we treat the
  308. // single '\r' as an EOL and continue tokeniziation from the character
  309. // immediately after '\r'. One tricky case is that there's a comment
  310. // between '\r' and '\n'. This implementation combines these
  311. // characters and treats them as a single EOL (the behavior derived
  312. // from BIND 9). Technically this may not be correct, but in practice
  313. // the caller wouldn't distinguish this case from the case it has
  314. // two EOLs, so we simplify the process.
  315. const int c = getLexerImpl(lexer)->skipComment(
  316. getLexerImpl(lexer)->source_->getChar());
  317. if (c != '\n') {
  318. getLexerImpl(lexer)->source_->ungetChar();
  319. }
  320. getLexerImpl(lexer)->token_ = MasterToken(MasterToken::END_OF_LINE);
  321. getLexerImpl(lexer)->last_was_eol_ = true;
  322. }
  323. };
  324. class String : public State {
  325. public:
  326. String() {}
  327. virtual ~String() {} // see the base class for the destructor
  328. virtual void handle(MasterLexer& lexer) const;
  329. };
  330. class QString : public State {
  331. public:
  332. QString() {}
  333. virtual ~QString() {} // see the base class for the destructor
  334. virtual void handle(MasterLexer& lexer) const;
  335. };
  336. class Number : public State {
  337. public:
  338. Number() {}
  339. virtual ~Number() {}
  340. virtual void handle(MasterLexer& lexer) const;
  341. };
  342. // We use a common instance of a each state in a singleton-like way to save
  343. // construction overhead. They are not singletons in its strict sense as
  344. // we don't prohibit direct construction of these objects. But that doesn't
  345. // matter much anyway, because the definitions are completely hidden within
  346. // this file.
  347. const CRLF CRLF_STATE;
  348. const String STRING_STATE;
  349. const QString QSTRING_STATE;
  350. const Number NUMBER_STATE;
  351. } // end unnamed namespace
  352. const State&
  353. State::getInstance(ID state_id) {
  354. switch (state_id) {
  355. case CRLF:
  356. return (CRLF_STATE);
  357. case String:
  358. return (STRING_STATE);
  359. case QString:
  360. return (QSTRING_STATE);
  361. case Number:
  362. return (NUMBER_STATE);
  363. }
  364. // This is a bug of the caller, and this method is only expected to be
  365. // used by tests, so we just forcefully make it fail by asserting the
  366. // condition.
  367. assert(false);
  368. return (STRING_STATE); // a dummy return, to silence some compilers.
  369. }
  370. const State*
  371. State::start(MasterLexer& lexer, MasterLexer::Options options) {
  372. // define some shortcuts
  373. MasterLexer::MasterLexerImpl& lexerimpl = *lexer.impl_;
  374. size_t& paren_count = lexerimpl.paren_count_;
  375. // Note: the if-else in the loop is getting complicated. When we complete
  376. // #2374, revisit the organization to see if we need a fundamental
  377. // refactoring.
  378. while (true) {
  379. const int c = lexerimpl.skipComment(lexerimpl.source_->getChar());
  380. if (c == InputSource::END_OF_STREAM) {
  381. lexerimpl.last_was_eol_ = false;
  382. if (paren_count != 0) {
  383. lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
  384. paren_count = 0; // reset to 0; this helps in lenient mode.
  385. return (NULL);
  386. }
  387. lexerimpl.token_ = MasterToken(MasterToken::END_OF_FILE);
  388. return (NULL);
  389. } else if (c == ' ' || c == '\t') {
  390. // If requested and we are not in (), recognize the initial space.
  391. if (lexerimpl.last_was_eol_ && paren_count == 0 &&
  392. (options & MasterLexer::INITIAL_WS) != 0) {
  393. lexerimpl.last_was_eol_ = false;
  394. lexerimpl.token_ = MasterToken(MasterToken::INITIAL_WS);
  395. return (NULL);
  396. }
  397. } else if (c == '\n') {
  398. lexerimpl.last_was_eol_ = true;
  399. if (paren_count == 0) { // we don't recognize EOL if we are in ()
  400. lexerimpl.token_ = MasterToken(MasterToken::END_OF_LINE);
  401. return (NULL);
  402. }
  403. } else if (c == '\r') {
  404. if (paren_count == 0) { // check if we are in () (see above)
  405. return (&CRLF_STATE);
  406. }
  407. } else if (c == '"' && (options & MasterLexer::QSTRING) != 0) {
  408. lexerimpl.last_was_eol_ = false;
  409. return (&QSTRING_STATE);
  410. } else if (c == '(') {
  411. lexerimpl.last_was_eol_ = false;
  412. ++paren_count;
  413. } else if (c == ')') {
  414. lexerimpl.last_was_eol_ = false;
  415. if (paren_count == 0) {
  416. lexerimpl.token_ = MasterToken(MasterToken::UNBALANCED_PAREN);
  417. return (NULL);
  418. }
  419. --paren_count;
  420. } else if ((options & MasterLexer::NUMBER) != 0 &&isdigit(c)) {
  421. lexerimpl.last_was_eol_ = false;
  422. // this character will be handled in the number state
  423. lexerimpl.source_->ungetChar();
  424. return (&NUMBER_STATE);
  425. } else {
  426. // this character will be handled in the string state
  427. lexerimpl.source_->ungetChar();
  428. lexerimpl.last_was_eol_ = false;
  429. return (&STRING_STATE);
  430. }
  431. // no code should be here; we just continue the loop.
  432. }
  433. }
  434. void
  435. String::handle(MasterLexer& lexer) const {
  436. std::vector<char>& data = getLexerImpl(lexer)->data_;
  437. data.clear();
  438. bool escaped = false;
  439. while (true) {
  440. const int c = getLexerImpl(lexer)->skipComment(
  441. getLexerImpl(lexer)->source_->getChar(), escaped);
  442. if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
  443. getLexerImpl(lexer)->source_->ungetChar();
  444. // make sure it nul-terminated as a c-str (excluded from token
  445. // data).
  446. data.push_back('\0');
  447. getLexerImpl(lexer)->token_ =
  448. MasterToken(&data.at(0), data.size() - 1);
  449. return;
  450. }
  451. escaped = (c == '\\' && !escaped);
  452. data.push_back(c);
  453. }
  454. }
  455. void
  456. QString::handle(MasterLexer& lexer) const {
  457. MasterToken& token = getLexerImpl(lexer)->token_;
  458. std::vector<char>& data = getLexerImpl(lexer)->data_;
  459. data.clear();
  460. bool escaped = false;
  461. while (true) {
  462. const int c = getLexerImpl(lexer)->source_->getChar();
  463. if (c == InputSource::END_OF_STREAM) {
  464. token = MasterToken(MasterToken::UNEXPECTED_END);
  465. return;
  466. } else if (c == '"') {
  467. if (escaped) {
  468. // found escaped '"'. overwrite the preceding backslash.
  469. assert(!data.empty());
  470. escaped = false;
  471. data.back() = '"';
  472. } else {
  473. // make sure it nul-terminated as a c-str (excluded from token
  474. // data). This also simplifies the case of an empty string.
  475. data.push_back('\0');
  476. token = MasterToken(&data.at(0), data.size() - 1, true);
  477. return;
  478. }
  479. } else if (c == '\n' && !escaped) {
  480. getLexerImpl(lexer)->source_->ungetChar();
  481. token = MasterToken(MasterToken::UNBALANCED_QUOTES);
  482. return;
  483. } else {
  484. escaped = (c == '\\' && !escaped);
  485. data.push_back(c);
  486. }
  487. }
  488. }
  489. void
  490. Number::handle(MasterLexer& lexer) const {
  491. MasterToken& token = getLexerImpl(lexer)->token_;
  492. // It may yet turn out to be a string, so we first
  493. // collect all the data
  494. bool digits_only = true;
  495. std::vector<char>& data = getLexerImpl(lexer)->data_;
  496. data.clear();
  497. bool escaped = false;
  498. while (true) {
  499. const int c = getLexerImpl(lexer)->skipComment(
  500. getLexerImpl(lexer)->source_->getChar(), escaped);
  501. if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
  502. getLexerImpl(lexer)->source_->ungetChar();
  503. // We need to close the string whether it's digits-only (for
  504. // lexical_cast) or not (see String::handle()).
  505. data.push_back('\0');
  506. if (digits_only) {
  507. try {
  508. const uint32_t number32 =
  509. boost::lexical_cast<uint32_t, const char*>(&data[0]);
  510. token = MasterToken(number32);
  511. } catch (const boost::bad_lexical_cast&) {
  512. // Since we already know we have only digits,
  513. // range should be the only possible problem.
  514. token = MasterToken(MasterToken::NUMBER_OUT_OF_RANGE);
  515. }
  516. } else {
  517. token = MasterToken(&data.at(0), data.size() - 1);
  518. }
  519. return;
  520. }
  521. if (!isdigit(c)) {
  522. digits_only = false;
  523. }
  524. escaped = (c == '\\' && !escaped);
  525. data.push_back(c);
  526. }
  527. }
  528. } // namespace master_lexer_internal
  529. } // end of namespace dns
  530. } // end of namespace isc