master_lexer.cc 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. // Copyright (C) 2012 Internet Systems Consortium, Inc. ("ISC")
  2. //
  3. // Permission to use, copy, modify, and/or distribute this software for any
  4. // purpose with or without fee is hereby granted, provided that the above
  5. // copyright notice and this permission notice appear in all copies.
  6. //
  7. // THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  8. // REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  9. // AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  10. // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  11. // LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  12. // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  13. // PERFORMANCE OF THIS SOFTWARE.
  14. #include <exceptions/exceptions.h>
  15. #include <dns/master_lexer.h>
  16. #include <dns/master_lexer_inputsource.h>
  17. #include <dns/master_lexer_state.h>
  18. #include <boost/shared_ptr.hpp>
  19. #include <bitset>
  20. #include <cassert>
  21. #include <string>
  22. #include <vector>
  23. namespace isc {
  24. namespace dns {
  25. namespace {
  26. typedef boost::shared_ptr<master_lexer_internal::InputSource> InputSourcePtr;
  27. }
  28. using namespace master_lexer_internal;
  29. struct MasterLexer::MasterLexerImpl {
  30. MasterLexerImpl() : source_(NULL), token_(Token::NOT_STARTED),
  31. paren_count_(0), last_was_eol_(false)
  32. {
  33. separators_.set('\r');
  34. separators_.set('\n');
  35. separators_.set(' ');
  36. separators_.set('\t');
  37. separators_.set('(');
  38. separators_.set(')');
  39. esc_separators_.set('\r');
  40. esc_separators_.set('\n');
  41. }
  42. // A helper method to skip possible comments toward the end of EOL or EOF.
  43. // commonly used by state classes. It returns the corresponding "end-of"
  44. // character in case it's a comment; otherwise it simply returns the
  45. // current character.
  46. int skipComment(int c, bool escaped = false) {
  47. if (c == ';' && !escaped) {
  48. while (true) {
  49. c = source_->getChar();
  50. if (c == '\n' || c == InputSource::END_OF_STREAM) {
  51. return (c);
  52. }
  53. }
  54. }
  55. return (c);
  56. }
  57. bool isTokenEnd(int c, bool escaped) {
  58. // Special case of EOF (end of stream); this is not in the bitmaps
  59. if (c == InputSource::END_OF_STREAM) {
  60. return (true);
  61. }
  62. // In this implementation we only ensure the behavior for unsigned
  63. // range of characters, so we restrict the range of the values up to
  64. // 0x7f = 127
  65. return (escaped ? esc_separators_.test(c & 0x7f) :
  66. separators_.test(c & 0x7f));
  67. }
  68. std::vector<InputSourcePtr> sources_;
  69. InputSource* source_; // current source (NULL if sources_ is empty)
  70. Token token_; // currently recognized token (set by a state)
  71. std::vector<char> data_; // placeholder for string data
  72. // These are used in states, and defined here only as a placeholder.
  73. // The main lexer class does not need these members.
  74. size_t paren_count_; // nest count of the parentheses
  75. bool last_was_eol_; // whether the lexer just passed an end-of-line
  76. // Bitmaps that gives whether a given (positive) character should be
  77. // considered a separator of a string/number token. The esc_ version
  78. // is a subset of the other, excluding characters that can be ignored
  79. // if escaped by a backslash. See isTokenEnd() for the bitmap size.
  80. std::bitset<128> separators_;
  81. std::bitset<128> esc_separators_;
  82. };
  83. MasterLexer::MasterLexer() : impl_(new MasterLexerImpl) {
  84. }
  85. MasterLexer::~MasterLexer() {
  86. delete impl_;
  87. }
  88. bool
  89. MasterLexer::pushSource(const char* filename, std::string* error) {
  90. if (filename == NULL) {
  91. isc_throw(InvalidParameter,
  92. "NULL filename for MasterLexer::pushSource");
  93. }
  94. try {
  95. impl_->sources_.push_back(InputSourcePtr(new InputSource(filename)));
  96. } catch (const InputSource::OpenError& ex) {
  97. if (error != NULL) {
  98. *error = ex.what();
  99. }
  100. return (false);
  101. }
  102. impl_->source_ = impl_->sources_.back().get();
  103. return (true);
  104. }
  105. void
  106. MasterLexer::pushSource(std::istream& input) {
  107. impl_->sources_.push_back(InputSourcePtr(new InputSource(input)));
  108. impl_->source_ = impl_->sources_.back().get();
  109. }
  110. void
  111. MasterLexer::popSource() {
  112. if (impl_->sources_.empty()) {
  113. isc_throw(InvalidOperation,
  114. "MasterLexer::popSource on an empty source");
  115. }
  116. impl_->sources_.pop_back();
  117. impl_->source_ = impl_->sources_.empty() ? NULL :
  118. impl_->sources_.back().get();
  119. }
  120. std::string
  121. MasterLexer::getSourceName() const {
  122. if (impl_->sources_.empty()) {
  123. return (std::string());
  124. }
  125. return (impl_->sources_.back()->getName());
  126. }
  127. size_t
  128. MasterLexer::getSourceLine() const {
  129. if (impl_->sources_.empty()) {
  130. return (0);
  131. }
  132. return (impl_->sources_.back()->getCurrentLine());
  133. }
  134. namespace {
  135. const char* const error_text[] = {
  136. "lexer not started", // NOT_STARTED
  137. "unbalanced parentheses", // UNBALANCED_PAREN
  138. "unexpected end of input", // UNEXPECTED_END
  139. "unbalanced quotes" // UNBALANCED_QUOTES
  140. };
  141. const size_t error_text_max_count = sizeof(error_text) / sizeof(error_text[0]);
  142. }
  143. std::string
  144. MasterLexer::Token::getErrorText() const {
  145. if (type_ != ERROR) {
  146. isc_throw(InvalidOperation,
  147. "Token::getErrorText() for non error type");
  148. }
  149. // The class integrity ensures the following:
  150. assert(val_.error_code_ < error_text_max_count);
  151. return (error_text[val_.error_code_]);
  152. }
  153. namespace master_lexer_internal {
  154. // Below we implement state classes for state transitions of MasterLexer.
  155. // Note that these need to be defined here so that they can refer to
  156. // the details of MasterLexerImpl.
  157. typedef MasterLexer::Token Token; // convenience shortcut
  158. bool
  159. State::wasLastEOL(const MasterLexer& lexer) const {
  160. return (lexer.impl_->last_was_eol_);
  161. }
  162. const MasterLexer::Token&
  163. State::getToken(const MasterLexer& lexer) const {
  164. return (lexer.impl_->token_);
  165. }
  166. size_t
  167. State::getParenCount(const MasterLexer& lexer) const {
  168. return (lexer.impl_->paren_count_);
  169. }
  170. namespace {
  171. class CRLF : public State {
  172. public:
  173. CRLF() {}
  174. virtual ~CRLF() {} // see the base class for the destructor
  175. virtual const State* handle(MasterLexer& lexer) const {
  176. // We've just seen '\r'. If this is part of a sequence of '\r\n',
  177. // we combine them as a single END-OF-LINE. Otherwise we treat the
  178. // single '\r' as an EOL and continue tokeniziation from the character
  179. // immediately after '\r'. One tricky case is that there's a comment
  180. // between '\r' and '\n'. This implementation combines these
  181. // characters and treats them as a single EOL (the behavior derived
  182. // from BIND 9). Technically this may not be correct, but in practice
  183. // the caller wouldn't distinguish this case from the case it has
  184. // two EOLs, so we simplify the process.
  185. const int c = getLexerImpl(lexer)->skipComment(
  186. getLexerImpl(lexer)->source_->getChar());
  187. if (c != '\n') {
  188. getLexerImpl(lexer)->source_->ungetChar();
  189. }
  190. getLexerImpl(lexer)->token_ = Token(Token::END_OF_LINE);
  191. getLexerImpl(lexer)->last_was_eol_ = true;
  192. return (NULL);
  193. }
  194. };
  195. class String : public State {
  196. public:
  197. String() {}
  198. virtual ~String() {} // see the base class for the destructor
  199. virtual const State* handle(MasterLexer& lexer) const;
  200. };
  201. class QString : public State {
  202. public:
  203. QString() {}
  204. virtual ~QString() {} // see the base class for the destructor
  205. virtual const State* handle(MasterLexer& lexer) const;
  206. };
  207. // We use a common instance of a each state in a singleton-like way to save
  208. // construction overhead. They are not singletons in its strict sense as
  209. // we don't prohibit direct construction of these objects. But that doesn't
  210. // matter much anyway, because the definitions are completely hidden within
  211. // this file.
  212. const CRLF CRLF_STATE;
  213. const String STRING_STATE;
  214. const QString QSTRING_STATE;
  215. }
  216. const State&
  217. State::getInstance(ID state_id) {
  218. switch (state_id) {
  219. case CRLF:
  220. return (CRLF_STATE);
  221. case String:
  222. return (STRING_STATE);
  223. case QString:
  224. return (QSTRING_STATE);
  225. }
  226. // This is a bug of the caller, and this method is only expected to be
  227. // used by tests, so we just forcefully make it fail by asserting the
  228. // condition.
  229. assert(false);
  230. return (STRING_STATE); // a dummy return, to silence some compilers.
  231. }
  232. const State*
  233. State::start(MasterLexer& lexer, MasterLexer::Options options) {
  234. // define some shortcuts
  235. MasterLexer::MasterLexerImpl& lexerimpl = *lexer.impl_;
  236. size_t& paren_count = lexerimpl.paren_count_;
  237. while (true) {
  238. const int c = lexerimpl.skipComment(lexerimpl.source_->getChar());
  239. if (c == InputSource::END_OF_STREAM) {
  240. lexerimpl.last_was_eol_ = false;
  241. if (paren_count != 0) {
  242. lexerimpl.token_ = Token(Token::UNBALANCED_PAREN);
  243. paren_count = 0; // reset to 0; this helps in lenient mode.
  244. return (NULL);
  245. }
  246. lexerimpl.token_ = Token(Token::END_OF_FILE);
  247. return (NULL);
  248. } else if (c == ' ' || c == '\t') {
  249. // If requested and we are not in (), recognize the initial space.
  250. if (lexerimpl.last_was_eol_ && paren_count == 0 &&
  251. (options & MasterLexer::INITIAL_WS) != 0) {
  252. lexerimpl.last_was_eol_ = false;
  253. lexerimpl.token_ = Token(Token::INITIAL_WS);
  254. return (NULL);
  255. }
  256. } else if (c == '\n') {
  257. lexerimpl.last_was_eol_ = true;
  258. if (paren_count == 0) { // we don't recognize EOL if we are in ()
  259. lexerimpl.token_ = Token(Token::END_OF_LINE);
  260. return (NULL);
  261. }
  262. } else if (c == '\r') {
  263. if (paren_count == 0) { // check if we are in () (see above)
  264. return (&CRLF_STATE);
  265. }
  266. } else if (c == '"' && (options & MasterLexer::QSTRING) != 0) {
  267. lexerimpl.last_was_eol_ = false;
  268. return (&QSTRING_STATE);
  269. } else if (c == '(') {
  270. lexerimpl.last_was_eol_ = false;
  271. ++paren_count;
  272. } else if (c == ')') {
  273. lexerimpl.last_was_eol_ = false;
  274. if (paren_count == 0) {
  275. lexerimpl.token_ = Token(Token::UNBALANCED_PAREN);
  276. return (NULL);
  277. }
  278. --paren_count;
  279. } else {
  280. // this character will be handled in the string state
  281. lexerimpl.source_->ungetChar();
  282. lexerimpl.last_was_eol_ = false;
  283. return (&STRING_STATE);
  284. }
  285. // no code should be here; we just continue the loop.
  286. }
  287. }
  288. const State*
  289. String::handle(MasterLexer& lexer) const {
  290. std::vector<char>& data = getLexerImpl(lexer)->data_;
  291. data.clear();
  292. bool escaped = false;
  293. while (true) {
  294. const int c = getLexerImpl(lexer)->skipComment(
  295. getLexerImpl(lexer)->source_->getChar(), escaped);
  296. if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
  297. getLexerImpl(lexer)->source_->ungetChar();
  298. getLexerImpl(lexer)->token_ =
  299. MasterLexer::Token(&data.at(0), data.size());
  300. return (NULL);
  301. }
  302. escaped = (c == '\\' && !escaped);
  303. data.push_back(c);
  304. }
  305. }
  306. const State*
  307. QString::handle(MasterLexer& lexer) const {
  308. MasterLexer::Token& token = getLexerImpl(lexer)->token_;
  309. std::vector<char>& data = getLexerImpl(lexer)->data_;
  310. data.clear();
  311. bool escaped = false;
  312. while (true) {
  313. const int c = getLexerImpl(lexer)->source_->getChar();
  314. if (c == InputSource::END_OF_STREAM) {
  315. token = Token(Token::UNEXPECTED_END);
  316. return (NULL);
  317. } else if (c == '"') {
  318. if (escaped) {
  319. // found escaped '"'. overwrite the preceding backslash.
  320. assert(!data.empty());
  321. escaped = false;
  322. data.back() = '"';
  323. } else {
  324. token = MasterLexer::Token(&data.at(0), data.size(), true);
  325. return (NULL);
  326. }
  327. } else if (c == '\n' && !escaped) {
  328. getLexerImpl(lexer)->source_->ungetChar();
  329. token = Token(Token::UNBALANCED_QUOTES);
  330. return (NULL);
  331. } else {
  332. escaped = (c == '\\' && !escaped);
  333. data.push_back(c);
  334. }
  335. }
  336. }
  337. } // namespace master_lexer_internal
  338. } // end of namespace dns
  339. } // end of namespace isc