master_lexer.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. // Copyright (C) 2012 Internet Systems Consortium, Inc. ("ISC")
  2. //
  3. // Permission to use, copy, modify, and/or distribute this software for any
  4. // purpose with or without fee is hereby granted, provided that the above
  5. // copyright notice and this permission notice appear in all copies.
  6. //
  7. // THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  8. // REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  9. // AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  10. // INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  11. // LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  12. // OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  13. // PERFORMANCE OF THIS SOFTWARE.
  14. #ifndef MASTER_LEXER_H
  15. #define MASTER_LEXER_H 1
  16. #include <exceptions/exceptions.h>
  17. #include <istream>
  18. #include <string>
  19. #include <stdint.h>
  20. namespace isc {
  21. namespace dns {
  22. /// \brief Tokenizer for parsing DNS master files.
  23. ///
  24. /// The \c MasterLexer class provides tokenize interfaces for parsing DNS
  25. /// master files. It understands some special rules of master files as
  26. /// defined in RFC 1035, such as comments, character escaping, and multi-line
  27. /// data, and provides the user application with the actual data in a
  28. /// more convenient form such as a std::string object.
  29. ///
  30. /// In order to support the $INCLUDE notation, this class is designed to be
  31. /// able to operate on multiple files or input streams in the nested way.
  32. /// The \c open() and \c close() methods correspond to the push and pop
  33. /// operations.
  34. ///
  35. /// While this class is public, it is less likely to be used by normal
  36. /// applications; it's mainly expected to be used within this library,
  37. /// specifically by the \c MasterLoader class and \c Rdata implementation
  38. /// classes.
  39. class MasterLexer {
  40. public:
  41. class Token; // we define it separately for better readability
  42. /// \brief The constructor.
  43. ///
  44. /// \throw std::bad_alloc Internal resource allocation fails (rare case).
  45. MasterLexer();
  46. /// \brief The destructor.
  47. ///
  48. /// It internally closes any remaining input sources.
  49. ~MasterLexer();
  50. /// \brief Open a file and make it the current input source of MasterLexer.
  51. ///
  52. /// The opened file can be explicitly closed by the \c close() method;
  53. /// if \c close() is not called within the lifetime of the \c MasterLexer,
  54. /// it will be closed in the destructor.
  55. ///
  56. /// \throw InvalidParameter filename is NULL
  57. /// \throw some_other The specified cannot be opened
  58. /// \param filename A non NULL string specifying a master file
  59. void open(const char* filename);
  60. /// \brief Make the given stream the current input source of MasterLexer.
  61. ///
  62. /// The caller still holds the ownership of the passed stream; it's the
  63. /// caller's responsibility to keep it valid as long as it's used in
  64. /// \c MasterLexer or to release any resource for the stream after that.
  65. /// The caller can explicitly tell \c MasterLexer to stop using the
  66. /// stream by calling the \c close() method.
  67. ///
  68. /// \param input An input stream object that produces textual
  69. /// representation of DNS RRs.
  70. void open(std::istream& input);
  71. /// \brief Close the most recently opened input source (file or stream).
  72. ///
  73. /// If it's a file, the opened file will be literally closed.
  74. /// If it's a stream, \c MasterLexer will simply stop using
  75. /// the stream; the caller can assume it will be never used in
  76. /// \c MasterLexer thereafter.
  77. ///
  78. /// This method must not be called when there is no opened source for
  79. /// \c MasterLexer. This method is otherwise exception free.
  80. ///
  81. /// \throw isc::InvalidOperation Called with no opened source.
  82. void close();
  83. /// \brief Return the name of the current input source name.
  84. ///
  85. /// If it's a file, it will be the C string given at the corresponding
  86. /// \c open() call, that is, its filename. If it's a stream, it will
  87. /// be formatted as \c "stream-%p" where \c %p is hex representation
  88. /// of the address of the stream object.
  89. ///
  90. /// If there is no opened source at the time of the call, this method
  91. /// returns an empty string.
  92. ///
  93. /// \throw std::bad_alloc Resource allocation failed for string
  94. /// construction (rare case)
  95. ///
  96. /// \return A string representation of the current source (see the
  97. /// description)
  98. std::string getSourceName() const;
  99. /// \brief Return the input source line number.
  100. ///
  101. /// If there is an opened source, the return value will be a non-0
  102. /// integer indicating the line number of the current source where
  103. /// the \c MasterLexer is currently working. The expected usage of
  104. /// this value is to print a helpful error message when parsing fails
  105. /// by specifically identifying the position of the error.
  106. ///
  107. /// If there is no opened source at the time of the call, this method
  108. /// returns 0.
  109. ///
  110. /// \throw None
  111. ///
  112. /// \return A string representation of the current source (see the
  113. /// description)
  114. size_t getSourceLine() const;
  115. private:
  116. struct MasterLexerImpl;
  117. MasterLexerImpl* impl_;
  118. };
  119. /// \brief Tokens for \c MasterLexer
  120. ///
  121. /// This is a simple value-class encapsulating a type of a lexer token and
  122. /// (if it has a value) its value. Essentially, the class provides
  123. /// constructors corresponding to different types of tokens, and corresponding
  124. /// getter methods. The type and value are fixed at the time of construction
  125. /// and will never be modified throughout the lifetime of the object.
  126. /// The getter methods are still provided to maximize the safety; an
  127. /// application cannot refer to a value that is invalid for the type of token.
  128. ///
  129. /// This class is intentionally implemented as copyable and assignable
  130. /// (using the default version of copy constructor and assignment operator),
  131. /// but it's mainly for internal implementation convenience. Applications will
  132. /// simply refer to Token object as a reference via the \c MasterLexer class.
  133. class MasterLexer::Token {
  134. public:
  135. /// \brief Enumeration for token types
  136. ///
  137. /// \note At the time of initial implementation, all numeric tokens
  138. /// that would be extracted from \c MasterLexer should be represented
  139. /// as an unsigned 32-bit integer. If we see the need for larger integers
  140. /// or negative numbers, we can then extend the token types.
  141. enum Type {
  142. END_OF_LINE, ///< End of line detected (if asked for detecting it)
  143. END_OF_FILE, ///< End of file detected (if asked for detecting it)
  144. INITIAL_WS, ///< White spaces at the beginning of a line
  145. NOVALUE_TYPE_MAX = INITIAL_WS, ///< Max integer corresponding to
  146. /// no-value (type only) types.
  147. /// Mainly for internal use.
  148. STRING, ///< A single string
  149. QSTRING, ///< A single string quoted by double-quotes (").
  150. NUMBER, ///< A decimal number (unsigned 32-bit)
  151. ERROR ///< Error detected in getting a token
  152. };
  153. /// \brief Enumeration for lexer error codes
  154. enum ErrorCode {
  155. NOT_STARTED, ///< The lexer is just initialized and has no token
  156. UNBALANCED_PAREN, ///< Unbalanced parentheses detected
  157. UNEXPECTED_END, ///< The lexer reaches the end of line or file
  158. /// unexpectedly
  159. UNBALANCED_QUOTES, ///< Unbalanced quotations detected
  160. MAX_ERROR_CODE ///< Max integer corresponding to valid error codes.
  161. /// (excluding this one). Mainly for internal use.
  162. };
  163. /// \brief A simple representation of a range of a string.
  164. ///
  165. /// This is a straightforward pair of the start pointer of a string
  166. /// and its length. The \c STRING and \c QSTRING types of tokens
  167. /// will be primarily represented in this form.
  168. ///
  169. /// Any character can be stored in the valid range of the region.
  170. /// In particular, there can be a nul character (\0) in the middle of
  171. /// the region. On the other hand, it is not ensured that the string
  172. /// is nul-terminated. So the usual string manipulation API may not work
  173. /// as expected.
  174. struct StringRegion {
  175. const char* beg; ///< The start address of the string
  176. size_t len; ///< The length of the string in bytes
  177. };
  178. /// \brief Constructor for non-value type of token.
  179. ///
  180. /// \throw InvalidParameter A value type token is specified.
  181. /// \param type The type of the token. It must indicate a non-value
  182. /// type (not larger than \c NOVALUE_TYPE_MAX).
  183. explicit Token(Type type) : type_(type) {
  184. if (type > NOVALUE_TYPE_MAX) {
  185. isc_throw(InvalidParameter, "Token per-type constructor "
  186. "called with invalid type: " << type);
  187. }
  188. }
  189. /// \brief Constructor for string and quoted-string types of token.
  190. ///
  191. /// The optional \c quoted parameter specifies whether it's a quoted or
  192. /// non quoted string.
  193. ///
  194. /// The string is specified as a pair of a pointer to the start address
  195. /// and its length. Any character can be contained in any position of
  196. /// the valid range (see \c StringRegion).
  197. ///
  198. /// When it's a quoted string, the quotation marks must be excluded
  199. /// from the specified range.
  200. ///
  201. /// \param str_beg The start address of the string
  202. /// \param str_len The size of the string in bytes
  203. /// \param quoted true if it's a quoted string; false otherwise.
  204. Token(const char* str_beg, size_t str_len, bool quoted = false) :
  205. type_(quoted ? QSTRING : STRING)
  206. {
  207. val_.str_region_.beg = str_beg;
  208. val_.str_region_.len = str_len;
  209. }
  210. /// \brief Constructor for number type of token.
  211. ///
  212. /// \brief number An unsigned 32-bit integer corresponding to the token
  213. /// value.
  214. explicit Token(uint32_t number) : type_(NUMBER) {
  215. val_.number_ = number;
  216. }
  217. /// \brief Constructor for error type of token.
  218. ///
  219. /// \throw InvalidParameter Invalid error code value is specified.
  220. /// \brief error_code A pre-defined constant of \c ErrorCode.
  221. explicit Token(ErrorCode error_code) : type_(ERROR) {
  222. if (!(error_code < MAX_ERROR_CODE)) {
  223. isc_throw(InvalidParameter, "Invalid master lexer error code: "
  224. << error_code);
  225. }
  226. val_.error_code_ = error_code;
  227. }
  228. /// \brief Return the token type.
  229. ///
  230. /// \throw none
  231. Type getType() const { return (type_); }
  232. /// \brief Return the value of a string-variant token.
  233. ///
  234. /// \throw InvalidOperation Called on a non string-variant types of token.
  235. /// \return A reference to \c StringRegion corresponding to the string
  236. /// token value.
  237. const StringRegion& getStringRegion() const {
  238. if (type_ != STRING && type_ != QSTRING) {
  239. isc_throw(InvalidOperation,
  240. "Token::getStringRegion() for non string-variant type");
  241. }
  242. return (val_.str_region_);
  243. }
  244. /// \brief Return the value of a string-variant token as a string object.
  245. ///
  246. /// Note that the underlying string may contain a nul (\0) character
  247. /// in the middle. The returned string object will contain all characters
  248. /// of the valid range of the underlying string. So some string
  249. /// operations such as c_str() may not work as expected.
  250. ///
  251. /// \throw InvalidOperation Called on a non string-variant types of token.
  252. /// \throw std::bad_alloc Resource allocation failure in constructing the
  253. /// string object.
  254. /// \return A std::string object corresponding to the string token value.
  255. std::string getString() const {
  256. if (type_ != STRING && type_ != QSTRING) {
  257. isc_throw(InvalidOperation,
  258. "Token::getString() for non string-variant type");
  259. }
  260. return (std::string(val_.str_region_.beg,
  261. val_.str_region_.beg + val_.str_region_.len));
  262. }
  263. /// \brief Return the value of a string-variant token as a string object.
  264. ///
  265. /// \throw InvalidOperation Called on a non number type of token.
  266. /// \return The integer corresponding to the number token value.
  267. uint32_t getNumber() const {
  268. if (type_ != NUMBER) {
  269. isc_throw(InvalidOperation,
  270. "Token::getNumber() for non number type");
  271. }
  272. return (val_.number_);
  273. }
  274. /// \brief Return the error code of a error type token.
  275. ///
  276. /// \throw InvalidOperation Called on a non error type of token.
  277. /// \return The error code of the token.
  278. ErrorCode getErrorCode() const {
  279. if (type_ != ERROR) {
  280. isc_throw(InvalidOperation,
  281. "Token::getErrorCode() for non error type");
  282. }
  283. return (val_.error_code_);
  284. };
  285. /// \brief Return a textual description of the error of a error type token.
  286. ///
  287. /// The returned string would be useful to produce a log message when
  288. /// a zone file parser encounters an error.
  289. ///
  290. /// \throw InvalidOperation Called on a non error type of token.
  291. /// \throw std::bad_alloc Resource allocation failure in constructing the
  292. /// string object.
  293. /// \return A string object that describes the meaning of the error.
  294. std::string getErrorText() const;
  295. private:
  296. Type type_; // this is not const so the class can be assignable
  297. // We use a union to represent different types of token values via the
  298. // unified Token class. The class integrity should ensure valid operation
  299. // on the union; getter methods should only refer to the member set at
  300. // the construction.
  301. union {
  302. StringRegion str_region_;
  303. uint32_t number_;
  304. ErrorCode error_code_;
  305. } val_;
  306. };
  307. } // namespace dns
  308. } // namespace isc
  309. #endif // MASTER_LEXER_H
  310. // Local Variables:
  311. // mode: c++
  312. // End: