token_functions.hpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. // Boost token_functions.hpp ------------------------------------------------//
  2. // Copyright John R. Bandela 2001.
  3. // Distributed under the Boost Software License, Version 1.0. (See
  4. // accompanying file LICENSE_1_0.txt or copy at
  5. // http://www.boost.org/LICENSE_1_0.txt)
  6. // See http://www.boost.org/libs/tokenizer/ for documentation.
  7. // Revision History:
  8. // 01 Oct 2004 Joaquin M Lopez Munoz
  9. // Workaround for a problem with string::assign in msvc-stlport
  10. // 06 Apr 2004 John Bandela
  11. // Fixed a bug involving using char_delimiter with a true input iterator
  12. // 28 Nov 2003 Robert Zeh and John Bandela
  13. // Converted into "fast" functions that avoid using += when
  14. // the supplied iterator isn't an input_iterator; based on
  15. // some work done at Archelon and a version that was checked into
  16. // the boost CVS for a short period of time.
  17. // 20 Feb 2002 John Maddock
  18. // Removed using namespace std declarations and added
  19. // workaround for BOOST_NO_STDC_NAMESPACE (the library
  20. // can be safely mixed with regex).
  21. // 06 Feb 2002 Jeremy Siek
  22. // Added char_separator.
  23. // 02 Feb 2002 Jeremy Siek
  24. // Removed tabs and a little cleanup.
  25. #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
  26. #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
  27. #include <vector>
  28. #include <stdexcept>
  29. #include <string>
  30. #include <cctype>
  31. #include <algorithm> // for find_if
  32. #include <boost/config.hpp>
  33. #include <boost/assert.hpp>
  34. #include <boost/detail/workaround.hpp>
  35. #include <boost/mpl/if.hpp>
  36. //
  37. // the following must not be macros if we are to prefix them
  38. // with std:: (they shouldn't be macros anyway...)
  39. //
  40. #ifdef ispunct
  41. # undef ispunct
  42. #endif
  43. #ifdef isspace
  44. # undef isspace
  45. #endif
  46. //
  47. // fix namespace problems:
  48. //
  49. #ifdef BOOST_NO_STDC_NAMESPACE
  50. namespace std{
  51. using ::ispunct;
  52. using ::isspace;
  53. }
  54. #endif
  55. namespace boost{
  56. //===========================================================================
  57. // The escaped_list_separator class. Which is a model of TokenizerFunction
  58. // An escaped list is a super-set of what is commonly known as a comma
  59. // separated value (csv) list.It is separated into fields by a comma or
  60. // other character. If the delimiting character is inside quotes, then it is
  61. // counted as a regular character.To allow for embedded quotes in a field,
  62. // there can be escape sequences using the \ much like C.
  63. // The role of the comma, the quotation mark, and the escape
  64. // character (backslash \), can be assigned to other characters.
  65. struct escaped_list_error : public std::runtime_error{
  66. escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { }
  67. };
  68. // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  69. // MSVC does not like the following typename
  70. #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
  71. template <class Char,
  72. class Traits = typename std::basic_string<Char>::traits_type >
  73. #else
  74. template <class Char,
  75. class Traits = std::basic_string<Char>::traits_type >
  76. #endif
  77. class escaped_list_separator {
  78. private:
  79. typedef std::basic_string<Char,Traits> string_type;
  80. struct char_eq {
  81. Char e_;
  82. char_eq(Char e):e_(e) { }
  83. bool operator()(Char c) {
  84. return Traits::eq(e_,c);
  85. }
  86. };
  87. string_type escape_;
  88. string_type c_;
  89. string_type quote_;
  90. bool last_;
  91. bool is_escape(Char e) {
  92. char_eq f(e);
  93. return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
  94. }
  95. bool is_c(Char e) {
  96. char_eq f(e);
  97. return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
  98. }
  99. bool is_quote(Char e) {
  100. char_eq f(e);
  101. return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
  102. }
  103. template <typename iterator, typename Token>
  104. void do_escape(iterator& next,iterator end,Token& tok) {
  105. if (++next == end)
  106. throw escaped_list_error(std::string("cannot end with escape"));
  107. if (Traits::eq(*next,'n')) {
  108. tok+='\n';
  109. return;
  110. }
  111. else if (is_quote(*next)) {
  112. tok+=*next;
  113. return;
  114. }
  115. else if (is_c(*next)) {
  116. tok+=*next;
  117. return;
  118. }
  119. else if (is_escape(*next)) {
  120. tok+=*next;
  121. return;
  122. }
  123. else
  124. throw escaped_list_error(std::string("unknown escape sequence"));
  125. }
  126. public:
  127. explicit escaped_list_separator(Char e = '\\',
  128. Char c = ',',Char q = '\"')
  129. : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
  130. escaped_list_separator(string_type e, string_type c, string_type q)
  131. : escape_(e), c_(c), quote_(q), last_(false) { }
  132. void reset() {last_=false;}
  133. template <typename InputIterator, typename Token>
  134. bool operator()(InputIterator& next,InputIterator end,Token& tok) {
  135. bool bInQuote = false;
  136. tok = Token();
  137. if (next == end) {
  138. if (last_) {
  139. last_ = false;
  140. return true;
  141. }
  142. else
  143. return false;
  144. }
  145. last_ = false;
  146. for (;next != end;++next) {
  147. if (is_escape(*next)) {
  148. do_escape(next,end,tok);
  149. }
  150. else if (is_c(*next)) {
  151. if (!bInQuote) {
  152. // If we are not in quote, then we are done
  153. ++next;
  154. // The last character was a c, that means there is
  155. // 1 more blank field
  156. last_ = true;
  157. return true;
  158. }
  159. else tok+=*next;
  160. }
  161. else if (is_quote(*next)) {
  162. bInQuote=!bInQuote;
  163. }
  164. else {
  165. tok += *next;
  166. }
  167. }
  168. return true;
  169. }
  170. };
  171. //===========================================================================
  172. // The classes here are used by offset_separator and char_separator to implement
  173. // faster assigning of tokens using assign instead of +=
  174. namespace tokenizer_detail {
  175. // The assign_or_plus_equal struct contains functions that implement
  176. // assign, +=, and clearing based on the iterator type. The
  177. // generic case does nothing for plus_equal and clearing, while
  178. // passing through the call for assign.
  179. //
  180. // When an input iterator is being used, the situation is reversed.
  181. // The assign method does nothing, plus_equal invokes operator +=,
  182. // and the clearing method sets the supplied token to the default
  183. // token constructor's result.
  184. //
  185. template<class IteratorTag>
  186. struct assign_or_plus_equal {
  187. template<class Iterator, class Token>
  188. static void assign(Iterator b, Iterator e, Token &t) {
  189. #if BOOST_WORKAROUND(BOOST_MSVC, < 1300) &&\
  190. BOOST_WORKAROUND(__SGI_STL_PORT, < 0x500) &&\
  191. defined(_STLP_DEBUG) &&\
  192. (defined(_STLP_USE_DYNAMIC_LIB) || defined(_DLL))
  193. // Problem with string::assign for msvc-stlport in debug mode: the
  194. // linker tries to import the templatized version of this memfun,
  195. // which is obviously not exported.
  196. // See http://www.stlport.com/dcforum/DCForumID6/1763.html for details.
  197. t = Token();
  198. while(b != e) t += *b++;
  199. #else
  200. t.assign(b, e);
  201. #endif
  202. }
  203. template<class Token, class Value>
  204. static void plus_equal(Token &, const Value &) {
  205. }
  206. // If we are doing an assign, there is no need for the
  207. // the clear.
  208. //
  209. template<class Token>
  210. static void clear(Token &) {
  211. }
  212. };
  213. template <>
  214. struct assign_or_plus_equal<std::input_iterator_tag> {
  215. template<class Iterator, class Token>
  216. static void assign(Iterator b, Iterator e, Token &t) {
  217. }
  218. template<class Token, class Value>
  219. static void plus_equal(Token &t, const Value &v) {
  220. t += v;
  221. }
  222. template<class Token>
  223. static void clear(Token &t) {
  224. t = Token();
  225. }
  226. };
  227. template<class Iterator>
  228. struct pointer_iterator_category{
  229. typedef std::random_access_iterator_tag type;
  230. };
  231. template<class Iterator>
  232. struct class_iterator_category{
  233. typedef typename Iterator::iterator_category type;
  234. };
  235. // This portably gets the iterator_tag without partial template specialization
  236. template<class Iterator>
  237. struct get_iterator_category{
  238. typedef typename mpl::if_<is_pointer<Iterator>,
  239. pointer_iterator_category<Iterator>,
  240. class_iterator_category<Iterator>
  241. >::type cat;
  242. typedef typename cat::type iterator_category;
  243. };
  244. }
  245. //===========================================================================
  246. // The offset_separator class, which is a model of TokenizerFunction.
  247. // Offset breaks a string into tokens based on a range of offsets
  248. class offset_separator {
  249. private:
  250. std::vector<int> offsets_;
  251. unsigned int current_offset_;
  252. bool wrap_offsets_;
  253. bool return_partial_last_;
  254. public:
  255. template <typename Iter>
  256. offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
  257. bool return_partial_last = true)
  258. : offsets_(begin,end), current_offset_(0),
  259. wrap_offsets_(wrap_offsets),
  260. return_partial_last_(return_partial_last) { }
  261. offset_separator()
  262. : offsets_(1,1), current_offset_(),
  263. wrap_offsets_(true), return_partial_last_(true) { }
  264. void reset() {
  265. current_offset_ = 0;
  266. }
  267. template <typename InputIterator, typename Token>
  268. bool operator()(InputIterator& next, InputIterator end, Token& tok)
  269. {
  270. typedef tokenizer_detail::assign_or_plus_equal<
  271. #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
  272. typename
  273. #endif
  274. tokenizer_detail::get_iterator_category<
  275. InputIterator>::iterator_category> assigner;
  276. BOOST_ASSERT(!offsets_.empty());
  277. assigner::clear(tok);
  278. InputIterator start(next);
  279. if (next == end)
  280. return false;
  281. if (current_offset_ == offsets_.size())
  282. {
  283. if (wrap_offsets_)
  284. current_offset_=0;
  285. else
  286. return false;
  287. }
  288. int c = offsets_[current_offset_];
  289. int i = 0;
  290. for (; i < c; ++i) {
  291. if (next == end)break;
  292. assigner::plus_equal(tok,*next++);
  293. }
  294. assigner::assign(start,next,tok);
  295. if (!return_partial_last_)
  296. if (i < (c-1) )
  297. return false;
  298. ++current_offset_;
  299. return true;
  300. }
  301. };
  302. //===========================================================================
  303. // The char_separator class breaks a sequence of characters into
  304. // tokens based on the character delimiters (very much like bad old
  305. // strtok). A delimiter character can either be kept or dropped. A
  306. // kept delimiter shows up as an output token, whereas a dropped
  307. // delimiter does not.
  308. // This class replaces the char_delimiters_separator class. The
  309. // constructor for the char_delimiters_separator class was too
  310. // confusing and needed to be deprecated. However, because of the
  311. // default arguments to the constructor, adding the new constructor
  312. // would cause ambiguity, so instead I deprecated the whole class.
  313. // The implementation of the class was also simplified considerably.
  314. enum empty_token_policy { drop_empty_tokens, keep_empty_tokens };
  315. // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  316. #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
  317. template <typename Char,
  318. typename Traits = typename std::basic_string<Char>::traits_type >
  319. #else
  320. template <typename Char,
  321. typename Traits = std::basic_string<Char>::traits_type >
  322. #endif
  323. class char_separator
  324. {
  325. typedef std::basic_string<Char,Traits> string_type;
  326. public:
  327. explicit
  328. char_separator(const Char* dropped_delims,
  329. const Char* kept_delims = 0,
  330. empty_token_policy empty_tokens = drop_empty_tokens)
  331. : m_dropped_delims(dropped_delims),
  332. m_use_ispunct(false),
  333. m_use_isspace(false),
  334. m_empty_tokens(empty_tokens),
  335. m_output_done(false)
  336. {
  337. // Borland workaround
  338. if (kept_delims)
  339. m_kept_delims = kept_delims;
  340. }
  341. // use ispunct() for kept delimiters and isspace for dropped.
  342. explicit
  343. char_separator()
  344. : m_use_ispunct(true),
  345. m_use_isspace(true),
  346. m_empty_tokens(drop_empty_tokens) { }
  347. void reset() { }
  348. template <typename InputIterator, typename Token>
  349. bool operator()(InputIterator& next, InputIterator end, Token& tok)
  350. {
  351. typedef tokenizer_detail::assign_or_plus_equal<
  352. #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
  353. typename
  354. #endif
  355. tokenizer_detail::get_iterator_category<
  356. InputIterator>::iterator_category> assigner;
  357. assigner::clear(tok);
  358. // skip past all dropped_delims
  359. if (m_empty_tokens == drop_empty_tokens)
  360. for (; next != end && is_dropped(*next); ++next)
  361. { }
  362. InputIterator start(next);
  363. if (m_empty_tokens == drop_empty_tokens) {
  364. if (next == end)
  365. return false;
  366. // if we are on a kept_delims move past it and stop
  367. if (is_kept(*next)) {
  368. assigner::plus_equal(tok,*next);
  369. ++next;
  370. } else
  371. // append all the non delim characters
  372. for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
  373. assigner::plus_equal(tok,*next);
  374. }
  375. else { // m_empty_tokens == keep_empty_tokens
  376. // Handle empty token at the end
  377. if (next == end)
  378. {
  379. if (m_output_done == false)
  380. {
  381. m_output_done = true;
  382. assigner::assign(start,next,tok);
  383. return true;
  384. }
  385. else
  386. return false;
  387. }
  388. if (is_kept(*next)) {
  389. if (m_output_done == false)
  390. m_output_done = true;
  391. else {
  392. assigner::plus_equal(tok,*next);
  393. ++next;
  394. m_output_done = false;
  395. }
  396. }
  397. else if (m_output_done == false && is_dropped(*next)) {
  398. m_output_done = true;
  399. }
  400. else {
  401. if (is_dropped(*next))
  402. start=++next;
  403. for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
  404. assigner::plus_equal(tok,*next);
  405. m_output_done = true;
  406. }
  407. }
  408. assigner::assign(start,next,tok);
  409. return true;
  410. }
  411. private:
  412. string_type m_kept_delims;
  413. string_type m_dropped_delims;
  414. bool m_use_ispunct;
  415. bool m_use_isspace;
  416. empty_token_policy m_empty_tokens;
  417. bool m_output_done;
  418. bool is_kept(Char E) const
  419. {
  420. if (m_kept_delims.length())
  421. return m_kept_delims.find(E) != string_type::npos;
  422. else if (m_use_ispunct) {
  423. return std::ispunct(E) != 0;
  424. } else
  425. return false;
  426. }
  427. bool is_dropped(Char E) const
  428. {
  429. if (m_dropped_delims.length())
  430. return m_dropped_delims.find(E) != string_type::npos;
  431. else if (m_use_isspace) {
  432. return std::isspace(E) != 0;
  433. } else
  434. return false;
  435. }
  436. };
  437. //===========================================================================
  438. // The following class is DEPRECATED, use class char_separators instead.
  439. //
  440. // The char_delimiters_separator class, which is a model of
  441. // TokenizerFunction. char_delimiters_separator breaks a string
  442. // into tokens based on character delimiters. There are 2 types of
  443. // delimiters. returnable delimiters can be returned as
  444. // tokens. These are often punctuation. nonreturnable delimiters
  445. // cannot be returned as tokens. These are often whitespace
  446. // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  447. #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
  448. template <class Char,
  449. class Traits = typename std::basic_string<Char>::traits_type >
  450. #else
  451. template <class Char,
  452. class Traits = std::basic_string<Char>::traits_type >
  453. #endif
  454. class char_delimiters_separator {
  455. private:
  456. typedef std::basic_string<Char,Traits> string_type;
  457. string_type returnable_;
  458. string_type nonreturnable_;
  459. bool return_delims_;
  460. bool no_ispunct_;
  461. bool no_isspace_;
  462. bool is_ret(Char E)const
  463. {
  464. if (returnable_.length())
  465. return returnable_.find(E) != string_type::npos;
  466. else{
  467. if (no_ispunct_) {return false;}
  468. else{
  469. int r = std::ispunct(E);
  470. return r != 0;
  471. }
  472. }
  473. }
  474. bool is_nonret(Char E)const
  475. {
  476. if (nonreturnable_.length())
  477. return nonreturnable_.find(E) != string_type::npos;
  478. else{
  479. if (no_isspace_) {return false;}
  480. else{
  481. int r = std::isspace(E);
  482. return r != 0;
  483. }
  484. }
  485. }
  486. public:
  487. explicit char_delimiters_separator(bool return_delims = false,
  488. const Char* returnable = 0,
  489. const Char* nonreturnable = 0)
  490. : returnable_(returnable ? returnable : string_type().c_str()),
  491. nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()),
  492. return_delims_(return_delims), no_ispunct_(returnable!=0),
  493. no_isspace_(nonreturnable!=0) { }
  494. void reset() { }
  495. public:
  496. template <typename InputIterator, typename Token>
  497. bool operator()(InputIterator& next, InputIterator end,Token& tok) {
  498. tok = Token();
  499. // skip past all nonreturnable delims
  500. // skip past the returnable only if we are not returning delims
  501. for (;next!=end && ( is_nonret(*next) || (is_ret(*next)
  502. && !return_delims_ ) );++next) { }
  503. if (next == end) {
  504. return false;
  505. }
  506. // if we are to return delims and we are one a returnable one
  507. // move past it and stop
  508. if (is_ret(*next) && return_delims_) {
  509. tok+=*next;
  510. ++next;
  511. }
  512. else
  513. // append all the non delim characters
  514. for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next)
  515. tok+=*next;
  516. return true;
  517. }
  518. };
  519. } //namespace boost
  520. #endif