text.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. from __future__ import absolute_import, unicode_literals, print_function
  2. import sys
  3. import re
  4. import inspect
  5. import itertools
  6. import textwrap
  7. import functools
  8. import six
  9. import jaraco.collections
  10. from jaraco.functools import compose
  11. def substitution(old, new):
  12. """
  13. Return a function that will perform a substitution on a string
  14. """
  15. return lambda s: s.replace(old, new)
  16. def multi_substitution(*substitutions):
  17. """
  18. Take a sequence of pairs specifying substitutions, and create
  19. a function that performs those substitutions.
  20. >>> multi_substitution(('foo', 'bar'), ('bar', 'baz'))('foo')
  21. 'baz'
  22. """
  23. substitutions = itertools.starmap(substitution, substitutions)
  24. # compose function applies last function first, so reverse the
  25. # substitutions to get the expected order.
  26. substitutions = reversed(tuple(substitutions))
  27. return compose(*substitutions)
  28. class FoldedCase(six.text_type):
  29. """
  30. A case insensitive string class; behaves just like str
  31. except compares equal when the only variation is case.
  32. >>> s = FoldedCase('hello world')
  33. >>> s == 'Hello World'
  34. True
  35. >>> 'Hello World' == s
  36. True
  37. >>> s.index('O')
  38. 4
  39. >>> s.split('O')
  40. ['hell', ' w', 'rld']
  41. >>> sorted(map(FoldedCase, ['GAMMA', 'alpha', 'Beta']))
  42. ['alpha', 'Beta', 'GAMMA']
  43. """
  44. def __lt__(self, other):
  45. return self.lower() < other.lower()
  46. def __gt__(self, other):
  47. return self.lower() > other.lower()
  48. def __eq__(self, other):
  49. return self.lower() == other.lower()
  50. def __hash__(self):
  51. return hash(self.lower())
  52. # cache lower since it's likely to be called frequently.
  53. def lower(self):
  54. self._lower = super(FoldedCase, self).lower()
  55. self.lower = lambda: self._lower
  56. return self._lower
  57. def index(self, sub):
  58. return self.lower().index(sub.lower())
  59. def split(self, splitter=' ', maxsplit=0):
  60. pattern = re.compile(re.escape(splitter), re.I)
  61. return pattern.split(self, maxsplit)
  62. def local_format(string):
  63. """
  64. format the string using variables in the caller's local namespace.
  65. >>> a = 3
  66. >>> local_format("{a:5}")
  67. ' 3'
  68. """
  69. context = inspect.currentframe().f_back.f_locals
  70. if sys.version_info < (3, 2):
  71. return string.format(**context)
  72. return string.format_map(context)
  73. def global_format(string):
  74. """
  75. format the string using variables in the caller's global namespace.
  76. >>> a = 3
  77. >>> fmt = "The func name: {global_format.__name__}"
  78. >>> global_format(fmt)
  79. 'The func name: global_format'
  80. """
  81. context = inspect.currentframe().f_back.f_globals
  82. if sys.version_info < (3, 2):
  83. return string.format(**context)
  84. return string.format_map(context)
  85. def namespace_format(string):
  86. """
  87. Format the string using variable in the caller's scope (locals + globals).
  88. >>> a = 3
  89. >>> fmt = "A is {a} and this func is {namespace_format.__name__}"
  90. >>> namespace_format(fmt)
  91. 'A is 3 and this func is namespace_format'
  92. """
  93. context = jaraco.collections.DictStack()
  94. context.push(inspect.currentframe().f_back.f_globals)
  95. context.push(inspect.currentframe().f_back.f_locals)
  96. if sys.version_info < (3, 2):
  97. return string.format(**context)
  98. return string.format_map(context)
  99. def is_decodable(value):
  100. r"""
  101. Return True if the supplied value is decodable (using the default
  102. encoding).
  103. >>> is_decodable(b'\xff')
  104. False
  105. >>> is_decodable(b'\x32')
  106. True
  107. """
  108. # TODO: This code could be expressed more consisely and directly
  109. # with a jaraco.context.ExceptionTrap, but that adds an unfortunate
  110. # long dependency tree, so for now, use boolean literals.
  111. try:
  112. value.decode()
  113. except UnicodeDecodeError:
  114. return False
  115. return True
  116. def is_binary(value):
  117. """
  118. Return True if the value appears to be binary (that is, it's a byte
  119. string and isn't decodable).
  120. """
  121. return isinstance(value, bytes) and not is_decodable(value)
  122. def trim(s):
  123. r"""
  124. Trim something like a docstring to remove the whitespace that
  125. is common due to indentation and formatting.
  126. >>> trim("\n\tfoo = bar\n\t\tbar = baz\n")
  127. 'foo = bar\n\tbar = baz'
  128. """
  129. return textwrap.dedent(s).strip()
  130. class Splitter(object):
  131. """object that will split a string with the given arguments for each call
  132. >>> s = Splitter(',')
  133. >>> s('hello, world, this is your, master calling')
  134. ['hello', ' world', ' this is your', ' master calling']
  135. """
  136. def __init__(self, *args):
  137. self.args = args
  138. def __call__(self, s):
  139. return s.split(*self.args)
  140. def indent(string, prefix=' ' * 4):
  141. return prefix + string
  142. class WordSet(tuple):
  143. """
  144. Given a Python identifier, return the words that identifier represents,
  145. whether in camel case, underscore-separated, etc.
  146. >>> WordSet.parse("camelCase")
  147. ('camel', 'Case')
  148. >>> WordSet.parse("under_sep")
  149. ('under', 'sep')
  150. Acronyms should be retained
  151. >>> WordSet.parse("firstSNL")
  152. ('first', 'SNL')
  153. >>> WordSet.parse("you_and_I")
  154. ('you', 'and', 'I')
  155. >>> WordSet.parse("A simple test")
  156. ('A', 'simple', 'test')
  157. Multiple caps should not interfere with the first cap of another word.
  158. >>> WordSet.parse("myABCClass")
  159. ('my', 'ABC', 'Class')
  160. The result is a WordSet, so you can get the form you need.
  161. >>> WordSet.parse("myABCClass").underscore_separated()
  162. 'my_ABC_Class'
  163. >>> WordSet.parse('a-command').camel_case()
  164. 'ACommand'
  165. >>> WordSet.parse('someIdentifier').lowered().space_separated()
  166. 'some identifier'
  167. Slices of the result should return another WordSet.
  168. >>> WordSet.parse('taken-out-of-context')[1:].underscore_separated()
  169. 'out_of_context'
  170. >>> WordSet.from_class_name(WordSet()).lowered().space_separated()
  171. 'word set'
  172. """
  173. _pattern = re.compile('([A-Z]?[a-z]+)|([A-Z]+(?![a-z]))')
  174. def capitalized(self):
  175. return WordSet(word.capitalize() for word in self)
  176. def lowered(self):
  177. return WordSet(word.lower() for word in self)
  178. def camel_case(self):
  179. return ''.join(self.capitalized())
  180. def headless_camel_case(self):
  181. words = iter(self)
  182. first = next(words).lower()
  183. return itertools.chain((first,), WordSet(words).camel_case())
  184. def underscore_separated(self):
  185. return '_'.join(self)
  186. def dash_separated(self):
  187. return '-'.join(self)
  188. def space_separated(self):
  189. return ' '.join(self)
  190. def __getitem__(self, item):
  191. result = super(WordSet, self).__getitem__(item)
  192. if isinstance(item, slice):
  193. result = WordSet(result)
  194. return result
  195. # for compatibility with Python 2
  196. def __getslice__(self, i, j):
  197. return self.__getitem__(slice(i, j))
  198. @classmethod
  199. def parse(cls, identifier):
  200. matches = cls._pattern.finditer(identifier)
  201. return WordSet(match.group(0) for match in matches)
  202. @classmethod
  203. def from_class_name(cls, subject):
  204. return cls.parse(subject.__class__.__name__)
  205. # for backward compatibility
  206. words = WordSet.parse
  207. def simple_html_strip(s):
  208. r"""
  209. Remove HTML from the string `s`.
  210. >>> str(simple_html_strip(''))
  211. ''
  212. >>> print(simple_html_strip('A <bold>stormy</bold> day in paradise'))
  213. A stormy day in paradise
  214. >>> print(simple_html_strip('Somebody <!-- do not --> tell the truth.'))
  215. Somebody tell the truth.
  216. >>> print(simple_html_strip('What about<br/>\nmultiple lines?'))
  217. What about
  218. multiple lines?
  219. """
  220. html_stripper = re.compile('(<!--.*?-->)|(<[^>]*>)|([^<]+)', re.DOTALL)
  221. texts = (
  222. match.group(3) or ''
  223. for match
  224. in html_stripper.finditer(s)
  225. )
  226. return ''.join(texts)
  227. class SeparatedValues(six.text_type):
  228. """
  229. A string separated by a separator. Overrides __iter__ for getting
  230. the values.
  231. >>> list(SeparatedValues('a,b,c'))
  232. ['a', 'b', 'c']
  233. Whitespace is stripped and empty values are discarded.
  234. >>> list(SeparatedValues(' a, b , c, '))
  235. ['a', 'b', 'c']
  236. """
  237. separator = ','
  238. def __iter__(self):
  239. parts = self.split(self.separator)
  240. return six.moves.filter(None, (part.strip() for part in parts))
  241. class Stripper:
  242. r"""
  243. Given a series of lines, find the common prefix and strip it from them.
  244. >>> lines = [
  245. ... 'abcdefg\n',
  246. ... 'abc\n',
  247. ... 'abcde\n',
  248. ... ]
  249. >>> res = Stripper.strip_prefix(lines)
  250. >>> res.prefix
  251. 'abc'
  252. >>> list(res.lines)
  253. ['defg\n', '\n', 'de\n']
  254. """
  255. def __init__(self, prefix, lines):
  256. self.prefix = prefix
  257. self.lines = map(self, lines)
  258. @classmethod
  259. def strip_prefix(cls, lines):
  260. prefix_lines, lines = itertools.tee(lines)
  261. prefix = functools.reduce(cls.common_prefix, prefix_lines)
  262. return cls(prefix, lines)
  263. def __call__(self, line):
  264. null, prefix, rest = line.partition(self.prefix)
  265. return rest
  266. @staticmethod
  267. def common_prefix(s1, s2):
  268. """
  269. Return the common prefix of two lines.
  270. """
  271. index = min(len(s1), len(s2))
  272. while s1[:index] != s2[:index]:
  273. index -= 1
  274. return s1[:index]