inflect.py 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130
  1. '''
  2. inflect.py: correctly generate plurals, ordinals, indefinite articles;
  3. convert numbers to words
  4. Copyright (C) 2010 Paul Dyson
  5. Based upon the Perl module Lingua::EN::Inflect by Damian Conway.
  6. This program is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. This program is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with this program. If not, see <http://www.gnu.org/licenses/>.
  16. The original Perl module Lingua::EN::Inflect by Damian Conway is
  17. available from http://search.cpan.org/~dconway/
  18. This module can be downloaded at http://pypi.python.org/pypi/inflect
  19. methods:
  20. classical inflect
  21. plural plural_noun plural_verb plural_adj singular_noun no num a an
  22. compare compare_nouns compare_verbs compare_adjs
  23. present_participle
  24. ordinal
  25. number_to_words
  26. join
  27. defnoun defverb defadj defa defan
  28. INFLECTIONS: classical inflect
  29. plural plural_noun plural_verb plural_adj singular_noun compare
  30. no num a an present_participle
  31. PLURALS: classical inflect
  32. plural plural_noun plural_verb plural_adj singular_noun no num
  33. compare compare_nouns compare_verbs compare_adjs
  34. COMPARISONS: classical
  35. compare compare_nouns compare_verbs compare_adjs
  36. ARTICLES: classical inflect num a an
  37. NUMERICAL: ordinal number_to_words
  38. USER_DEFINED: defnoun defverb defadj defa defan
  39. Exceptions:
  40. UnknownClassicalModeError
  41. BadNumValueError
  42. BadChunkingOptionError
  43. NumOutOfRangeError
  44. BadUserDefinedPatternError
  45. BadRcFileError
  46. BadGenderError
  47. '''
  48. from re import match, search, subn, IGNORECASE, VERBOSE
  49. from re import split as splitre
  50. from re import error as reerror
  51. from re import sub as resub
  52. class UnknownClassicalModeError(Exception):
  53. pass
  54. class BadNumValueError(Exception):
  55. pass
  56. class BadChunkingOptionError(Exception):
  57. pass
  58. class NumOutOfRangeError(Exception):
  59. pass
  60. class BadUserDefinedPatternError(Exception):
  61. pass
  62. class BadRcFileError(Exception):
  63. pass
  64. class BadGenderError(Exception):
  65. pass
  66. __ver_major__ = 0
  67. __ver_minor__ = 2
  68. __ver_patch__ = 5
  69. __ver_sub__ = ""
  70. __version__ = "%d.%d.%d%s" % (__ver_major__, __ver_minor__,
  71. __ver_patch__, __ver_sub__)
  72. STDOUT_ON = False
  73. def print3(txt):
  74. if STDOUT_ON:
  75. print(txt)
  76. def enclose(s):
  77. return "(?:%s)" % s
  78. def joinstem(cutpoint=0, words=''):
  79. '''
  80. join stem of each word in words into a string for regex
  81. each word is truncated at cutpoint
  82. cutpoint is usually negative indicating the number of letters to remove
  83. from the end of each word
  84. e.g.
  85. joinstem(-2, ["ephemeris", "iris", ".*itis"]) returns
  86. (?:ephemer|ir|.*it)
  87. '''
  88. return enclose('|'.join(w[:cutpoint] for w in words))
  89. def bysize(words):
  90. '''
  91. take a list of words and return a dict of sets sorted by word length
  92. e.g.
  93. ret[3]=set(['ant', 'cat', 'dog', 'pig'])
  94. ret[4]=set(['frog', 'goat'])
  95. ret[5]=set(['horse'])
  96. ret[8]=set(['elephant'])
  97. '''
  98. ret = {}
  99. for w in words:
  100. if len(w) not in ret:
  101. ret[len(w)] = set()
  102. ret[len(w)].add(w)
  103. return ret
  104. def make_pl_si_lists(lst, plending, siendingsize, dojoinstem=True):
  105. '''
  106. given a list of singular words: lst
  107. an ending to append to make the plural: plending
  108. the number of characters to remove from the singular before appending plending: siendingsize
  109. a flag whether to create a joinstem: dojoinstem
  110. return:
  111. a list of pluralised words: si_list (called si because this is what you need to
  112. look for to make the singular)
  113. the pluralised words as a dict of sets sorted by word length: si_bysize
  114. the singular words as a dict of sets sorted by word length: pl_bysize
  115. if dojoinstem is True: a regular expression that matches any of the stems: stem
  116. '''
  117. if siendingsize is not None:
  118. siendingsize = -siendingsize
  119. si_list = [w[:siendingsize] + plending for w in lst]
  120. pl_bysize = bysize(lst)
  121. si_bysize = bysize(si_list)
  122. if dojoinstem:
  123. stem = joinstem(siendingsize, lst)
  124. return si_list, si_bysize, pl_bysize, stem
  125. else:
  126. return si_list, si_bysize, pl_bysize
  127. # 1. PLURALS
  128. pl_sb_irregular_s = {
  129. "corpus": "corpuses|corpora",
  130. "opus": "opuses|opera",
  131. "genus": "genera",
  132. "mythos": "mythoi",
  133. "penis": "penises|penes",
  134. "testis": "testes",
  135. "atlas": "atlases|atlantes",
  136. "yes": "yeses",
  137. }
  138. pl_sb_irregular = {
  139. "child": "children",
  140. "brother": "brothers|brethren",
  141. "loaf": "loaves",
  142. "hoof": "hoofs|hooves",
  143. "beef": "beefs|beeves",
  144. "thief": "thiefs|thieves",
  145. "money": "monies",
  146. "mongoose": "mongooses",
  147. "ox": "oxen",
  148. "cow": "cows|kine",
  149. "graffito": "graffiti",
  150. "octopus": "octopuses|octopodes",
  151. "genie": "genies|genii",
  152. "ganglion": "ganglions|ganglia",
  153. "trilby": "trilbys",
  154. "turf": "turfs|turves",
  155. "numen": "numina",
  156. "atman": "atmas",
  157. "occiput": "occiputs|occipita",
  158. "sabretooth": "sabretooths",
  159. "sabertooth": "sabertooths",
  160. "lowlife": "lowlifes",
  161. "flatfoot": "flatfoots",
  162. "tenderfoot": "tenderfoots",
  163. "romany": "romanies",
  164. "jerry": "jerries",
  165. "mary": "maries",
  166. "talouse": "talouses",
  167. "blouse": "blouses",
  168. "rom": "roma",
  169. "carmen": "carmina",
  170. }
  171. pl_sb_irregular.update(pl_sb_irregular_s)
  172. # pl_sb_irregular_keys = enclose('|'.join(pl_sb_irregular.keys()))
  173. pl_sb_irregular_caps = {
  174. 'Romany': 'Romanies',
  175. 'Jerry': 'Jerrys',
  176. 'Mary': 'Marys',
  177. 'Rom': 'Roma',
  178. }
  179. pl_sb_irregular_compound = {
  180. "prima donna": "prima donnas|prime donne",
  181. }
  182. si_sb_irregular = dict([(v, k) for (k, v) in pl_sb_irregular.items()])
  183. keys = list(si_sb_irregular.keys())
  184. for k in keys:
  185. if '|' in k:
  186. k1, k2 = k.split('|')
  187. si_sb_irregular[k1] = si_sb_irregular[k2] = si_sb_irregular[k]
  188. del si_sb_irregular[k]
  189. si_sb_irregular_caps = dict([(v, k) for (k, v) in pl_sb_irregular_caps.items()])
  190. si_sb_irregular_compound = dict([(v, k) for (k, v) in pl_sb_irregular_compound.items()])
  191. keys = list(si_sb_irregular_compound.keys())
  192. for k in keys:
  193. if '|' in k:
  194. k1, k2 = k.split('|')
  195. si_sb_irregular_compound[k1] = si_sb_irregular_compound[k2] = si_sb_irregular_compound[k]
  196. del si_sb_irregular_compound[k]
  197. # si_sb_irregular_keys = enclose('|'.join(si_sb_irregular.keys()))
  198. # Z's that don't double
  199. pl_sb_z_zes_list = (
  200. "quartz", "topaz",
  201. )
  202. pl_sb_z_zes_bysize = bysize(pl_sb_z_zes_list)
  203. pl_sb_ze_zes_list = ('snooze',)
  204. pl_sb_ze_zes_bysize = bysize(pl_sb_ze_zes_list)
  205. # CLASSICAL "..is" -> "..ides"
  206. pl_sb_C_is_ides_complete = [
  207. # GENERAL WORDS...
  208. "ephemeris", "iris", "clitoris",
  209. "chrysalis", "epididymis",
  210. ]
  211. pl_sb_C_is_ides_endings = [
  212. # INFLAMATIONS...
  213. "itis",
  214. ]
  215. pl_sb_C_is_ides = joinstem(-2, pl_sb_C_is_ides_complete + ['.*%s' % w for w in pl_sb_C_is_ides_endings])
  216. pl_sb_C_is_ides_list = pl_sb_C_is_ides_complete + pl_sb_C_is_ides_endings
  217. (si_sb_C_is_ides_list, si_sb_C_is_ides_bysize,
  218. pl_sb_C_is_ides_bysize) = make_pl_si_lists(pl_sb_C_is_ides_list, 'ides', 2, dojoinstem=False)
  219. # CLASSICAL "..a" -> "..ata"
  220. pl_sb_C_a_ata_list = (
  221. "anathema", "bema", "carcinoma", "charisma", "diploma",
  222. "dogma", "drama", "edema", "enema", "enigma", "lemma",
  223. "lymphoma", "magma", "melisma", "miasma", "oedema",
  224. "sarcoma", "schema", "soma", "stigma", "stoma", "trauma",
  225. "gumma", "pragma",
  226. )
  227. (si_sb_C_a_ata_list, si_sb_C_a_ata_bysize,
  228. pl_sb_C_a_ata_bysize, pl_sb_C_a_ata) = make_pl_si_lists(pl_sb_C_a_ata_list, 'ata', 1)
  229. # UNCONDITIONAL "..a" -> "..ae"
  230. pl_sb_U_a_ae_list = (
  231. "alumna", "alga", "vertebra", "persona"
  232. )
  233. (si_sb_U_a_ae_list, si_sb_U_a_ae_bysize,
  234. pl_sb_U_a_ae_bysize, pl_sb_U_a_ae) = make_pl_si_lists(pl_sb_U_a_ae_list, 'e', None)
  235. # CLASSICAL "..a" -> "..ae"
  236. pl_sb_C_a_ae_list = (
  237. "amoeba", "antenna", "formula", "hyperbola",
  238. "medusa", "nebula", "parabola", "abscissa",
  239. "hydra", "nova", "lacuna", "aurora", "umbra",
  240. "flora", "fauna",
  241. )
  242. (si_sb_C_a_ae_list, si_sb_C_a_ae_bysize,
  243. pl_sb_C_a_ae_bysize, pl_sb_C_a_ae) = make_pl_si_lists(pl_sb_C_a_ae_list, 'e', None)
  244. # CLASSICAL "..en" -> "..ina"
  245. pl_sb_C_en_ina_list = (
  246. "stamen", "foramen", "lumen",
  247. )
  248. (si_sb_C_en_ina_list, si_sb_C_en_ina_bysize,
  249. pl_sb_C_en_ina_bysize, pl_sb_C_en_ina) = make_pl_si_lists(pl_sb_C_en_ina_list, 'ina', 2)
  250. # UNCONDITIONAL "..um" -> "..a"
  251. pl_sb_U_um_a_list = (
  252. "bacterium", "agendum", "desideratum", "erratum",
  253. "stratum", "datum", "ovum", "extremum",
  254. "candelabrum",
  255. )
  256. (si_sb_U_um_a_list, si_sb_U_um_a_bysize,
  257. pl_sb_U_um_a_bysize, pl_sb_U_um_a) = make_pl_si_lists(pl_sb_U_um_a_list, 'a', 2)
  258. # CLASSICAL "..um" -> "..a"
  259. pl_sb_C_um_a_list = (
  260. "maximum", "minimum", "momentum", "optimum",
  261. "quantum", "cranium", "curriculum", "dictum",
  262. "phylum", "aquarium", "compendium", "emporium",
  263. "enconium", "gymnasium", "honorarium", "interregnum",
  264. "lustrum", "memorandum", "millennium", "rostrum",
  265. "spectrum", "speculum", "stadium", "trapezium",
  266. "ultimatum", "medium", "vacuum", "velum",
  267. "consortium", "arboretum",
  268. )
  269. (si_sb_C_um_a_list, si_sb_C_um_a_bysize,
  270. pl_sb_C_um_a_bysize, pl_sb_C_um_a) = make_pl_si_lists(pl_sb_C_um_a_list, 'a', 2)
  271. # UNCONDITIONAL "..us" -> "i"
  272. pl_sb_U_us_i_list = (
  273. "alumnus", "alveolus", "bacillus", "bronchus",
  274. "locus", "nucleus", "stimulus", "meniscus",
  275. "sarcophagus",
  276. )
  277. (si_sb_U_us_i_list, si_sb_U_us_i_bysize,
  278. pl_sb_U_us_i_bysize, pl_sb_U_us_i) = make_pl_si_lists(pl_sb_U_us_i_list, 'i', 2)
  279. # CLASSICAL "..us" -> "..i"
  280. pl_sb_C_us_i_list = (
  281. "focus", "radius", "genius",
  282. "incubus", "succubus", "nimbus",
  283. "fungus", "nucleolus", "stylus",
  284. "torus", "umbilicus", "uterus",
  285. "hippopotamus", "cactus",
  286. )
  287. (si_sb_C_us_i_list, si_sb_C_us_i_bysize,
  288. pl_sb_C_us_i_bysize, pl_sb_C_us_i) = make_pl_si_lists(pl_sb_C_us_i_list, 'i', 2)
  289. # CLASSICAL "..us" -> "..us" (ASSIMILATED 4TH DECLENSION LATIN NOUNS)
  290. pl_sb_C_us_us = (
  291. "status", "apparatus", "prospectus", "sinus",
  292. "hiatus", "impetus", "plexus",
  293. )
  294. pl_sb_C_us_us_bysize = bysize(pl_sb_C_us_us)
  295. # UNCONDITIONAL "..on" -> "a"
  296. pl_sb_U_on_a_list = (
  297. "criterion", "perihelion", "aphelion",
  298. "phenomenon", "prolegomenon", "noumenon",
  299. "organon", "asyndeton", "hyperbaton",
  300. )
  301. (si_sb_U_on_a_list, si_sb_U_on_a_bysize,
  302. pl_sb_U_on_a_bysize, pl_sb_U_on_a) = make_pl_si_lists(pl_sb_U_on_a_list, 'a', 2)
  303. # CLASSICAL "..on" -> "..a"
  304. pl_sb_C_on_a_list = (
  305. "oxymoron",
  306. )
  307. (si_sb_C_on_a_list, si_sb_C_on_a_bysize,
  308. pl_sb_C_on_a_bysize, pl_sb_C_on_a) = make_pl_si_lists(pl_sb_C_on_a_list, 'a', 2)
  309. # CLASSICAL "..o" -> "..i" (BUT NORMALLY -> "..os")
  310. pl_sb_C_o_i = [
  311. "solo", "soprano", "basso", "alto",
  312. "contralto", "tempo", "piano", "virtuoso",
  313. ] # list not tuple so can concat for pl_sb_U_o_os
  314. pl_sb_C_o_i_bysize = bysize(pl_sb_C_o_i)
  315. si_sb_C_o_i_bysize = bysize(['%si' % w[:-1] for w in pl_sb_C_o_i])
  316. pl_sb_C_o_i_stems = joinstem(-1, pl_sb_C_o_i)
  317. # ALWAYS "..o" -> "..os"
  318. pl_sb_U_o_os_complete = set((
  319. "ado", "ISO", "NATO", "NCO", "NGO", "oto",
  320. ))
  321. si_sb_U_o_os_complete = set('%ss' % w for w in pl_sb_U_o_os_complete)
  322. pl_sb_U_o_os_endings = [
  323. "aficionado", "aggro",
  324. "albino", "allegro", "ammo",
  325. "Antananarivo", "archipelago", "armadillo",
  326. "auto", "avocado", "Bamako",
  327. "Barquisimeto", "bimbo", "bingo",
  328. "Biro", "bolero", "Bolzano",
  329. "bongo", "Boto", "burro",
  330. "Cairo", "canto", "cappuccino",
  331. "casino", "cello", "Chicago",
  332. "Chimango", "cilantro", "cochito",
  333. "coco", "Colombo", "Colorado",
  334. "commando", "concertino", "contango",
  335. "credo", "crescendo", "cyano",
  336. "demo", "ditto", "Draco",
  337. "dynamo", "embryo", "Esperanto",
  338. "espresso", "euro", "falsetto",
  339. "Faro", "fiasco", "Filipino",
  340. "flamenco", "furioso", "generalissimo",
  341. "Gestapo", "ghetto", "gigolo",
  342. "gizmo", "Greensboro", "gringo",
  343. "Guaiabero", "guano", "gumbo",
  344. "gyro", "hairdo", "hippo",
  345. "Idaho", "impetigo", "inferno",
  346. "info", "intermezzo", "intertrigo",
  347. "Iquico", "jumbo",
  348. "junto", "Kakapo", "kilo",
  349. "Kinkimavo", "Kokako", "Kosovo",
  350. "Lesotho", "libero", "libido",
  351. "libretto", "lido", "Lilo",
  352. "limbo", "limo", "lineno",
  353. "lingo", "lino", "livedo",
  354. "loco", "logo", "lumbago",
  355. "macho", "macro", "mafioso",
  356. "magneto", "magnifico", "Majuro",
  357. "Malabo", "manifesto", "Maputo",
  358. "Maracaibo", "medico", "memo",
  359. "metro", "Mexico", "micro",
  360. "Milano", "Monaco", "mono",
  361. "Montenegro", "Morocco", "Muqdisho",
  362. "myo",
  363. "neutrino", "Ningbo",
  364. "octavo", "oregano", "Orinoco",
  365. "Orlando", "Oslo",
  366. "panto", "Paramaribo", "Pardusco",
  367. "pedalo", "photo", "pimento",
  368. "pinto", "pleco", "Pluto",
  369. "pogo", "polo", "poncho",
  370. "Porto-Novo", "Porto", "pro",
  371. "psycho", "pueblo", "quarto",
  372. "Quito", "rhino", "risotto",
  373. "rococo", "rondo", "Sacramento",
  374. "saddo", "sago", "salvo",
  375. "Santiago", "Sapporo", "Sarajevo",
  376. "scherzando", "scherzo", "silo",
  377. "sirocco", "sombrero", "staccato",
  378. "sterno", "stucco", "stylo",
  379. "sumo", "Taiko", "techno",
  380. "terrazzo", "testudo", "timpano",
  381. "tiro", "tobacco", "Togo",
  382. "Tokyo", "torero", "Torino",
  383. "Toronto", "torso", "tremolo",
  384. "typo", "tyro", "ufo",
  385. "UNESCO", "vaquero", "vermicello",
  386. "verso", "vibrato", "violoncello",
  387. "Virgo", "weirdo", "WHO",
  388. "WTO", "Yamoussoukro", "yo-yo",
  389. "zero", "Zibo",
  390. ] + pl_sb_C_o_i
  391. pl_sb_U_o_os_bysize = bysize(pl_sb_U_o_os_endings)
  392. si_sb_U_o_os_bysize = bysize(['%ss' % w for w in pl_sb_U_o_os_endings])
  393. # UNCONDITIONAL "..ch" -> "..chs"
  394. pl_sb_U_ch_chs_list = (
  395. "czech", "eunuch", "stomach"
  396. )
  397. (si_sb_U_ch_chs_list, si_sb_U_ch_chs_bysize,
  398. pl_sb_U_ch_chs_bysize, pl_sb_U_ch_chs) = make_pl_si_lists(pl_sb_U_ch_chs_list, 's', None)
  399. # UNCONDITIONAL "..[ei]x" -> "..ices"
  400. pl_sb_U_ex_ices_list = (
  401. "codex", "murex", "silex",
  402. )
  403. (si_sb_U_ex_ices_list, si_sb_U_ex_ices_bysize,
  404. pl_sb_U_ex_ices_bysize, pl_sb_U_ex_ices) = make_pl_si_lists(pl_sb_U_ex_ices_list, 'ices', 2)
  405. pl_sb_U_ix_ices_list = (
  406. "radix", "helix",
  407. )
  408. (si_sb_U_ix_ices_list, si_sb_U_ix_ices_bysize,
  409. pl_sb_U_ix_ices_bysize, pl_sb_U_ix_ices) = make_pl_si_lists(pl_sb_U_ix_ices_list, 'ices', 2)
  410. # CLASSICAL "..[ei]x" -> "..ices"
  411. pl_sb_C_ex_ices_list = (
  412. "vortex", "vertex", "cortex", "latex",
  413. "pontifex", "apex", "index", "simplex",
  414. )
  415. (si_sb_C_ex_ices_list, si_sb_C_ex_ices_bysize,
  416. pl_sb_C_ex_ices_bysize, pl_sb_C_ex_ices) = make_pl_si_lists(pl_sb_C_ex_ices_list, 'ices', 2)
  417. pl_sb_C_ix_ices_list = (
  418. "appendix",
  419. )
  420. (si_sb_C_ix_ices_list, si_sb_C_ix_ices_bysize,
  421. pl_sb_C_ix_ices_bysize, pl_sb_C_ix_ices) = make_pl_si_lists(pl_sb_C_ix_ices_list, 'ices', 2)
  422. # ARABIC: ".." -> "..i"
  423. pl_sb_C_i_list = (
  424. "afrit", "afreet", "efreet",
  425. )
  426. (si_sb_C_i_list, si_sb_C_i_bysize,
  427. pl_sb_C_i_bysize, pl_sb_C_i) = make_pl_si_lists(pl_sb_C_i_list, 'i', None)
  428. # HEBREW: ".." -> "..im"
  429. pl_sb_C_im_list = (
  430. "goy", "seraph", "cherub",
  431. )
  432. (si_sb_C_im_list, si_sb_C_im_bysize,
  433. pl_sb_C_im_bysize, pl_sb_C_im) = make_pl_si_lists(pl_sb_C_im_list, 'im', None)
  434. # UNCONDITIONAL "..man" -> "..mans"
  435. pl_sb_U_man_mans_list = """
  436. ataman caiman cayman ceriman
  437. desman dolman farman harman hetman
  438. human leman ottoman shaman talisman
  439. """.split()
  440. pl_sb_U_man_mans_caps_list = """
  441. Alabaman Bahaman Burman German
  442. Hiroshiman Liman Nakayaman Norman Oklahoman
  443. Panaman Roman Selman Sonaman Tacoman Yakiman
  444. Yokohaman Yuman
  445. """.split()
  446. (si_sb_U_man_mans_list, si_sb_U_man_mans_bysize,
  447. pl_sb_U_man_mans_bysize) = make_pl_si_lists(pl_sb_U_man_mans_list, 's', None, dojoinstem=False)
  448. (si_sb_U_man_mans_caps_list, si_sb_U_man_mans_caps_bysize,
  449. pl_sb_U_man_mans_caps_bysize) = make_pl_si_lists(pl_sb_U_man_mans_caps_list, 's', None, dojoinstem=False)
  450. pl_sb_uninflected_s_complete = [
  451. # PAIRS OR GROUPS SUBSUMED TO A SINGULAR...
  452. "breeches", "britches", "pajamas", "pyjamas", "clippers", "gallows",
  453. "hijinks", "headquarters", "pliers", "scissors", "testes", "herpes",
  454. "pincers", "shears", "proceedings", "trousers",
  455. # UNASSIMILATED LATIN 4th DECLENSION
  456. "cantus", "coitus", "nexus",
  457. # RECENT IMPORTS...
  458. "contretemps", "corps", "debris",
  459. "siemens",
  460. # DISEASES
  461. "mumps",
  462. # MISCELLANEOUS OTHERS...
  463. "diabetes", "jackanapes", "series", "species", "subspecies", "rabies",
  464. "chassis", "innings", "news", "mews", "haggis",
  465. ]
  466. pl_sb_uninflected_s_endings = [
  467. # RECENT IMPORTS...
  468. "ois",
  469. # DISEASES
  470. "measles",
  471. ]
  472. pl_sb_uninflected_s = pl_sb_uninflected_s_complete + ['.*%s' % w for w in pl_sb_uninflected_s_endings]
  473. pl_sb_uninflected_herd = (
  474. # DON'T INFLECT IN CLASSICAL MODE, OTHERWISE NORMAL INFLECTION
  475. "wildebeest", "swine", "eland", "bison", "buffalo",
  476. "elk", "rhinoceros", 'zucchini',
  477. 'caribou', 'dace', 'grouse', 'guinea fowl', 'guinea-fowl',
  478. 'haddock', 'hake', 'halibut', 'herring', 'mackerel',
  479. 'pickerel', 'pike', 'roe', 'seed', 'shad',
  480. 'snipe', 'teal', 'turbot', 'water fowl', 'water-fowl',
  481. )
  482. pl_sb_uninflected_complete = [
  483. # SOME FISH AND HERD ANIMALS
  484. "tuna", "salmon", "mackerel", "trout",
  485. "bream", "sea-bass", "sea bass", "carp", "cod", "flounder", "whiting",
  486. "moose",
  487. # OTHER ODDITIES
  488. "graffiti", "djinn", 'samuri',
  489. 'offspring', 'pence', 'quid', 'hertz',
  490. ] + pl_sb_uninflected_s_complete
  491. # SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE)
  492. pl_sb_uninflected_caps = [
  493. # ALL NATIONALS ENDING IN -ese
  494. "Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese",
  495. "Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese",
  496. "Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese",
  497. "Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese",
  498. "Shavese", "Vermontese", "Wenchowese", "Yengeese",
  499. ]
  500. pl_sb_uninflected_endings = [
  501. # SOME FISH AND HERD ANIMALS
  502. "fish",
  503. "deer", "sheep",
  504. # ALL NATIONALS ENDING IN -ese
  505. "nese", "rese", "lese", "mese",
  506. # DISEASES
  507. "pox",
  508. # OTHER ODDITIES
  509. 'craft',
  510. ] + pl_sb_uninflected_s_endings
  511. # SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE)
  512. pl_sb_uninflected_bysize = bysize(pl_sb_uninflected_endings)
  513. # SINGULAR WORDS ENDING IN ...s (ALL INFLECT WITH ...es)
  514. pl_sb_singular_s_complete = [
  515. "acropolis", "aegis", "alias", "asbestos", "bathos", "bias",
  516. "bronchitis", "bursitis", "caddis", "cannabis",
  517. "canvas", "chaos", "cosmos", "dais", "digitalis",
  518. "epidermis", "ethos", "eyas", "gas", "glottis",
  519. "hubris", "ibis", "lens", "mantis", "marquis", "metropolis",
  520. "pathos", "pelvis", "polis", "rhinoceros",
  521. "sassafras", "trellis",
  522. ] + pl_sb_C_is_ides_complete
  523. pl_sb_singular_s_endings = [
  524. "ss", "us",
  525. ] + pl_sb_C_is_ides_endings
  526. pl_sb_singular_s_bysize = bysize(pl_sb_singular_s_endings)
  527. si_sb_singular_s_complete = ['%ses' % w for w in pl_sb_singular_s_complete]
  528. si_sb_singular_s_endings = ['%ses' % w for w in pl_sb_singular_s_endings]
  529. si_sb_singular_s_bysize = bysize(si_sb_singular_s_endings)
  530. pl_sb_singular_s_es = [
  531. "[A-Z].*es",
  532. ]
  533. pl_sb_singular_s = enclose('|'.join(pl_sb_singular_s_complete +
  534. ['.*%s' % w for w in pl_sb_singular_s_endings] +
  535. pl_sb_singular_s_es))
  536. # PLURALS ENDING IN uses -> use
  537. si_sb_ois_oi_case = (
  538. 'Bolshois', 'Hanois'
  539. )
  540. si_sb_uses_use_case = (
  541. 'Betelgeuses', 'Duses', 'Meuses', 'Syracuses', 'Toulouses',
  542. )
  543. si_sb_uses_use = (
  544. 'abuses', 'applauses', 'blouses',
  545. 'carouses', 'causes', 'chartreuses', 'clauses',
  546. 'contuses', 'douses', 'excuses', 'fuses',
  547. 'grouses', 'hypotenuses', 'masseuses',
  548. 'menopauses', 'misuses', 'muses', 'overuses', 'pauses',
  549. 'peruses', 'profuses', 'recluses', 'reuses',
  550. 'ruses', 'souses', 'spouses', 'suffuses', 'transfuses', 'uses',
  551. )
  552. si_sb_ies_ie_case = (
  553. 'Addies', 'Aggies', 'Allies', 'Amies', 'Angies', 'Annies',
  554. 'Annmaries', 'Archies', 'Arties', 'Aussies', 'Barbies',
  555. 'Barries', 'Basies', 'Bennies', 'Bernies', 'Berties', 'Bessies',
  556. 'Betties', 'Billies', 'Blondies', 'Bobbies', 'Bonnies',
  557. 'Bowies', 'Brandies', 'Bries', 'Brownies', 'Callies',
  558. 'Carnegies', 'Carries', 'Cassies', 'Charlies', 'Cheries',
  559. 'Christies', 'Connies', 'Curies', 'Dannies', 'Debbies', 'Dixies',
  560. 'Dollies', 'Donnies', 'Drambuies', 'Eddies', 'Effies', 'Ellies',
  561. 'Elsies', 'Eries', 'Ernies', 'Essies', 'Eugenies', 'Fannies',
  562. 'Flossies', 'Frankies', 'Freddies', 'Gillespies', 'Goldies',
  563. 'Gracies', 'Guthries', 'Hallies', 'Hatties', 'Hetties',
  564. 'Hollies', 'Jackies', 'Jamies', 'Janies', 'Jannies', 'Jeanies',
  565. 'Jeannies', 'Jennies', 'Jessies', 'Jimmies', 'Jodies', 'Johnies',
  566. 'Johnnies', 'Josies', 'Julies', 'Kalgoorlies', 'Kathies', 'Katies',
  567. 'Kellies', 'Kewpies', 'Kristies', 'Laramies', 'Lassies', 'Lauries',
  568. 'Leslies', 'Lessies', 'Lillies', 'Lizzies', 'Lonnies', 'Lories',
  569. 'Lorries', 'Lotties', 'Louies', 'Mackenzies', 'Maggies', 'Maisies',
  570. 'Mamies', 'Marcies', 'Margies', 'Maries', 'Marjories', 'Matties',
  571. 'McKenzies', 'Melanies', 'Mickies', 'Millies', 'Minnies', 'Mollies',
  572. 'Mounties', 'Nannies', 'Natalies', 'Nellies', 'Netties', 'Ollies',
  573. 'Ozzies', 'Pearlies', 'Pottawatomies', 'Reggies', 'Richies', 'Rickies',
  574. 'Robbies', 'Ronnies', 'Rosalies', 'Rosemaries', 'Rosies', 'Roxies',
  575. 'Rushdies', 'Ruthies', 'Sadies', 'Sallies', 'Sammies', 'Scotties',
  576. 'Selassies', 'Sherries', 'Sophies', 'Stacies', 'Stefanies', 'Stephanies',
  577. 'Stevies', 'Susies', 'Sylvies', 'Tammies', 'Terries', 'Tessies',
  578. 'Tommies', 'Tracies', 'Trekkies', 'Valaries', 'Valeries', 'Valkyries',
  579. 'Vickies', 'Virgies', 'Willies', 'Winnies', 'Wylies', 'Yorkies',
  580. )
  581. si_sb_ies_ie = (
  582. 'aeries', 'baggies', 'belies', 'biggies', 'birdies', 'bogies',
  583. 'bonnies', 'boogies', 'bookies', 'bourgeoisies', 'brownies',
  584. 'budgies', 'caddies', 'calories', 'camaraderies', 'cockamamies',
  585. 'collies', 'cookies', 'coolies', 'cooties', 'coteries', 'crappies',
  586. 'curies', 'cutesies', 'dogies', 'eyrie', 'floozies', 'footsies',
  587. 'freebies', 'genies', 'goalies', 'groupies',
  588. 'hies', 'jalousies', 'junkies',
  589. 'kiddies', 'laddies', 'lassies', 'lies',
  590. 'lingeries', 'magpies', 'menageries', 'mommies', 'movies', 'neckties',
  591. 'newbies', 'nighties', 'oldies', 'organdies', 'overlies',
  592. 'pies', 'pinkies', 'pixies', 'potpies', 'prairies',
  593. 'quickies', 'reveries', 'rookies', 'rotisseries', 'softies', 'sorties',
  594. 'species', 'stymies', 'sweeties', 'ties', 'underlies', 'unties',
  595. 'veggies', 'vies', 'yuppies', 'zombies',
  596. )
  597. si_sb_oes_oe_case = (
  598. 'Chloes', 'Crusoes', 'Defoes', 'Faeroes', 'Ivanhoes', 'Joes',
  599. 'McEnroes', 'Moes', 'Monroes', 'Noes', 'Poes', 'Roscoes',
  600. 'Tahoes', 'Tippecanoes', 'Zoes',
  601. )
  602. si_sb_oes_oe = (
  603. 'aloes', 'backhoes', 'canoes',
  604. 'does', 'floes', 'foes', 'hoes', 'mistletoes',
  605. 'oboes', 'pekoes', 'roes', 'sloes',
  606. 'throes', 'tiptoes', 'toes', 'woes',
  607. )
  608. si_sb_z_zes = (
  609. "quartzes", "topazes",
  610. )
  611. si_sb_zzes_zz = (
  612. 'buzzes', 'fizzes', 'frizzes', 'razzes'
  613. )
  614. si_sb_ches_che_case = (
  615. 'Andromaches', 'Apaches', 'Blanches', 'Comanches',
  616. 'Nietzsches', 'Porsches', 'Roches',
  617. )
  618. si_sb_ches_che = (
  619. 'aches', 'avalanches', 'backaches', 'bellyaches', 'caches',
  620. 'cloches', 'creches', 'douches', 'earaches', 'fiches',
  621. 'headaches', 'heartaches', 'microfiches',
  622. 'niches', 'pastiches', 'psyches', 'quiches',
  623. 'stomachaches', 'toothaches',
  624. )
  625. si_sb_xes_xe = (
  626. 'annexes', 'axes', 'deluxes', 'pickaxes',
  627. )
  628. si_sb_sses_sse_case = (
  629. 'Hesses', 'Jesses', 'Larousses', 'Matisses',
  630. )
  631. si_sb_sses_sse = (
  632. 'bouillabaisses', 'crevasses', 'demitasses', 'impasses',
  633. 'mousses', 'posses',
  634. )
  635. si_sb_ves_ve_case = (
  636. # *[nwl]ives -> [nwl]live
  637. 'Clives', 'Palmolives',
  638. )
  639. si_sb_ves_ve = (
  640. # *[^d]eaves -> eave
  641. 'interweaves', 'weaves',
  642. # *[nwl]ives -> [nwl]live
  643. 'olives',
  644. # *[eoa]lves -> [eoa]lve
  645. 'bivalves', 'dissolves', 'resolves', 'salves', 'twelves', 'valves',
  646. )
  647. plverb_special_s = enclose('|'.join(
  648. [pl_sb_singular_s] +
  649. pl_sb_uninflected_s +
  650. list(pl_sb_irregular_s.keys()) + [
  651. '(.*[csx])is',
  652. '(.*)ceps',
  653. '[A-Z].*s',
  654. ]
  655. ))
  656. pl_sb_postfix_adj = {
  657. 'general': ['(?!major|lieutenant|brigadier|adjutant|.*star)\S+'],
  658. 'martial': ['court'],
  659. }
  660. for k in list(pl_sb_postfix_adj.keys()):
  661. pl_sb_postfix_adj[k] = enclose(
  662. enclose('|'.join(pl_sb_postfix_adj[k])) +
  663. "(?=(?:-|\\s+)%s)" % k)
  664. pl_sb_postfix_adj_stems = '(' + '|'.join(list(pl_sb_postfix_adj.values())) + ')(.*)'
  665. # PLURAL WORDS ENDING IS es GO TO SINGULAR is
  666. si_sb_es_is = (
  667. 'amanuenses', 'amniocenteses', 'analyses', 'antitheses',
  668. 'apotheoses', 'arterioscleroses', 'atheroscleroses', 'axes',
  669. # 'bases', # bases -> basis
  670. 'catalyses', 'catharses', 'chasses', 'cirrhoses',
  671. 'cocces', 'crises', 'diagnoses', 'dialyses', 'diereses',
  672. 'electrolyses', 'emphases', 'exegeses', 'geneses',
  673. 'halitoses', 'hydrolyses', 'hypnoses', 'hypotheses', 'hystereses',
  674. 'metamorphoses', 'metastases', 'misdiagnoses', 'mitoses',
  675. 'mononucleoses', 'narcoses', 'necroses', 'nemeses', 'neuroses',
  676. 'oases', 'osmoses', 'osteoporoses', 'paralyses', 'parentheses',
  677. 'parthenogeneses', 'periphrases', 'photosyntheses', 'probosces',
  678. 'prognoses', 'prophylaxes', 'prostheses', 'preces', 'psoriases',
  679. 'psychoanalyses', 'psychokineses', 'psychoses', 'scleroses',
  680. 'scolioses', 'sepses', 'silicoses', 'symbioses', 'synopses',
  681. 'syntheses', 'taxes', 'telekineses', 'theses', 'thromboses',
  682. 'tuberculoses', 'urinalyses',
  683. )
  684. pl_prep_list = """
  685. about above across after among around at athwart before behind
  686. below beneath beside besides between betwixt beyond but by
  687. during except for from in into near of off on onto out over
  688. since till to under until unto upon with""".split()
  689. pl_prep_list_da = pl_prep_list + ['de', 'du', 'da']
  690. pl_prep_bysize = bysize(pl_prep_list_da)
  691. pl_prep = enclose('|'.join(pl_prep_list_da))
  692. pl_sb_prep_dual_compound = r'(.*?)((?:-|\s+)(?:' + pl_prep + r')(?:-|\s+))a(?:-|\s+)(.*)'
  693. singular_pronoun_genders = set(['neuter',
  694. 'feminine',
  695. 'masculine',
  696. 'gender-neutral',
  697. 'feminine or masculine',
  698. 'masculine or feminine'])
  699. pl_pron_nom = {
  700. # NOMINATIVE REFLEXIVE
  701. "i": "we", "myself": "ourselves",
  702. "you": "you", "yourself": "yourselves",
  703. "she": "they", "herself": "themselves",
  704. "he": "they", "himself": "themselves",
  705. "it": "they", "itself": "themselves",
  706. "they": "they", "themself": "themselves",
  707. # POSSESSIVE
  708. "mine": "ours",
  709. "yours": "yours",
  710. "hers": "theirs",
  711. "his": "theirs",
  712. "its": "theirs",
  713. "theirs": "theirs",
  714. }
  715. si_pron = {}
  716. si_pron['nom'] = dict([(v, k) for (k, v) in pl_pron_nom.items()])
  717. si_pron['nom']['we'] = 'I'
  718. pl_pron_acc = {
  719. # ACCUSATIVE REFLEXIVE
  720. "me": "us", "myself": "ourselves",
  721. "you": "you", "yourself": "yourselves",
  722. "her": "them", "herself": "themselves",
  723. "him": "them", "himself": "themselves",
  724. "it": "them", "itself": "themselves",
  725. "them": "them", "themself": "themselves",
  726. }
  727. pl_pron_acc_keys = enclose('|'.join(list(pl_pron_acc.keys())))
  728. pl_pron_acc_keys_bysize = bysize(list(pl_pron_acc.keys()))
  729. si_pron['acc'] = dict([(v, k) for (k, v) in pl_pron_acc.items()])
  730. for thecase, plur, gend, sing in (
  731. ('nom', 'they', 'neuter', 'it'),
  732. ('nom', 'they', 'feminine', 'she'),
  733. ('nom', 'they', 'masculine', 'he'),
  734. ('nom', 'they', 'gender-neutral', 'they'),
  735. ('nom', 'they', 'feminine or masculine', 'she or he'),
  736. ('nom', 'they', 'masculine or feminine', 'he or she'),
  737. ('nom', 'themselves', 'neuter', 'itself'),
  738. ('nom', 'themselves', 'feminine', 'herself'),
  739. ('nom', 'themselves', 'masculine', 'himself'),
  740. ('nom', 'themselves', 'gender-neutral', 'themself'),
  741. ('nom', 'themselves', 'feminine or masculine', 'herself or himself'),
  742. ('nom', 'themselves', 'masculine or feminine', 'himself or herself'),
  743. ('nom', 'theirs', 'neuter', 'its'),
  744. ('nom', 'theirs', 'feminine', 'hers'),
  745. ('nom', 'theirs', 'masculine', 'his'),
  746. ('nom', 'theirs', 'gender-neutral', 'theirs'),
  747. ('nom', 'theirs', 'feminine or masculine', 'hers or his'),
  748. ('nom', 'theirs', 'masculine or feminine', 'his or hers'),
  749. ('acc', 'them', 'neuter', 'it'),
  750. ('acc', 'them', 'feminine', 'her'),
  751. ('acc', 'them', 'masculine', 'him'),
  752. ('acc', 'them', 'gender-neutral', 'them'),
  753. ('acc', 'them', 'feminine or masculine', 'her or him'),
  754. ('acc', 'them', 'masculine or feminine', 'him or her'),
  755. ('acc', 'themselves', 'neuter', 'itself'),
  756. ('acc', 'themselves', 'feminine', 'herself'),
  757. ('acc', 'themselves', 'masculine', 'himself'),
  758. ('acc', 'themselves', 'gender-neutral', 'themself'),
  759. ('acc', 'themselves', 'feminine or masculine', 'herself or himself'),
  760. ('acc', 'themselves', 'masculine or feminine', 'himself or herself'),
  761. ):
  762. try:
  763. si_pron[thecase][plur][gend] = sing
  764. except TypeError:
  765. si_pron[thecase][plur] = {}
  766. si_pron[thecase][plur][gend] = sing
  767. si_pron_acc_keys = enclose('|'.join(list(si_pron['acc'].keys())))
  768. si_pron_acc_keys_bysize = bysize(list(si_pron['acc'].keys()))
  769. def get_si_pron(thecase, word, gender):
  770. try:
  771. sing = si_pron[thecase][word]
  772. except KeyError:
  773. raise # not a pronoun
  774. try:
  775. return sing[gender] # has several types due to gender
  776. except TypeError:
  777. return sing # answer independent of gender
  778. plverb_irregular_pres = {
  779. # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR
  780. # 3RD PERS. (INDET.)
  781. "am": "are", "are": "are", "is": "are",
  782. "was": "were", "were": "were", "was": "were",
  783. "have": "have", "have": "have", "has": "have",
  784. "do": "do", "do": "do", "does": "do",
  785. }
  786. plverb_ambiguous_pres = {
  787. # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR
  788. # 3RD PERS. (INDET.)
  789. "act": "act", "act": "act", "acts": "act",
  790. "blame": "blame", "blame": "blame", "blames": "blame",
  791. "can": "can", "can": "can", "can": "can",
  792. "must": "must", "must": "must", "must": "must",
  793. "fly": "fly", "fly": "fly", "flies": "fly",
  794. "copy": "copy", "copy": "copy", "copies": "copy",
  795. "drink": "drink", "drink": "drink", "drinks": "drink",
  796. "fight": "fight", "fight": "fight", "fights": "fight",
  797. "fire": "fire", "fire": "fire", "fires": "fire",
  798. "like": "like", "like": "like", "likes": "like",
  799. "look": "look", "look": "look", "looks": "look",
  800. "make": "make", "make": "make", "makes": "make",
  801. "reach": "reach", "reach": "reach", "reaches": "reach",
  802. "run": "run", "run": "run", "runs": "run",
  803. "sink": "sink", "sink": "sink", "sinks": "sink",
  804. "sleep": "sleep", "sleep": "sleep", "sleeps": "sleep",
  805. "view": "view", "view": "view", "views": "view",
  806. }
  807. plverb_ambiguous_pres_keys = enclose('|'.join(list(plverb_ambiguous_pres.keys())))
  808. plverb_irregular_non_pres = (
  809. "did", "had", "ate", "made", "put",
  810. "spent", "fought", "sank", "gave", "sought",
  811. "shall", "could", "ought", "should",
  812. )
  813. plverb_ambiguous_non_pres = enclose('|'.join((
  814. "thought", "saw", "bent", "will", "might", "cut",
  815. )))
  816. # "..oes" -> "..oe" (the rest are "..oes" -> "o")
  817. pl_v_oes_oe = ('canoes', 'floes', 'oboes', 'roes', 'throes', 'woes')
  818. pl_v_oes_oe_endings_size4 = ('hoes', 'toes')
  819. pl_v_oes_oe_endings_size5 = ('shoes')
  820. pl_count_zero = (
  821. "0", "no", "zero", "nil"
  822. )
  823. pl_count_one = (
  824. "1", "a", "an", "one", "each", "every", "this", "that",
  825. )
  826. pl_adj_special = {
  827. "a": "some", "an": "some",
  828. "this": "these", "that": "those",
  829. }
  830. pl_adj_special_keys = enclose('|'.join(list(pl_adj_special.keys())))
  831. pl_adj_poss = {
  832. "my": "our",
  833. "your": "your",
  834. "its": "their",
  835. "her": "their",
  836. "his": "their",
  837. "their": "their",
  838. }
  839. pl_adj_poss_keys = enclose('|'.join(list(pl_adj_poss.keys())))
  840. # 2. INDEFINITE ARTICLES
  841. # THIS PATTERN MATCHES STRINGS OF CAPITALS STARTING WITH A "VOWEL-SOUND"
  842. # CONSONANT FOLLOWED BY ANOTHER CONSONANT, AND WHICH ARE NOT LIKELY
  843. # TO BE REAL WORDS (OH, ALL RIGHT THEN, IT'S JUST MAGIC!)
  844. A_abbrev = r"""
  845. (?! FJO | [HLMNS]Y. | RY[EO] | SQU
  846. | ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU])
  847. [FHLMNRSX][A-Z]
  848. """
  849. # THIS PATTERN CODES THE BEGINNINGS OF ALL ENGLISH WORDS BEGINING WITH A
  850. # 'y' FOLLOWED BY A CONSONANT. ANY OTHER Y-CONSONANT PREFIX THEREFORE
  851. # IMPLIES AN ABBREVIATION.
  852. A_y_cons = 'y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)'
  853. # EXCEPTIONS TO EXCEPTIONS
  854. A_explicit_a = enclose('|'.join((
  855. "unabomber", "unanimous", "US",
  856. )))
  857. A_explicit_an = enclose('|'.join((
  858. "euler",
  859. "hour(?!i)", "heir", "honest", "hono[ur]",
  860. "mpeg",
  861. )))
  862. A_ordinal_an = enclose('|'.join((
  863. "[aefhilmnorsx]-?th",
  864. )))
  865. A_ordinal_a = enclose('|'.join((
  866. "[bcdgjkpqtuvwyz]-?th",
  867. )))
  868. # NUMERICAL INFLECTIONS
  869. nth = {
  870. 0: 'th',
  871. 1: 'st',
  872. 2: 'nd',
  873. 3: 'rd',
  874. 4: 'th',
  875. 5: 'th',
  876. 6: 'th',
  877. 7: 'th',
  878. 8: 'th',
  879. 9: 'th',
  880. 11: 'th',
  881. 12: 'th',
  882. 13: 'th',
  883. }
  884. ordinal = dict(ty='tieth',
  885. one='first',
  886. two='second',
  887. three='third',
  888. five='fifth',
  889. eight='eighth',
  890. nine='ninth',
  891. twelve='twelfth')
  892. ordinal_suff = '|'.join(list(ordinal.keys()))
  893. # NUMBERS
  894. unit = ['', 'one', 'two', 'three', 'four', 'five',
  895. 'six', 'seven', 'eight', 'nine']
  896. teen = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
  897. 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen']
  898. ten = ['', '', 'twenty', 'thirty', 'forty',
  899. 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
  900. mill = [' ', ' thousand', ' million', ' billion', ' trillion', ' quadrillion',
  901. ' quintillion', ' sextillion', ' septillion', ' octillion',
  902. ' nonillion', ' decillion']
  903. # SUPPORT CLASSICAL PLURALIZATIONS
  904. def_classical = dict(
  905. all=False,
  906. zero=False,
  907. herd=False,
  908. names=True,
  909. persons=False,
  910. ancient=False,
  911. )
  912. all_classical = dict((k, True) for k in list(def_classical.keys()))
  913. no_classical = dict((k, False) for k in list(def_classical.keys()))
  914. # TODO: .inflectrc file does not work
  915. # can't just execute methods from another file like this
  916. # for rcfile in (pathjoin(dirname(__file__), '.inflectrc'),
  917. # expanduser(pathjoin(('~'), '.inflectrc'))):
  918. # if isfile(rcfile):
  919. # try:
  920. # execfile(rcfile)
  921. # except:
  922. # print3("\nBad .inflectrc file (%s):\n" % rcfile)
  923. # raise BadRcFileError
  924. class engine:
  925. def __init__(self):
  926. self.classical_dict = def_classical.copy()
  927. self.persistent_count = None
  928. self.mill_count = 0
  929. self.pl_sb_user_defined = []
  930. self.pl_v_user_defined = []
  931. self.pl_adj_user_defined = []
  932. self.si_sb_user_defined = []
  933. self.A_a_user_defined = []
  934. self.thegender = 'neuter'
  935. deprecated_methods = dict(pl='plural',
  936. plnoun='plural_noun',
  937. plverb='plural_verb',
  938. pladj='plural_adj',
  939. sinoun='single_noun',
  940. prespart='present_participle',
  941. numwords='number_to_words',
  942. plequal='compare',
  943. plnounequal='compare_nouns',
  944. plverbequal='compare_verbs',
  945. pladjequal='compare_adjs',
  946. wordlist='join',
  947. )
  948. def __getattr__(self, meth):
  949. if meth in self.deprecated_methods:
  950. print3('%s() deprecated, use %s()' % (meth, self.deprecated_methods[meth]))
  951. raise DeprecationWarning
  952. raise AttributeError
  953. def defnoun(self, singular, plural):
  954. '''
  955. Set the noun plural of singular to plural.
  956. '''
  957. self.checkpat(singular)
  958. self.checkpatplural(plural)
  959. self.pl_sb_user_defined.extend((singular, plural))
  960. self.si_sb_user_defined.extend((plural, singular))
  961. return 1
  962. def defverb(self, s1, p1, s2, p2, s3, p3):
  963. '''
  964. Set the verb plurals for s1, s2 and s3 to p1, p2 and p3 respectively.
  965. Where 1, 2 and 3 represent the 1st, 2nd and 3rd person forms of the verb.
  966. '''
  967. self.checkpat(s1)
  968. self.checkpat(s2)
  969. self.checkpat(s3)
  970. self.checkpatplural(p1)
  971. self.checkpatplural(p2)
  972. self.checkpatplural(p3)
  973. self.pl_v_user_defined.extend((s1, p1, s2, p2, s3, p3))
  974. return 1
  975. def defadj(self, singular, plural):
  976. '''
  977. Set the adjective plural of singular to plural.
  978. '''
  979. self.checkpat(singular)
  980. self.checkpatplural(plural)
  981. self.pl_adj_user_defined.extend((singular, plural))
  982. return 1
  983. def defa(self, pattern):
  984. '''
  985. Define the indefinate article as 'a' for words matching pattern.
  986. '''
  987. self.checkpat(pattern)
  988. self.A_a_user_defined.extend((pattern, 'a'))
  989. return 1
  990. def defan(self, pattern):
  991. '''
  992. Define the indefinate article as 'an' for words matching pattern.
  993. '''
  994. self.checkpat(pattern)
  995. self.A_a_user_defined.extend((pattern, 'an'))
  996. return 1
  997. def checkpat(self, pattern):
  998. '''
  999. check for errors in a regex pattern
  1000. '''
  1001. if pattern is None:
  1002. return
  1003. try:
  1004. match(pattern, '')
  1005. except reerror:
  1006. print3("\nBad user-defined singular pattern:\n\t%s\n" % pattern)
  1007. raise BadUserDefinedPatternError
  1008. def checkpatplural(self, pattern):
  1009. '''
  1010. check for errors in a regex replace pattern
  1011. '''
  1012. return
  1013. # can't find a pattern that doesn't pass the following test:
  1014. # if pattern is None:
  1015. # return
  1016. # try:
  1017. # resub('', pattern, '')
  1018. # except reerror:
  1019. # print3("\nBad user-defined plural pattern:\n\t%s\n" % pattern)
  1020. # raise BadUserDefinedPatternError
  1021. def ud_match(self, word, wordlist):
  1022. for i in range(len(wordlist) - 2, -2, -2): # backwards through even elements
  1023. mo = search(r'^%s$' % wordlist[i], word, IGNORECASE)
  1024. if mo:
  1025. if wordlist[i + 1] is None:
  1026. return None
  1027. pl = resub(r'\$(\d+)', r'\\1', wordlist[i + 1]) # change $n to \n for expand
  1028. return mo.expand(pl)
  1029. return None
  1030. def classical(self, **kwargs):
  1031. """
  1032. turn classical mode on and off for various categories
  1033. turn on all classical modes:
  1034. classical()
  1035. classical(all=True)
  1036. turn on or off specific claassical modes:
  1037. e.g.
  1038. classical(herd=True)
  1039. classical(names=False)
  1040. By default all classical modes are off except names.
  1041. unknown value in args or key in kwargs rasies exception: UnknownClasicalModeError
  1042. """
  1043. classical_mode = list(def_classical.keys())
  1044. if not kwargs:
  1045. self.classical_dict = all_classical.copy()
  1046. return
  1047. if 'all' in kwargs:
  1048. if kwargs['all']:
  1049. self.classical_dict = all_classical.copy()
  1050. else:
  1051. self.classical_dict = no_classical.copy()
  1052. for k, v in list(kwargs.items()):
  1053. if k in classical_mode:
  1054. self.classical_dict[k] = v
  1055. else:
  1056. raise UnknownClassicalModeError
  1057. def num(self, count=None, show=None): # (;$count,$show)
  1058. '''
  1059. Set the number to be used in other method calls.
  1060. Returns count.
  1061. Set show to False to return '' instead.
  1062. '''
  1063. if count is not None:
  1064. try:
  1065. self.persistent_count = int(count)
  1066. except ValueError:
  1067. raise BadNumValueError
  1068. if (show is None) or show:
  1069. return str(count)
  1070. else:
  1071. self.persistent_count = None
  1072. return ''
  1073. def gender(self, gender):
  1074. '''
  1075. set the gender for the singular of plural pronouns
  1076. can be one of:
  1077. 'neuter' ('they' -> 'it')
  1078. 'feminine' ('they' -> 'she')
  1079. 'masculine' ('they' -> 'he')
  1080. 'gender-neutral' ('they' -> 'they')
  1081. 'feminine or masculine' ('they' -> 'she or he')
  1082. 'masculine or feminine' ('they' -> 'he or she')
  1083. '''
  1084. if gender in singular_pronoun_genders:
  1085. self.thegender = gender
  1086. else:
  1087. raise BadGenderError
  1088. def nummo(self, matchobject):
  1089. '''
  1090. num but take a matchobject
  1091. use groups 1 and 2 in matchobject
  1092. '''
  1093. return self.num(matchobject.group(1), matchobject.group(2))
  1094. def plmo(self, matchobject):
  1095. '''
  1096. plural but take a matchobject
  1097. use groups 1 and 3 in matchobject
  1098. '''
  1099. return self.plural(matchobject.group(1), matchobject.group(3))
  1100. def plnounmo(self, matchobject):
  1101. '''
  1102. plural_noun but take a matchobject
  1103. use groups 1 and 3 in matchobject
  1104. '''
  1105. return self.plural_noun(matchobject.group(1), matchobject.group(3))
  1106. def plverbmo(self, matchobject):
  1107. '''
  1108. plural_verb but take a matchobject
  1109. use groups 1 and 3 in matchobject
  1110. '''
  1111. return self.plural_verb(matchobject.group(1), matchobject.group(3))
  1112. def pladjmo(self, matchobject):
  1113. '''
  1114. plural_adj but take a matchobject
  1115. use groups 1 and 3 in matchobject
  1116. '''
  1117. return self.plural_adj(matchobject.group(1), matchobject.group(3))
  1118. def sinounmo(self, matchobject):
  1119. '''
  1120. singular_noun but take a matchobject
  1121. use groups 1 and 3 in matchobject
  1122. '''
  1123. return self.singular_noun(matchobject.group(1), matchobject.group(3))
  1124. def amo(self, matchobject):
  1125. '''
  1126. A but take a matchobject
  1127. use groups 1 and 3 in matchobject
  1128. '''
  1129. if matchobject.group(3) is None:
  1130. return self.a(matchobject.group(1))
  1131. return self.a(matchobject.group(1), matchobject.group(3))
  1132. def nomo(self, matchobject):
  1133. '''
  1134. NO but take a matchobject
  1135. use groups 1 and 3 in matchobject
  1136. '''
  1137. return self.no(matchobject.group(1), matchobject.group(3))
  1138. def ordinalmo(self, matchobject):
  1139. '''
  1140. ordinal but take a matchobject
  1141. use group 1
  1142. '''
  1143. return self.ordinal(matchobject.group(1))
  1144. def numwordsmo(self, matchobject):
  1145. '''
  1146. number_to_words but take a matchobject
  1147. use group 1
  1148. '''
  1149. return self.number_to_words(matchobject.group(1))
  1150. def prespartmo(self, matchobject):
  1151. '''
  1152. prespart but take a matchobject
  1153. use group 1
  1154. '''
  1155. return self.present_participle(matchobject.group(1))
  1156. # 0. PERFORM GENERAL INFLECTIONS IN A STRING
  1157. def inflect(self, text):
  1158. '''
  1159. Perform inflections in a string.
  1160. e.g. inflect('The plural of cat is plural(cat)') returns
  1161. 'The plural of cat is cats'
  1162. can use plural, plural_noun, plural_verb, plural_adj, singular_noun, a, an, no, ordinal,
  1163. number_to_words and prespart
  1164. '''
  1165. save_persistent_count = self.persistent_count
  1166. sections = splitre(r"(num\([^)]*\))", text)
  1167. inflection = []
  1168. for section in sections:
  1169. (section, count) = subn(r"num\(\s*?(?:([^),]*)(?:,([^)]*))?)?\)", self.nummo, section)
  1170. if not count:
  1171. total = -1
  1172. while total:
  1173. (section, total) = subn(
  1174. r"(?x)\bplural \( ([^),]*) (, ([^)]*) )? \) ",
  1175. self.plmo, section)
  1176. (section, count) = subn(
  1177. r"(?x)\bplural_noun \( ([^),]*) (, ([^)]*) )? \) ",
  1178. self.plnounmo, section)
  1179. total += count
  1180. (section, count) = subn(
  1181. r"(?x)\bplural_verb \( ([^),]*) (, ([^)]*) )? \) ",
  1182. self.plverbmo, section)
  1183. total += count
  1184. (section, count) = subn(
  1185. r"(?x)\bplural_adj \( ([^),]*) (, ([^)]*) )? \) ",
  1186. self.pladjmo, section)
  1187. total += count
  1188. (section, count) = subn(
  1189. r"(?x)\bsingular_noun \( ([^),]*) (, ([^)]*) )? \) ",
  1190. self.sinounmo, section)
  1191. total += count
  1192. (section, count) = subn(
  1193. r"(?x)\ban? \( ([^),]*) (, ([^)]*) )? \) ",
  1194. self.amo, section)
  1195. total += count
  1196. (section, count) = subn(
  1197. r"(?x)\bno \( ([^),]*) (, ([^)]*) )? \) ",
  1198. self.nomo, section)
  1199. total += count
  1200. (section, count) = subn(
  1201. r"(?x)\bordinal \( ([^)]*) \) ",
  1202. self.ordinalmo, section)
  1203. total += count
  1204. (section, count) = subn(
  1205. r"(?x)\bnumber_to_words \( ([^)]*) \) ",
  1206. self.numwordsmo, section)
  1207. total += count
  1208. (section, count) = subn(
  1209. r"(?x)\bpresent_participle \( ([^)]*) \) ",
  1210. self.prespartmo, section)
  1211. total += count
  1212. inflection.append(section)
  1213. self.persistent_count = save_persistent_count
  1214. return "".join(inflection)
  1215. # ## PLURAL SUBROUTINES
  1216. def postprocess(self, orig, inflected):
  1217. """
  1218. FIX PEDANTRY AND CAPITALIZATION :-)
  1219. """
  1220. if '|' in inflected:
  1221. inflected = inflected.split('|')[self.classical_dict['all']]
  1222. if orig == "I":
  1223. return inflected
  1224. if orig == orig.upper():
  1225. return inflected.upper()
  1226. if orig[0] == orig[0].upper():
  1227. return '%s%s' % (inflected[0].upper(),
  1228. inflected[1:])
  1229. return inflected
  1230. def partition_word(self, text):
  1231. mo = search(r'\A(\s*)(.+?)(\s*)\Z', text)
  1232. try:
  1233. return mo.group(1), mo.group(2), mo.group(3)
  1234. except AttributeError: # empty string
  1235. return '', '', ''
  1236. # def pl(self, *args, **kwds):
  1237. # print 'pl() deprecated, use plural()'
  1238. # raise DeprecationWarning
  1239. # return self.plural(*args, **kwds)
  1240. #
  1241. # def plnoun(self, *args, **kwds):
  1242. # print 'plnoun() deprecated, use plural_noun()'
  1243. # raise DeprecationWarning
  1244. # return self.plural_noun(*args, **kwds)
  1245. #
  1246. # def plverb(self, *args, **kwds):
  1247. # print 'plverb() deprecated, use plural_verb()'
  1248. # raise DeprecationWarning
  1249. # return self.plural_verb(*args, **kwds)
  1250. #
  1251. # def pladj(self, *args, **kwds):
  1252. # print 'pladj() deprecated, use plural_adj()'
  1253. # raise DeprecationWarning
  1254. # return self.plural_adj(*args, **kwds)
  1255. #
  1256. # def sinoun(self, *args, **kwds):
  1257. # print 'sinoun() deprecated, use singular_noun()'
  1258. # raise DeprecationWarning
  1259. # return self.singular_noun(*args, **kwds)
  1260. #
  1261. # def prespart(self, *args, **kwds):
  1262. # print 'prespart() deprecated, use present_participle()'
  1263. # raise DeprecationWarning
  1264. # return self.present_participle(*args, **kwds)
  1265. #
  1266. # def numwords(self, *args, **kwds):
  1267. # print 'numwords() deprecated, use number_to_words()'
  1268. # raise DeprecationWarning
  1269. # return self.number_to_words(*args, **kwds)
  1270. def plural(self, text, count=None):
  1271. '''
  1272. Return the plural of text.
  1273. If count supplied, then return text if count is one of:
  1274. 1, a, an, one, each, every, this, that
  1275. otherwise return the plural.
  1276. Whitespace at the start and end is preserved.
  1277. '''
  1278. pre, word, post = self.partition_word(text)
  1279. if not word:
  1280. return text
  1281. plural = self.postprocess(
  1282. word,
  1283. self._pl_special_adjective(word, count) or
  1284. self._pl_special_verb(word, count) or
  1285. self._plnoun(word, count))
  1286. return "%s%s%s" % (pre, plural, post)
  1287. def plural_noun(self, text, count=None):
  1288. '''
  1289. Return the plural of text, where text is a noun.
  1290. If count supplied, then return text if count is one of:
  1291. 1, a, an, one, each, every, this, that
  1292. otherwise return the plural.
  1293. Whitespace at the start and end is preserved.
  1294. '''
  1295. pre, word, post = self.partition_word(text)
  1296. if not word:
  1297. return text
  1298. plural = self.postprocess(word, self._plnoun(word, count))
  1299. return "%s%s%s" % (pre, plural, post)
  1300. def plural_verb(self, text, count=None):
  1301. '''
  1302. Return the plural of text, where text is a verb.
  1303. If count supplied, then return text if count is one of:
  1304. 1, a, an, one, each, every, this, that
  1305. otherwise return the plural.
  1306. Whitespace at the start and end is preserved.
  1307. '''
  1308. pre, word, post = self.partition_word(text)
  1309. if not word:
  1310. return text
  1311. plural = self.postprocess(word, self._pl_special_verb(word, count) or
  1312. self._pl_general_verb(word, count))
  1313. return "%s%s%s" % (pre, plural, post)
  1314. def plural_adj(self, text, count=None):
  1315. '''
  1316. Return the plural of text, where text is an adjective.
  1317. If count supplied, then return text if count is one of:
  1318. 1, a, an, one, each, every, this, that
  1319. otherwise return the plural.
  1320. Whitespace at the start and end is preserved.
  1321. '''
  1322. pre, word, post = self.partition_word(text)
  1323. if not word:
  1324. return text
  1325. plural = self.postprocess(word, self._pl_special_adjective(word, count) or word)
  1326. return "%s%s%s" % (pre, plural, post)
  1327. def compare(self, word1, word2):
  1328. '''
  1329. compare word1 and word2 for equality regardless of plurality
  1330. return values:
  1331. eq - the strings are equal
  1332. p:s - word1 is the plural of word2
  1333. s:p - word2 is the plural of word1
  1334. p:p - word1 and word2 are two different plural forms of the one word
  1335. False - otherwise
  1336. '''
  1337. return (
  1338. self._plequal(word1, word2, self.plural_noun) or
  1339. self._plequal(word1, word2, self.plural_verb) or
  1340. self._plequal(word1, word2, self.plural_adj))
  1341. def compare_nouns(self, word1, word2):
  1342. '''
  1343. compare word1 and word2 for equality regardless of plurality
  1344. word1 and word2 are to be treated as nouns
  1345. return values:
  1346. eq - the strings are equal
  1347. p:s - word1 is the plural of word2
  1348. s:p - word2 is the plural of word1
  1349. p:p - word1 and word2 are two different plural forms of the one word
  1350. False - otherwise
  1351. '''
  1352. return self._plequal(word1, word2, self.plural_noun)
  1353. def compare_verbs(self, word1, word2):
  1354. '''
  1355. compare word1 and word2 for equality regardless of plurality
  1356. word1 and word2 are to be treated as verbs
  1357. return values:
  1358. eq - the strings are equal
  1359. p:s - word1 is the plural of word2
  1360. s:p - word2 is the plural of word1
  1361. p:p - word1 and word2 are two different plural forms of the one word
  1362. False - otherwise
  1363. '''
  1364. return self._plequal(word1, word2, self.plural_verb)
  1365. def compare_adjs(self, word1, word2):
  1366. '''
  1367. compare word1 and word2 for equality regardless of plurality
  1368. word1 and word2 are to be treated as adjectives
  1369. return values:
  1370. eq - the strings are equal
  1371. p:s - word1 is the plural of word2
  1372. s:p - word2 is the plural of word1
  1373. p:p - word1 and word2 are two different plural forms of the one word
  1374. False - otherwise
  1375. '''
  1376. return self._plequal(word1, word2, self.plural_adj)
  1377. def singular_noun(self, text, count=None, gender=None):
  1378. '''
  1379. Return the singular of text, where text is a plural noun.
  1380. If count supplied, then return the singular if count is one of:
  1381. 1, a, an, one, each, every, this, that or if count is None
  1382. otherwise return text unchanged.
  1383. Whitespace at the start and end is preserved.
  1384. '''
  1385. pre, word, post = self.partition_word(text)
  1386. if not word:
  1387. return text
  1388. sing = self._sinoun(word, count=count, gender=gender)
  1389. if sing is not False:
  1390. plural = self.postprocess(word, self._sinoun(word, count=count, gender=gender))
  1391. return "%s%s%s" % (pre, plural, post)
  1392. return False
  1393. def _plequal(self, word1, word2, pl):
  1394. classval = self.classical_dict.copy()
  1395. self.classical_dict = all_classical.copy()
  1396. if word1 == word2:
  1397. return "eq"
  1398. if word1 == pl(word2):
  1399. return "p:s"
  1400. if pl(word1) == word2:
  1401. return "s:p"
  1402. self.classical_dict = no_classical.copy()
  1403. if word1 == pl(word2):
  1404. return "p:s"
  1405. if pl(word1) == word2:
  1406. return "s:p"
  1407. self.classical_dict = classval.copy()
  1408. if pl == self.plural or pl == self.plural_noun:
  1409. if self._pl_check_plurals_N(word1, word2):
  1410. return "p:p"
  1411. if self._pl_check_plurals_N(word2, word1):
  1412. return "p:p"
  1413. if pl == self.plural or pl == self.plural_adj:
  1414. if self._pl_check_plurals_adj(word1, word2):
  1415. return "p:p"
  1416. return False
  1417. def _pl_reg_plurals(self, pair, stems, end1, end2):
  1418. if search(r"(%s)(%s\|\1%s|%s\|\1%s)" % (stems, end1, end2, end2, end1), pair):
  1419. return True
  1420. return False
  1421. def _pl_check_plurals_N(self, word1, word2):
  1422. pair = "%s|%s" % (word1, word2)
  1423. if pair in list(pl_sb_irregular_s.values()):
  1424. return True
  1425. if pair in list(pl_sb_irregular.values()):
  1426. return True
  1427. if pair in list(pl_sb_irregular_caps.values()):
  1428. return True
  1429. for (stems, end1, end2) in (
  1430. (pl_sb_C_a_ata, "as", "ata"),
  1431. (pl_sb_C_is_ides, "is", "ides"),
  1432. (pl_sb_C_a_ae, "s", "e"),
  1433. (pl_sb_C_en_ina, "ens", "ina"),
  1434. (pl_sb_C_um_a, "ums", "a"),
  1435. (pl_sb_C_us_i, "uses", "i"),
  1436. (pl_sb_C_on_a, "ons", "a"),
  1437. (pl_sb_C_o_i_stems, "os", "i"),
  1438. (pl_sb_C_ex_ices, "exes", "ices"),
  1439. (pl_sb_C_ix_ices, "ixes", "ices"),
  1440. (pl_sb_C_i, "s", "i"),
  1441. (pl_sb_C_im, "s", "im"),
  1442. ('.*eau', "s", "x"),
  1443. ('.*ieu', "s", "x"),
  1444. ('.*tri', "xes", "ces"),
  1445. ('.{2,}[yia]n', "xes", "ges")
  1446. ):
  1447. if self._pl_reg_plurals(pair, stems, end1, end2):
  1448. return True
  1449. return False
  1450. def _pl_check_plurals_adj(self, word1, word2):
  1451. # VERSION: tuple in endswith requires python 2.5
  1452. word1a = word1[:word1.rfind("'")] if word1.endswith(("'s", "'")) else ''
  1453. word2a = word2[:word2.rfind("'")] if word2.endswith(("'s", "'")) else ''
  1454. # TODO: BUG? report upstream. I don't think you should chop off the s'
  1455. # word1b = word1[:-2] if word1.endswith("s'") else ''
  1456. # word2b = word2[:-2] if word2.endswith("s'") else ''
  1457. # TODO: dresses', dresses's -> dresses, dresses when chop off letters
  1458. # then they return False because they are the same. Need to fix this.
  1459. if word1a:
  1460. if word2a and (self._pl_check_plurals_N(word1a, word2a)
  1461. or self._pl_check_plurals_N(word2a, word1a)):
  1462. return True
  1463. # if word2b and ( self._pl_check_plurals_N(word1a, word2b)
  1464. # or self._pl_check_plurals_N(word2b, word1a) ):
  1465. # return True
  1466. # if word1b:
  1467. # if word2a and ( self._pl_check_plurals_N(word1b, word2a)
  1468. # or self._pl_check_plurals_N(word2a, word1b) ):
  1469. # return True
  1470. # if word2b and ( self._pl_check_plurals_N(word1b, word2b)
  1471. # or self._pl_check_plurals_N(word2b, word1b) ):
  1472. # return True
  1473. return False
  1474. def get_count(self, count=None):
  1475. if count is None and self.persistent_count is not None:
  1476. count = self.persistent_count
  1477. if count is not None:
  1478. count = 1 if ((str(count) in pl_count_one) or
  1479. (self.classical_dict['zero'] and str(count).lower() in pl_count_zero)) else 2
  1480. else:
  1481. count = ''
  1482. return count
  1483. # @profile
  1484. def _plnoun(self, word, count=None):
  1485. count = self.get_count(count)
  1486. # DEFAULT TO PLURAL
  1487. if count == 1:
  1488. return word
  1489. # HANDLE USER-DEFINED NOUNS
  1490. value = self.ud_match(word, self.pl_sb_user_defined)
  1491. if value is not None:
  1492. return value
  1493. # HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS
  1494. if word == '':
  1495. return word
  1496. lowerword = word.lower()
  1497. if lowerword in pl_sb_uninflected_complete:
  1498. return word
  1499. if word in pl_sb_uninflected_caps:
  1500. return word
  1501. for k, v in pl_sb_uninflected_bysize.items():
  1502. if lowerword[-k:] in v:
  1503. return word
  1504. if (self.classical_dict['herd'] and lowerword in pl_sb_uninflected_herd):
  1505. return word
  1506. # HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.)
  1507. mo = search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, IGNORECASE)
  1508. if mo and mo.group(2) != '':
  1509. return "%s%s" % (self._plnoun(mo.group(1), 2), mo.group(2))
  1510. if ' a ' in lowerword or '-a-' in lowerword:
  1511. mo = search(r"^(?:%s)$" % pl_sb_prep_dual_compound, word, IGNORECASE)
  1512. if mo and mo.group(2) != '' and mo.group(3) != '':
  1513. return "%s%s%s" % (self._plnoun(mo.group(1), 2),
  1514. mo.group(2),
  1515. self._plnoun(mo.group(3)))
  1516. lowersplit = lowerword.split(' ')
  1517. if len(lowersplit) >= 3:
  1518. for numword in range(1, len(lowersplit) - 1):
  1519. if lowersplit[numword] in pl_prep_list_da:
  1520. return ' '.join(
  1521. lowersplit[:numword - 1] +
  1522. [self._plnoun(lowersplit[numword - 1], 2)] + lowersplit[numword:])
  1523. lowersplit = lowerword.split('-')
  1524. if len(lowersplit) >= 3:
  1525. for numword in range(1, len(lowersplit) - 1):
  1526. if lowersplit[numword] in pl_prep_list_da:
  1527. return ' '.join(
  1528. lowersplit[:numword - 1] +
  1529. [self._plnoun(lowersplit[numword - 1], 2) +
  1530. '-' + lowersplit[numword] + '-']) + ' '.join(lowersplit[(numword + 1):])
  1531. # HANDLE PRONOUNS
  1532. for k, v in pl_pron_acc_keys_bysize.items():
  1533. if lowerword[-k:] in v: # ends with accusivate pronoun
  1534. for pk, pv in pl_prep_bysize.items():
  1535. if lowerword[:pk] in pv: # starts with a prep
  1536. if lowerword.split() == [lowerword[:pk], lowerword[-k:]]: # only whitespace in between
  1537. return lowerword[:-k] + pl_pron_acc[lowerword[-k:]]
  1538. try:
  1539. return pl_pron_nom[word.lower()]
  1540. except KeyError:
  1541. pass
  1542. try:
  1543. return pl_pron_acc[word.lower()]
  1544. except KeyError:
  1545. pass
  1546. # HANDLE ISOLATED IRREGULAR PLURALS
  1547. wordsplit = word.split()
  1548. wordlast = wordsplit[-1]
  1549. lowerwordlast = wordlast.lower()
  1550. if wordlast in list(pl_sb_irregular_caps.keys()):
  1551. llen = len(wordlast)
  1552. return '%s%s' % (word[:-llen],
  1553. pl_sb_irregular_caps[wordlast])
  1554. if lowerwordlast in list(pl_sb_irregular.keys()):
  1555. llen = len(lowerwordlast)
  1556. return '%s%s' % (word[:-llen],
  1557. pl_sb_irregular[lowerwordlast])
  1558. if (' '.join(wordsplit[-2:])).lower() in list(pl_sb_irregular_compound.keys()):
  1559. llen = len(' '.join(wordsplit[-2:])) # TODO: what if 2 spaces between these words?
  1560. return '%s%s' % (word[:-llen],
  1561. pl_sb_irregular_compound[(' '.join(wordsplit[-2:])).lower()])
  1562. if lowerword[-3:] == 'quy':
  1563. return word[:-1] + 'ies'
  1564. if lowerword[-6:] == 'person':
  1565. if self.classical_dict['persons']:
  1566. return word + 's'
  1567. else:
  1568. return word[:-4] + 'ople'
  1569. # HANDLE FAMILIES OF IRREGULAR PLURALS
  1570. if lowerword[-3:] == 'man':
  1571. for k, v in pl_sb_U_man_mans_bysize.items():
  1572. if lowerword[-k:] in v:
  1573. return word + 's'
  1574. for k, v in pl_sb_U_man_mans_caps_bysize.items():
  1575. if word[-k:] in v:
  1576. return word + 's'
  1577. return word[:-3] + 'men'
  1578. if lowerword[-5:] == 'mouse':
  1579. return word[:-5] + 'mice'
  1580. if lowerword[-5:] == 'louse':
  1581. return word[:-5] + 'lice'
  1582. if lowerword[-5:] == 'goose':
  1583. return word[:-5] + 'geese'
  1584. if lowerword[-5:] == 'tooth':
  1585. return word[:-5] + 'teeth'
  1586. if lowerword[-4:] == 'foot':
  1587. return word[:-4] + 'feet'
  1588. if lowerword == 'die':
  1589. return 'dice'
  1590. # HANDLE UNASSIMILATED IMPORTS
  1591. if lowerword[-4:] == 'ceps':
  1592. return word
  1593. if lowerword[-4:] == 'zoon':
  1594. return word[:-2] + 'a'
  1595. if lowerword[-3:] in ('cis', 'sis', 'xis'):
  1596. return word[:-2] + 'es'
  1597. for lastlet, d, numend, post in (
  1598. ('h', pl_sb_U_ch_chs_bysize, None, 's'),
  1599. ('x', pl_sb_U_ex_ices_bysize, -2, 'ices'),
  1600. ('x', pl_sb_U_ix_ices_bysize, -2, 'ices'),
  1601. ('m', pl_sb_U_um_a_bysize, -2, 'a'),
  1602. ('s', pl_sb_U_us_i_bysize, -2, 'i'),
  1603. ('n', pl_sb_U_on_a_bysize, -2, 'a'),
  1604. ('a', pl_sb_U_a_ae_bysize, None, 'e'),
  1605. ):
  1606. if lowerword[-1] == lastlet: # this test to add speed
  1607. for k, v in d.items():
  1608. if lowerword[-k:] in v:
  1609. return word[:numend] + post
  1610. # HANDLE INCOMPLETELY ASSIMILATED IMPORTS
  1611. if (self.classical_dict['ancient']):
  1612. if lowerword[-4:] == 'trix':
  1613. return word[:-1] + 'ces'
  1614. if lowerword[-3:] in ('eau', 'ieu'):
  1615. return word + 'x'
  1616. if lowerword[-3:] in ('ynx', 'inx', 'anx') and len(word) > 4:
  1617. return word[:-1] + 'ges'
  1618. for lastlet, d, numend, post in (
  1619. ('n', pl_sb_C_en_ina_bysize, -2, 'ina'),
  1620. ('x', pl_sb_C_ex_ices_bysize, -2, 'ices'),
  1621. ('x', pl_sb_C_ix_ices_bysize, -2, 'ices'),
  1622. ('m', pl_sb_C_um_a_bysize, -2, 'a'),
  1623. ('s', pl_sb_C_us_i_bysize, -2, 'i'),
  1624. ('s', pl_sb_C_us_us_bysize, None, ''),
  1625. ('a', pl_sb_C_a_ae_bysize, None, 'e'),
  1626. ('a', pl_sb_C_a_ata_bysize, None, 'ta'),
  1627. ('s', pl_sb_C_is_ides_bysize, -1, 'des'),
  1628. ('o', pl_sb_C_o_i_bysize, -1, 'i'),
  1629. ('n', pl_sb_C_on_a_bysize, -2, 'a'),
  1630. ):
  1631. if lowerword[-1] == lastlet: # this test to add speed
  1632. for k, v in d.items():
  1633. if lowerword[-k:] in v:
  1634. return word[:numend] + post
  1635. for d, numend, post in (
  1636. (pl_sb_C_i_bysize, None, 'i'),
  1637. (pl_sb_C_im_bysize, None, 'im'),
  1638. ):
  1639. for k, v in d.items():
  1640. if lowerword[-k:] in v:
  1641. return word[:numend] + post
  1642. # HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS
  1643. if lowerword in pl_sb_singular_s_complete:
  1644. return word + 'es'
  1645. for k, v in pl_sb_singular_s_bysize.items():
  1646. if lowerword[-k:] in v:
  1647. return word + 'es'
  1648. if lowerword[-2:] == 'es' and word[0] == word[0].upper():
  1649. return word + 'es'
  1650. # Wouldn't special words
  1651. # ending with 's' always have been caught, regardless of them starting
  1652. # with a capital letter (i.e. being names)
  1653. # It makes sense below to do this for words ending in 'y' so that
  1654. # Sally -> Sallys. But not sure it makes sense here. Where is the case
  1655. # of a word ending in s that is caught here and would otherwise have been
  1656. # caught below?
  1657. #
  1658. # removing it as I can't find a case that executes it
  1659. # TODO: check this again
  1660. #
  1661. # if (self.classical_dict['names']):
  1662. # mo = search(r"([A-Z].*s)$", word)
  1663. # if mo:
  1664. # return "%ses" % mo.group(1)
  1665. if lowerword[-1] == 'z':
  1666. for k, v in pl_sb_z_zes_bysize.items():
  1667. if lowerword[-k:] in v:
  1668. return word + 'es'
  1669. if lowerword[-2:-1] != 'z':
  1670. return word + 'zes'
  1671. if lowerword[-2:] == 'ze':
  1672. for k, v in pl_sb_ze_zes_bysize.items():
  1673. if lowerword[-k:] in v:
  1674. return word + 's'
  1675. if lowerword[-2:] in ('ch', 'sh', 'zz', 'ss') or lowerword[-1] == 'x':
  1676. return word + 'es'
  1677. # ## (r"(.*)(us)$", "%s%ses"), TODO: why is this commented?
  1678. # HANDLE ...f -> ...ves
  1679. if lowerword[-3:] in ('elf', 'alf', 'olf'):
  1680. return word[:-1] + 'ves'
  1681. if lowerword[-3:] == 'eaf' and lowerword[-4:-3] != 'd':
  1682. return word[:-1] + 'ves'
  1683. if lowerword[-4:] in ('nife', 'life', 'wife'):
  1684. return word[:-2] + 'ves'
  1685. if lowerword[-3:] == 'arf':
  1686. return word[:-1] + 'ves'
  1687. # HANDLE ...y
  1688. if lowerword[-1] == 'y':
  1689. if lowerword[-2:-1] in 'aeiou' or len(word) == 1:
  1690. return word + 's'
  1691. if (self.classical_dict['names']):
  1692. if lowerword[-1] == 'y' and word[0] == word[0].upper():
  1693. return word + 's'
  1694. return word[:-1] + 'ies'
  1695. # HANDLE ...o
  1696. if lowerword in pl_sb_U_o_os_complete:
  1697. return word + 's'
  1698. for k, v in pl_sb_U_o_os_bysize.items():
  1699. if lowerword[-k:] in v:
  1700. return word + 's'
  1701. if lowerword[-2:] in ('ao', 'eo', 'io', 'oo', 'uo'):
  1702. return word + 's'
  1703. if lowerword[-1] == 'o':
  1704. return word + 'es'
  1705. # OTHERWISE JUST ADD ...s
  1706. return "%ss" % word
  1707. def _pl_special_verb(self, word, count=None):
  1708. if (self.classical_dict['zero'] and
  1709. str(count).lower() in pl_count_zero):
  1710. return False
  1711. count = self.get_count(count)
  1712. if count == 1:
  1713. return word
  1714. # HANDLE USER-DEFINED VERBS
  1715. value = self.ud_match(word, self.pl_v_user_defined)
  1716. if value is not None:
  1717. return value
  1718. # HANDLE IRREGULAR PRESENT TENSE (SIMPLE AND COMPOUND)
  1719. lowerword = word.lower()
  1720. try:
  1721. firstword = lowerword.split()[0]
  1722. except IndexError:
  1723. return False # word is ''
  1724. if firstword in list(plverb_irregular_pres.keys()):
  1725. return "%s%s" % (plverb_irregular_pres[firstword], word[len(firstword):])
  1726. # HANDLE IRREGULAR FUTURE, PRETERITE AND PERFECT TENSES
  1727. if firstword in plverb_irregular_non_pres:
  1728. return word
  1729. # HANDLE PRESENT NEGATIONS (SIMPLE AND COMPOUND)
  1730. if firstword.endswith("n't") and firstword[:-3] in list(plverb_irregular_pres.keys()):
  1731. return "%sn't%s" % (plverb_irregular_pres[firstword[:-3]], word[len(firstword):])
  1732. if firstword.endswith("n't"):
  1733. return word
  1734. # HANDLE SPECIAL CASES
  1735. mo = search(r"^(%s)$" % plverb_special_s, word)
  1736. if mo:
  1737. return False
  1738. if search(r"\s", word):
  1739. return False
  1740. if lowerword == 'quizzes':
  1741. return 'quiz'
  1742. # HANDLE STANDARD 3RD PERSON (CHOP THE ...(e)s OFF SINGLE WORDS)
  1743. if lowerword[-4:] in ('ches', 'shes', 'zzes', 'sses') or \
  1744. lowerword[-3:] == 'xes':
  1745. return word[:-2]
  1746. # # mo = search(r"^(.*)([cs]h|[x]|zz|ss)es$",
  1747. # # word, IGNORECASE)
  1748. # # if mo:
  1749. # # return "%s%s" % (mo.group(1), mo.group(2))
  1750. if lowerword[-3:] == 'ies' and len(word) > 3:
  1751. return lowerword[:-3] + 'y'
  1752. if (lowerword in pl_v_oes_oe or
  1753. lowerword[-4:] in pl_v_oes_oe_endings_size4 or
  1754. lowerword[-5:] in pl_v_oes_oe_endings_size5):
  1755. return word[:-1]
  1756. if lowerword.endswith('oes') and len(word) > 3:
  1757. return lowerword[:-2]
  1758. mo = search(r"^(.*[^s])s$", word, IGNORECASE)
  1759. if mo:
  1760. return mo.group(1)
  1761. # OTHERWISE, A REGULAR VERB (HANDLE ELSEWHERE)
  1762. return False
  1763. def _pl_general_verb(self, word, count=None):
  1764. count = self.get_count(count)
  1765. if count == 1:
  1766. return word
  1767. # HANDLE AMBIGUOUS PRESENT TENSES (SIMPLE AND COMPOUND)
  1768. mo = search(r"^(%s)((\s.*)?)$" % plverb_ambiguous_pres_keys, word, IGNORECASE)
  1769. if mo:
  1770. return "%s%s" % (plverb_ambiguous_pres[mo.group(1).lower()], mo.group(2))
  1771. # HANDLE AMBIGUOUS PRETERITE AND PERFECT TENSES
  1772. mo = search(r"^(%s)((\s.*)?)$" % plverb_ambiguous_non_pres, word, IGNORECASE)
  1773. if mo:
  1774. return word
  1775. # OTHERWISE, 1st OR 2ND PERSON IS UNINFLECTED
  1776. return word
  1777. def _pl_special_adjective(self, word, count=None):
  1778. count = self.get_count(count)
  1779. if count == 1:
  1780. return word
  1781. # HANDLE USER-DEFINED ADJECTIVES
  1782. value = self.ud_match(word, self.pl_adj_user_defined)
  1783. if value is not None:
  1784. return value
  1785. # HANDLE KNOWN CASES
  1786. mo = search(r"^(%s)$" % pl_adj_special_keys,
  1787. word, IGNORECASE)
  1788. if mo:
  1789. return "%s" % (pl_adj_special[mo.group(1).lower()])
  1790. # HANDLE POSSESSIVES
  1791. mo = search(r"^(%s)$" % pl_adj_poss_keys,
  1792. word, IGNORECASE)
  1793. if mo:
  1794. return "%s" % (pl_adj_poss[mo.group(1).lower()])
  1795. mo = search(r"^(.*)'s?$",
  1796. word)
  1797. if mo:
  1798. pl = self.plural_noun(mo.group(1))
  1799. trailing_s = "" if pl[-1] == 's' else "s"
  1800. return "%s'%s" % (pl, trailing_s)
  1801. # OTHERWISE, NO IDEA
  1802. return False
  1803. # @profile
  1804. def _sinoun(self, word, count=None, gender=None):
  1805. count = self.get_count(count)
  1806. # DEFAULT TO PLURAL
  1807. if count == 2:
  1808. return word
  1809. # SET THE GENDER
  1810. try:
  1811. if gender is None:
  1812. gender = self.thegender
  1813. elif gender not in singular_pronoun_genders:
  1814. raise BadGenderError
  1815. except (TypeError, IndexError):
  1816. raise BadGenderError
  1817. # HANDLE USER-DEFINED NOUNS
  1818. value = self.ud_match(word, self.si_sb_user_defined)
  1819. if value is not None:
  1820. return value
  1821. # HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS
  1822. if word == '':
  1823. return word
  1824. lowerword = word.lower()
  1825. if word in si_sb_ois_oi_case:
  1826. return word[:-1]
  1827. if lowerword in pl_sb_uninflected_complete:
  1828. return word
  1829. if word in pl_sb_uninflected_caps:
  1830. return word
  1831. for k, v in pl_sb_uninflected_bysize.items():
  1832. if lowerword[-k:] in v:
  1833. return word
  1834. if (self.classical_dict['herd'] and lowerword in pl_sb_uninflected_herd):
  1835. return word
  1836. # HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.)
  1837. mo = search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, IGNORECASE)
  1838. if mo and mo.group(2) != '':
  1839. return "%s%s" % (self._sinoun(mo.group(1), 1, gender=gender), mo.group(2))
  1840. # how to reverse this one?
  1841. # mo = search(r"^(?:%s)$" % pl_sb_prep_dual_compound, word, IGNORECASE)
  1842. # if mo and mo.group(2) != '' and mo.group(3) != '':
  1843. # return "%s%s%s" % (self._sinoun(mo.group(1), 1),
  1844. # mo.group(2),
  1845. # self._sinoun(mo.group(3), 1))
  1846. lowersplit = lowerword.split(' ')
  1847. if len(lowersplit) >= 3:
  1848. for numword in range(1, len(lowersplit) - 1):
  1849. if lowersplit[numword] in pl_prep_list_da:
  1850. return ' '.join(lowersplit[:numword - 1] +
  1851. [self._sinoun(lowersplit[numword - 1], 1, gender=gender) or
  1852. lowersplit[numword - 1]] + lowersplit[numword:])
  1853. lowersplit = lowerword.split('-')
  1854. if len(lowersplit) >= 3:
  1855. for numword in range(1, len(lowersplit) - 1):
  1856. if lowersplit[numword] in pl_prep_list_da:
  1857. return ' '.join(
  1858. lowersplit[:numword - 1] +
  1859. [(self._sinoun(lowersplit[numword - 1], 1, gender=gender) or lowersplit[numword - 1]) +
  1860. '-' + lowersplit[numword] + '-']) + ' '.join(lowersplit[(numword + 1):])
  1861. # HANDLE PRONOUNS
  1862. for k, v in si_pron_acc_keys_bysize.items():
  1863. if lowerword[-k:] in v: # ends with accusivate pronoun
  1864. for pk, pv in pl_prep_bysize.items():
  1865. if lowerword[:pk] in pv: # starts with a prep
  1866. if lowerword.split() == [lowerword[:pk], lowerword[-k:]]: # only whitespace in between
  1867. return lowerword[:-k] + get_si_pron('acc', lowerword[-k:], gender)
  1868. try:
  1869. return get_si_pron('nom', word.lower(), gender)
  1870. except KeyError:
  1871. pass
  1872. try:
  1873. return get_si_pron('acc', word.lower(), gender)
  1874. except KeyError:
  1875. pass
  1876. # HANDLE ISOLATED IRREGULAR PLURALS
  1877. wordsplit = word.split()
  1878. wordlast = wordsplit[-1]
  1879. lowerwordlast = wordlast.lower()
  1880. if wordlast in list(si_sb_irregular_caps.keys()):
  1881. llen = len(wordlast)
  1882. return '%s%s' % (word[:-llen],
  1883. si_sb_irregular_caps[wordlast])
  1884. if lowerwordlast in list(si_sb_irregular.keys()):
  1885. llen = len(lowerwordlast)
  1886. return '%s%s' % (word[:-llen],
  1887. si_sb_irregular[lowerwordlast])
  1888. if (' '.join(wordsplit[-2:])).lower() in list(si_sb_irregular_compound.keys()):
  1889. llen = len(' '.join(wordsplit[-2:])) # TODO: what if 2 spaces between these words?
  1890. return '%s%s' % (word[:-llen],
  1891. si_sb_irregular_compound[(' '.join(wordsplit[-2:])).lower()])
  1892. if lowerword[-5:] == 'quies':
  1893. return word[:-3] + 'y'
  1894. if lowerword[-7:] == 'persons':
  1895. return word[:-1]
  1896. if lowerword[-6:] == 'people':
  1897. return word[:-4] + 'rson'
  1898. # HANDLE FAMILIES OF IRREGULAR PLURALS
  1899. if lowerword[-4:] == 'mans':
  1900. for k, v in si_sb_U_man_mans_bysize.items():
  1901. if lowerword[-k:] in v:
  1902. return word[:-1]
  1903. for k, v in si_sb_U_man_mans_caps_bysize.items():
  1904. if word[-k:] in v:
  1905. return word[:-1]
  1906. if lowerword[-3:] == 'men':
  1907. return word[:-3] + 'man'
  1908. if lowerword[-4:] == 'mice':
  1909. return word[:-4] + 'mouse'
  1910. if lowerword[-4:] == 'lice':
  1911. return word[:-4] + 'louse'
  1912. if lowerword[-5:] == 'geese':
  1913. return word[:-5] + 'goose'
  1914. if lowerword[-5:] == 'teeth':
  1915. return word[:-5] + 'tooth'
  1916. if lowerword[-4:] == 'feet':
  1917. return word[:-4] + 'foot'
  1918. if lowerword == 'dice':
  1919. return 'die'
  1920. # HANDLE UNASSIMILATED IMPORTS
  1921. if lowerword[-4:] == 'ceps':
  1922. return word
  1923. if lowerword[-3:] == 'zoa':
  1924. return word[:-1] + 'on'
  1925. for lastlet, d, numend, post in (
  1926. ('s', si_sb_U_ch_chs_bysize, -1, ''),
  1927. ('s', si_sb_U_ex_ices_bysize, -4, 'ex'),
  1928. ('s', si_sb_U_ix_ices_bysize, -4, 'ix'),
  1929. ('a', si_sb_U_um_a_bysize, -1, 'um'),
  1930. ('i', si_sb_U_us_i_bysize, -1, 'us'),
  1931. ('a', si_sb_U_on_a_bysize, -1, 'on'),
  1932. ('e', si_sb_U_a_ae_bysize, -1, ''),
  1933. ):
  1934. if lowerword[-1] == lastlet: # this test to add speed
  1935. for k, v in d.items():
  1936. if lowerword[-k:] in v:
  1937. return word[:numend] + post
  1938. # HANDLE INCOMPLETELY ASSIMILATED IMPORTS
  1939. if (self.classical_dict['ancient']):
  1940. if lowerword[-6:] == 'trices':
  1941. return word[:-3] + 'x'
  1942. if lowerword[-4:] in ('eaux', 'ieux'):
  1943. return word[:-1]
  1944. if lowerword[-5:] in ('ynges', 'inges', 'anges') and len(word) > 6:
  1945. return word[:-3] + 'x'
  1946. for lastlet, d, numend, post in (
  1947. ('a', si_sb_C_en_ina_bysize, -3, 'en'),
  1948. ('s', si_sb_C_ex_ices_bysize, -4, 'ex'),
  1949. ('s', si_sb_C_ix_ices_bysize, -4, 'ix'),
  1950. ('a', si_sb_C_um_a_bysize, -1, 'um'),
  1951. ('i', si_sb_C_us_i_bysize, -1, 'us'),
  1952. ('s', pl_sb_C_us_us_bysize, None, ''),
  1953. ('e', si_sb_C_a_ae_bysize, -1, ''),
  1954. ('a', si_sb_C_a_ata_bysize, -2, ''),
  1955. ('s', si_sb_C_is_ides_bysize, -3, 's'),
  1956. ('i', si_sb_C_o_i_bysize, -1, 'o'),
  1957. ('a', si_sb_C_on_a_bysize, -1, 'on'),
  1958. ('m', si_sb_C_im_bysize, -2, ''),
  1959. ('i', si_sb_C_i_bysize, -1, ''),
  1960. ):
  1961. if lowerword[-1] == lastlet: # this test to add speed
  1962. for k, v in d.items():
  1963. if lowerword[-k:] in v:
  1964. return word[:numend] + post
  1965. # HANDLE PLURLS ENDING IN uses -> use
  1966. if (lowerword[-6:] == 'houses' or
  1967. word in si_sb_uses_use_case or
  1968. lowerword in si_sb_uses_use):
  1969. return word[:-1]
  1970. # HANDLE PLURLS ENDING IN ies -> ie
  1971. if word in si_sb_ies_ie_case or lowerword in si_sb_ies_ie:
  1972. return word[:-1]
  1973. # HANDLE PLURLS ENDING IN oes -> oe
  1974. if (lowerword[-5:] == 'shoes' or
  1975. word in si_sb_oes_oe_case or
  1976. lowerword in si_sb_oes_oe):
  1977. return word[:-1]
  1978. # HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS
  1979. if (word in si_sb_sses_sse_case or
  1980. lowerword in si_sb_sses_sse):
  1981. return word[:-1]
  1982. if lowerword in si_sb_singular_s_complete:
  1983. return word[:-2]
  1984. for k, v in si_sb_singular_s_bysize.items():
  1985. if lowerword[-k:] in v:
  1986. return word[:-2]
  1987. if lowerword[-4:] == 'eses' and word[0] == word[0].upper():
  1988. return word[:-2]
  1989. # Wouldn't special words
  1990. # ending with 's' always have been caught, regardless of them starting
  1991. # with a capital letter (i.e. being names)
  1992. # It makes sense below to do this for words ending in 'y' so that
  1993. # Sally -> Sallys. But not sure it makes sense here. Where is the case
  1994. # of a word ending in s that is caught here and would otherwise have been
  1995. # caught below?
  1996. #
  1997. # removing it as I can't find a case that executes it
  1998. # TODO: check this again
  1999. #
  2000. # if (self.classical_dict['names']):
  2001. # mo = search(r"([A-Z].*ses)$", word)
  2002. # if mo:
  2003. # return "%s" % mo.group(1)
  2004. if lowerword in si_sb_z_zes:
  2005. return word[:-2]
  2006. if lowerword in si_sb_zzes_zz:
  2007. return word[:-2]
  2008. if lowerword[-4:] == 'zzes':
  2009. return word[:-3]
  2010. if (word in si_sb_ches_che_case or
  2011. lowerword in si_sb_ches_che):
  2012. return word[:-1]
  2013. if lowerword[-4:] in ('ches', 'shes'):
  2014. return word[:-2]
  2015. if lowerword in si_sb_xes_xe:
  2016. return word[:-1]
  2017. if lowerword[-3:] == 'xes':
  2018. return word[:-2]
  2019. # (r"(.*)(us)es$", "%s%s"), TODO: why is this commented?
  2020. # HANDLE ...f -> ...ves
  2021. if (word in si_sb_ves_ve_case or
  2022. lowerword in si_sb_ves_ve):
  2023. return word[:-1]
  2024. if lowerword[-3:] == 'ves':
  2025. if lowerword[-5:-3] in ('el', 'al', 'ol'):
  2026. return word[:-3] + 'f'
  2027. if lowerword[-5:-3] == 'ea' and word[-6:-5] != 'd':
  2028. return word[:-3] + 'f'
  2029. if lowerword[-5:-3] in ('ni', 'li', 'wi'):
  2030. return word[:-3] + 'fe'
  2031. if lowerword[-5:-3] == 'ar':
  2032. return word[:-3] + 'f'
  2033. # HANDLE ...y
  2034. if lowerword[-2:] == 'ys':
  2035. if len(lowerword) > 2 and lowerword[-3] in 'aeiou':
  2036. return word[:-1]
  2037. if (self.classical_dict['names']):
  2038. if lowerword[-2:] == 'ys' and word[0] == word[0].upper():
  2039. return word[:-1]
  2040. if lowerword[-3:] == 'ies':
  2041. return word[:-3] + 'y'
  2042. # HANDLE ...o
  2043. if lowerword[-2:] == 'os':
  2044. if lowerword in si_sb_U_o_os_complete:
  2045. return word[:-1]
  2046. for k, v in si_sb_U_o_os_bysize.items():
  2047. if lowerword[-k:] in v:
  2048. return word[:-1]
  2049. if lowerword[-3:] in ('aos', 'eos', 'ios', 'oos', 'uos'):
  2050. return word[:-1]
  2051. if lowerword[-3:] == 'oes':
  2052. return word[:-2]
  2053. # UNASSIMILATED IMPORTS FINAL RULE
  2054. if word in si_sb_es_is:
  2055. return word[:-2] + 'is'
  2056. # OTHERWISE JUST REMOVE ...s
  2057. if lowerword[-1] == 's':
  2058. return word[:-1]
  2059. # COULD NOT FIND SINGULAR
  2060. return False
  2061. # ADJECTIVES
  2062. def a(self, text, count=1):
  2063. '''
  2064. Return the appropriate indefinite article followed by text.
  2065. The indefinite article is either 'a' or 'an'.
  2066. If count is not one, then return count followed by text
  2067. instead of 'a' or 'an'.
  2068. Whitespace at the start and end is preserved.
  2069. '''
  2070. mo = search(r"\A(\s*)(?:an?\s+)?(.+?)(\s*)\Z",
  2071. text, IGNORECASE)
  2072. if mo:
  2073. word = mo.group(2)
  2074. if not word:
  2075. return text
  2076. pre = mo.group(1)
  2077. post = mo.group(3)
  2078. result = self._indef_article(word, count)
  2079. return "%s%s%s" % (pre, result, post)
  2080. return ''
  2081. an = a
  2082. def _indef_article(self, word, count):
  2083. mycount = self.get_count(count)
  2084. if mycount != 1:
  2085. return "%s %s" % (count, word)
  2086. # HANDLE USER-DEFINED VARIANTS
  2087. value = self.ud_match(word, self.A_a_user_defined)
  2088. if value is not None:
  2089. return "%s %s" % (value, word)
  2090. # HANDLE ORDINAL FORMS
  2091. for a in (
  2092. (r"^(%s)" % A_ordinal_a, "a"),
  2093. (r"^(%s)" % A_ordinal_an, "an"),
  2094. ):
  2095. mo = search(a[0], word, IGNORECASE)
  2096. if mo:
  2097. return "%s %s" % (a[1], word)
  2098. # HANDLE SPECIAL CASES
  2099. for a in (
  2100. (r"^(%s)" % A_explicit_an, "an"),
  2101. (r"^[aefhilmnorsx]$", "an"),
  2102. (r"^[bcdgjkpqtuvwyz]$", "a"),
  2103. ):
  2104. mo = search(a[0], word, IGNORECASE)
  2105. if mo:
  2106. return "%s %s" % (a[1], word)
  2107. # HANDLE ABBREVIATIONS
  2108. for a in (
  2109. (r"(%s)" % A_abbrev, "an", VERBOSE),
  2110. (r"^[aefhilmnorsx][.-]", "an", IGNORECASE),
  2111. (r"^[a-z][.-]", "a", IGNORECASE),
  2112. ):
  2113. mo = search(a[0], word, a[2])
  2114. if mo:
  2115. return "%s %s" % (a[1], word)
  2116. # HANDLE CONSONANTS
  2117. mo = search(r"^[^aeiouy]", word, IGNORECASE)
  2118. if mo:
  2119. return "a %s" % word
  2120. # HANDLE SPECIAL VOWEL-FORMS
  2121. for a in (
  2122. (r"^e[uw]", "a"),
  2123. (r"^onc?e\b", "a"),
  2124. (r"^onetime\b", "a"),
  2125. (r"^uni([^nmd]|mo)", "a"),
  2126. (r"^u[bcfghjkqrst][aeiou]", "a"),
  2127. (r"^ukr", "a"),
  2128. (r"^(%s)" % A_explicit_a, "a"),
  2129. ):
  2130. mo = search(a[0], word, IGNORECASE)
  2131. if mo:
  2132. return "%s %s" % (a[1], word)
  2133. # HANDLE SPECIAL CAPITALS
  2134. mo = search(r"^U[NK][AIEO]?", word)
  2135. if mo:
  2136. return "a %s" % word
  2137. # HANDLE VOWELS
  2138. mo = search(r"^[aeiou]", word, IGNORECASE)
  2139. if mo:
  2140. return "an %s" % word
  2141. # HANDLE y... (BEFORE CERTAIN CONSONANTS IMPLIES (UNNATURALIZED) "i.." SOUND)
  2142. mo = search(r"^(%s)" % A_y_cons, word, IGNORECASE)
  2143. if mo:
  2144. return "an %s" % word
  2145. # OTHERWISE, GUESS "a"
  2146. return "a %s" % word
  2147. # 2. TRANSLATE ZERO-QUANTIFIED $word TO "no plural($word)"
  2148. def no(self, text, count=None):
  2149. '''
  2150. If count is 0, no, zero or nil, return 'no' followed by the plural
  2151. of text.
  2152. If count is one of:
  2153. 1, a, an, one, each, every, this, that
  2154. return count followed by text.
  2155. Otherwise return count follow by the plural of text.
  2156. In the return value count is always followed by a space.
  2157. Whitespace at the start and end is preserved.
  2158. '''
  2159. if count is None and self.persistent_count is not None:
  2160. count = self.persistent_count
  2161. if count is None:
  2162. count = 0
  2163. mo = search(r"\A(\s*)(.+?)(\s*)\Z", text)
  2164. pre = mo.group(1)
  2165. word = mo.group(2)
  2166. post = mo.group(3)
  2167. if str(count).lower() in pl_count_zero:
  2168. return "%sno %s%s" % (pre, self.plural(word, 0), post)
  2169. else:
  2170. return "%s%s %s%s" % (pre, count, self.plural(word, count), post)
  2171. # PARTICIPLES
  2172. def present_participle(self, word):
  2173. '''
  2174. Return the present participle for word.
  2175. word is the 3rd person singular verb.
  2176. '''
  2177. plv = self.plural_verb(word, 2)
  2178. for pat, repl in (
  2179. (r"ie$", r"y"),
  2180. (r"ue$", r"u"), # TODO: isn't ue$ -> u encompassed in the following rule?
  2181. (r"([auy])e$", r"\g<1>"),
  2182. (r"ski$", r"ski"),
  2183. (r"[^b]i$", r""),
  2184. (r"^(are|were)$", r"be"),
  2185. (r"^(had)$", r"hav"),
  2186. (r"^(hoe)$", r"\g<1>"),
  2187. (r"([^e])e$", r"\g<1>"),
  2188. (r"er$", r"er"),
  2189. (r"([^aeiou][aeiouy]([bdgmnprst]))$", "\g<1>\g<2>"),
  2190. ):
  2191. (ans, num) = subn(pat, repl, plv)
  2192. if num:
  2193. return "%sing" % ans
  2194. return "%sing" % ans
  2195. # NUMERICAL INFLECTIONS
  2196. def ordinal(self, num):
  2197. '''
  2198. Return the ordinal of num.
  2199. num can be an integer or text
  2200. e.g. ordinal(1) returns '1st'
  2201. ordinal('one') returns 'first'
  2202. '''
  2203. if match(r"\d", str(num)):
  2204. try:
  2205. num % 2
  2206. n = num
  2207. except TypeError:
  2208. if '.' in str(num):
  2209. try:
  2210. n = int(num[-1]) # numbers after decimal, so only need last one for ordinal
  2211. except ValueError: # ends with '.', so need to use whole string
  2212. n = int(num[:-1])
  2213. else:
  2214. n = int(num)
  2215. try:
  2216. post = nth[n % 100]
  2217. except KeyError:
  2218. post = nth[n % 10]
  2219. return "%s%s" % (num, post)
  2220. else:
  2221. mo = search(r"(%s)\Z" % ordinal_suff, num)
  2222. try:
  2223. post = ordinal[mo.group(1)]
  2224. return resub(r"(%s)\Z" % ordinal_suff, post, num)
  2225. except AttributeError:
  2226. return "%sth" % num
  2227. def millfn(self, ind=0):
  2228. if ind > len(mill) - 1:
  2229. print3("number out of range")
  2230. raise NumOutOfRangeError
  2231. return mill[ind]
  2232. def unitfn(self, units, mindex=0):
  2233. return "%s%s" % (unit[units], self.millfn(mindex))
  2234. def tenfn(self, tens, units, mindex=0):
  2235. if tens != 1:
  2236. return "%s%s%s%s" % (ten[tens],
  2237. '-' if tens and units else '',
  2238. unit[units],
  2239. self.millfn(mindex))
  2240. return "%s%s" % (teen[units], mill[mindex])
  2241. def hundfn(self, hundreds, tens, units, mindex):
  2242. if hundreds:
  2243. return "%s hundred%s%s%s, " % (unit[hundreds], # use unit not unitfn as simpler
  2244. " %s " % self.number_args['andword'] if tens or units else '',
  2245. self.tenfn(tens, units),
  2246. self.millfn(mindex))
  2247. if tens or units:
  2248. return "%s%s, " % (self.tenfn(tens, units), self.millfn(mindex))
  2249. return ''
  2250. def group1sub(self, mo):
  2251. units = int(mo.group(1))
  2252. if units == 1:
  2253. return " %s, " % self.number_args['one']
  2254. elif units:
  2255. # TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl
  2256. return "%s, " % unit[units]
  2257. else:
  2258. return " %s, " % self.number_args['zero']
  2259. def group1bsub(self, mo):
  2260. units = int(mo.group(1))
  2261. if units:
  2262. # TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl
  2263. return "%s, " % unit[units]
  2264. else:
  2265. return " %s, " % self.number_args['zero']
  2266. def group2sub(self, mo):
  2267. tens = int(mo.group(1))
  2268. units = int(mo.group(2))
  2269. if tens:
  2270. return "%s, " % self.tenfn(tens, units)
  2271. if units:
  2272. return " %s %s, " % (self.number_args['zero'], unit[units])
  2273. return " %s %s, " % (self.number_args['zero'], self.number_args['zero'])
  2274. def group3sub(self, mo):
  2275. hundreds = int(mo.group(1))
  2276. tens = int(mo.group(2))
  2277. units = int(mo.group(3))
  2278. if hundreds == 1:
  2279. hunword = " %s" % self.number_args['one']
  2280. elif hundreds:
  2281. hunword = "%s" % unit[hundreds]
  2282. # TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl
  2283. else:
  2284. hunword = " %s" % self.number_args['zero']
  2285. if tens:
  2286. tenword = self.tenfn(tens, units)
  2287. elif units:
  2288. tenword = " %s %s" % (self.number_args['zero'], unit[units])
  2289. else:
  2290. tenword = " %s %s" % (self.number_args['zero'], self.number_args['zero'])
  2291. return "%s %s, " % (hunword, tenword)
  2292. def hundsub(self, mo):
  2293. ret = self.hundfn(int(mo.group(1)), int(mo.group(2)), int(mo.group(3)), self.mill_count)
  2294. self.mill_count += 1
  2295. return ret
  2296. def tensub(self, mo):
  2297. return "%s, " % self.tenfn(int(mo.group(1)), int(mo.group(2)), self.mill_count)
  2298. def unitsub(self, mo):
  2299. return "%s, " % self.unitfn(int(mo.group(1)), self.mill_count)
  2300. def enword(self, num, group):
  2301. # import pdb
  2302. # pdb.set_trace()
  2303. if group == 1:
  2304. num = resub(r"(\d)", self.group1sub, num)
  2305. elif group == 2:
  2306. num = resub(r"(\d)(\d)", self.group2sub, num)
  2307. num = resub(r"(\d)", self.group1bsub, num, 1)
  2308. # group1bsub same as
  2309. # group1sub except it doesn't use the default word for one.
  2310. # Is this required? i.e. is the default word not to beused when
  2311. # grouping in pairs?
  2312. #
  2313. # No. This is a bug. Fixed. TODO: report upstream.
  2314. elif group == 3:
  2315. num = resub(r"(\d)(\d)(\d)", self.group3sub, num)
  2316. num = resub(r"(\d)(\d)", self.group2sub, num, 1)
  2317. num = resub(r"(\d)", self.group1sub, num, 1)
  2318. elif int(num) == 0:
  2319. num = self.number_args['zero']
  2320. elif int(num) == 1:
  2321. num = self.number_args['one']
  2322. else:
  2323. num = num.lstrip().lstrip('0')
  2324. self.mill_count = 0
  2325. # surely there's a better way to do the next bit
  2326. mo = search(r"(\d)(\d)(\d)(?=\D*\Z)", num)
  2327. while mo:
  2328. num = resub(r"(\d)(\d)(\d)(?=\D*\Z)", self.hundsub, num, 1)
  2329. mo = search(r"(\d)(\d)(\d)(?=\D*\Z)", num)
  2330. num = resub(r"(\d)(\d)(?=\D*\Z)", self.tensub, num, 1)
  2331. num = resub(r"(\d)(?=\D*\Z)", self.unitsub, num, 1)
  2332. return num
  2333. def blankfn(self, mo):
  2334. ''' do a global blank replace
  2335. TODO: surely this can be done with an option to resub
  2336. rather than this fn
  2337. '''
  2338. return ''
  2339. def commafn(self, mo):
  2340. ''' do a global ',' replace
  2341. TODO: surely this can be done with an option to resub
  2342. rather than this fn
  2343. '''
  2344. return ','
  2345. def spacefn(self, mo):
  2346. ''' do a global ' ' replace
  2347. TODO: surely this can be done with an option to resub
  2348. rather than this fn
  2349. '''
  2350. return ' '
  2351. def number_to_words(self, num, wantlist=False,
  2352. group=0, comma=',', andword='and',
  2353. zero='zero', one='one', decimal='point',
  2354. threshold=None):
  2355. '''
  2356. Return a number in words.
  2357. group = 1, 2 or 3 to group numbers before turning into words
  2358. comma: define comma
  2359. andword: word for 'and'. Can be set to ''.
  2360. e.g. "one hundred and one" vs "one hundred one"
  2361. zero: word for '0'
  2362. one: word for '1'
  2363. decimal: word for decimal point
  2364. threshold: numbers above threshold not turned into words
  2365. parameters not remembered from last call. Departure from Perl version.
  2366. '''
  2367. self.number_args = dict(andword=andword, zero=zero, one=one)
  2368. num = '%s' % num
  2369. # Handle "stylistic" conversions (up to a given threshold)...
  2370. if (threshold is not None and float(num) > threshold):
  2371. spnum = num.split('.', 1)
  2372. while (comma):
  2373. (spnum[0], n) = subn(r"(\d)(\d{3}(?:,|\Z))", r"\1,\2", spnum[0])
  2374. if n == 0:
  2375. break
  2376. try:
  2377. return "%s.%s" % (spnum[0], spnum[1])
  2378. except IndexError:
  2379. return "%s" % spnum[0]
  2380. if group < 0 or group > 3:
  2381. raise BadChunkingOptionError
  2382. nowhite = num.lstrip()
  2383. if nowhite[0] == '+':
  2384. sign = "plus"
  2385. elif nowhite[0] == '-':
  2386. sign = "minus"
  2387. else:
  2388. sign = ""
  2389. myord = (num[-2:] in ('st', 'nd', 'rd', 'th'))
  2390. if myord:
  2391. num = num[:-2]
  2392. finalpoint = False
  2393. if decimal:
  2394. if group != 0:
  2395. chunks = num.split('.')
  2396. else:
  2397. chunks = num.split('.', 1)
  2398. if chunks[-1] == '': # remove blank string if nothing after decimal
  2399. chunks = chunks[:-1]
  2400. finalpoint = True # add 'point' to end of output
  2401. else:
  2402. chunks = [num]
  2403. first = 1
  2404. loopstart = 0
  2405. if chunks[0] == '':
  2406. first = 0
  2407. if len(chunks) > 1:
  2408. loopstart = 1
  2409. for i in range(loopstart, len(chunks)):
  2410. chunk = chunks[i]
  2411. # remove all non numeric \D
  2412. chunk = resub(r"\D", self.blankfn, chunk)
  2413. if chunk == "":
  2414. chunk = "0"
  2415. if group == 0 and (first == 0 or first == ''):
  2416. chunk = self.enword(chunk, 1)
  2417. else:
  2418. chunk = self.enword(chunk, group)
  2419. if chunk[-2:] == ', ':
  2420. chunk = chunk[:-2]
  2421. chunk = resub(r"\s+,", self.commafn, chunk)
  2422. if group == 0 and first:
  2423. chunk = resub(r", (\S+)\s+\Z", " %s \\1" % andword, chunk)
  2424. chunk = resub(r"\s+", self.spacefn, chunk)
  2425. # chunk = resub(r"(\A\s|\s\Z)", self.blankfn, chunk)
  2426. chunk = chunk.strip()
  2427. if first:
  2428. first = ''
  2429. chunks[i] = chunk
  2430. numchunks = []
  2431. if first != 0:
  2432. numchunks = chunks[0].split("%s " % comma)
  2433. if myord and numchunks:
  2434. # TODO: can this be just one re as it is in perl?
  2435. mo = search(r"(%s)\Z" % ordinal_suff, numchunks[-1])
  2436. if mo:
  2437. numchunks[-1] = resub(r"(%s)\Z" % ordinal_suff, ordinal[mo.group(1)],
  2438. numchunks[-1])
  2439. else:
  2440. numchunks[-1] += 'th'
  2441. for chunk in chunks[1:]:
  2442. numchunks.append(decimal)
  2443. numchunks.extend(chunk.split("%s " % comma))
  2444. if finalpoint:
  2445. numchunks.append(decimal)
  2446. # wantlist: Perl list context. can explictly specify in Python
  2447. if wantlist:
  2448. if sign:
  2449. numchunks = [sign] + numchunks
  2450. return numchunks
  2451. elif group:
  2452. signout = "%s " % sign if sign else ''
  2453. return "%s%s" % (signout, ", ".join(numchunks))
  2454. else:
  2455. signout = "%s " % sign if sign else ''
  2456. num = "%s%s" % (signout, numchunks.pop(0))
  2457. if decimal is None:
  2458. first = True
  2459. else:
  2460. first = not num.endswith(decimal)
  2461. for nc in numchunks:
  2462. if nc == decimal:
  2463. num += " %s" % nc
  2464. first = 0
  2465. elif first:
  2466. num += "%s %s" % (comma, nc)
  2467. else:
  2468. num += " %s" % nc
  2469. return num
  2470. # Join words with commas and a trailing 'and' (when appropriate)...
  2471. def join(self, words, sep=None, sep_spaced=True,
  2472. final_sep=None, conj='and', conj_spaced=True):
  2473. '''
  2474. Join words into a list.
  2475. e.g. join(['ant', 'bee', 'fly']) returns 'ant, bee, and fly'
  2476. options:
  2477. conj: replacement for 'and'
  2478. sep: separator. default ',', unless ',' is in the list then ';'
  2479. final_sep: final separator. default ',', unless ',' is in the list then ';'
  2480. conj_spaced: boolean. Should conj have spaces around it
  2481. '''
  2482. if not words:
  2483. return ""
  2484. if len(words) == 1:
  2485. return words[0]
  2486. if conj_spaced:
  2487. if conj == '':
  2488. conj = ' '
  2489. else:
  2490. conj = ' %s ' % conj
  2491. if len(words) == 2:
  2492. return "%s%s%s" % (words[0], conj, words[1])
  2493. if sep is None:
  2494. if ',' in ''.join(words):
  2495. sep = ';'
  2496. else:
  2497. sep = ','
  2498. if final_sep is None:
  2499. final_sep = sep
  2500. final_sep = "%s%s" % (final_sep, conj)
  2501. if sep_spaced:
  2502. sep += ' '
  2503. return "%s%s%s" % (sep.join(words[0:-1]), final_sep, words[-1])