crawler.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. import io
  2. import cgi
  3. import pytz
  4. from datetime import datetime, timedelta
  5. from werkzeug.http import parse_date
  6. from flask import escape, json
  7. import requests
  8. from ispformat.validator import validate_isp
  9. from .models import ISP
  10. from . import app
  11. def get_encoding(content_type):
  12. content_type, params = cgi.parse_header(content_type)
  13. if 'charset' in params:
  14. return params['charset'].strip("'\"")
  15. class Crawler(object):
  16. MAX_JSON_SIZE=1*1024*1024
  17. escape=staticmethod(lambda x: unicode(str(x), 'utf8') if type(x) != unicode else x)
  18. def __init__(self):
  19. self.success=False
  20. self.jdict={}
  21. def m(self, msg, evt=None):
  22. if not evt:
  23. return u'%s\n'%msg
  24. else:
  25. return u''
  26. def err(self, msg, *args):
  27. return self.m(u'! %s'%msg, *args)
  28. def warn(self, msg):
  29. return self.m(u'@ %s'%msg)
  30. def info(self, msg):
  31. return self.m(u'\u2013 %s'%msg)
  32. def abort(self, msg):
  33. raise NotImplemented
  34. def color(self, color, msg):
  35. return msg
  36. def bold(self, msg):
  37. return msg
  38. def italics(self, msg):
  39. return msg
  40. def nl(self):
  41. return self.m('')
  42. def format_validation_errors(self, errs):
  43. r=[]
  44. for e in errs:
  45. r.append(u' %s: %s'%('.'.join(list(e.schema_path)[1:]), e.message))
  46. return u'\n'.join(r)
  47. def pre_done_cb(self, *args):
  48. pass
  49. def done_cb(self):
  50. pass
  51. def config(self, name):
  52. return app.config.get('CRAWLER_'+name)
  53. def parse_cache_control(self, _cachectl):
  54. cachectl={}
  55. for cc in _cachectl.split(','):
  56. cc=cc.strip()
  57. if not cc:
  58. continue
  59. cc=cc.split('=')
  60. if cc[0] not in ('max-age', 's-maxage'):
  61. continue
  62. try:
  63. cachectl[cc[0]]=cc[1]
  64. except IndexError:
  65. cachectl[cc[0]]=True
  66. return cachectl
  67. def __call__(self, url):
  68. esc=self.escape
  69. yield self.m('Starting the validation process...')
  70. r=None
  71. try:
  72. yield self.m('* Attempting to retreive %s'%self.bold(url))
  73. r=requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt',
  74. headers={'User-Agent': 'FFDN DB validator'},
  75. stream=True, timeout=10)
  76. except requests.exceptions.SSLError as e:
  77. yield self.err('Unable to connect, SSL Error: '+self.color('#dd1144', esc(e)))
  78. except requests.exceptions.ConnectionError as e:
  79. yield self.err('Unable to connect: '+self.color('#dd1144', esc(e)))
  80. except requests.exceptions.Timeout as e:
  81. yield self.err('Connection timeout')
  82. except requests.exceptions.TooManyRedirects as e:
  83. yield self.err('Too many redirects')
  84. except requests.exceptions.RequestException as e:
  85. yield self.err('Internal request exception')
  86. # except Exception as e:
  87. # yield self.err('Unexpected request exception')
  88. if r is None:
  89. yield self.abort('Connection could not be established, aborting')
  90. return
  91. yield self.info('Connection established')
  92. yield self.info('Response code: '+self.bold(str(r.status_code)+' '+esc(r.reason)))
  93. try:
  94. r.raise_for_status()
  95. except requests.exceptions.HTTPError as e:
  96. yield self.err('Response code indicates an error')
  97. yield self.abort('Invalid response code')
  98. return
  99. yield self.info('Content type: '+self.bold(esc(r.headers.get('content-type', 'not defined'))))
  100. if not r.headers.get('content-type'):
  101. yield self.error('Content-type '+self.bold('MUST')+' be defined')
  102. yield self.abort('The file must have a proper content-type to continue')
  103. elif r.headers.get('content-type').lower() != 'application/json':
  104. yield self.warn('Content-type '+self.italics('SHOULD')+' be application/json')
  105. encoding=get_encoding(r.headers.get('content-type'))
  106. if not encoding:
  107. yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
  108. yield self.info('Content length: %s'%(self.bold(esc(r.headers.get('content-length', 'not set')))))
  109. cl=r.headers.get('content-length')
  110. if not cl:
  111. yield self.warn('No content-length. Note that we will not process a file whose size exceed 1MiB')
  112. elif int(cl) > self.MAX_JSON_SIZE:
  113. yield self.abort('File too big ! File size must be less then 1MiB')
  114. _cachecontrol=r.headers.get('cache-control')
  115. cachecontrol=self.parse_cache_control(_cachecontrol) if _cachecontrol else None
  116. max_age=None
  117. if cachecontrol:
  118. try:
  119. _maxage=cachecontrol.get('max-age')
  120. _maxage=cachecontrol.get('s-maxage', _maxage) # s-maxage takes precedence
  121. max_age=int(_maxage)
  122. except ValueError:
  123. yield self.warn('Invalid max-age '+esc(_maxage))
  124. yield self.info('Cache control: '+self.bold(esc(
  125. ', '.join([k+'='+v if type(v) != bool else k for k, v in cachecontrol.iteritems()]))
  126. ))
  127. _expires=r.headers.get('expires')
  128. expires=parse_date(_expires)
  129. if expires:
  130. _now=r.headers.get('date')
  131. if _now: # use server date when possible
  132. now=parse_date(_now)
  133. else:
  134. now=datetime.utcnow()
  135. if expires > now:
  136. expires=(expires-now).total_seconds()
  137. yield self.info('Expires: '+self.bold(esc(_expires)))
  138. else:
  139. yield self.warn('Invalid Expires header. Expiry date must be in the future.')
  140. expires=None
  141. else:
  142. yield self.warn('Invalid Expires header %r'%esc(_expires))
  143. if not max_age and not expires:
  144. yield self.warn('No valid expiration time provided ! Please provide it either '
  145. 'with a Cache-Control or Expires header.')
  146. max_age=self.config('DEFAULT_CACHE_TIME')
  147. yield self.info('Using default expiration time of %d seconds'%(max_age))
  148. self.jdict_max_age = max_age if max_age else expires
  149. self.jdict_max_age = min(
  150. self.config('MAX_CACHE_TIME'),
  151. max(self.config('MIN_CACHE_TIME'), self.jdict_max_age)
  152. )
  153. yield self.info('Next update will be in %s'%(timedelta(seconds=self.jdict_max_age)))
  154. yield self.info('Reading response into memory...')
  155. b=io.BytesIO()
  156. for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
  157. b.write(d)
  158. if b.tell() > self.MAX_JSON_SIZE:
  159. yield self.abort('File too big ! File size must be less then 1MiB')
  160. return
  161. r._content=b.getvalue()
  162. del b
  163. yield self.info('Successfully read %d bytes'%len(r.content))
  164. yield self.nl()+self.m('* Parsing the JSON file')
  165. if not encoding:
  166. charset=requests.utils.guess_json_utf(r.content)
  167. if not charset:
  168. yield self.err('Unable to guess unicode charset')
  169. yield self.abort('The file MUST be unicode-encoded when no explicit charset is in the content-type')
  170. return
  171. yield self.info('Guessed charset: '+self.bold(charset))
  172. try:
  173. txt=r.content.decode(encoding or charset)
  174. yield self.info('Successfully decoded file as %s'%esc(encoding or charset))
  175. except LookupError as e:
  176. yield self.err('Invalid/unknown charset: %s'%esc(e))
  177. yield self.abort('Charset error, Cannot continue')
  178. return
  179. except UnicodeDecodeError as e:
  180. yield self.err('Unicode decode error: %s'%e)
  181. yield self.abort('Charset error, cannot continue')
  182. return
  183. except Exception:
  184. yield self.abort('Unexpected charset error')
  185. return
  186. jdict=None
  187. try:
  188. jdict=json.loads(txt)
  189. except ValueError as e:
  190. yield self.err('Error while parsing JSON: %s'%esc(e))
  191. except Exception as e:
  192. yield self.err('Unexpected error while parsing JSON: %s'%esc(e))
  193. if not jdict:
  194. yield self.abort('Could not parse JSON')
  195. return
  196. yield self.info('JSON parsed successfully')
  197. yield self.nl()+self.m('* Validating the JSON against the schema')
  198. v=list(validate_isp(jdict))
  199. if v:
  200. yield self.err('Validation errors:')+self.format_validation_errors(v)
  201. yield self.abort('Your JSON file does not follow the schema, please fix it')
  202. return
  203. else:
  204. yield self.info('Done. No errors encountered \o')
  205. ret=self.pre_done_cb(jdict)
  206. if ret:
  207. yield ret
  208. return
  209. yield (self.nl()+self.m('== '+self.color('forestgreen', 'All good ! You can click on Confirm now'))+
  210. self.m(json.dumps({'passed': 1}), 'control'))
  211. self.jdict=jdict
  212. self.success=True
  213. self.done_cb()
  214. class PrettyValidator(Crawler):
  215. def __init__(self, session=None, *args, **kwargs):
  216. super(PrettyValidator, self).__init__(*args, **kwargs)
  217. self.session=session
  218. self.escape=lambda x: escape(unicode(str(x), 'utf8') if type(x) != unicode else x)
  219. def m(self, msg, evt=None):
  220. return u'%sdata: %s\n\n'%(u'event: %s\n'%evt if evt else '', msg)
  221. def err(self, msg, *args):
  222. return self.m(u'<strong style="color: crimson">!</strong> %s'%msg, *args)
  223. def warn(self, msg):
  224. return self.m(u'<strong style="color: dodgerblue">@</strong> %s'%msg)
  225. def info(self, msg):
  226. return self.m(u'&ndash; %s'%msg)
  227. def abort(self, msg):
  228. return (self.m(u'<br />== <span style="color: crimson">%s</span>'%msg)+
  229. self.m(json.dumps({'closed': 1}), 'control'))
  230. def bold(self, msg):
  231. return u'<strong>%s</strong>'%msg
  232. def italics(self, msg):
  233. return u'<em>%s</em>'%msg
  234. def color(self, color, msg):
  235. return u'<span style="color: %s">%s</span>'%(color, msg)
  236. def format_validation_errors(self, errs):
  237. lns=super(PrettyValidator, self).format_validation_errors(errs)
  238. buf=u''
  239. for l in lns.split('\n'):
  240. buf+=self.m(self.escape(l))
  241. return buf
  242. def done_cb(self):
  243. self.session['form_json']['validated']=True
  244. self.session['form_json']['jdict']=self.jdict
  245. self.session.save()
  246. class WebValidator(PrettyValidator):
  247. def pre_done_cb(self, jdict):
  248. # check name uniqueness
  249. where = (ISP.name == jdict['name'])
  250. if 'shortname' in jdict and jdict['shortname']:
  251. where |= (ISP.shortname == jdict.get('shortname'))
  252. if ISP.query.filter(where).count() > 0:
  253. ret = self.nl()
  254. ret += self.err('An ISP named "%s" already exist in our database'%self.escape(
  255. jdict['name']+(' ('+jdict['shortname']+')' if jdict.get('shortname') else '')
  256. ))
  257. ret += self.abort('The name of your ISP must be unique')
  258. return ret
  259. class TextValidator(Crawler):
  260. def abort(self, msg):
  261. res=u'FATAL ERROR: %s\n'%msg
  262. pad=u'='*(len(res)-1)+'\n'
  263. return self.m(pad+res+pad)