crawler.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. import io
  2. import cgi
  3. import sys
  4. import pytz
  5. from datetime import datetime, timedelta
  6. from werkzeug.http import parse_date
  7. from flask import escape, json, current_app
  8. import requests
  9. from ispformat.validator import validate_isp
  10. from .models import ISP
  11. from .utils import check_geojson_spatialite, utcnow, filesize_fmt
  12. from . import db
  13. def get_encoding(content_type):
  14. """
  15. >>> get_encoding('wat/ever; charset=hey')
  16. 'hey'
  17. """
  18. content_type, params = cgi.parse_header(content_type)
  19. if 'charset' in params:
  20. return params['charset'].strip("'\"")
  21. class Crawler(object):
  22. MAX_JSON_SIZE=1*1024*1024
  23. escape=staticmethod(lambda x: unicode(str(x), 'utf8') if type(x) != unicode else x)
  24. def __init__(self):
  25. self.success=False
  26. self.modified=True
  27. self.jdict={}
  28. self.cache_info=None
  29. self.jdict_max_age=self.config('DEFAULT_CACHE_TIME')
  30. def m(self, msg, evt=None):
  31. if not evt:
  32. return u'%s\n'%msg
  33. else:
  34. return u''
  35. def err(self, msg, *args):
  36. return self.m(u'! %s'%msg, *args)
  37. def warn(self, msg):
  38. return self.m(u'@ %s'%msg)
  39. def info(self, msg):
  40. return self.m(u'\u2013 %s'%msg)
  41. def abort(self, msg):
  42. raise NotImplemented
  43. def color(self, color, msg):
  44. return msg
  45. def bold(self, msg):
  46. return msg
  47. def italics(self, msg):
  48. return msg
  49. def nl(self):
  50. return self.m('')
  51. def format_validation_errors(self, errs):
  52. r=[]
  53. for e in errs:
  54. r.append(u' %s: %s'%('.'.join(list(e.schema_path)[1:]), e.message))
  55. return u'\n'.join(r)+'\n'
  56. def pre_done_cb(self, *args):
  57. pass
  58. def done_cb(self):
  59. pass
  60. def config(self, name):
  61. return current_app.config.get('CRAWLER_'+name)
  62. def parse_cache_control(self, _cachectl):
  63. cachectl={}
  64. for cc in _cachectl.split(','):
  65. cc=cc.strip()
  66. if not cc:
  67. continue
  68. cc=cc.split('=')
  69. if cc[0] not in ('max-age', 's-maxage'):
  70. continue
  71. try:
  72. cachectl[cc[0]]=cc[1]
  73. except IndexError:
  74. cachectl[cc[0]]=True
  75. return cachectl
  76. def __call__(self, url, cache_info={}):
  77. esc=self.escape
  78. yield self.m('Starting the validation process...')
  79. r=None
  80. try:
  81. yield self.m('* Attempting to retreive %s'%self.bold(url))
  82. headers={'User-Agent': 'FFDN DB validator'}
  83. if cache_info.get('etag'):
  84. headers['If-None-Match'] = cache_info['etag']
  85. if cache_info.get('last-modified'):
  86. headers['If-Modified-Since'] = cache_info['last-modified']
  87. r=requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt',
  88. headers=headers, stream=True, timeout=10)
  89. except requests.exceptions.SSLError as e:
  90. yield self.err('Unable to connect, SSL Error: '+self.color('#dd1144', esc(e)))
  91. except requests.exceptions.ConnectionError as e:
  92. yield self.err('Unable to connect: '+self.color('#dd1144', esc(e)))
  93. except requests.exceptions.Timeout as e:
  94. yield self.err('Connection timeout')
  95. except requests.exceptions.TooManyRedirects as e:
  96. yield self.err('Too many redirects')
  97. except requests.exceptions.RequestException as e:
  98. yield self.err('Internal request exception')
  99. except Exception as e:
  100. # Unexpected exception: abort the validation, then re-raise it
  101. # so that it's logged.
  102. tb = sys.exc_info()[2]
  103. yield self.abort('Unexpected request exception')
  104. raise e, None, tb
  105. if r is None:
  106. yield self.abort('Connection could not be established, aborting')
  107. return
  108. yield self.info('Connection established')
  109. yield self.info('Response code: '+self.bold(str(r.status_code)+' '+esc(r.reason)))
  110. try:
  111. r.raise_for_status()
  112. except requests.exceptions.HTTPError as e:
  113. yield self.err('Response code indicates an error')
  114. yield self.abort('Invalid response code')
  115. return
  116. _cachecontrol=r.headers.get('cache-control')
  117. cachecontrol=self.parse_cache_control(_cachecontrol) if _cachecontrol else None
  118. max_age=None
  119. if cachecontrol:
  120. try:
  121. _maxage=cachecontrol.get('max-age')
  122. _maxage=cachecontrol.get('s-maxage', _maxage) # s-maxage takes precedence
  123. max_age=int(_maxage)
  124. except ValueError:
  125. yield self.warn('Invalid max-age '+esc(_maxage))
  126. yield self.info('Cache control: '+self.bold(esc(
  127. ', '.join([k+'='+v if type(v) != bool else k for k, v in cachecontrol.iteritems()]))
  128. ))
  129. _expires=r.headers.get('expires')
  130. expires=parse_date(_expires)
  131. if expires:
  132. _now=r.headers.get('date')
  133. if _now: # use server date when possible
  134. now=parse_date(_now)
  135. else:
  136. now=datetime.utcnow()
  137. if expires > now:
  138. expires=(expires-now).total_seconds()
  139. yield self.info('Expires: '+self.bold(esc(_expires)))
  140. else:
  141. yield self.warn('Invalid Expires header. Expiry date must be in the future.')
  142. expires=None
  143. elif _expires and not expires:
  144. yield self.warn('Invalid Expires header %r'%esc(_expires))
  145. if not max_age and not expires:
  146. yield self.warn('No valid expiration time provided ! Please provide it either '
  147. 'with a Cache-Control or Expires header.')
  148. max_age=self.config('DEFAULT_CACHE_TIME')
  149. yield self.info('Using default expiration time of %d seconds'%(max_age))
  150. self.jdict_max_age = max_age if max_age else expires
  151. self.jdict_max_age = min(
  152. self.config('MAX_CACHE_TIME'),
  153. max(self.config('MIN_CACHE_TIME'), self.jdict_max_age)
  154. )
  155. yield self.info('Next update will be in %s'%(timedelta(seconds=self.jdict_max_age)))
  156. etag=r.headers.get('etag')
  157. last_modified=r.headers.get('last-modified')
  158. if not etag and not last_modified:
  159. yield self.warn('Please, provide at an ETag or Last-Modified header for '
  160. 'conditional requests')
  161. self.cache_info={}
  162. if etag:
  163. self.cache_info['etag']=etag
  164. if last_modified:
  165. self.cache_info['last-modified']=last_modified
  166. if cache_info and r.status_code == 304: # not modified
  167. self.m('== '+self.color('forestgreen', 'Response not modified. All good !'))
  168. self.modified=False
  169. self.success=True
  170. self.done_cb()
  171. return
  172. yield self.info('Content type: '+self.bold(esc(r.headers.get('content-type', 'not defined'))))
  173. if not r.headers.get('content-type'):
  174. yield self.err('Content-type '+self.bold('MUST')+' be defined')
  175. yield self.abort('The file must have a proper content-type to continue')
  176. return
  177. elif r.headers.get('content-type').lower() != 'application/json':
  178. yield self.warn('Content-type '+self.italics('SHOULD')+' be application/json')
  179. encoding=get_encoding(r.headers.get('content-type'))
  180. if not encoding:
  181. yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
  182. yield self.info('Content length: %s'%(self.bold(esc(r.headers.get('content-length', 'not set')))))
  183. cl=r.headers.get('content-length')
  184. if not cl:
  185. yield self.warn('No content-length. Note that we will not process a file whose size exceed %s'
  186. % (filesize_fmt(self.MAX_JSON_SIZE)))
  187. elif int(cl) > self.MAX_JSON_SIZE:
  188. yield self.abort('File too big ! File size must be less then %s' % (filesize_fmt(self.MAX_JSON_SIZE)))
  189. return
  190. yield self.info('Reading response into memory...')
  191. b=io.BytesIO()
  192. for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
  193. b.write(d)
  194. if b.tell() > self.MAX_JSON_SIZE:
  195. yield self.abort('File too big ! File size must be less then %s'
  196. % (filesize_fmt(self.MAX_JSON_SIZE)))
  197. return
  198. r._content=b.getvalue()
  199. del b
  200. yield self.info('Successfully read %d bytes'%len(r.content))
  201. yield self.nl()+self.m('* Parsing the JSON file')
  202. if not encoding:
  203. charset=requests.utils.guess_json_utf(r.content)
  204. if not charset:
  205. yield self.err('Unable to guess unicode charset')
  206. yield self.abort('The file MUST be unicode-encoded when no explicit charset is in the content-type')
  207. return
  208. yield self.info('Guessed charset: '+self.bold(charset))
  209. try:
  210. txt=r.content.decode(encoding or charset)
  211. yield self.info('Successfully decoded file as %s'%esc(encoding or charset))
  212. except LookupError as e:
  213. yield self.err('Invalid/unknown charset: %s'%esc(e))
  214. yield self.abort('Charset error, Cannot continue')
  215. return
  216. except UnicodeDecodeError as e:
  217. yield self.err('Unicode decode error: %s'%e)
  218. yield self.abort('Charset error, cannot continue')
  219. return
  220. except Exception:
  221. yield self.abort('Unexpected charset error')
  222. return
  223. jdict=None
  224. try:
  225. jdict=json.loads(txt)
  226. except ValueError as e:
  227. yield self.err('Error while parsing JSON: %s'%esc(e))
  228. except Exception as e:
  229. yield self.err('Unexpected error while parsing JSON: %s'%esc(e))
  230. if not jdict:
  231. yield self.abort('Could not parse JSON')
  232. return
  233. yield self.info('JSON parsed successfully')
  234. yield self.nl()+self.m('* Validating the JSON against the schema')
  235. v=list(validate_isp(jdict))
  236. if v:
  237. yield self.err('Validation errors:')+self.format_validation_errors(v)
  238. yield self.abort('Your JSON file does not follow the schema, please fix it')
  239. return
  240. else:
  241. yield self.info('Done. No errors encountered \o')
  242. for ca in jdict.get('coveredAreas', []):
  243. if not 'area' in ca:
  244. continue
  245. if not check_geojson_spatialite(ca['area']):
  246. yield self.err('GeoJSON data for covered area "%s" cannot '
  247. 'be handled by our database'%esc(ca['name']))
  248. yield self.abort('Please fix your GeoJSON')
  249. return
  250. ret=self.pre_done_cb(jdict)
  251. if ret:
  252. yield ret
  253. return
  254. yield (self.nl()+self.m('== '+self.color('forestgreen', 'All good ! You can click on Confirm now'))+
  255. self.m(json.dumps({'passed': 1}), 'control'))
  256. self.jdict=jdict
  257. self.success=True
  258. self.done_cb()
  259. class PrettyValidator(Crawler):
  260. def __init__(self, session=None, sesskey=None, *args, **kwargs):
  261. super(PrettyValidator, self).__init__(*args, **kwargs)
  262. self.session=session
  263. self.sesskey=sesskey
  264. self.escape=lambda x: escape(unicode(str(x), 'utf8') if type(x) != unicode else x)
  265. def m(self, msg, evt=None):
  266. return u'%sdata: %s\n\n'%(u'event: %s\n'%evt if evt else '', msg)
  267. def err(self, msg, *args):
  268. return self.m(u'<strong style="color: crimson">!</strong> %s'%msg, *args)
  269. def warn(self, msg):
  270. return self.m(u'<strong style="color: dodgerblue">@</strong> %s'%msg)
  271. def info(self, msg):
  272. return self.m(u'&ndash; %s'%msg)
  273. def abort(self, msg):
  274. return (self.m(u'<br />== <span style="color: crimson">%s</span>'%msg)+
  275. self.m(json.dumps({'closed': 1}), 'control'))
  276. def bold(self, msg):
  277. return u'<strong>%s</strong>'%msg
  278. def italics(self, msg):
  279. return u'<em>%s</em>'%msg
  280. def color(self, color, msg):
  281. return u'<span style="color: %s">%s</span>'%(color, msg)
  282. def format_validation_errors(self, errs):
  283. lns=super(PrettyValidator, self).format_validation_errors(errs)
  284. buf=u''
  285. for l in lns.split('\n'):
  286. buf+=self.m(self.escape(l))
  287. return buf
  288. def done_cb(self):
  289. self.session[self.sesskey]['validated']=True
  290. self.session[self.sesskey]['jdict']=self.jdict
  291. self.session[self.sesskey]['cache_info']=self.cache_info
  292. self.session[self.sesskey]['last_update']=utcnow()
  293. self.session[self.sesskey]['next_update']=utcnow()+timedelta(seconds=self.jdict_max_age)
  294. self.session.save()
  295. class WebValidator(PrettyValidator):
  296. def pre_done_cb(self, jdict):
  297. # check name uniqueness
  298. where = (ISP.name == jdict['name'])
  299. if 'shortname' in jdict and jdict['shortname']:
  300. where |= (ISP.shortname == jdict.get('shortname'))
  301. if ISP.query.filter(where).count() > 0:
  302. ret = self.nl()
  303. ret += self.err('An ISP named "%s" already exist in our database'%self.escape(
  304. jdict['name']+(' ('+jdict['shortname']+')' if jdict.get('shortname') else '')
  305. ))
  306. ret += self.abort('The name of your ISP must be unique')
  307. return ret
  308. class TextValidator(Crawler):
  309. def abort(self, msg):
  310. res=u'FATAL ERROR: %s\n'%msg
  311. pad=u'='*(len(res)-1)+'\n'
  312. return self.m(pad+res+pad)