crawler.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. import io
  2. import cgi
  3. import sys
  4. from datetime import datetime, timedelta
  5. from werkzeug.http import parse_date
  6. from flask import escape, json, current_app
  7. import requests
  8. from ispformat.validator import validate_isp
  9. from .models import ISP
  10. from .utils import check_geojson_spatialite, utcnow, filesize_fmt
  11. def get_encoding(content_type):
  12. """
  13. >>> get_encoding('wat/ever; charset=hey')
  14. 'hey'
  15. """
  16. content_type, params = cgi.parse_header(content_type)
  17. if 'charset' in params:
  18. return params['charset'].strip("'\"")
  19. class Crawler(object):
  20. MAX_JSON_SIZE = 1 * 1024 * 1024
  21. escape = staticmethod(lambda x: unicode(str(x), 'utf8') if type(x) != unicode else x)
  22. def __init__(self):
  23. self.success = False
  24. self.modified = True
  25. self.jdict = {}
  26. self.cache_info = None
  27. self.jdict_max_age = self.config('DEFAULT_CACHE_TIME')
  28. def m(self, msg, evt=None):
  29. if not evt:
  30. return u'%s\n' % msg
  31. else:
  32. return u''
  33. def err(self, msg, *args):
  34. return self.m(u'! %s' % msg, *args)
  35. def warn(self, msg):
  36. return self.m(u'@ %s' % msg)
  37. def info(self, msg):
  38. return self.m(u'\u2013 %s' % msg)
  39. def abort(self, msg):
  40. raise NotImplemented
  41. def color(self, color, msg):
  42. return msg
  43. def bold(self, msg):
  44. return msg
  45. def italics(self, msg):
  46. return msg
  47. def nl(self):
  48. return self.m('')
  49. def format_validation_errors(self, errs):
  50. r = []
  51. for e in errs:
  52. r.append(u' %s: %s' % ('.'.join(list(e.schema_path)[1:]), e.message))
  53. return u'\n'.join(r) + '\n'
  54. def pre_done_cb(self, *args):
  55. pass
  56. def done_cb(self):
  57. pass
  58. def config(self, name):
  59. return current_app.config.get('CRAWLER_' + name)
  60. def parse_cache_control(self, _cachectl):
  61. cachectl = {}
  62. for cc in _cachectl.split(','):
  63. cc = cc.strip()
  64. if not cc:
  65. continue
  66. cc = cc.split('=')
  67. if cc[0] not in ('max-age', 's-maxage'):
  68. continue
  69. try:
  70. cachectl[cc[0]] = cc[1]
  71. except IndexError:
  72. cachectl[cc[0]] = True
  73. return cachectl
  74. def __call__(self, url, cache_info={}):
  75. esc = self.escape
  76. yield self.m('Starting the validation process...')
  77. r = None
  78. try:
  79. yield self.m('* Attempting to retreive %s' % self.bold(url))
  80. headers = {'User-Agent': 'FFDN DB validator'}
  81. if cache_info.get('etag'):
  82. headers['If-None-Match'] = cache_info['etag']
  83. if cache_info.get('last-modified'):
  84. headers['If-Modified-Since'] = cache_info['last-modified']
  85. r = requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt',
  86. headers=headers, stream=True, timeout=10)
  87. except requests.exceptions.SSLError as e:
  88. yield self.err('Unable to connect, SSL Error: ' + self.color('#dd1144', esc(e)))
  89. except requests.exceptions.ConnectionError as e:
  90. yield self.err('Unable to connect: ' + self.color('#dd1144', esc(e)))
  91. except requests.exceptions.Timeout as e:
  92. yield self.err('Connection timeout')
  93. except requests.exceptions.TooManyRedirects as e:
  94. yield self.err('Too many redirects')
  95. except requests.exceptions.RequestException as e:
  96. yield self.err('Internal request exception')
  97. except Exception as e:
  98. # Unexpected exception: abort the validation, then re-raise it
  99. # so that it's logged.
  100. tb = sys.exc_info()[2]
  101. yield self.abort('Unexpected request exception')
  102. raise e, None, tb
  103. if r is None:
  104. yield self.abort('Connection could not be established, aborting')
  105. return
  106. yield self.info('Connection established')
  107. yield self.info('Response code: ' + self.bold(str(r.status_code) + ' ' + esc(r.reason)))
  108. try:
  109. r.raise_for_status()
  110. except requests.exceptions.HTTPError as e:
  111. yield self.err('Response code indicates an error')
  112. yield self.abort('Invalid response code')
  113. return
  114. _cachecontrol = r.headers.get('cache-control')
  115. cachecontrol = self.parse_cache_control(_cachecontrol) if _cachecontrol else None
  116. max_age = None
  117. if cachecontrol:
  118. try:
  119. _maxage = cachecontrol.get('max-age')
  120. _maxage = cachecontrol.get('s-maxage', _maxage) # s-maxage takes precedence
  121. max_age = int(_maxage)
  122. except ValueError:
  123. yield self.warn('Invalid max-age ' + esc(_maxage))
  124. yield self.info('Cache control: ' + self.bold(esc(
  125. ', '.join([k + '=' + v if type(v) != bool else k for k, v in cachecontrol.iteritems()]))
  126. ))
  127. _expires = r.headers.get('expires')
  128. expires = parse_date(_expires)
  129. if expires:
  130. _now = r.headers.get('date')
  131. if _now: # use server date when possible
  132. now = parse_date(_now)
  133. else:
  134. now = datetime.utcnow()
  135. if expires > now:
  136. expires = (expires - now).total_seconds()
  137. yield self.info('Expires: ' + self.bold(esc(_expires)))
  138. else:
  139. yield self.warn('Invalid Expires header. Expiry date must be in the future.')
  140. expires = None
  141. elif _expires and not expires:
  142. yield self.warn('Invalid Expires header %r' % esc(_expires))
  143. if not max_age and not expires:
  144. yield self.warn('No valid expiration time provided ! Please provide it either '
  145. 'with a Cache-Control or Expires header.')
  146. max_age = self.config('DEFAULT_CACHE_TIME')
  147. yield self.info('Using default expiration time of %d seconds' % (max_age))
  148. self.jdict_max_age = max_age if max_age else expires
  149. self.jdict_max_age = min(
  150. self.config('MAX_CACHE_TIME'),
  151. max(self.config('MIN_CACHE_TIME'), self.jdict_max_age)
  152. )
  153. yield self.info('Next update will be in %s' % (timedelta(seconds=self.jdict_max_age)))
  154. etag = r.headers.get('etag')
  155. last_modified = r.headers.get('last-modified')
  156. if not etag and not last_modified:
  157. yield self.warn('Please, provide at an ETag or Last-Modified header for '
  158. 'conditional requests')
  159. self.cache_info = {}
  160. if etag:
  161. self.cache_info['etag'] = etag
  162. if last_modified:
  163. self.cache_info['last-modified'] = last_modified
  164. if cache_info and r.status_code == 304: # not modified
  165. self.m('== ' + self.color('forestgreen', 'Response not modified. All good !'))
  166. self.modified = False
  167. self.success = True
  168. self.done_cb()
  169. return
  170. yield self.info('Content type: ' + self.bold(esc(r.headers.get('content-type', 'not defined'))))
  171. if not r.headers.get('content-type'):
  172. yield self.err('Content-type ' + self.bold('MUST') + ' be defined')
  173. yield self.abort('The file must have a proper content-type to continue')
  174. return
  175. elif r.headers.get('content-type').lower() != 'application/json':
  176. yield self.warn('Content-type ' + self.italics('SHOULD') + ' be application/json')
  177. encoding = get_encoding(r.headers.get('content-type'))
  178. if not encoding:
  179. yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
  180. yield self.info('Content length: %s' % (self.bold(esc(r.headers.get('content-length', 'not set')))))
  181. cl = r.headers.get('content-length')
  182. if not cl:
  183. yield self.warn('No content-length. Note that we will not process a file whose size exceed %s'
  184. % (filesize_fmt(self.MAX_JSON_SIZE)))
  185. elif int(cl) > self.MAX_JSON_SIZE:
  186. yield self.abort('File too big ! File size must be less then %s' % (filesize_fmt(self.MAX_JSON_SIZE)))
  187. return
  188. yield self.info('Reading response into memory...')
  189. b = io.BytesIO()
  190. for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
  191. b.write(d)
  192. if b.tell() > self.MAX_JSON_SIZE:
  193. yield self.abort('File too big ! File size must be less then %s'
  194. % (filesize_fmt(self.MAX_JSON_SIZE)))
  195. return
  196. r._content = b.getvalue()
  197. del b
  198. yield self.info('Successfully read %d bytes' % len(r.content))
  199. yield self.nl() + self.m('* Parsing the JSON file')
  200. if not encoding:
  201. charset = requests.utils.guess_json_utf(r.content)
  202. if not charset:
  203. yield self.err('Unable to guess unicode charset')
  204. yield self.abort('The file MUST be unicode-encoded when no explicit charset is in the content-type')
  205. return
  206. yield self.info('Guessed charset: ' + self.bold(charset))
  207. try:
  208. txt = r.content.decode(encoding or charset)
  209. yield self.info('Successfully decoded file as %s' % esc(encoding or charset))
  210. except LookupError as e:
  211. yield self.err('Invalid/unknown charset: %s' % esc(e))
  212. yield self.abort('Charset error, Cannot continue')
  213. return
  214. except UnicodeDecodeError as e:
  215. yield self.err('Unicode decode error: %s' % e)
  216. yield self.abort('Charset error, cannot continue')
  217. return
  218. except Exception:
  219. yield self.abort('Unexpected charset error')
  220. return
  221. jdict = None
  222. try:
  223. jdict = json.loads(txt)
  224. except ValueError as e:
  225. yield self.err('Error while parsing JSON: %s' % esc(e))
  226. except Exception as e:
  227. yield self.err('Unexpected error while parsing JSON: %s' % esc(e))
  228. if not jdict:
  229. yield self.abort('Could not parse JSON')
  230. return
  231. yield self.info('JSON parsed successfully')
  232. yield self.nl() + self.m('* Validating the JSON against the schema')
  233. v = list(validate_isp(jdict))
  234. if v:
  235. yield self.err('Validation errors:') + self.format_validation_errors(v)
  236. yield self.abort('Your JSON file does not follow the schema, please fix it')
  237. return
  238. else:
  239. yield self.info('Done. No errors encountered \o')
  240. for ca in jdict.get('coveredAreas', []):
  241. if not 'area' in ca:
  242. continue
  243. if not check_geojson_spatialite(ca['area']):
  244. yield self.err('GeoJSON data for covered area "%s" cannot '
  245. 'be handled by our database' % esc(ca['name']))
  246. yield self.abort('Please fix your GeoJSON')
  247. return
  248. ret = self.pre_done_cb(jdict)
  249. if ret:
  250. yield ret
  251. return
  252. yield (self.nl() + self.m('== ' + self.color('forestgreen', 'All good ! You can click on Confirm now')) +
  253. self.m(json.dumps({'passed': 1}), 'control'))
  254. self.jdict = jdict
  255. self.success = True
  256. self.done_cb()
  257. class PrettyValidator(Crawler):
  258. def __init__(self, session=None, sesskey=None, *args, **kwargs):
  259. super(PrettyValidator, self).__init__(*args, **kwargs)
  260. self.session = session
  261. self.sesskey = sesskey
  262. self.escape = lambda x: escape(unicode(str(x), 'utf8') if type(x) != unicode else x)
  263. def m(self, msg, evt=None):
  264. return u'%sdata: %s\n\n' % (u'event: %s\n' % evt if evt else '', msg)
  265. def err(self, msg, *args):
  266. return self.m(u'<strong style="color: crimson">!</strong> %s' % msg, *args)
  267. def warn(self, msg):
  268. return self.m(u'<strong style="color: dodgerblue">@</strong> %s' % msg)
  269. def info(self, msg):
  270. return self.m(u'&ndash; %s' % msg)
  271. def abort(self, msg):
  272. return (self.m(u'<br />== <span style="color: crimson">%s</span>' % msg) +
  273. self.m(json.dumps({'closed': 1}), 'control'))
  274. def bold(self, msg):
  275. return u'<strong>%s</strong>' % msg
  276. def italics(self, msg):
  277. return u'<em>%s</em>' % msg
  278. def color(self, color, msg):
  279. return u'<span style="color: %s">%s</span>' % (color, msg)
  280. def format_validation_errors(self, errs):
  281. lns = super(PrettyValidator, self).format_validation_errors(errs)
  282. buf = u''
  283. for l in lns.split('\n'):
  284. buf += self.m(self.escape(l))
  285. return buf
  286. def done_cb(self):
  287. self.session[self.sesskey]['validated'] = True
  288. self.session[self.sesskey]['jdict'] = self.jdict
  289. self.session[self.sesskey]['cache_info'] = self.cache_info
  290. self.session[self.sesskey]['last_update'] = utcnow()
  291. self.session[self.sesskey]['next_update'] = utcnow() + timedelta(seconds=self.jdict_max_age)
  292. self.session.save()
  293. class WebValidator(PrettyValidator):
  294. def pre_done_cb(self, jdict):
  295. # check name uniqueness
  296. where = (ISP.name == jdict['name'])
  297. if 'shortname' in jdict and jdict['shortname']:
  298. where |= (ISP.shortname == jdict.get('shortname'))
  299. if ISP.query.filter(where).count() > 0:
  300. ret = self.nl()
  301. ret += self.err('An ISP named "%s" already exist in our database' % self.escape(
  302. jdict['name'] + (' (' + jdict['shortname'] + ')' if jdict.get('shortname') else '')
  303. ))
  304. ret += self.abort('The name of your ISP must be unique')
  305. return ret
  306. class TextValidator(Crawler):
  307. def abort(self, msg):
  308. res = u'FATAL ERROR: %s\n' % msg
  309. pad = u'=' * (len(res) - 1) + '\n'
  310. return self.m(pad + res + pad)