import io import cgi import sys from datetime import datetime, timedelta from werkzeug.http import parse_date from flask import escape, json, current_app import requests from ispformat.validator import validate_isp from .models import ISP from .utils import check_geojson_spatialite, utcnow, filesize_fmt def get_encoding(content_type): """ >>> get_encoding('wat/ever; charset=hey') 'hey' """ content_type, params = cgi.parse_header(content_type) if 'charset' in params: return params['charset'].strip("'\"") class Crawler(object): MAX_JSON_SIZE = 1 * 1024 * 1024 escape = staticmethod(lambda x: unicode(str(x), 'utf8') if type(x) != unicode else x) def __init__(self): self.success = False self.modified = True self.jdict = {} self.cache_info = None self.jdict_max_age = self.config('DEFAULT_CACHE_TIME') def m(self, msg, evt=None): if not evt: return u'%s\n' % msg else: return u'' def err(self, msg, *args): return self.m(u'! %s' % msg, *args) def warn(self, msg): return self.m(u'@ %s' % msg) def info(self, msg): return self.m(u'\u2013 %s' % msg) def abort(self, msg): raise NotImplemented def color(self, color, msg): return msg def bold(self, msg): return msg def italics(self, msg): return msg def nl(self): return self.m('') def format_validation_errors(self, errs): r = [] for e in errs: r.append(u' %s: %s' % ('.'.join(list(e.schema_path)[1:]), e.message)) return u'\n'.join(r) + '\n' def pre_done_cb(self, *args): pass def done_cb(self): pass def config(self, name): return current_app.config.get('CRAWLER_' + name) def parse_cache_control(self, _cachectl): cachectl = {} for cc in _cachectl.split(','): cc = cc.strip() if not cc: continue cc = cc.split('=') if cc[0] not in ('max-age', 's-maxage'): continue try: cachectl[cc[0]] = cc[1] except IndexError: cachectl[cc[0]] = True return cachectl def __call__(self, url, cache_info={}): esc = self.escape yield self.m('Starting the validation process...') r = None try: yield self.m('* Attempting to retreive %s' % self.bold(url)) headers = {'User-Agent': 'FFDN DB validator'} if cache_info.get('etag'): headers['If-None-Match'] = cache_info['etag'] if cache_info.get('last-modified'): headers['If-Modified-Since'] = cache_info['last-modified'] r = requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt', headers=headers, stream=True, timeout=10) except requests.exceptions.SSLError as e: yield self.err('Unable to connect, SSL Error: ' + self.color('#dd1144', esc(e))) except requests.exceptions.ConnectionError as e: yield self.err('Unable to connect: ' + self.color('#dd1144', esc(e))) except requests.exceptions.Timeout as e: yield self.err('Connection timeout') except requests.exceptions.TooManyRedirects as e: yield self.err('Too many redirects') except requests.exceptions.RequestException as e: yield self.err('Internal request exception') except Exception as e: # Unexpected exception: abort the validation, then re-raise it # so that it's logged. tb = sys.exc_info()[2] yield self.abort('Unexpected request exception') raise e, None, tb if r is None: yield self.abort('Connection could not be established, aborting') return yield self.info('Connection established') yield self.info('Response code: ' + self.bold(str(r.status_code) + ' ' + esc(r.reason))) try: r.raise_for_status() except requests.exceptions.HTTPError as e: yield self.err('Response code indicates an error') yield self.abort('Invalid response code') return _cachecontrol = r.headers.get('cache-control') cachecontrol = self.parse_cache_control(_cachecontrol) if _cachecontrol else None max_age = None if cachecontrol: try: _maxage = cachecontrol.get('max-age') _maxage = cachecontrol.get('s-maxage', _maxage) # s-maxage takes precedence max_age = int(_maxage) except ValueError: yield self.warn('Invalid max-age ' + esc(_maxage)) yield self.info('Cache control: ' + self.bold(esc( ', '.join([k + '=' + v if type(v) != bool else k for k, v in cachecontrol.iteritems()])) )) _expires = r.headers.get('expires') expires = parse_date(_expires) if expires: _now = r.headers.get('date') if _now: # use server date when possible now = parse_date(_now) else: now = datetime.utcnow() if expires > now: expires = (expires - now).total_seconds() yield self.info('Expires: ' + self.bold(esc(_expires))) else: yield self.warn('Invalid Expires header. Expiry date must be in the future.') expires = None elif _expires and not expires: yield self.warn('Invalid Expires header %r' % esc(_expires)) if not max_age and not expires: yield self.warn('No valid expiration time provided ! Please provide it either ' 'with a Cache-Control or Expires header.') max_age = self.config('DEFAULT_CACHE_TIME') yield self.info('Using default expiration time of %d seconds' % (max_age)) self.jdict_max_age = max_age if max_age else expires self.jdict_max_age = min( self.config('MAX_CACHE_TIME'), max(self.config('MIN_CACHE_TIME'), self.jdict_max_age) ) yield self.info('Next update will be in %s' % (timedelta(seconds=self.jdict_max_age))) etag = r.headers.get('etag') last_modified = r.headers.get('last-modified') if not etag and not last_modified: yield self.warn('Please, provide at an ETag or Last-Modified header for ' 'conditional requests') self.cache_info = {} if etag: self.cache_info['etag'] = etag if last_modified: self.cache_info['last-modified'] = last_modified if cache_info and r.status_code == 304: # not modified self.m('== ' + self.color('forestgreen', 'Response not modified. All good !')) self.modified = False self.success = True self.done_cb() return yield self.info('Content type: ' + self.bold(esc(r.headers.get('content-type', 'not defined')))) if not r.headers.get('content-type'): yield self.err('Content-type ' + self.bold('MUST') + ' be defined') yield self.abort('The file must have a proper content-type to continue') return elif r.headers.get('content-type').lower() != 'application/json': yield self.warn('Content-type ' + self.italics('SHOULD') + ' be application/json') encoding = get_encoding(r.headers.get('content-type')) if not encoding: yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3') yield self.info('Content length: %s' % (self.bold(esc(r.headers.get('content-length', 'not set'))))) cl = r.headers.get('content-length') if not cl: yield self.warn('No content-length. Note that we will not process a file whose size exceed %s' % (filesize_fmt(self.MAX_JSON_SIZE))) elif int(cl) > self.MAX_JSON_SIZE: yield self.abort('File too big ! File size must be less then %s' % (filesize_fmt(self.MAX_JSON_SIZE))) return yield self.info('Reading response into memory...') b = io.BytesIO() for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE): b.write(d) if b.tell() > self.MAX_JSON_SIZE: yield self.abort('File too big ! File size must be less then %s' % (filesize_fmt(self.MAX_JSON_SIZE))) return r._content = b.getvalue() del b yield self.info('Successfully read %d bytes' % len(r.content)) yield self.nl() + self.m('* Parsing the JSON file') if not encoding: charset = requests.utils.guess_json_utf(r.content) if not charset: yield self.err('Unable to guess unicode charset') yield self.abort('The file MUST be unicode-encoded when no explicit charset is in the content-type') return yield self.info('Guessed charset: ' + self.bold(charset)) try: txt = r.content.decode(encoding or charset) yield self.info('Successfully decoded file as %s' % esc(encoding or charset)) except LookupError as e: yield self.err('Invalid/unknown charset: %s' % esc(e)) yield self.abort('Charset error, Cannot continue') return except UnicodeDecodeError as e: yield self.err('Unicode decode error: %s' % e) yield self.abort('Charset error, cannot continue') return except Exception: yield self.abort('Unexpected charset error') return jdict = None try: jdict = json.loads(txt) except ValueError as e: yield self.err('Error while parsing JSON: %s' % esc(e)) except Exception as e: yield self.err('Unexpected error while parsing JSON: %s' % esc(e)) if not jdict: yield self.abort('Could not parse JSON') return yield self.info('JSON parsed successfully') yield self.nl() + self.m('* Validating the JSON against the schema') v = list(validate_isp(jdict)) if v: yield self.err('Validation errors:') + self.format_validation_errors(v) yield self.abort('Your JSON file does not follow the schema, please fix it') return else: yield self.info('Done. No errors encountered \o') for ca in jdict.get('coveredAreas', []): if not 'area' in ca: continue if not check_geojson_spatialite(ca['area']): yield self.err('GeoJSON data for covered area "%s" cannot ' 'be handled by our database' % esc(ca['name'])) yield self.abort('Please fix your GeoJSON') return ret = self.pre_done_cb(jdict) if ret: yield ret return yield (self.nl() + self.m('== ' + self.color('forestgreen', 'All good ! You can click on Confirm now')) + self.m(json.dumps({'passed': 1}), 'control')) self.jdict = jdict self.success = True self.done_cb() class PrettyValidator(Crawler): def __init__(self, session=None, sesskey=None, *args, **kwargs): super(PrettyValidator, self).__init__(*args, **kwargs) self.session = session self.sesskey = sesskey self.escape = lambda x: escape(unicode(str(x), 'utf8') if type(x) != unicode else x) def m(self, msg, evt=None): return u'%sdata: %s\n\n' % (u'event: %s\n' % evt if evt else '', msg) def err(self, msg, *args): return self.m(u'! %s' % msg, *args) def warn(self, msg): return self.m(u'@ %s' % msg) def info(self, msg): return self.m(u'– %s' % msg) def abort(self, msg): return (self.m(u'
== %s' % msg) + self.m(json.dumps({'closed': 1}), 'control')) def bold(self, msg): return u'%s' % msg def italics(self, msg): return u'%s' % msg def color(self, color, msg): return u'%s' % (color, msg) def format_validation_errors(self, errs): lns = super(PrettyValidator, self).format_validation_errors(errs) buf = u'' for l in lns.split('\n'): buf += self.m(self.escape(l)) return buf def done_cb(self): self.session[self.sesskey]['validated'] = True self.session[self.sesskey]['jdict'] = self.jdict self.session[self.sesskey]['cache_info'] = self.cache_info self.session[self.sesskey]['last_update'] = utcnow() self.session[self.sesskey]['next_update'] = utcnow() + timedelta(seconds=self.jdict_max_age) self.session.save() class WebValidator(PrettyValidator): def pre_done_cb(self, jdict): # check name uniqueness where = (ISP.name == jdict['name']) if 'shortname' in jdict and jdict['shortname']: where |= (ISP.shortname == jdict.get('shortname')) if ISP.query.filter(where).count() > 0: ret = self.nl() ret += self.err('An ISP named "%s" already exist in our database' % self.escape( jdict['name'] + (' (' + jdict['shortname'] + ')' if jdict.get('shortname') else '') )) ret += self.abort('The name of your ISP must be unique') return ret class TextValidator(Crawler): def abort(self, msg): res = u'FATAL ERROR: %s\n' % msg pad = u'=' * (len(res) - 1) + '\n' return self.m(pad + res + pad)