crawler.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. import io
  2. import cgi
  3. import pytz
  4. from datetime import datetime, timedelta
  5. from werkzeug.http import parse_date
  6. from flask import escape, json
  7. import requests
  8. from ispformat.validator import validate_isp
  9. from .models import ISP
  10. def get_encoding(content_type):
  11. content_type, params = cgi.parse_header(content_type)
  12. if 'charset' in params:
  13. return params['charset'].strip("'\"")
  14. class Crawler(object):
  15. MAX_JSON_SIZE=1*1024*1024
  16. escape=staticmethod(lambda x: unicode(str(x), 'utf8') if type(x) != unicode else x)
  17. def __init__(self):
  18. self.success=False
  19. self.jdict={}
  20. def m(self, msg, evt=None):
  21. if not evt:
  22. return u'%s\n'%msg
  23. else:
  24. return u''
  25. def err(self, msg, *args):
  26. return self.m(u'! %s'%msg, *args)
  27. def warn(self, msg):
  28. return self.m(u'@ %s'%msg)
  29. def info(self, msg):
  30. return self.m(u'\u2013 %s'%msg)
  31. def abort(self, msg):
  32. raise NotImplemented
  33. def color(self, color, msg):
  34. return msg
  35. def bold(self, msg):
  36. return msg
  37. def italics(self, msg):
  38. return msg
  39. def nl(self):
  40. return self.m('')
  41. def format_validation_errors(self, errs):
  42. r=[]
  43. for e in errs:
  44. r.append(u' %s: %s'%('.'.join(list(e.schema_path)[1:]), e.message))
  45. return u'\n'.join(r)
  46. def pre_done_cb(self, *args):
  47. pass
  48. def done_cb(self):
  49. pass
  50. def config(self, name):
  51. return app.config.get('CRAWLER_'+name)
  52. def parse_cache_control(self, _cachectl):
  53. cachectl={}
  54. for cc in _cachectl.split(','):
  55. cc=cc.strip()
  56. if not cc:
  57. continue
  58. cc=cc.split('=')
  59. if cc[0] not in ('max-age', 's-maxage'):
  60. continue
  61. try:
  62. cachectl[cc[0]]=cc[1]
  63. except IndexError:
  64. cachectl[cc[0]]=True
  65. return cachectl
  66. def __call__(self, url):
  67. esc=self.escape
  68. yield self.m('Starting the validation process...')
  69. r=None
  70. try:
  71. yield self.m('* Attempting to retreive %s'%self.bold(url))
  72. r=requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt',
  73. headers={'User-Agent': 'FFDN DB validator'},
  74. stream=True, timeout=10)
  75. except requests.exceptions.SSLError as e:
  76. yield self.err('Unable to connect, SSL Error: '+self.color('#dd1144', esc(e)))
  77. except requests.exceptions.ConnectionError as e:
  78. yield self.err('Unable to connect: '+self.color('#dd1144', esc(e)))
  79. except requests.exceptions.Timeout as e:
  80. yield self.err('Connection timeout')
  81. except requests.exceptions.TooManyRedirects as e:
  82. yield self.err('Too many redirects')
  83. except requests.exceptions.RequestException as e:
  84. yield self.err('Internal request exception')
  85. # except Exception as e:
  86. # yield self.err('Unexpected request exception')
  87. if r is None:
  88. yield self.abort('Connection could not be established, aborting')
  89. return
  90. yield self.info('Connection established')
  91. yield self.info('Response code: '+self.bold(str(r.status_code)+' '+esc(r.reason)))
  92. try:
  93. r.raise_for_status()
  94. except requests.exceptions.HTTPError as e:
  95. yield self.err('Response code indicates an error')
  96. yield self.abort('Invalid response code')
  97. return
  98. yield self.info('Content type: '+self.bold(esc(r.headers.get('content-type', 'not defined'))))
  99. if not r.headers.get('content-type'):
  100. yield self.error('Content-type '+self.bold('MUST')+' be defined')
  101. yield self.abort('The file must have a proper content-type to continue')
  102. elif r.headers.get('content-type').lower() != 'application/json':
  103. yield self.warn('Content-type '+self.italics('SHOULD')+' be application/json')
  104. encoding=get_encoding(r.headers.get('content-type'))
  105. if not encoding:
  106. yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
  107. yield self.info('Content length: %s'%(self.bold(esc(r.headers.get('content-length', 'not set')))))
  108. cl=r.headers.get('content-length')
  109. if not cl:
  110. yield self.warn('No content-length. Note that we will not process a file whose size exceed 1MiB')
  111. elif int(cl) > self.MAX_JSON_SIZE:
  112. yield self.abort('File too big ! File size must be less then 1MiB')
  113. _cachecontrol=r.headers.get('cache-control')
  114. cachecontrol=self.parse_cache_control(_cachecontrol) if _cachecontrol else None
  115. max_age=None
  116. if cachecontrol:
  117. try:
  118. _maxage=cachecontrol.get('max-age')
  119. _maxage=cachecontrol.get('s-maxage', _maxage) # s-maxage takes precedence
  120. max_age=int(_maxage)
  121. except ValueError:
  122. yield self.warn('Invalid max-age '+esc(_maxage))
  123. yield self.info('Cache control: '+self.bold(esc(
  124. ', '.join([k+'='+v if type(v) != bool else k for k, v in cachecontrol.iteritems()]))
  125. ))
  126. _expires=r.headers.get('expires')
  127. expires=parse_date(_expires)
  128. if expires:
  129. _now=r.headers.get('date')
  130. if _now: # use server date when possible
  131. now=parse_date(_now)
  132. else:
  133. now=datetime.utcnow()
  134. if expires > now:
  135. expires=(expires-now).total_seconds()
  136. yield self.info('Expires: '+self.bold(esc(_expires)))
  137. else:
  138. yield self.warn('Invalid Expires header. Expiry date must be in the future.')
  139. expires=None
  140. else:
  141. yield self.warn('Invalid Expires header %r'%esc(_expires))
  142. if not max_age and not expires:
  143. yield self.warn('No valid expiration time provided ! Please provide it either '
  144. 'with a Cache-Control or Expires header.')
  145. max_age=self.config('DEFAULT_CACHE_TIME')
  146. yield self.info('Using default expiration time of %d seconds'%(max_age))
  147. self.jdict_max_age = max_age if max_age else expires
  148. self.jdict_max_age = min(
  149. self.config('MAX_CACHE_TIME'),
  150. max(self.config('MIN_CACHE_TIME'), self.jdict_max_age)
  151. )
  152. yield self.info('Next update will be in %s'%(timedelta(seconds=self.jdict_max_age)))
  153. yield self.info('Reading response into memory...')
  154. b=io.BytesIO()
  155. for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
  156. b.write(d)
  157. if b.tell() > self.MAX_JSON_SIZE:
  158. yield self.abort('File too big ! File size must be less then 1MiB')
  159. return
  160. r._content=b.getvalue()
  161. del b
  162. yield self.info('Successfully read %d bytes'%len(r.content))
  163. yield self.nl()+self.m('* Parsing the JSON file')
  164. if not encoding:
  165. charset=requests.utils.guess_json_utf(r.content)
  166. if not charset:
  167. yield self.err('Unable to guess unicode charset')
  168. yield self.abort('The file MUST be unicode-encoded when no explicit charset is in the content-type')
  169. return
  170. yield self.info('Guessed charset: '+self.bold(charset))
  171. try:
  172. txt=r.content.decode(encoding or charset)
  173. yield self.info('Successfully decoded file as %s'%esc(encoding or charset))
  174. except LookupError as e:
  175. yield self.err('Invalid/unknown charset: %s'%esc(e))
  176. yield self.abort('Charset error, Cannot continue')
  177. return
  178. except UnicodeDecodeError as e:
  179. yield self.err('Unicode decode error: %s'%e)
  180. yield self.abort('Charset error, cannot continue')
  181. return
  182. except Exception:
  183. yield self.abort('Unexpected charset error')
  184. return
  185. jdict=None
  186. try:
  187. jdict=json.loads(txt)
  188. except ValueError as e:
  189. yield self.err('Error while parsing JSON: %s'%esc(e))
  190. except Exception as e:
  191. yield self.err('Unexpected error while parsing JSON: %s'%esc(e))
  192. if not jdict:
  193. yield self.abort('Could not parse JSON')
  194. return
  195. yield self.info('JSON parsed successfully')
  196. yield self.nl()+self.m('* Validating the JSON against the schema')
  197. v=list(validate_isp(jdict))
  198. if v:
  199. yield self.err('Validation errors:')+self.format_validation_errors(v)
  200. yield self.abort('Your JSON file does not follow the schema, please fix it')
  201. return
  202. else:
  203. yield self.info('Done. No errors encountered \o')
  204. ret=self.pre_done_cb(jdict)
  205. if ret:
  206. yield ret
  207. return
  208. yield (self.nl()+self.m('== '+self.color('forestgreen', 'All good ! You can click on Confirm now'))+
  209. self.m(json.dumps({'passed': 1}), 'control'))
  210. self.jdict=jdict
  211. self.success=True
  212. self.done_cb()
  213. class PrettyValidator(Crawler):
  214. def __init__(self, session=None, *args, **kwargs):
  215. super(PrettyValidator, self).__init__(*args, **kwargs)
  216. self.session=session
  217. self.escape=lambda x: escape(unicode(str(x), 'utf8') if type(x) != unicode else x)
  218. def m(self, msg, evt=None):
  219. return u'%sdata: %s\n\n'%(u'event: %s\n'%evt if evt else '', msg)
  220. def err(self, msg, *args):
  221. return self.m(u'<strong style="color: crimson">!</strong> %s'%msg, *args)
  222. def warn(self, msg):
  223. return self.m(u'<strong style="color: dodgerblue">@</strong> %s'%msg)
  224. def info(self, msg):
  225. return self.m(u'&ndash; %s'%msg)
  226. def abort(self, msg):
  227. return (self.m(u'<br />== <span style="color: crimson">%s</span>'%msg)+
  228. self.m(json.dumps({'closed': 1}), 'control'))
  229. def bold(self, msg):
  230. return u'<strong>%s</strong>'%msg
  231. def italics(self, msg):
  232. return u'<em>%s</em>'%msg
  233. def color(self, color, msg):
  234. return u'<span style="color: %s">%s</span>'%(color, msg)
  235. def format_validation_errors(self, errs):
  236. lns=super(PrettyValidator, self).format_validation_errors(errs)
  237. buf=u''
  238. for l in lns.split('\n'):
  239. buf+=self.m(self.escape(l))
  240. return buf
  241. def done_cb(self):
  242. self.session['form_json']['validated']=True
  243. self.session['form_json']['jdict']=self.jdict
  244. self.session.save()
  245. class WebValidator(PrettyValidator):
  246. def pre_done_cb(self, jdict):
  247. # check name uniqueness
  248. where = (ISP.name == jdict['name'])
  249. if 'shortname' in jdict and jdict['shortname']:
  250. where |= (ISP.shortname == jdict.get('shortname'))
  251. if ISP.query.filter(where).count() > 0:
  252. ret = self.nl()
  253. ret += self.err('An ISP named "%s" already exist in our database'%self.escape(
  254. jdict['name']+(' ('+jdict['shortname']+')' if jdict.get('shortname') else '')
  255. ))
  256. ret += self.abort('The name of your ISP must be unique')
  257. return ret
  258. class TextValidator(Crawler):
  259. def abort(self, msg):
  260. res=u'FATAL ERROR: %s\n'%msg
  261. pad=u'='*(len(res)-1)+'\n'
  262. return self.m(pad+res+pad)