crawler.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. from flask import escape, json
  2. import requests
  3. import io
  4. import cgi
  5. from ispformat.validator import validate_isp
  6. from .models import ISP
  7. def get_encoding(content_type):
  8. content_type, params = cgi.parse_header(content_type)
  9. if 'charset' in params:
  10. return params['charset'].strip("'\"")
  11. class Crawler(object):
  12. MAX_JSON_SIZE=1*1024*1024
  13. format_validation_errors=unicode
  14. escape=lambda x:x
  15. def m(self, msg, evt=None):
  16. return u'%sdata: %s\n\n'%(u'event: %s\n'%evt if evt else '', msg)
  17. def err(self, msg, *args):
  18. return self.m(u'! %s'%msg, *args)
  19. def warn(self, msg):
  20. return self.m(u'@ %s'%msg)
  21. def info(self, msg):
  22. return self.m(u'\u2013 %s'%msg)
  23. def abort(self, msg):
  24. return (self.m('<br />== <span style="color: crimson">%s</span>'%msg)+
  25. self.m(json.dumps({'closed': 1}), 'control'))
  26. def done_cb(self):
  27. pass
  28. def __call__(self, url):
  29. esc=self.escape
  30. yield self.m('Starting the validation process...')
  31. r=None
  32. try:
  33. yield self.m('* Attempting to retreive <strong>%s</strong>'%url)
  34. r=requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt',
  35. headers={'User-Agent': 'FFDN DB validator'},
  36. stream=True, timeout=10)
  37. except requests.exceptions.SSLError as e:
  38. yield self.err('Unable to connect, SSL Error: <code style="color: #dd1144;">%s</code>'%esc(e))
  39. except requests.exceptions.ConnectionError as e:
  40. yield self.err('Unable to connect: <code style="color: #dd1144;">%s</code>'%e)
  41. except requests.exceptions.Timeout as e:
  42. yield self.err('Connection timeout')
  43. except requests.exceptions.TooManyRedirects as e:
  44. yield self.err('Too many redirects')
  45. except requests.exceptions.RequestException as e:
  46. yield self.err('Internal request exception')
  47. except Exception as e:
  48. yield self.err('Unexpected request exception')
  49. if r is None:
  50. yield self.abort('Connection could not be established, aborting')
  51. return
  52. yield self.info('Connection established')
  53. yield self.info('Response code: <strong>%s %s</strong>'%(esc(r.status_code), esc(r.reason)))
  54. try:
  55. r.raise_for_status()
  56. except requests.exceptions.HTTPError as e:
  57. yield self.err('Response code indicates an error')
  58. yield self.abort('Invalid response code')
  59. return
  60. yield self.info('Content type: <strong>%s</strong>'%(esc(r.headers.get('content-type', 'not defined'))))
  61. if not r.headers.get('content-type'):
  62. yield self.error('Content-type <strong>MUST</strong> be defined')
  63. yield self.abort('The file must have a proper content-type to continue')
  64. elif r.headers.get('content-type').lower() != 'application/json':
  65. yield self.warn('Content-type <em>SHOULD</em> be application/json')
  66. encoding=get_encoding(r.headers.get('content-type'))
  67. if not encoding:
  68. yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
  69. yield self.info('Content length: <strong>%s</strong>'%(esc(r.headers.get('content-length', 'not set'))))
  70. cl=r.headers.get('content-length')
  71. if not cl:
  72. yield self.warn('No content-length. Note that we will not process a file whose size exceed 1MiB')
  73. elif int(cl) > self.MAX_JSON_SIZE:
  74. yield self.abort('File too big ! File size must be less then 1MiB')
  75. yield self.info('Reading response into memory...')
  76. b=io.BytesIO()
  77. for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
  78. b.write(d)
  79. if b.tell() > self.MAX_JSON_SIZE:
  80. yield self.abort('File too big ! File size must be less then 1MiB')
  81. return
  82. r._content=b.getvalue()
  83. del b
  84. yield self.info('Successfully read %d bytes'%len(r.content))
  85. yield self.m('<br>* Parsing the JSON file')
  86. if not encoding:
  87. charset=requests.utils.guess_json_utf(r.content)
  88. if not charset:
  89. yield self.err('Unable to guess unicode charset')
  90. yield self.abort('The file MUST be unicode-encoded when no explicit charset is in the content-type')
  91. return
  92. yield self.info('Guessed charset: <strong>%s</strong>'%charset)
  93. try:
  94. txt=r.content.decode(encoding or charset)
  95. yield self.info('Successfully decoded file as %s'%esc(encoding or charset))
  96. except LookupError as e:
  97. yield self.err('Invalid/unknown charset: %s'%esc(e))
  98. yield self.abort('Charset error, Cannot continue')
  99. return
  100. except UnicodeDecodeError as e:
  101. yield self.err('Unicode decode error: %s'%e)
  102. yield self.abort('Charset error, cannot continue')
  103. return
  104. except Exception:
  105. yield self.abort('Unexpected charset error')
  106. return
  107. jdict=None
  108. try:
  109. jdict=json.loads(txt)
  110. except ValueError as e:
  111. yield self.err('Error while parsing JSON: %s'%esc(e))
  112. except Exception as e:
  113. yield self.err('Unexpected error while parsing JSON: %s'%esc(e))
  114. if not jdict:
  115. yield self.abort('Could not parse JSON')
  116. return
  117. yield self.info('JSON parsed successfully')
  118. yield self.m('<br />* Validating the JSON against the schema')
  119. v=list(validate_isp(jdict))
  120. if v:
  121. yield self.err('Validation errors:<br />%s'%esc(self.format_validation_errors(v)))
  122. yield self.abort('Your JSON file does not follow the schema, please fix it')
  123. return
  124. else:
  125. yield self.info('Done. No errors encountered \o')
  126. # check name uniqueness
  127. where = (ISP.name == jdict['name'])
  128. if 'shortname' in jdict and jdict['shortname']:
  129. where |= (ISP.shortname == jdict.get('shortname'))
  130. if ISP.query.filter(where).count() > 0:
  131. yield self.err('An ISP named "%s" already exist'%esc(
  132. jdict['name']+(' ('+jdict['shortname']+')' if jdict.get('shortname') else '')
  133. ))
  134. yield self.abort('The name of your ISP must be unique')
  135. return
  136. yield (self.m('<br />== <span style="color: forestgreen">All good ! You can click on Confirm now</span>')+
  137. self.m(json.dumps({'passed': 1}), 'control'))
  138. self.jdict=jdict
  139. self.done_cb()
  140. class PrettyValidator(Crawler):
  141. def __init__(self, session=None, *args, **kwargs):
  142. super(PrettyValidator, self).__init__(*args, **kwargs)
  143. self.session=session
  144. self.escape=escape
  145. def err(self, msg, *args):
  146. return self.m(u'<strong style="color: crimson">!</strong> %s'%msg, *args)
  147. def warn(self, msg):
  148. return self.m(u'<strong style="color: dodgerblue">@</strong> %s'%msg)
  149. def info(self, msg):
  150. return self.m(u'&ndash; %s'%msg)
  151. def abort(self, msg):
  152. return (self.m(u'<br />== <span style="color: crimson">%s</span>'%msg)+
  153. self.m(json.dumps({'closed': 1}), 'control'))
  154. def format_validation_errors(self, errs):
  155. r=[]
  156. for e in errs:
  157. r.append(u' %s: %s'%('.'.join(list(e.schema_path)[1:]), str(e)))
  158. return '\n'.join(r)
  159. def done_cb(self):
  160. self.session['form_json']['validated']=True
  161. self.session['form_json']['jdict']=self.jdict
  162. self.session.save()