crawler.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. from flask import escape, json
  2. import requests
  3. import io
  4. import cgi
  5. from ispformat.validator import validate_isp
  6. from .models import ISP
  7. def get_encoding(content_type):
  8. content_type, params = cgi.parse_header(content_type)
  9. if 'charset' in params:
  10. return params['charset'].strip("'\"")
  11. class Crawler(object):
  12. MAX_JSON_SIZE=1*1024*1024
  13. escape=staticmethod(lambda x: unicode(str(x), 'utf8'))
  14. def __init__(self):
  15. self.success=False
  16. self.jdict={}
  17. def m(self, msg, evt=None):
  18. if not evt:
  19. return u'%s\n'%msg
  20. else:
  21. return u''
  22. def err(self, msg, *args):
  23. return self.m(u'! %s'%msg, *args)
  24. def warn(self, msg):
  25. return self.m(u'@ %s'%msg)
  26. def info(self, msg):
  27. return self.m(u'\u2013 %s'%msg)
  28. def abort(self, msg):
  29. raise NotImplemented
  30. def color(self, color, msg):
  31. return msg
  32. def bold(self, msg):
  33. return msg
  34. def italics(self, msg):
  35. return msg
  36. def nl(self):
  37. return self.m('')
  38. def format_validation_errors(self, errs):
  39. r=[]
  40. for e in errs:
  41. r.append(u' %s: %s'%('.'.join(list(e.schema_path)[1:]), e.message))
  42. return u'\n'.join(r)
  43. def pre_done_cb(self, *args):
  44. pass
  45. def done_cb(self):
  46. pass
  47. def __call__(self, url):
  48. esc=self.escape
  49. yield self.m('Starting the validation process...')
  50. r=None
  51. try:
  52. yield self.m('* Attempting to retreive %s'%self.bold(url))
  53. r=requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt',
  54. headers={'User-Agent': 'FFDN DB validator'},
  55. stream=True, timeout=10)
  56. except requests.exceptions.SSLError as e:
  57. yield self.err('Unable to connect, SSL Error: '+self.color('#dd1144', esc(e)))
  58. except requests.exceptions.ConnectionError as e:
  59. yield self.err('Unable to connect: '+self.color('#dd1144', esc(e)))
  60. except requests.exceptions.Timeout as e:
  61. yield self.err('Connection timeout')
  62. except requests.exceptions.TooManyRedirects as e:
  63. yield self.err('Too many redirects')
  64. except requests.exceptions.RequestException as e:
  65. yield self.err('Internal request exception')
  66. # except Exception as e:
  67. # yield self.err('Unexpected request exception')
  68. if r is None:
  69. yield self.abort('Connection could not be established, aborting')
  70. return
  71. yield self.info('Connection established')
  72. yield self.info('Response code: '+self.bold(str(r.status_code)+' '+esc(r.reason)))
  73. try:
  74. r.raise_for_status()
  75. except requests.exceptions.HTTPError as e:
  76. yield self.err('Response code indicates an error')
  77. yield self.abort('Invalid response code')
  78. return
  79. yield self.info('Content type: '+self.bold(esc(r.headers.get('content-type', 'not defined'))))
  80. if not r.headers.get('content-type'):
  81. yield self.error('Content-type '+self.bold('MUST')+' be defined')
  82. yield self.abort('The file must have a proper content-type to continue')
  83. elif r.headers.get('content-type').lower() != 'application/json':
  84. yield self.warn('Content-type '+self.italics('SHOULD')+' be application/json')
  85. encoding=get_encoding(r.headers.get('content-type'))
  86. if not encoding:
  87. yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
  88. yield self.info('Content length: %s'%(self.bold(esc(r.headers.get('content-length', 'not set')))))
  89. cl=r.headers.get('content-length')
  90. if not cl:
  91. yield self.warn('No content-length. Note that we will not process a file whose size exceed 1MiB')
  92. elif int(cl) > self.MAX_JSON_SIZE:
  93. yield self.abort('File too big ! File size must be less then 1MiB')
  94. yield self.info('Reading response into memory...')
  95. b=io.BytesIO()
  96. for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
  97. b.write(d)
  98. if b.tell() > self.MAX_JSON_SIZE:
  99. yield self.abort('File too big ! File size must be less then 1MiB')
  100. return
  101. r._content=b.getvalue()
  102. del b
  103. yield self.info('Successfully read %d bytes'%len(r.content))
  104. yield self.nl()+self.m('* Parsing the JSON file')
  105. if not encoding:
  106. charset=requests.utils.guess_json_utf(r.content)
  107. if not charset:
  108. yield self.err('Unable to guess unicode charset')
  109. yield self.abort('The file MUST be unicode-encoded when no explicit charset is in the content-type')
  110. return
  111. yield self.info('Guessed charset: '+self.bold(charset))
  112. try:
  113. txt=r.content.decode(encoding or charset)
  114. yield self.info('Successfully decoded file as %s'%esc(encoding or charset))
  115. except LookupError as e:
  116. yield self.err('Invalid/unknown charset: %s'%esc(e))
  117. yield self.abort('Charset error, Cannot continue')
  118. return
  119. except UnicodeDecodeError as e:
  120. yield self.err('Unicode decode error: %s'%e)
  121. yield self.abort('Charset error, cannot continue')
  122. return
  123. except Exception:
  124. yield self.abort('Unexpected charset error')
  125. return
  126. jdict=None
  127. try:
  128. jdict=json.loads(txt)
  129. except ValueError as e:
  130. yield self.err('Error while parsing JSON: %s'%esc(e))
  131. except Exception as e:
  132. yield self.err('Unexpected error while parsing JSON: %s'%esc(e))
  133. if not jdict:
  134. yield self.abort('Could not parse JSON')
  135. return
  136. yield self.info('JSON parsed successfully')
  137. yield self.nl()+self.m('* Validating the JSON against the schema')
  138. v=list(validate_isp(jdict))
  139. if v:
  140. yield self.err('Validation errors:')+self.format_validation_errors(v)
  141. yield self.abort('Your JSON file does not follow the schema, please fix it')
  142. return
  143. else:
  144. yield self.info('Done. No errors encountered \o')
  145. ret=self.pre_done_cb(jdict)
  146. if ret:
  147. yield ret
  148. return
  149. yield (self.nl()+self.m('== '+self.color('forestgreen', 'All good ! You can click on Confirm now'))+
  150. self.m(json.dumps({'passed': 1}), 'control'))
  151. self.jdict=jdict
  152. self.success=True
  153. self.done_cb()
  154. class PrettyValidator(Crawler):
  155. def __init__(self, session=None, *args, **kwargs):
  156. super(PrettyValidator, self).__init__(*args, **kwargs)
  157. self.session=session
  158. self.escape=lambda x: escape(unicode(str(x), 'utf8'))
  159. def m(self, msg, evt=None):
  160. return u'%sdata: %s\n\n'%(u'event: %s\n'%evt if evt else '', msg)
  161. def err(self, msg, *args):
  162. return self.m(u'<strong style="color: crimson">!</strong> %s'%msg, *args)
  163. def warn(self, msg):
  164. return self.m(u'<strong style="color: dodgerblue">@</strong> %s'%msg)
  165. def info(self, msg):
  166. return self.m(u'&ndash; %s'%msg)
  167. def abort(self, msg):
  168. return (self.m(u'<br />== <span style="color: crimson">%s</span>'%msg)+
  169. self.m(json.dumps({'closed': 1}), 'control'))
  170. def bold(self, msg):
  171. return u'<strong>%s</strong>'%msg
  172. def italics(self, msg):
  173. return u'<em>%s</em>'%msg
  174. def color(self, color, msg):
  175. return u'<span style="color: %s">%s</span>'%(color, msg)
  176. def format_validation_errors(self, errs):
  177. lns=super(PrettyValidator, self).format_validation_errors(errs)
  178. buf=u''
  179. for l in lns.split('\n'):
  180. buf+=self.m(self.escape(l))
  181. return buf
  182. def done_cb(self):
  183. self.session['form_json']['validated']=True
  184. self.session['form_json']['jdict']=self.jdict
  185. self.session.save()
  186. class WebValidator(PrettyValidator):
  187. def pre_done_cb(self, jdict):
  188. # check name uniqueness
  189. where = (ISP.name == jdict['name'])
  190. if 'shortname' in jdict and jdict['shortname']:
  191. where |= (ISP.shortname == jdict.get('shortname'))
  192. if ISP.query.filter(where).count() > 0:
  193. ret = self.nl()
  194. ret += self.err('An ISP named "%s" already exist in our database'%self.escape(
  195. jdict['name']+(' ('+jdict['shortname']+')' if jdict.get('shortname') else '')
  196. ))
  197. ret += self.abort('The name of your ISP must be unique')
  198. return ret
  199. class TextValidator(Crawler):
  200. def abort(self, msg):
  201. res=u'FATAL ERROR: %s\n'%msg
  202. pad=u'='*(len(res)-1)+'\n'
  203. return self.m(pad+res+pad)