11 years ago · 73bc2f07db
--- a/config.py
+++ b/config.py
@@ -4,3 +4,6 @@ SQLALCHEMY_DATABASE_URI = 'sqlite:///../ffdn-db.sqlite'
 
				 #PASSWD_SALT = 'change this value to some random chars!'
			
 
				 SECRET_KEY = '{J@uRKO,xO-PK7B,jF?>iHbxLasF9s#zjOoy=+:'
			
 
				 DEBUG = True
			
 
				+CRAWLER_MIN_CACHE_TIME=60*60 # 1 hour
			
 
				+CRAWLER_MAX_CACHE_TIME=60*60*24*14 # 2 week
			
 
				+CRAWLER_DEFAULT_CACHE_TIME=60*60*12 # 12 hours
			
--- a/ffdnispdb/crawler.py
+++ b/ffdnispdb/crawler.py
@@ -1,9 +1,12 @@
 
				 
			
 
				 
			
 
				-from flask import escape, json
			
 
				-import requests
			
 
				 import io
			
 
				 import cgi
			
 
				+import pytz
			
 
				+from datetime import datetime, timedelta
			
 
				+from werkzeug.http import parse_date
			
 
				+from flask import escape, json
			
 
				+import requests
			
 
				 
			
 
				 from ispformat.validator import validate_isp
			
 
				 from .models import ISP
			
@@ -20,7 +23,7 @@ class Crawler(object):
 
				 
			
 
				     MAX_JSON_SIZE=1*1024*1024
			
 
				 
			
 
				-    escape=staticmethod(lambda x: unicode(str(x), 'utf8'))
			
 
				+    escape=staticmethod(lambda x: unicode(str(x), 'utf8') if type(x) != unicode else x)
			
 
				 
			
 
				     def __init__(self):
			
 
				         self.success=False
			
@@ -69,6 +72,24 @@ class Crawler(object):
 
				     def done_cb(self):
			
 
				         pass
			
 
				 
			
 
				+    def config(self, name):
			
 
				+        return app.config.get('CRAWLER_'+name)
			
 
				+
			
 
				+    def parse_cache_control(self, _cachectl):
			
 
				+        cachectl={}
			
 
				+        for cc in _cachectl.split(','):
			
 
				+            cc=cc.strip()
			
 
				+            if not cc:
			
 
				+                continue
			
 
				+            cc=cc.split('=')
			
 
				+            if cc[0] not in ('max-age', 's-maxage'):
			
 
				+                continue
			
 
				+            try:
			
 
				+                cachectl[cc[0]]=cc[1]
			
 
				+            except IndexError:
			
 
				+                cachectl[cc[0]]=True
			
 
				+        return cachectl
			
 
				+
			
 
				     def __call__(self, url):
			
 
				         esc=self.escape
			
 
				         yield self.m('Starting the validation process...')
			
@@ -124,6 +145,54 @@ class Crawler(object):
 
				         elif int(cl) > self.MAX_JSON_SIZE:
			
 
				             yield self.abort('File too big ! File size must be less then 1MiB')
			
 
				 
			
 
				+
			
 
				+        _cachecontrol=r.headers.get('cache-control')
			
 
				+        cachecontrol=self.parse_cache_control(_cachecontrol) if _cachecontrol else None
			
 
				+        max_age=None
			
 
				+        if cachecontrol:
			
 
				+            try:
			
 
				+                _maxage=cachecontrol.get('max-age')
			
 
				+                _maxage=cachecontrol.get('s-maxage', _maxage) # s-maxage takes precedence
			
 
				+                max_age=int(_maxage)
			
 
				+            except ValueError:
			
 
				+                yield self.warn('Invalid max-age '+esc(_maxage))
			
 
				+
			
 
				+            yield self.info('Cache control: '+self.bold(esc(
			
 
				+                ', '.join([k+'='+v if type(v) != bool else k for k, v in cachecontrol.iteritems()]))
			
 
				+            ))
			
 
				+
			
 
				+        _expires=r.headers.get('expires')
			
 
				+        expires=parse_date(_expires)
			
 
				+        if expires:
			
 
				+            _now=r.headers.get('date')
			
 
				+            if _now: # use server date when possible
			
 
				+                now=parse_date(_now)
			
 
				+            else:
			
 
				+                now=datetime.utcnow()
			
 
				+
			
 
				+            if expires > now:
			
 
				+                expires=(expires-now).total_seconds()
			
 
				+                yield self.info('Expires: '+self.bold(esc(_expires)))
			
 
				+            else:
			
 
				+                yield self.warn('Invalid Expires header. Expiry date must be in the future.')
			
 
				+                expires=None
			
 
				+        else:
			
 
				+            yield self.warn('Invalid Expires header %r'%esc(_expires))
			
 
				+
			
 
				+        if not max_age and not expires:
			
 
				+            yield self.warn('No valid expiration time provided ! Please provide it either '
			
 
				+                             'with a Cache-Control or Expires header.')
			
 
				+            max_age=self.config('DEFAULT_CACHE_TIME')
			
 
				+            yield self.info('Using default expiration time of %d seconds'%(max_age))
			
 
				+
			
 
				+        self.jdict_max_age = max_age if max_age else expires
			
 
				+        self.jdict_max_age = min(
			
 
				+            self.config('MAX_CACHE_TIME'),
			
 
				+            max(self.config('MIN_CACHE_TIME'), self.jdict_max_age)
			
 
				+        )
			
 
				+        yield self.info('Next update will be in %s'%(timedelta(seconds=self.jdict_max_age)))
			
 
				+
			
 
				+
			
 
				         yield self.info('Reading response into memory...')
			
 
				         b=io.BytesIO()
			
 
				         for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
			
@@ -203,7 +272,7 @@ class PrettyValidator(Crawler):
 
				     def __init__(self, session=None, *args, **kwargs):
			
 
				         super(PrettyValidator, self).__init__(*args, **kwargs)
			
 
				         self.session=session
			
 
				-        self.escape=lambda x: escape(unicode(str(x), 'utf8'))
			
 
				+        self.escape=lambda x: escape(unicode(str(x), 'utf8') if type(x) != unicode else x)
			
 
				 
			
 
				     def m(self, msg, evt=None):
			
 
				         return u'%sdata: %s\n\n'%(u'event: %s\n'%evt if evt else '', msg)
			
--- a/ffdnispdb/cron_task.py
+++ b/ffdnispdb/cron_task.py
@@ -93,7 +93,7 @@ try:
 
				             db.session.commit()
			
 
				 
			
 
				             validator=TextValidator()
			
 
				-            log=''.join(validator(isp.json_url+'ab'))
			
 
				+            log=''.join(validator(isp.json_url))
			
 
				             if not validator.success: # handle error
			
 
				                 isp.update_error_strike += 1
			
 
				                 #isp.next_update = bla