Browse Source

Add HTTP Cache support to the crawler

Gu1 11 years ago
parent
commit
73bc2f07db
3 changed files with 77 additions and 5 deletions
  1. 3 0
      config.py
  2. 73 4
      ffdnispdb/crawler.py
  3. 1 1
      ffdnispdb/cron_task.py

+ 3 - 0
config.py

@@ -4,3 +4,6 @@ SQLALCHEMY_DATABASE_URI = 'sqlite:///../ffdn-db.sqlite'
 #PASSWD_SALT = 'change this value to some random chars!'
 SECRET_KEY = '{J@uRKO,xO-PK7B,jF?>iHbxLasF9s#zjOoy=+:'
 DEBUG = True
+CRAWLER_MIN_CACHE_TIME=60*60 # 1 hour
+CRAWLER_MAX_CACHE_TIME=60*60*24*14 # 2 week
+CRAWLER_DEFAULT_CACHE_TIME=60*60*12 # 12 hours

+ 73 - 4
ffdnispdb/crawler.py

@@ -1,9 +1,12 @@
 
 
-from flask import escape, json
-import requests
 import io
 import cgi
+import pytz
+from datetime import datetime, timedelta
+from werkzeug.http import parse_date
+from flask import escape, json
+import requests
 
 from ispformat.validator import validate_isp
 from .models import ISP
@@ -20,7 +23,7 @@ class Crawler(object):
 
     MAX_JSON_SIZE=1*1024*1024
 
-    escape=staticmethod(lambda x: unicode(str(x), 'utf8'))
+    escape=staticmethod(lambda x: unicode(str(x), 'utf8') if type(x) != unicode else x)
 
     def __init__(self):
         self.success=False
@@ -69,6 +72,24 @@ class Crawler(object):
     def done_cb(self):
         pass
 
+    def config(self, name):
+        return app.config.get('CRAWLER_'+name)
+
+    def parse_cache_control(self, _cachectl):
+        cachectl={}
+        for cc in _cachectl.split(','):
+            cc=cc.strip()
+            if not cc:
+                continue
+            cc=cc.split('=')
+            if cc[0] not in ('max-age', 's-maxage'):
+                continue
+            try:
+                cachectl[cc[0]]=cc[1]
+            except IndexError:
+                cachectl[cc[0]]=True
+        return cachectl
+
     def __call__(self, url):
         esc=self.escape
         yield self.m('Starting the validation process...')
@@ -124,6 +145,54 @@ class Crawler(object):
         elif int(cl) > self.MAX_JSON_SIZE:
             yield self.abort('File too big ! File size must be less then 1MiB')
 
+
+        _cachecontrol=r.headers.get('cache-control')
+        cachecontrol=self.parse_cache_control(_cachecontrol) if _cachecontrol else None
+        max_age=None
+        if cachecontrol:
+            try:
+                _maxage=cachecontrol.get('max-age')
+                _maxage=cachecontrol.get('s-maxage', _maxage) # s-maxage takes precedence
+                max_age=int(_maxage)
+            except ValueError:
+                yield self.warn('Invalid max-age '+esc(_maxage))
+
+            yield self.info('Cache control: '+self.bold(esc(
+                ', '.join([k+'='+v if type(v) != bool else k for k, v in cachecontrol.iteritems()]))
+            ))
+
+        _expires=r.headers.get('expires')
+        expires=parse_date(_expires)
+        if expires:
+            _now=r.headers.get('date')
+            if _now: # use server date when possible
+                now=parse_date(_now)
+            else:
+                now=datetime.utcnow()
+
+            if expires > now:
+                expires=(expires-now).total_seconds()
+                yield self.info('Expires: '+self.bold(esc(_expires)))
+            else:
+                yield self.warn('Invalid Expires header. Expiry date must be in the future.')
+                expires=None
+        else:
+            yield self.warn('Invalid Expires header %r'%esc(_expires))
+
+        if not max_age and not expires:
+            yield self.warn('No valid expiration time provided ! Please provide it either '
+                             'with a Cache-Control or Expires header.')
+            max_age=self.config('DEFAULT_CACHE_TIME')
+            yield self.info('Using default expiration time of %d seconds'%(max_age))
+
+        self.jdict_max_age = max_age if max_age else expires
+        self.jdict_max_age = min(
+            self.config('MAX_CACHE_TIME'),
+            max(self.config('MIN_CACHE_TIME'), self.jdict_max_age)
+        )
+        yield self.info('Next update will be in %s'%(timedelta(seconds=self.jdict_max_age)))
+
+
         yield self.info('Reading response into memory...')
         b=io.BytesIO()
         for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):
@@ -203,7 +272,7 @@ class PrettyValidator(Crawler):
     def __init__(self, session=None, *args, **kwargs):
         super(PrettyValidator, self).__init__(*args, **kwargs)
         self.session=session
-        self.escape=lambda x: escape(unicode(str(x), 'utf8'))
+        self.escape=lambda x: escape(unicode(str(x), 'utf8') if type(x) != unicode else x)
 
     def m(self, msg, evt=None):
         return u'%sdata: %s\n\n'%(u'event: %s\n'%evt if evt else '', msg)

+ 1 - 1
ffdnispdb/cron_task.py

@@ -93,7 +93,7 @@ try:
             db.session.commit()
 
             validator=TextValidator()
-            log=''.join(validator(isp.json_url+'ab'))
+            log=''.join(validator(isp.json_url))
             if not validator.success: # handle error
                 isp.update_error_strike += 1
                 #isp.next_update = bla