Browse Source

crawler: add support for conditional GET

Gu1 11 years ago
parent
commit
d3da980712
3 changed files with 60 additions and 26 deletions
  1. 53 24
      ffdnispdb/crawler.py
  2. 6 2
      ffdnispdb/cron_task.py
  3. 1 0
      ffdnispdb/models.py

+ 53 - 24
ffdnispdb/crawler.py

@@ -28,7 +28,10 @@ class Crawler(object):
 
     def __init__(self):
         self.success=False
+        self.modified=True
         self.jdict={}
+        self.cache_info=None
+        self.jdict_max_age=self.config('DEFAULT_CACHE_TIME')
 
     def m(self, msg, evt=None):
         if not evt:
@@ -91,15 +94,19 @@ class Crawler(object):
                 cachectl[cc[0]]=True
         return cachectl
 
-    def __call__(self, url):
+    def __call__(self, url, cache_info={}):
         esc=self.escape
         yield self.m('Starting the validation process...')
         r=None
         try:
             yield self.m('* Attempting to retreive %s'%self.bold(url))
+            headers={'User-Agent': 'FFDN DB validator'}
+            if cache_info.get('etag'):
+                headers['If-None-Match'] = cache_info['etag']
+            if cache_info.get('last-modified'):
+                headers['If-Modified-Since'] = cache_info['last-modified']
             r=requests.get(url, verify='/etc/ssl/certs/ca-certificates.crt',
-                           headers={'User-Agent': 'FFDN DB validator'},
-                           stream=True, timeout=10)
+                           headers=headers, stream=True, timeout=10)
         except requests.exceptions.SSLError as e:
             yield self.err('Unable to connect, SSL Error: '+self.color('#dd1144', esc(e)))
         except requests.exceptions.ConnectionError as e:
@@ -127,26 +134,6 @@ class Crawler(object):
             yield self.abort('Invalid response code')
             return
 
-        yield self.info('Content type: '+self.bold(esc(r.headers.get('content-type', 'not defined'))))
-        if not r.headers.get('content-type'):
-            yield self.error('Content-type '+self.bold('MUST')+' be defined')
-            yield self.abort('The file must have a proper content-type to continue')
-        elif r.headers.get('content-type').lower() != 'application/json':
-            yield self.warn('Content-type '+self.italics('SHOULD')+' be application/json')
-
-        encoding=get_encoding(r.headers.get('content-type'))
-        if not encoding:
-            yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
-
-        yield self.info('Content length: %s'%(self.bold(esc(r.headers.get('content-length', 'not set')))))
-
-        cl=r.headers.get('content-length')
-        if not cl:
-            yield self.warn('No content-length. Note that we will not process a file whose size exceed 1MiB')
-        elif int(cl) > self.MAX_JSON_SIZE:
-            yield self.abort('File too big ! File size must be less then 1MiB')
-
-
         _cachecontrol=r.headers.get('cache-control')
         cachecontrol=self.parse_cache_control(_cachecontrol) if _cachecontrol else None
         max_age=None
@@ -177,7 +164,7 @@ class Crawler(object):
             else:
                 yield self.warn('Invalid Expires header. Expiry date must be in the future.')
                 expires=None
-        else:
+        elif _expires and not expires:
             yield self.warn('Invalid Expires header %r'%esc(_expires))
 
         if not max_age and not expires:
@@ -194,6 +181,48 @@ class Crawler(object):
         yield self.info('Next update will be in %s'%(timedelta(seconds=self.jdict_max_age)))
 
 
+        etag=r.headers.get('etag')
+        last_modified=r.headers.get('last-modified')
+        if not etag and not last_modified:
+            yield self.warn('Please, provide at an ETag or Last-Modified header for '
+                            'conditional requests')
+
+        self.cache_info={}
+        if etag:
+            self.cache_info['etag']=etag
+        if last_modified:
+            self.cache_info['last-modified']=last_modified
+
+        if cache_info and r.status_code == 304: # not modified
+            self.m('== '+self.color('forestgreen', 'Response not modified. All good !'))
+            self.modified=False
+            self.success=True
+            self.done_cb()
+            return
+
+
+        yield self.info('Content type: '+self.bold(esc(r.headers.get('content-type', 'not defined'))))
+        if not r.headers.get('content-type'):
+            yield self.err('Content-type '+self.bold('MUST')+' be defined')
+            yield self.abort('The file must have a proper content-type to continue')
+            return
+        elif r.headers.get('content-type').lower() != 'application/json':
+            yield self.warn('Content-type '+self.italics('SHOULD')+' be application/json')
+
+        encoding=get_encoding(r.headers.get('content-type'))
+        if not encoding:
+            yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
+
+        yield self.info('Content length: %s'%(self.bold(esc(r.headers.get('content-length', 'not set')))))
+
+        cl=r.headers.get('content-length')
+        if not cl:
+            yield self.warn('No content-length. Note that we will not process a file whose size exceed 1MiB')
+        elif int(cl) > self.MAX_JSON_SIZE:
+            yield self.abort('File too big ! File size must be less then 1MiB')
+            return
+
+
         yield self.info('Reading response into memory...')
         b=io.BytesIO()
         for d in r.iter_content(requests.models.CONTENT_CHUNK_SIZE):

+ 6 - 2
ffdnispdb/cron_task.py

@@ -93,9 +93,11 @@ try:
             db.session.commit()
 
             validator=TextValidator()
-            log=''.join(validator(isp.json_url))
+            log=''.join(validator(isp.json_url, isp.cache_info or {}))
             if not validator.success: # handle error
                 isp.update_error_strike += 1
+                if isp.cache_info:
+                    isp.cache_info = validator.cache_info
                 isp.next_update = datetime.now()+timedelta(seconds=validator.jdict_max_age)
                 db.session.add(isp)
                 db.session.commit()
@@ -107,7 +109,9 @@ try:
                 print log.rstrip()+'\n'
                 continue
 
-            isp.json = validator.jdict
+            if validator.modified:
+                isp.json = validator.jdict
+            isp.cache_info = validator.cache_info
             isp.last_update_success = isp.last_update_attempt
             isp.update_error_strike = 0
             isp.next_update = datetime.now()+timedelta(seconds=validator.jdict_max_age)

+ 1 - 0
ffdnispdb/models.py

@@ -55,6 +55,7 @@ class ISP(db.Model):
     update_error_strike = db.Column(db.Integer, default=0) # if >= 3; then updates are disabled
     next_update = db.Column(db.DateTime, default=datetime.now())
     tech_email = db.Column(db.String)
+    cache_info = db.Column(MutableDict.as_mutable(JSONEncodedDict))
     json = db.Column(MutableDict.as_mutable(JSONEncodedDict))
 
     def __init__(self, *args, **kwargs):