Browse Source

Fix charset detection when content-type is text/plain

Gu1 11 years ago
parent
commit
896c39a861
1 changed files with 13 additions and 4 deletions
  1. 13 4
      ffdnispdb/crawler.py

+ 13 - 4
ffdnispdb/crawler.py

@@ -3,11 +3,19 @@
 from flask import escape, json
 import requests
 import io
+import cgi
 
 from ispformat.validator import validate_isp
 from .models import ISP
 
 
+def get_encoding(content_type):
+    content_type, params = cgi.parse_header(content_type)
+
+    if 'charset' in params:
+        return params['charset'].strip("'\"")
+
+
 class Crawler(object):
 
     MAX_JSON_SIZE=1*1024*1024
@@ -77,7 +85,8 @@ class Crawler(object):
         elif r.headers.get('content-type').lower() != 'application/json':
             yield self.warn('Content-type <em>SHOULD</em> be application/json')
 
-        if not r.encoding:
+        encoding=get_encoding(r.headers.get('content-type'))
+        if not encoding:
             yield self.warn('Encoding not set. Assuming it\'s unicode, as per RFC4627 section 3')
 
         yield self.info('Content length: <strong>%s</strong>'%(esc(r.headers.get('content-length', 'not set'))))
@@ -100,7 +109,7 @@ class Crawler(object):
         yield self.info('Successfully read %d bytes'%len(r.content))
 
         yield self.m('<br>* Parsing the JSON file')
-        if not r.encoding:
+        if not encoding:
             charset=requests.utils.guess_json_utf(r.content)
             if not charset:
                 yield self.err('Unable to guess unicode charset')
@@ -110,8 +119,8 @@ class Crawler(object):
             yield self.info('Guessed charset: <strong>%s</strong>'%charset)
 
         try:
-            txt=r.content.decode(r.encoding or charset)
-            yield self.info('Successfully decoded file as %s'%esc(r.encoding or charset))
+            txt=r.content.decode(encoding or charset)
+            yield self.info('Successfully decoded file as %s'%esc(encoding or charset))
         except LookupError as e:
             yield self.err('Invalid/unknown charset: %s'%esc(e))
             yield self.abort('Charset error, Cannot continue')