|
@@ -0,0 +1,262 @@
|
|
|
+'''
|
|
|
+Created on 12 Jan 2012
|
|
|
+
|
|
|
+@author: rwilkinson
|
|
|
+'''
|
|
|
+import base64
|
|
|
+import socket
|
|
|
+import urlparse
|
|
|
+from urllib import unquote, addinfourl
|
|
|
+from urllib2 import _parse_proxy, URLError, HTTPError
|
|
|
+from urllib2 import (AbstractHTTPHandler as _AbstractHTTPHandler,
|
|
|
+ BaseHandler as _BaseHandler,
|
|
|
+ HTTPRedirectHandler as _HTTPRedirectHandler,
|
|
|
+ Request as _Request,
|
|
|
+ OpenerDirector as _OpenerDirector)
|
|
|
+
|
|
|
+from ndg.httpsclient.httplib_proxy import HTTPConnection
|
|
|
+
|
|
|
+
|
|
|
+class Request(_Request):
|
|
|
+
|
|
|
+ def __init__(self, *args, **kw):
|
|
|
+ _Request.__init__(self, *args, **kw)
|
|
|
+ self._tunnel_host = None
|
|
|
+
|
|
|
+ def set_proxy(self, host, type):
|
|
|
+ if self.type == 'https' and not self._tunnel_host:
|
|
|
+ self._tunnel_host = self.host
|
|
|
+ else:
|
|
|
+ self.type = type
|
|
|
+ self.__r_host = self.__original
|
|
|
+ self.host = host
|
|
|
+
|
|
|
+
|
|
|
+class BaseHandler(_BaseHandler):
|
|
|
+ def proxy_open(self, req, proxy, type):
|
|
|
+ if req.get_type() == 'https':
|
|
|
+ orig_type = req.get_type()
|
|
|
+ proxy_type, user, password, hostport = _parse_proxy(proxy)
|
|
|
+ if proxy_type is None:
|
|
|
+ proxy_type = orig_type
|
|
|
+ if user and password:
|
|
|
+ user_pass = '%s:%s' % (unquote(user), unquote(password))
|
|
|
+ creds = base64.b64encode(user_pass).strip()
|
|
|
+ req.add_header('Proxy-authorization', 'Basic ' + creds)
|
|
|
+ hostport = unquote(hostport)
|
|
|
+ req.set_proxy(hostport, proxy_type)
|
|
|
+ # let other handlers take care of it
|
|
|
+ return None
|
|
|
+ else:
|
|
|
+ return _BaseHandler.proxy_open(self, req, proxy, type)
|
|
|
+
|
|
|
+class AbstractHTTPHandler(_AbstractHTTPHandler):
|
|
|
+ def do_open(self, http_class, req):
|
|
|
+ """Return an addinfourl object for the request, using http_class.
|
|
|
+
|
|
|
+ http_class must implement the HTTPConnection API from httplib.
|
|
|
+ The addinfourl return value is a file-like object. It also
|
|
|
+ has methods and attributes including:
|
|
|
+ - info(): return a mimetools.Message object for the headers
|
|
|
+ - geturl(): return the original request URL
|
|
|
+ - code: HTTP status code
|
|
|
+ """
|
|
|
+ host = req.get_host()
|
|
|
+ if not host:
|
|
|
+ raise URLError('no host given')
|
|
|
+
|
|
|
+ h = http_class(host, timeout=req.timeout) # will parse host:port
|
|
|
+ h.set_debuglevel(self._debuglevel)
|
|
|
+
|
|
|
+ headers = dict(req.headers)
|
|
|
+ headers.update(req.unredirected_hdrs)
|
|
|
+ # We want to make an HTTP/1.1 request, but the addinfourl
|
|
|
+ # class isn't prepared to deal with a persistent connection.
|
|
|
+ # It will try to read all remaining data from the socket,
|
|
|
+ # which will block while the server waits for the next request.
|
|
|
+ # So make sure the connection gets closed after the (only)
|
|
|
+ # request.
|
|
|
+ headers["Connection"] = "close"
|
|
|
+ headers = dict(
|
|
|
+ (name.title(), val) for name, val in headers.items())
|
|
|
+
|
|
|
+ if not hasattr(req, '_tunnel_host'):
|
|
|
+ pass
|
|
|
+
|
|
|
+ if req._tunnel_host:
|
|
|
+ h.set_tunnel(req._tunnel_host)
|
|
|
+ try:
|
|
|
+ h.request(req.get_method(), req.get_selector(), req.data, headers)
|
|
|
+ r = h.getresponse()
|
|
|
+ except socket.error, err: # XXX what error?
|
|
|
+ raise URLError(err)
|
|
|
+
|
|
|
+ # Pick apart the HTTPResponse object to get the addinfourl
|
|
|
+ # object initialized properly.
|
|
|
+
|
|
|
+ # Wrap the HTTPResponse object in socket's file object adapter
|
|
|
+ # for Windows. That adapter calls recv(), so delegate recv()
|
|
|
+ # to read(). This weird wrapping allows the returned object to
|
|
|
+ # have readline() and readlines() methods.
|
|
|
+
|
|
|
+ # XXX It might be better to extract the read buffering code
|
|
|
+ # out of socket._fileobject() and into a base class.
|
|
|
+
|
|
|
+ r.recv = r.read
|
|
|
+ fp = socket._fileobject(r, close=True)
|
|
|
+
|
|
|
+ resp = addinfourl(fp, r.msg, req.get_full_url())
|
|
|
+ resp.code = r.status
|
|
|
+ resp.msg = r.reason
|
|
|
+ return resp
|
|
|
+
|
|
|
+
|
|
|
+class HTTPHandler(AbstractHTTPHandler):
|
|
|
+
|
|
|
+ def http_open(self, req):
|
|
|
+ return self.do_open(HTTPConnection, req)
|
|
|
+
|
|
|
+ http_request = AbstractHTTPHandler.do_request_
|
|
|
+
|
|
|
+#if hasattr(httplib, 'HTTPS'):
|
|
|
+# class HTTPSHandler(AbstractHTTPHandler):
|
|
|
+#
|
|
|
+# def https_open(self, req):
|
|
|
+# return self.do_open(httplib.HTTPSConnection, req)
|
|
|
+#
|
|
|
+# https_request = AbstractHTTPHandler.do_request_
|
|
|
+
|
|
|
+
|
|
|
+class HTTPRedirectHandler(BaseHandler):
|
|
|
+ # maximum number of redirections to any single URL
|
|
|
+ # this is needed because of the state that cookies introduce
|
|
|
+ max_repeats = 4
|
|
|
+ # maximum total number of redirections (regardless of URL) before
|
|
|
+ # assuming we're in a loop
|
|
|
+ max_redirections = 10
|
|
|
+
|
|
|
+ def redirect_request(self, req, fp, code, msg, headers, newurl):
|
|
|
+ """Return a Request or None in response to a redirect.
|
|
|
+
|
|
|
+ This is called by the http_error_30x methods when a
|
|
|
+ redirection response is received. If a redirection should
|
|
|
+ take place, return a new Request to allow http_error_30x to
|
|
|
+ perform the redirect. Otherwise, raise HTTPError if no-one
|
|
|
+ else should try to handle this url. Return None if you can't
|
|
|
+ but another Handler might.
|
|
|
+ """
|
|
|
+ m = req.get_method()
|
|
|
+ if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
|
|
|
+ or code in (301, 302, 303) and m == "POST"):
|
|
|
+ # Strictly (according to RFC 2616), 301 or 302 in response
|
|
|
+ # to a POST MUST NOT cause a redirection without confirmation
|
|
|
+ # from the user (of urllib2, in this case). In practice,
|
|
|
+ # essentially all clients do redirect in this case, so we
|
|
|
+ # do the same.
|
|
|
+ # be conciliant with URIs containing a space
|
|
|
+ newurl = newurl.replace(' ', '%20')
|
|
|
+ newheaders = dict((k,v) for k,v in req.headers.items()
|
|
|
+ if k.lower() not in ("content-length", "content-type")
|
|
|
+ )
|
|
|
+ return Request(newurl,
|
|
|
+ headers=newheaders,
|
|
|
+ origin_req_host=req.get_origin_req_host(),
|
|
|
+ unverifiable=True)
|
|
|
+ else:
|
|
|
+ raise HTTPError(req.get_full_url(), code, msg, headers, fp)
|
|
|
+
|
|
|
+ # Implementation note: To avoid the server sending us into an
|
|
|
+ # infinite loop, the request object needs to track what URLs we
|
|
|
+ # have already seen. Do this by adding a handler-specific
|
|
|
+ # attribute to the Request object.
|
|
|
+ def http_error_302(self, req, fp, code, msg, headers):
|
|
|
+ # Some servers (incorrectly) return multiple Location headers
|
|
|
+ # (so probably same goes for URI). Use first header.
|
|
|
+ if 'location' in headers:
|
|
|
+ newurl = headers.getheaders('location')[0]
|
|
|
+ elif 'uri' in headers:
|
|
|
+ newurl = headers.getheaders('uri')[0]
|
|
|
+ else:
|
|
|
+ return
|
|
|
+
|
|
|
+ # fix a possible malformed URL
|
|
|
+ urlparts = urlparse.urlparse(newurl)
|
|
|
+ if not urlparts.path:
|
|
|
+ urlparts = list(urlparts)
|
|
|
+ urlparts[2] = "/"
|
|
|
+ newurl = urlparse.urlunparse(urlparts)
|
|
|
+
|
|
|
+ newurl = urlparse.urljoin(req.get_full_url(), newurl)
|
|
|
+
|
|
|
+ # For security reasons we do not allow redirects to protocols
|
|
|
+ # other than HTTP, HTTPS or FTP.
|
|
|
+ newurl_lower = newurl.lower()
|
|
|
+ if not (newurl_lower.startswith('http://') or
|
|
|
+ newurl_lower.startswith('https://') or
|
|
|
+ newurl_lower.startswith('ftp://')):
|
|
|
+ raise HTTPError(newurl, code,
|
|
|
+ msg + " - Redirection to url '%s' is not allowed" %
|
|
|
+ newurl,
|
|
|
+ headers, fp)
|
|
|
+
|
|
|
+ # XXX Probably want to forget about the state of the current
|
|
|
+ # request, although that might interact poorly with other
|
|
|
+ # handlers that also use handler-specific request attributes
|
|
|
+ new = self.redirect_request(req, fp, code, msg, headers, newurl)
|
|
|
+ if new is None:
|
|
|
+ return
|
|
|
+
|
|
|
+ # loop detection
|
|
|
+ # .redirect_dict has a key url if url was previously visited.
|
|
|
+ if hasattr(req, 'redirect_dict'):
|
|
|
+ visited = new.redirect_dict = req.redirect_dict
|
|
|
+ if (visited.get(newurl, 0) >= self.max_repeats or
|
|
|
+ len(visited) >= self.max_redirections):
|
|
|
+ raise HTTPError(req.get_full_url(), code,
|
|
|
+ self.inf_msg + msg, headers, fp)
|
|
|
+ else:
|
|
|
+ visited = new.redirect_dict = req.redirect_dict = {}
|
|
|
+ visited[newurl] = visited.get(newurl, 0) + 1
|
|
|
+
|
|
|
+ # Don't close the fp until we are sure that we won't use it
|
|
|
+ # with HTTPError.
|
|
|
+ fp.read()
|
|
|
+ fp.close()
|
|
|
+
|
|
|
+ return self.parent.open(new, timeout=req.timeout)
|
|
|
+
|
|
|
+ http_error_301 = http_error_303 = http_error_307 = http_error_302
|
|
|
+
|
|
|
+ inf_msg = "The HTTP server returned a redirect error that would " \
|
|
|
+ "lead to an infinite loop.\n" \
|
|
|
+ "The last 30x error message was:\n"
|
|
|
+
|
|
|
+
|
|
|
+class OpenerDirector(_OpenerDirector):
|
|
|
+ def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
|
|
|
+ # accept a URL or a Request object
|
|
|
+ if isinstance(fullurl, basestring):
|
|
|
+ req = Request(fullurl, data)
|
|
|
+ else:
|
|
|
+ req = fullurl
|
|
|
+ if data is not None:
|
|
|
+ req.add_data(data)
|
|
|
+
|
|
|
+ req.timeout = timeout
|
|
|
+ protocol = req.get_type()
|
|
|
+
|
|
|
+ # pre-process request
|
|
|
+ meth_name = protocol+"_request"
|
|
|
+ for processor in self.process_request.get(protocol, []):
|
|
|
+ meth = getattr(processor, meth_name)
|
|
|
+ req = meth(req)
|
|
|
+
|
|
|
+ response = self._open(req, data)
|
|
|
+
|
|
|
+ # post-process response
|
|
|
+ meth_name = protocol+"_response"
|
|
|
+ for processor in self.process_response.get(protocol, []):
|
|
|
+ meth = getattr(processor, meth_name)
|
|
|
+ response = meth(req, response)
|
|
|
+
|
|
|
+ return response
|