123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262 |
- '''
- Created on 12 Jan 2012
- @author: rwilkinson
- '''
- import base64
- import socket
- import urlparse
- from urllib import unquote, addinfourl
- from urllib2 import _parse_proxy, URLError, HTTPError
- from urllib2 import (AbstractHTTPHandler as _AbstractHTTPHandler,
- BaseHandler as _BaseHandler,
- HTTPRedirectHandler as _HTTPRedirectHandler,
- Request as _Request,
- OpenerDirector as _OpenerDirector)
- from ndg.httpsclient.httplib_proxy import HTTPConnection
- class Request(_Request):
- def __init__(self, *args, **kw):
- _Request.__init__(self, *args, **kw)
- self._tunnel_host = None
- def set_proxy(self, host, type):
- if self.type == 'https' and not self._tunnel_host:
- self._tunnel_host = self.host
- else:
- self.type = type
- self.__r_host = self.__original
- self.host = host
- class BaseHandler(_BaseHandler):
- def proxy_open(self, req, proxy, type):
- if req.get_type() == 'https':
- orig_type = req.get_type()
- proxy_type, user, password, hostport = _parse_proxy(proxy)
- if proxy_type is None:
- proxy_type = orig_type
- if user and password:
- user_pass = '%s:%s' % (unquote(user), unquote(password))
- creds = base64.b64encode(user_pass).strip()
- req.add_header('Proxy-authorization', 'Basic ' + creds)
- hostport = unquote(hostport)
- req.set_proxy(hostport, proxy_type)
- # let other handlers take care of it
- return None
- else:
- return _BaseHandler.proxy_open(self, req, proxy, type)
- class AbstractHTTPHandler(_AbstractHTTPHandler):
- def do_open(self, http_class, req):
- """Return an addinfourl object for the request, using http_class.
- http_class must implement the HTTPConnection API from httplib.
- The addinfourl return value is a file-like object. It also
- has methods and attributes including:
- - info(): return a mimetools.Message object for the headers
- - geturl(): return the original request URL
- - code: HTTP status code
- """
- host = req.get_host()
- if not host:
- raise URLError('no host given')
- h = http_class(host, timeout=req.timeout) # will parse host:port
- h.set_debuglevel(self._debuglevel)
- headers = dict(req.headers)
- headers.update(req.unredirected_hdrs)
- # We want to make an HTTP/1.1 request, but the addinfourl
- # class isn't prepared to deal with a persistent connection.
- # It will try to read all remaining data from the socket,
- # which will block while the server waits for the next request.
- # So make sure the connection gets closed after the (only)
- # request.
- headers["Connection"] = "close"
- headers = dict(
- (name.title(), val) for name, val in headers.items())
- if not hasattr(req, '_tunnel_host'):
- pass
-
- if req._tunnel_host:
- h.set_tunnel(req._tunnel_host)
- try:
- h.request(req.get_method(), req.get_selector(), req.data, headers)
- r = h.getresponse()
- except socket.error, err: # XXX what error?
- raise URLError(err)
- # Pick apart the HTTPResponse object to get the addinfourl
- # object initialized properly.
- # Wrap the HTTPResponse object in socket's file object adapter
- # for Windows. That adapter calls recv(), so delegate recv()
- # to read(). This weird wrapping allows the returned object to
- # have readline() and readlines() methods.
- # XXX It might be better to extract the read buffering code
- # out of socket._fileobject() and into a base class.
- r.recv = r.read
- fp = socket._fileobject(r, close=True)
- resp = addinfourl(fp, r.msg, req.get_full_url())
- resp.code = r.status
- resp.msg = r.reason
- return resp
- class HTTPHandler(AbstractHTTPHandler):
- def http_open(self, req):
- return self.do_open(HTTPConnection, req)
- http_request = AbstractHTTPHandler.do_request_
- #if hasattr(httplib, 'HTTPS'):
- # class HTTPSHandler(AbstractHTTPHandler):
- #
- # def https_open(self, req):
- # return self.do_open(httplib.HTTPSConnection, req)
- #
- # https_request = AbstractHTTPHandler.do_request_
- class HTTPRedirectHandler(BaseHandler):
- # maximum number of redirections to any single URL
- # this is needed because of the state that cookies introduce
- max_repeats = 4
- # maximum total number of redirections (regardless of URL) before
- # assuming we're in a loop
- max_redirections = 10
- def redirect_request(self, req, fp, code, msg, headers, newurl):
- """Return a Request or None in response to a redirect.
- This is called by the http_error_30x methods when a
- redirection response is received. If a redirection should
- take place, return a new Request to allow http_error_30x to
- perform the redirect. Otherwise, raise HTTPError if no-one
- else should try to handle this url. Return None if you can't
- but another Handler might.
- """
- m = req.get_method()
- if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
- or code in (301, 302, 303) and m == "POST"):
- # Strictly (according to RFC 2616), 301 or 302 in response
- # to a POST MUST NOT cause a redirection without confirmation
- # from the user (of urllib2, in this case). In practice,
- # essentially all clients do redirect in this case, so we
- # do the same.
- # be conciliant with URIs containing a space
- newurl = newurl.replace(' ', '%20')
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ("content-length", "content-type")
- )
- return Request(newurl,
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
- else:
- raise HTTPError(req.get_full_url(), code, msg, headers, fp)
- # Implementation note: To avoid the server sending us into an
- # infinite loop, the request object needs to track what URLs we
- # have already seen. Do this by adding a handler-specific
- # attribute to the Request object.
- def http_error_302(self, req, fp, code, msg, headers):
- # Some servers (incorrectly) return multiple Location headers
- # (so probably same goes for URI). Use first header.
- if 'location' in headers:
- newurl = headers.getheaders('location')[0]
- elif 'uri' in headers:
- newurl = headers.getheaders('uri')[0]
- else:
- return
- # fix a possible malformed URL
- urlparts = urlparse.urlparse(newurl)
- if not urlparts.path:
- urlparts = list(urlparts)
- urlparts[2] = "/"
- newurl = urlparse.urlunparse(urlparts)
- newurl = urlparse.urljoin(req.get_full_url(), newurl)
- # For security reasons we do not allow redirects to protocols
- # other than HTTP, HTTPS or FTP.
- newurl_lower = newurl.lower()
- if not (newurl_lower.startswith('http://') or
- newurl_lower.startswith('https://') or
- newurl_lower.startswith('ftp://')):
- raise HTTPError(newurl, code,
- msg + " - Redirection to url '%s' is not allowed" %
- newurl,
- headers, fp)
- # XXX Probably want to forget about the state of the current
- # request, although that might interact poorly with other
- # handlers that also use handler-specific request attributes
- new = self.redirect_request(req, fp, code, msg, headers, newurl)
- if new is None:
- return
- # loop detection
- # .redirect_dict has a key url if url was previously visited.
- if hasattr(req, 'redirect_dict'):
- visited = new.redirect_dict = req.redirect_dict
- if (visited.get(newurl, 0) >= self.max_repeats or
- len(visited) >= self.max_redirections):
- raise HTTPError(req.get_full_url(), code,
- self.inf_msg + msg, headers, fp)
- else:
- visited = new.redirect_dict = req.redirect_dict = {}
- visited[newurl] = visited.get(newurl, 0) + 1
- # Don't close the fp until we are sure that we won't use it
- # with HTTPError.
- fp.read()
- fp.close()
- return self.parent.open(new, timeout=req.timeout)
- http_error_301 = http_error_303 = http_error_307 = http_error_302
- inf_msg = "The HTTP server returned a redirect error that would " \
- "lead to an infinite loop.\n" \
- "The last 30x error message was:\n"
-
- class OpenerDirector(_OpenerDirector):
- def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
- # accept a URL or a Request object
- if isinstance(fullurl, basestring):
- req = Request(fullurl, data)
- else:
- req = fullurl
- if data is not None:
- req.add_data(data)
- req.timeout = timeout
- protocol = req.get_type()
- # pre-process request
- meth_name = protocol+"_request"
- for processor in self.process_request.get(protocol, []):
- meth = getattr(processor, meth_name)
- req = meth(req)
- response = self._open(req, data)
-
- # post-process response
- meth_name = protocol+"_response"
- for processor in self.process_response.get(protocol, []):
- meth = getattr(processor, meth_name)
- response = meth(req, response)
- return response
|