urllib2_proxy.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. '''
  2. Created on 12 Jan 2012
  3. @author: rwilkinson
  4. '''
  5. import base64
  6. import socket
  7. import urlparse
  8. from urllib import unquote, addinfourl
  9. from urllib2 import _parse_proxy, URLError, HTTPError
  10. from urllib2 import (AbstractHTTPHandler as _AbstractHTTPHandler,
  11. BaseHandler as _BaseHandler,
  12. HTTPRedirectHandler as _HTTPRedirectHandler,
  13. Request as _Request,
  14. OpenerDirector as _OpenerDirector)
  15. from ndg.httpsclient.httplib_proxy import HTTPConnection
  16. class Request(_Request):
  17. def __init__(self, *args, **kw):
  18. _Request.__init__(self, *args, **kw)
  19. self._tunnel_host = None
  20. def set_proxy(self, host, type):
  21. if self.type == 'https' and not self._tunnel_host:
  22. self._tunnel_host = self.host
  23. else:
  24. self.type = type
  25. self.__r_host = self.__original
  26. self.host = host
  27. class BaseHandler(_BaseHandler):
  28. def proxy_open(self, req, proxy, type):
  29. if req.get_type() == 'https':
  30. orig_type = req.get_type()
  31. proxy_type, user, password, hostport = _parse_proxy(proxy)
  32. if proxy_type is None:
  33. proxy_type = orig_type
  34. if user and password:
  35. user_pass = '%s:%s' % (unquote(user), unquote(password))
  36. creds = base64.b64encode(user_pass).strip()
  37. req.add_header('Proxy-authorization', 'Basic ' + creds)
  38. hostport = unquote(hostport)
  39. req.set_proxy(hostport, proxy_type)
  40. # let other handlers take care of it
  41. return None
  42. else:
  43. return _BaseHandler.proxy_open(self, req, proxy, type)
  44. class AbstractHTTPHandler(_AbstractHTTPHandler):
  45. def do_open(self, http_class, req):
  46. """Return an addinfourl object for the request, using http_class.
  47. http_class must implement the HTTPConnection API from httplib.
  48. The addinfourl return value is a file-like object. It also
  49. has methods and attributes including:
  50. - info(): return a mimetools.Message object for the headers
  51. - geturl(): return the original request URL
  52. - code: HTTP status code
  53. """
  54. host = req.get_host()
  55. if not host:
  56. raise URLError('no host given')
  57. h = http_class(host, timeout=req.timeout) # will parse host:port
  58. h.set_debuglevel(self._debuglevel)
  59. headers = dict(req.headers)
  60. headers.update(req.unredirected_hdrs)
  61. # We want to make an HTTP/1.1 request, but the addinfourl
  62. # class isn't prepared to deal with a persistent connection.
  63. # It will try to read all remaining data from the socket,
  64. # which will block while the server waits for the next request.
  65. # So make sure the connection gets closed after the (only)
  66. # request.
  67. headers["Connection"] = "close"
  68. headers = dict(
  69. (name.title(), val) for name, val in headers.items())
  70. if not hasattr(req, '_tunnel_host'):
  71. pass
  72. if req._tunnel_host:
  73. h.set_tunnel(req._tunnel_host)
  74. try:
  75. h.request(req.get_method(), req.get_selector(), req.data, headers)
  76. r = h.getresponse()
  77. except socket.error, err: # XXX what error?
  78. raise URLError(err)
  79. # Pick apart the HTTPResponse object to get the addinfourl
  80. # object initialized properly.
  81. # Wrap the HTTPResponse object in socket's file object adapter
  82. # for Windows. That adapter calls recv(), so delegate recv()
  83. # to read(). This weird wrapping allows the returned object to
  84. # have readline() and readlines() methods.
  85. # XXX It might be better to extract the read buffering code
  86. # out of socket._fileobject() and into a base class.
  87. r.recv = r.read
  88. fp = socket._fileobject(r, close=True)
  89. resp = addinfourl(fp, r.msg, req.get_full_url())
  90. resp.code = r.status
  91. resp.msg = r.reason
  92. return resp
  93. class HTTPHandler(AbstractHTTPHandler):
  94. def http_open(self, req):
  95. return self.do_open(HTTPConnection, req)
  96. http_request = AbstractHTTPHandler.do_request_
  97. #if hasattr(httplib, 'HTTPS'):
  98. # class HTTPSHandler(AbstractHTTPHandler):
  99. #
  100. # def https_open(self, req):
  101. # return self.do_open(httplib.HTTPSConnection, req)
  102. #
  103. # https_request = AbstractHTTPHandler.do_request_
  104. class HTTPRedirectHandler(BaseHandler):
  105. # maximum number of redirections to any single URL
  106. # this is needed because of the state that cookies introduce
  107. max_repeats = 4
  108. # maximum total number of redirections (regardless of URL) before
  109. # assuming we're in a loop
  110. max_redirections = 10
  111. def redirect_request(self, req, fp, code, msg, headers, newurl):
  112. """Return a Request or None in response to a redirect.
  113. This is called by the http_error_30x methods when a
  114. redirection response is received. If a redirection should
  115. take place, return a new Request to allow http_error_30x to
  116. perform the redirect. Otherwise, raise HTTPError if no-one
  117. else should try to handle this url. Return None if you can't
  118. but another Handler might.
  119. """
  120. m = req.get_method()
  121. if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
  122. or code in (301, 302, 303) and m == "POST"):
  123. # Strictly (according to RFC 2616), 301 or 302 in response
  124. # to a POST MUST NOT cause a redirection without confirmation
  125. # from the user (of urllib2, in this case). In practice,
  126. # essentially all clients do redirect in this case, so we
  127. # do the same.
  128. # be conciliant with URIs containing a space
  129. newurl = newurl.replace(' ', '%20')
  130. newheaders = dict((k,v) for k,v in req.headers.items()
  131. if k.lower() not in ("content-length", "content-type")
  132. )
  133. return Request(newurl,
  134. headers=newheaders,
  135. origin_req_host=req.get_origin_req_host(),
  136. unverifiable=True)
  137. else:
  138. raise HTTPError(req.get_full_url(), code, msg, headers, fp)
  139. # Implementation note: To avoid the server sending us into an
  140. # infinite loop, the request object needs to track what URLs we
  141. # have already seen. Do this by adding a handler-specific
  142. # attribute to the Request object.
  143. def http_error_302(self, req, fp, code, msg, headers):
  144. # Some servers (incorrectly) return multiple Location headers
  145. # (so probably same goes for URI). Use first header.
  146. if 'location' in headers:
  147. newurl = headers.getheaders('location')[0]
  148. elif 'uri' in headers:
  149. newurl = headers.getheaders('uri')[0]
  150. else:
  151. return
  152. # fix a possible malformed URL
  153. urlparts = urlparse.urlparse(newurl)
  154. if not urlparts.path:
  155. urlparts = list(urlparts)
  156. urlparts[2] = "/"
  157. newurl = urlparse.urlunparse(urlparts)
  158. newurl = urlparse.urljoin(req.get_full_url(), newurl)
  159. # For security reasons we do not allow redirects to protocols
  160. # other than HTTP, HTTPS or FTP.
  161. newurl_lower = newurl.lower()
  162. if not (newurl_lower.startswith('http://') or
  163. newurl_lower.startswith('https://') or
  164. newurl_lower.startswith('ftp://')):
  165. raise HTTPError(newurl, code,
  166. msg + " - Redirection to url '%s' is not allowed" %
  167. newurl,
  168. headers, fp)
  169. # XXX Probably want to forget about the state of the current
  170. # request, although that might interact poorly with other
  171. # handlers that also use handler-specific request attributes
  172. new = self.redirect_request(req, fp, code, msg, headers, newurl)
  173. if new is None:
  174. return
  175. # loop detection
  176. # .redirect_dict has a key url if url was previously visited.
  177. if hasattr(req, 'redirect_dict'):
  178. visited = new.redirect_dict = req.redirect_dict
  179. if (visited.get(newurl, 0) >= self.max_repeats or
  180. len(visited) >= self.max_redirections):
  181. raise HTTPError(req.get_full_url(), code,
  182. self.inf_msg + msg, headers, fp)
  183. else:
  184. visited = new.redirect_dict = req.redirect_dict = {}
  185. visited[newurl] = visited.get(newurl, 0) + 1
  186. # Don't close the fp until we are sure that we won't use it
  187. # with HTTPError.
  188. fp.read()
  189. fp.close()
  190. return self.parent.open(new, timeout=req.timeout)
  191. http_error_301 = http_error_303 = http_error_307 = http_error_302
  192. inf_msg = "The HTTP server returned a redirect error that would " \
  193. "lead to an infinite loop.\n" \
  194. "The last 30x error message was:\n"
  195. class OpenerDirector(_OpenerDirector):
  196. def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
  197. # accept a URL or a Request object
  198. if isinstance(fullurl, basestring):
  199. req = Request(fullurl, data)
  200. else:
  201. req = fullurl
  202. if data is not None:
  203. req.add_data(data)
  204. req.timeout = timeout
  205. protocol = req.get_type()
  206. # pre-process request
  207. meth_name = protocol+"_request"
  208. for processor in self.process_request.get(protocol, []):
  209. meth = getattr(processor, meth_name)
  210. req = meth(req)
  211. response = self._open(req, data)
  212. # post-process response
  213. meth_name = protocol+"_response"
  214. for processor in self.process_response.get(protocol, []):
  215. meth = getattr(processor, meth_name)
  216. response = meth(req, response)
  217. return response