utils.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. """Utilities using NDG HTTPS Client, including a main module that can be used to
  2. fetch from a URL.
  3. """
  4. __author__ = "R B Wilkinson"
  5. __date__ = "09/12/11"
  6. __copyright__ = "(C) 2011 Science and Technology Facilities Council"
  7. __license__ = "BSD - see LICENSE file in top-level directory"
  8. __contact__ = "Philip.Kershaw@stfc.ac.uk"
  9. __revision__ = '$Id$'
  10. import cookielib
  11. import httplib
  12. import logging
  13. from optparse import OptionParser
  14. import os
  15. import urllib2
  16. from urllib2 import (HTTPHandler, HTTPCookieProcessor,
  17. HTTPBasicAuthHandler, HTTPPasswordMgrWithDefaultRealm)
  18. import urlparse
  19. from ndg.httpsclient.urllib2_build_opener import build_opener
  20. from ndg.httpsclient.https import HTTPSContextHandler
  21. from ndg.httpsclient import ssl_context_util
  22. log = logging.getLogger(__name__)
  23. class AccumulatingHTTPCookieProcessor(HTTPCookieProcessor):
  24. """Cookie processor that adds new cookies (instead of replacing the existing
  25. ones as HTTPCookieProcessor does)
  26. """
  27. def http_request(self, request):
  28. """Processes cookies for a HTTP request.
  29. @param request: request to process
  30. @type request: urllib2.Request
  31. @return: request
  32. @rtype: urllib2.Request
  33. """
  34. COOKIE_HEADER_NAME = "Cookie"
  35. tmp_request = urllib2.Request(request.get_full_url(), request.data, {},
  36. request.origin_req_host,
  37. request.unverifiable)
  38. self.cookiejar.add_cookie_header(tmp_request)
  39. # Combine existing and new cookies.
  40. new_cookies = tmp_request.get_header(COOKIE_HEADER_NAME)
  41. if new_cookies:
  42. if request.has_header(COOKIE_HEADER_NAME):
  43. # Merge new cookies with existing ones.
  44. old_cookies = request.get_header(COOKIE_HEADER_NAME)
  45. merged_cookies = '; '.join([old_cookies, new_cookies])
  46. request.add_unredirected_header(COOKIE_HEADER_NAME,
  47. merged_cookies)
  48. else:
  49. # No existing cookies so just set new ones.
  50. request.add_unredirected_header(COOKIE_HEADER_NAME, new_cookies)
  51. return request
  52. # Process cookies for HTTPS in the same way.
  53. https_request = http_request
  54. class URLFetchError(Exception):
  55. """Error fetching content from URL"""
  56. def fetch_from_url(url, config, data=None, handlers=None):
  57. """Returns data retrieved from a URL.
  58. @param url: URL to attempt to open
  59. @type url: basestring
  60. @param config: SSL context configuration
  61. @type config: Configuration
  62. @return data retrieved from URL or None
  63. """
  64. return_code, return_message, response = open_url(url, config, data=data,
  65. handlers=handlers)
  66. if return_code and return_code == httplib.OK:
  67. return_data = response.read()
  68. response.close()
  69. return return_data
  70. else:
  71. raise URLFetchError(return_message)
  72. def fetch_from_url_to_file(url, config, output_file, data=None, handlers=None):
  73. """Writes data retrieved from a URL to a file.
  74. @param url: URL to attempt to open
  75. @type url: basestring
  76. @param config: SSL context configuration
  77. @type config: Configuration
  78. @param output_file: output file
  79. @type output_file: basestring
  80. @return: tuple (
  81. returned HTTP status code or 0 if an error occurred
  82. returned message
  83. boolean indicating whether access was successful)
  84. """
  85. return_code, return_message, response = open_url(url, config, data=data,
  86. handlers=handlers)
  87. if return_code == httplib.OK:
  88. return_data = response.read()
  89. response.close()
  90. outfile = open(output_file, "w")
  91. outfile.write(return_data)
  92. outfile.close()
  93. return return_code, return_message, return_code == httplib.OK
  94. def fetch_stream_from_url(url, config, data=None, handlers=None):
  95. """Returns data retrieved from a URL.
  96. @param url: URL to attempt to open
  97. @type url: basestring
  98. @param config: SSL context configuration
  99. @type config: Configuration
  100. @return: data retrieved from URL or None
  101. @rtype: file derived type
  102. """
  103. return_code, return_message, response = open_url(url, config, data=data,
  104. handlers=handlers)
  105. if return_code and return_code == httplib.OK:
  106. return response
  107. else:
  108. raise URLFetchError(return_message)
  109. def open_url(url, config, data=None, handlers=None):
  110. """Attempts to open a connection to a specified URL.
  111. @param url: URL to attempt to open
  112. @param config: SSL context configuration
  113. @type config: Configuration
  114. @param data: HTTP POST data
  115. @type data: str
  116. @param handlers: list of custom urllib2 handlers to add to the request
  117. @type handlers: iterable
  118. @return: tuple (
  119. returned HTTP status code or 0 if an error occurred
  120. returned message or error description
  121. response object)
  122. """
  123. debuglevel = 1 if config.debug else 0
  124. # Set up handlers for URL opener.
  125. if config.cookie:
  126. cj = config.cookie
  127. else:
  128. cj = cookielib.CookieJar()
  129. # Use a cookie processor that accumulates cookies when redirects occur so
  130. # that an application can redirect for authentication and retain both any
  131. # cookies for the application and the security system (c.f.,
  132. # urllib2.HTTPCookieProcessor which replaces cookies).
  133. cookie_handler = AccumulatingHTTPCookieProcessor(cj)
  134. if not handlers:
  135. handlers = []
  136. handlers.append(cookie_handler)
  137. if config.debug:
  138. http_handler = HTTPHandler(debuglevel=debuglevel)
  139. https_handler = HTTPSContextHandler(config.ssl_context,
  140. debuglevel=debuglevel)
  141. handlers.extend([http_handler, https_handler])
  142. if config.http_basicauth:
  143. # currently only supports http basic auth
  144. auth_handler = HTTPBasicAuthHandler(HTTPPasswordMgrWithDefaultRealm())
  145. auth_handler.add_password(realm=None, uri=url,
  146. user=config.httpauth[0],
  147. passwd=config.httpauth[1])
  148. handlers.append(auth_handler)
  149. # Explicitly remove proxy handling if the host is one listed in the value of
  150. # the no_proxy environment variable because urllib2 does use proxy settings
  151. # set via http_proxy and https_proxy, but does not take the no_proxy value
  152. # into account.
  153. if not _should_use_proxy(url, config.no_proxy):
  154. handlers.append(urllib2.ProxyHandler({}))
  155. log.debug("Not using proxy")
  156. elif config.proxies:
  157. handlers.append(urllib2.ProxyHandler(config.proxies))
  158. log.debug("Configuring proxies: %s" % config.proxies)
  159. opener = build_opener(*handlers, ssl_context=config.ssl_context)
  160. headers = config.headers
  161. if headers is None:
  162. headers = {}
  163. request = urllib2.Request(url, data, headers)
  164. # Open the URL and check the response.
  165. return_code = 0
  166. return_message = ''
  167. response = None
  168. try:
  169. response = opener.open(request)
  170. return_message = response.msg
  171. return_code = response.code
  172. if log.isEnabledFor(logging.DEBUG):
  173. for index, cookie in enumerate(cj):
  174. log.debug("%s : %s", index, cookie)
  175. except urllib2.HTTPError, exc:
  176. return_code = exc.code
  177. return_message = "Error: %s" % exc.msg
  178. if log.isEnabledFor(logging.DEBUG):
  179. log.debug("%s %s", exc.code, exc.msg)
  180. except Exception, exc:
  181. return_message = "Error: %s" % exc.__str__()
  182. if log.isEnabledFor(logging.DEBUG):
  183. import traceback
  184. log.debug(traceback.format_exc())
  185. return (return_code, return_message, response)
  186. def _should_use_proxy(url, no_proxy=None):
  187. """Determines whether a proxy should be used to open a connection to the
  188. specified URL, based on the value of the no_proxy environment variable.
  189. @param url: URL
  190. @type url: basestring or urllib2.Request
  191. """
  192. if no_proxy is None:
  193. no_proxy_effective = os.environ.get('no_proxy', '')
  194. else:
  195. no_proxy_effective = no_proxy
  196. urlObj = urlparse.urlparse(_url_as_string(url))
  197. for np in [h.strip() for h in no_proxy_effective.split(',')]:
  198. if urlObj.hostname == np:
  199. return False
  200. return True
  201. def _url_as_string(url):
  202. """Returns the URL string from a URL value that is either a string or
  203. urllib2.Request..
  204. @param url: URL
  205. @type url: basestring or urllib2.Request
  206. @return: URL string
  207. @rtype: basestring
  208. """
  209. if isinstance(url, urllib2.Request):
  210. return url.get_full_url()
  211. elif isinstance(url, basestring):
  212. return url
  213. else:
  214. raise TypeError("Expected type %r or %r" %
  215. (basestring, urllib2.Request))
  216. class Configuration(object):
  217. """Connection configuration.
  218. """
  219. def __init__(self, ssl_context, debug=False, proxies=None, no_proxy=None,
  220. cookie=None, http_basicauth=None, headers=None):
  221. """
  222. @param ssl_context: SSL context to use with this configuration
  223. @type ssl_context: OpenSSL.SSL.Context
  224. @param debug: if True, output debugging information
  225. @type debug: bool
  226. @param proxies: proxies to use for
  227. @type proxies: dict with basestring keys and values
  228. @param no_proxy: hosts for which a proxy should not be used
  229. @type no_proxy: basestring
  230. @param cookie: cookies to set for request
  231. @type cookie: cookielib.CookieJar
  232. @param http_basicauth: http authentication, or None
  233. @type http_basicauth: tuple of (username,password)
  234. @param headers: http headers
  235. @type headers: dict
  236. """
  237. self.ssl_context = ssl_context
  238. self.debug = debug
  239. self.proxies = proxies
  240. self.no_proxy = no_proxy
  241. self.cookie = cookie
  242. self.http_basicauth = http_basicauth
  243. self.headers = headers
  244. def main():
  245. '''Utility to fetch data using HTTP or HTTPS GET from a specified URL.
  246. '''
  247. parser = OptionParser(usage="%prog [options] url")
  248. parser.add_option("-c", "--certificate", dest="cert_file", metavar="FILE",
  249. default=os.path.expanduser("~/credentials.pem"),
  250. help="Certificate file - defaults to $HOME/credentials.pem")
  251. parser.add_option("-k", "--private-key", dest="key_file", metavar="FILE",
  252. default=None,
  253. help="Private key file - defaults to the certificate file")
  254. parser.add_option("-t", "--ca-certificate-dir", dest="ca_dir",
  255. metavar="PATH",
  256. default=None,
  257. help="Trusted CA certificate file directory")
  258. parser.add_option("-d", "--debug", action="store_true", dest="debug",
  259. default=False,
  260. help="Print debug information.")
  261. parser.add_option("-p", "--post-data-file", dest="data_file",
  262. metavar="FILE", default=None,
  263. help="POST data file")
  264. parser.add_option("-f", "--fetch", dest="output_file", metavar="FILE",
  265. default=None, help="Output file")
  266. parser.add_option("-n", "--no-verify-peer", action="store_true",
  267. dest="no_verify_peer", default=False,
  268. help="Skip verification of peer certificate.")
  269. parser.add_option("-a", "--basicauth", dest="auth", metavar="USER:PASSWD",
  270. default=None,
  271. help="HTTP authentication credentials")
  272. parser.add_option("--header", action="append", dest="headers",
  273. metavar="HEADER: VALUE",
  274. help="Add HTTP header to request")
  275. (options, args) = parser.parse_args()
  276. if len(args) != 1:
  277. parser.error("Incorrect number of arguments")
  278. url = args[0]
  279. if options.debug:
  280. logging.getLogger().setLevel(logging.DEBUG)
  281. if options.key_file and os.path.exists(options.key_file):
  282. key_file = options.key_file
  283. else:
  284. key_file = None
  285. if options.cert_file and os.path.exists(options.cert_file):
  286. cert_file = options.cert_file
  287. else:
  288. cert_file = None
  289. if options.ca_dir and os.path.exists(options.ca_dir):
  290. ca_dir = options.ca_dir
  291. else:
  292. ca_dir = None
  293. verify_peer = not options.no_verify_peer
  294. if options.data_file and os.path.exists(options.data_file):
  295. data_file = open(options.data_file)
  296. data = data_file.read()
  297. data_file.close()
  298. else:
  299. data = None
  300. if options.basicauth:
  301. http_basicauth = options.auth.split(':', 1)
  302. else:
  303. http_basicauth = None
  304. headers = {}
  305. if options.headers:
  306. for h in options.headers:
  307. key, val = h.split(':', 1)
  308. headers[key.strip()] = val.lstrip()
  309. # If a private key file is not specified, the key is assumed to be stored in
  310. # the certificate file.
  311. ssl_context = ssl_context_util.make_ssl_context(key_file,
  312. cert_file,
  313. None,
  314. ca_dir,
  315. verify_peer,
  316. url)
  317. config = Configuration(ssl_context,
  318. options.debug,
  319. http_basicauth=http_basicauth,
  320. headers=headers)
  321. if options.output_file:
  322. return_code, return_message = fetch_from_url_to_file(
  323. url,
  324. config,
  325. options.output_file,
  326. data)[:2]
  327. raise SystemExit(return_code, return_message)
  328. else:
  329. data = fetch_from_url(url, config)
  330. print(data)
  331. if __name__=='__main__':
  332. logging.basicConfig()
  333. main()