Fixes the hwpack download failure because of EULA

author: Deepti B. Kalakeri <deepti.kalakeri@linaro.org> 2012-01-06 11:09:01 +0000
committer: Deepti B. Kalakeri <deepti.kalakeri@linaro.org> 2012-01-06 11:09:01 +0000
commit: 25ffb0f293fcb99b353a79721d4f3924dcbbbb4c (patch)
tree: 2fed81853fae1f158f45e7387a976257f8c8f411 /download_content_yes_to_lic.py
parent: bdff58b9571dfdbda7c7a36b436d5868bf7d59b0 (diff)
1 files changed, 118 insertions, 0 deletions
diff --git a/download_content_yes_to_lic.py b/download_content_yes_to_lic.py
new file mode 100644
index 0000000..4a31941
--- /dev/null
+++ b/download_content_yes_to_lic.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+# Changes required to address EULA for the origen hwpacks
+
+import argparse
+import os
+import pycurl
+import re
+import urlparse
+
+class LicenseProtectedFileFetcher:
+    """Fetch a file from the web that may be protected by a license redirect
+
+    This is designed to run on snapshots.linaro.org. License HTML file are in
+    the form:
+
+    <vendor>.html has a link to <vendor>-accept.html
+
+    If self.get is pointed at a file that has to go through one of these
+    licenses, it should be able to automatically accept the license and
+    download the file.
+
+    Once a license has been accepted, it will be used for all following
+    downloads.
+
+    If self.close() is called before the object is deleted, cURL will store
+    the license accept cookie to cookies.txt, so it can be used for later
+    downloads.
+
+    """
+    def __init__(self):
+        """Set up cURL"""
+        self.curl = pycurl.Curl()
+        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
+        self.curl.setopt(pycurl.WRITEFUNCTION, self._write_body)
+        self.curl.setopt(pycurl.HEADERFUNCTION, self._write_header)
+        self.curl.setopt(pycurl.COOKIEFILE, "cookies.txt")
+        self.curl.setopt(pycurl.COOKIEJAR, "cookies.txt")
+
+    def _get(self, url):
+        """Clear out header and body storage, fetch URL, filling them in."""
+        self.curl.setopt(pycurl.URL, url)
+
+        self.body = ""
+        self.header = ""
+
+        self.curl.perform()
+
+    def get(self, url):
+        """Fetch the requested URL, accepting licenses, returns file body
+
+        Fetches the file at url. If a redirect is encountered, it is
+        expected to be to a license that has an accept link. Follow that link,
+        then download the original file.
+
+        """
+        self._get(url)
+
+        location = self._get_location()
+        if location:
+            # Off to the races - we have been redirected.
+            # Expect to find a link to self.location with -accepted inserted
+            # before the .html, i.e. ste.html -> ste-accepted.html
+
+            # Get the file from the URL (full path)
+            file = urlparse.urlparse(location).path
+
+            # Get the file without the rest of the path
+            file = os.path.split(file)[-1]
+
+            # Look for a link with accepted.html in the page name. Follow it.
+            new_file = None
+            for line in self.body.splitlines():
+                link_search = re.search("""href=.*?["'](.*?-accepted.html)""",
+                                        line)
+                if link_search:
+                    # Have found license accept URL!
+                    new_file = link_search.group(1)
+
+            if new_file:
+                # Accept the license...
+                accept_url = re.sub(file, new_file, location)
+                self._get(accept_url)
+
+                # The above get *should* take us to the file requested via
+                # a redirect. If we manually need to follow that redirect,
+                # do that now.
+
+                if self._get_location():
+                    # If we haven't been redirected to our original file,
+                    # we should be able to just download it now.
+                    self._get(url)
+
+        return self.body
+
+    def _search_header(self, field):
+        """Search header for the supplied field, return field / None"""
+        for line in self.header.splitlines():
+            search = re.search(field + ":\s+(.*?)$", line)
+            if search:
+                return search.group(1)
+        return None
+
+    def _get_location(self):
+        """Return content of Location field in header / None"""
+        return self._search_header("Location")
+
+    def _write_body(self, buf):
+        """Used by curl as a sink for body content"""
+        self.body += buf
+
+    def _write_header(self, buf):
+        """Used by curl as a sink for header content"""
+        self.header += buf
+
+    def close(self):
+        """Wrapper to close curl - this will allow curl to write out cookies"""
+        self.curl.close()
author	Deepti B. Kalakeri <deepti.kalakeri@linaro.org>	2012-01-06 11:09:01 +0000
committer	Deepti B. Kalakeri <deepti.kalakeri@linaro.org>	2012-01-06 11:09:01 +0000
commit	25ffb0f293fcb99b353a79721d4f3924dcbbbb4c (patch)
tree	2fed81853fae1f158f45e7387a976257f8c8f411 /download_content_yes_to_lic.py
parent	bdff58b9571dfdbda7c7a36b436d5868bf7d59b0 (diff)