... |
... |
@@ -2,6 +2,13 @@ |
2
|
2
|
DownloadableFileSource - Abstract class for downloading files
|
3
|
3
|
=============================================================
|
4
|
4
|
A base abstract class for source implementations which download a file.
|
|
5
|
+
|
|
6
|
+Derived classes must write their own stage() implementation, using the
|
|
7
|
+public APIs exposed in this class.
|
|
8
|
+
|
|
9
|
+Derived classes must also chain up to the parent method in their get_unique_key()
|
|
10
|
+implementations.
|
|
11
|
+
|
5
|
12
|
"""
|
6
|
13
|
|
7
|
14
|
import os
|
... |
... |
@@ -19,50 +26,54 @@ class DownloadableFileSource(Source): |
19
|
26
|
|
20
|
27
|
COMMON_CONFIG_KEYS = Source.COMMON_CONFIG_KEYS + ['url', 'ref', 'etag']
|
21
|
28
|
|
|
29
|
+ #####################################
|
|
30
|
+ # Implementations of abstract methods
|
|
31
|
+ #####################################
|
|
32
|
+
|
22
|
33
|
def configure(self, node):
|
23
|
|
- self.original_url = self.node_get_member(node, str, 'url')
|
24
|
|
- self.ref = self.node_get_member(node, str, 'ref', None)
|
25
|
|
- self.url = self.translate_url(self.original_url)
|
26
|
|
- self._warn_deprecated_etag(node)
|
|
34
|
+ self.__original_url = self.node_get_member(node, str, 'url')
|
|
35
|
+ self.__ref = self.node_get_member(node, str, 'ref', None)
|
|
36
|
+ self.__url = self.translate_url(self.__original_url)
|
|
37
|
+ self.__warn_deprecated_etag(node)
|
27
|
38
|
|
28
|
39
|
def preflight(self):
|
29
|
40
|
return
|
30
|
41
|
|
31
|
42
|
def get_unique_key(self):
|
32
|
|
- return [self.original_url, self.ref]
|
|
43
|
+ return [self.__original_url, self.__ref]
|
33
|
44
|
|
34
|
45
|
def get_consistency(self):
|
35
|
|
- if self.ref is None:
|
|
46
|
+ if self.__ref is None:
|
36
|
47
|
return Consistency.INCONSISTENT
|
37
|
48
|
|
38
|
|
- if os.path.isfile(self._get_mirror_file()):
|
|
49
|
+ if os.path.isfile(self.get_mirror_file()):
|
39
|
50
|
return Consistency.CACHED
|
40
|
51
|
|
41
|
52
|
else:
|
42
|
53
|
return Consistency.RESOLVED
|
43
|
54
|
|
44
|
55
|
def load_ref(self, node):
|
45
|
|
- self.ref = self.node_get_member(node, str, 'ref', None)
|
46
|
|
- self._warn_deprecated_etag(node)
|
|
56
|
+ self.__ref = self.node_get_member(node, str, 'ref', None)
|
|
57
|
+ self.__warn_deprecated_etag(node)
|
47
|
58
|
|
48
|
59
|
def get_ref(self):
|
49
|
|
- return self.ref
|
|
60
|
+ return self.__ref
|
50
|
61
|
|
51
|
62
|
def set_ref(self, ref, node):
|
52
|
|
- node['ref'] = self.ref = ref
|
|
63
|
+ node['ref'] = self.__ref = ref
|
53
|
64
|
|
54
|
65
|
def track(self):
|
55
|
66
|
# there is no 'track' field in the source to determine what/whether
|
56
|
67
|
# or not to update refs, because tracking a ref is always a conscious
|
57
|
68
|
# decision by the user.
|
58
|
|
- with self.timed_activity("Tracking {}".format(self.url),
|
|
69
|
+ with self.timed_activity("Tracking {}".format(self.__url),
|
59
|
70
|
silent_nested=True):
|
60
|
|
- new_ref = self._ensure_mirror()
|
|
71
|
+ new_ref = self.ensure_mirror()
|
61
|
72
|
|
62
|
|
- if self.ref and self.ref != new_ref:
|
|
73
|
+ if self.__ref and self.__ref != new_ref:
|
63
|
74
|
detail = "When tracking, new ref differs from current ref:\n" \
|
64
|
|
- + " Tracked URL: {}\n".format(self.url) \
|
65
|
|
- + " Current ref: {}\n".format(self.ref) \
|
|
75
|
+ + " Tracked URL: {}\n".format(self.__url) \
|
|
76
|
+ + " Current ref: {}\n".format(self.__ref) \
|
66
|
77
|
+ " New ref: {}\n".format(new_ref)
|
67
|
78
|
self.warn("Potential man-in-the-middle attack!", detail=detail)
|
68
|
79
|
|
... |
... |
@@ -74,49 +85,40 @@ class DownloadableFileSource(Source): |
74
|
85
|
# file to be already cached because Source.fetch() will
|
75
|
86
|
# not be called if the source is already Consistency.CACHED.
|
76
|
87
|
#
|
77
|
|
- if os.path.isfile(self._get_mirror_file()):
|
|
88
|
+ if os.path.isfile(self.get_mirror_file()):
|
78
|
89
|
return # pragma: nocover
|
79
|
90
|
|
80
|
91
|
# Download the file, raise hell if the sha256sums don't match,
|
81
|
92
|
# and mirror the file otherwise.
|
82
|
|
- with self.timed_activity("Fetching {}".format(self.url), silent_nested=True):
|
83
|
|
- sha256 = self._ensure_mirror()
|
84
|
|
- if sha256 != self.ref:
|
|
93
|
+ with self.timed_activity("Fetching {}".format(self.__url), silent_nested=True):
|
|
94
|
+ sha256 = self.ensure_mirror()
|
|
95
|
+ if sha256 != self.__ref:
|
85
|
96
|
raise SourceError("File downloaded from {} has sha256sum '{}', not '{}'!"
|
86
|
|
- .format(self.url, sha256, self.ref))
|
|
97
|
+ .format(self.__url, sha256, self.__ref))
|
|
98
|
+ ################
|
|
99
|
+ # Public methods
|
|
100
|
+ ################
|
87
|
101
|
|
88
|
|
- def _warn_deprecated_etag(self, node):
|
89
|
|
- etag = self.node_get_member(node, str, 'etag', None)
|
90
|
|
- if etag:
|
91
|
|
- provenance = self.node_provenance(node, member_name='etag')
|
92
|
|
- self.warn('{} "etag" is deprecated and ignored.'.format(provenance))
|
|
102
|
+ def ensure_mirror(self):
|
|
103
|
+ """Downloads from the url and caches it according to its sha256sum
|
93
|
104
|
|
94
|
|
- def _get_etag(self, ref):
|
95
|
|
- etagfilename = os.path.join(self._get_mirror_dir(), '{}.etag'.format(ref))
|
96
|
|
- if os.path.exists(etagfilename):
|
97
|
|
- with open(etagfilename, 'r') as etagfile:
|
98
|
|
- return etagfile.read()
|
99
|
|
-
|
100
|
|
- return None
|
|
105
|
+ Returns:
|
|
106
|
+ (str): The sha256sum of the mirrored file
|
101
|
107
|
|
102
|
|
- def _store_etag(self, ref, etag):
|
103
|
|
- etagfilename = os.path.join(self._get_mirror_dir(), '{}.etag'.format(ref))
|
104
|
|
- with utils.save_file_atomic(etagfilename) as etagfile:
|
105
|
|
- etagfile.write(etag)
|
106
|
|
-
|
107
|
|
- def _ensure_mirror(self):
|
108
|
|
- # Downloads from the url and caches it according to its sha256sum.
|
|
108
|
+ Raises:
|
|
109
|
+ :class:`.SourceError`
|
|
110
|
+ """
|
109
|
111
|
try:
|
110
|
112
|
with self.tempdir() as td:
|
111
|
|
- default_name = os.path.basename(self.url)
|
112
|
|
- request = urllib.request.Request(self.url)
|
|
113
|
+ default_name = os.path.basename(self.__url)
|
|
114
|
+ request = urllib.request.Request(self.__url)
|
113
|
115
|
request.add_header('Accept', '*/*')
|
114
|
116
|
|
115
|
117
|
# We do not use etag in case what we have in cache is
|
116
|
118
|
# not matching ref in order to be able to recover from
|
117
|
119
|
# corrupted download.
|
118
|
|
- if self.ref:
|
119
|
|
- etag = self._get_etag(self.ref)
|
|
120
|
+ if self.__ref:
|
|
121
|
+ etag = self.__get_etag(self.__ref)
|
120
|
122
|
|
121
|
123
|
# Do not re-download the file if the ETag matches.
|
122
|
124
|
if etag and self.get_consistency() == Consistency.CACHED:
|
... |
... |
@@ -134,17 +136,17 @@ class DownloadableFileSource(Source): |
134
|
136
|
shutil.copyfileobj(response, dest)
|
135
|
137
|
|
136
|
138
|
# Make sure url-specific mirror dir exists.
|
137
|
|
- if not os.path.isdir(self._get_mirror_dir()):
|
138
|
|
- os.makedirs(self._get_mirror_dir())
|
|
139
|
+ if not os.path.isdir(self.__get_mirror_dir()):
|
|
140
|
+ os.makedirs(self.__get_mirror_dir())
|
139
|
141
|
|
140
|
142
|
# Store by sha256sum
|
141
|
143
|
sha256 = utils.sha256sum(local_file)
|
142
|
144
|
# Even if the file already exists, move the new file over.
|
143
|
145
|
# In case the old file was corrupted somehow.
|
144
|
|
- os.rename(local_file, self._get_mirror_file(sha256))
|
|
146
|
+ os.rename(local_file, self.get_mirror_file(sha=sha256))
|
145
|
147
|
|
146
|
148
|
if etag:
|
147
|
|
- self._store_etag(sha256, etag)
|
|
149
|
+ self.__store_etag(sha256, etag)
|
148
|
150
|
return sha256
|
149
|
151
|
|
150
|
152
|
except urllib.error.HTTPError as e:
|
... |
... |
@@ -152,17 +154,51 @@ class DownloadableFileSource(Source): |
152
|
154
|
# 304 Not Modified.
|
153
|
155
|
# Because we use etag only for matching ref, currently specified ref is what
|
154
|
156
|
# we would have downloaded.
|
155
|
|
- return self.ref
|
|
157
|
+ return self.__ref
|
156
|
158
|
raise SourceError("{}: Error mirroring {}: {}"
|
157
|
|
- .format(self, self.url, e), temporary=True) from e
|
|
159
|
+ .format(self, self.__url, e), temporary=True) from e
|
158
|
160
|
|
159
|
161
|
except (urllib.error.URLError, urllib.error.ContentTooShortError, OSError) as e:
|
160
|
162
|
raise SourceError("{}: Error mirroring {}: {}"
|
161
|
|
- .format(self, self.url, e), temporary=True) from e
|
|
163
|
+ .format(self, self.__url, e), temporary=True) from e
|
|
164
|
+
|
|
165
|
+ def get_mirror_file(self, *, sha=None):
|
|
166
|
+ """Calculates the path to where this source stores the downloaded file
|
162
|
167
|
|
163
|
|
- def _get_mirror_dir(self):
|
|
168
|
+ Users are expected to read the file this points to when staging their source.
|
|
169
|
+
|
|
170
|
+ Returns:
|
|
171
|
+ (str): A path to the file the source should be cached at
|
|
172
|
+ """
|
|
173
|
+ return os.path.join(self.__get_mirror_dir(), sha or self.__ref)
|
|
174
|
+
|
|
175
|
+ #######################
|
|
176
|
+ # Local Private methods
|
|
177
|
+ #######################
|
|
178
|
+
|
|
179
|
+ def __get_mirror_dir(self):
|
|
180
|
+ # Get the directory this source should store things in, for a given URL
|
164
|
181
|
return os.path.join(self.get_mirror_directory(),
|
165
|
|
- utils.url_directory_name(self.original_url))
|
|
182
|
+ utils.url_directory_name(self.__original_url))
|
|
183
|
+
|
|
184
|
+ def __warn_deprecated_etag(self, node):
|
|
185
|
+ # Warn the user if the 'etag' field is being used
|
|
186
|
+ etag = self.node_get_member(node, str, 'etag', None)
|
|
187
|
+ if etag:
|
|
188
|
+ provenance = self.node_provenance(node, member_name='etag')
|
|
189
|
+ self.warn('{} "etag" is deprecated and ignored.'.format(provenance))
|
166
|
190
|
|
167
|
|
- def _get_mirror_file(self, sha=None):
|
168
|
|
- return os.path.join(self._get_mirror_dir(), sha or self.ref)
|
|
191
|
+ def __get_etag(self, ref):
|
|
192
|
+ # Retrieve the etag's data from disk
|
|
193
|
+ etagfilename = os.path.join(self.__get_mirror_dir(), '{}.etag'.format(ref))
|
|
194
|
+ if os.path.exists(etagfilename):
|
|
195
|
+ with open(etagfilename, 'r') as etagfile:
|
|
196
|
+ return etagfile.read()
|
|
197
|
+
|
|
198
|
+ return None
|
|
199
|
+
|
|
200
|
+ def __store_etag(self, ref, etag):
|
|
201
|
+ # Write the etag's data to disk
|
|
202
|
+ etagfilename = os.path.join(self.__get_mirror_dir(), '{}.etag'.format(ref))
|
|
203
|
+ with utils.save_file_atomic(etagfilename) as etagfile:
|
|
204
|
+ etagfile.write(etag)
|