Planner: remove backend application
[autotest-zwu.git] / mirror / source.py
blobec602507a1835c70785678276879bb971376a0c1
1 # Copyright 2009 Google Inc. Released under the GPL v2
3 import os, re, time, urllib2, urlparse, HTMLParser
5 from autotest_lib.mirror import database
6 from autotest_lib.client.common_lib import utils
9 class source(object):
10 """
11 Abstract Base Class for the source classes.
12 """
13 def __init__(self, database):
14 self.database = database
17 def _get_new_files(self, files):
18 """
19 Return a copy of "files" after filtering out known old files
20 from "files".
21 """
22 old_files = self.database.get_dictionary()
23 return dict(filter(lambda x: x[0] not in old_files, files.iteritems()))
26 def get_new_files(self):
27 raise NotImplemented('get_new_files not implemented')
30 def store_files(self, files):
31 self.database.merge_dictionary(files)
34 class rsync_source(source):
35 _cmd_template = '/usr/bin/rsync -rltz --no-motd %s %s/%s'
37 def __init__(self, database, prefix, excludes = []):
38 super(rsync_source, self).__init__(database)
40 self.prefix = prefix
41 self.exclude = ' '.join(['--exclude "' + x + '"' for x in excludes])
42 self.sources = []
45 def _parse_output(self, output, prefix):
46 """
47 Parse rsync's "ls -l" style output and return a dictionary of
48 database.item indexed by the "name" field.
49 """
50 regex = re.compile(
51 '-[rwx-]{9} +(\d+) (\d{4}/\d\d/\d\d \d\d:\d\d:\d\d) (.*)')
52 res = {}
53 for line in output.splitlines():
54 match = regex.match(line)
55 if match:
56 groups = match.groups()
57 timestamp = time.mktime(time.strptime(groups[1],
58 '%Y/%m/%d %H:%M:%S'))
59 if prefix:
60 fname = '%s/%s' % (prefix, groups[2])
61 else:
62 fname = groups[2]
64 item = database.item(fname, int(groups[0]), int(timestamp))
65 res[item.name] = item
67 return res
70 def add_path(self, src, prefix=''):
71 """
72 Add paths to synchronize from the source.
73 """
74 self.sources.append((src, prefix))
77 def get_new_files(self):
78 """
79 Implement source.get_new_files by using rsync listing feature.
80 """
81 files = {}
82 for src, prefix in self.sources:
83 output = utils.system_output(self._cmd_template %
84 (self.exclude, self.prefix, src))
85 files.update(self._parse_output(output, prefix))
87 return self._get_new_files(files)
90 class _ahref_parser(HTMLParser.HTMLParser):
91 def reset(self, url=None, pattern=None):
92 HTMLParser.HTMLParser.reset(self)
93 self.url = url
94 self.pattern = pattern
95 self.links = []
98 def handle_starttag(self, tag, attrs):
99 if tag == 'a':
100 for name, value in attrs:
101 if name == 'href':
102 # compose absolute URL if relative "href" found
103 url = urlparse.urljoin(self.url, value)
104 if self.pattern.match(url):
105 self.links.append(url)
108 def get_ahref_list(self, url, pattern):
109 self.reset(url, pattern)
110 self.feed(urllib2.urlopen(url).read())
111 self.close()
113 return self.links
116 class url_source(source):
118 A simple URL based source that parses HTML to find references to
119 kernel files.
121 _extension_pattern = re.compile(r'.*\.[^/.]+$')
123 def __init__(self, database, prefix):
124 super(url_source, self).__init__(database)
125 self.prefix = prefix
126 self.urls = []
129 def add_url(self, url, pattern):
131 Add a URL path to a HTML document with links to kernel files.
133 @param url: URL path to a HTML file with links to kernel files
134 (can be either an absolute URL or one relative to self.prefix)
135 @param pattern: regex pattern to filter kernel files links out of
136 all othe links found in the HTML document
138 # if it does not have an extension then it's a directory and it needs
139 # a trailing '/'. NOTE: there are some false positives such as
140 # directories named "v2.6" where ".6" will be assumed to be extension.
141 # In order for these to work the caller must provide a trailing /
142 if url[-1:] != '/' and not self._extension_pattern.match(url):
143 url = url + '/'
144 self.urls.append((url, re.compile(pattern)))
147 @staticmethod
148 def _get_item(url):
150 Get a database.item object by fetching relevant HTTP information
151 from the document pointed to by the given url.
153 try:
154 info = urllib2.urlopen(url).info()
155 except IOError, err:
156 # file is referenced but does not exist
157 print 'WARNING: %s' % err
158 return None
160 size = info.get('content-length')
161 if size:
162 size = int(size)
163 else:
164 size = -1
166 timestamp = int(time.mktime(info.getdate('date')))
167 if not timestamp:
168 timestamp = 0
170 return database.item(url, size, timestamp)
173 def get_new_files(self):
174 parser = _ahref_parser()
176 files = {}
177 for url, pattern in self.urls:
178 links = parser.get_ahref_list(urlparse.urljoin(self.prefix, url),
179 pattern)
180 for link in links:
181 item = self._get_item(link)
182 if item:
183 files[item.name] = item
185 return self._get_new_files(files)
188 class directory_source(source):
190 Source that finds kernel files by listing the contents of a directory.
192 def __init__(self, database, path):
194 Initialize a directory_source instance.
196 @param database: Persistent database with known kernels information.
197 @param path: Path to the directory with the kernel files found by
198 this source.
200 super(directory_source, self).__init__(database)
202 self._path = path
205 def get_new_files(self, _stat_func=os.stat):
207 Main function, see source.get_new_files().
209 @param _stat_func: Used for unit testing, if we stub os.stat in the
210 unit test then unit test failures get reported confusingly
211 because the unit test framework tries to stat() the unit test
212 file.
214 all_files = {}
215 for filename in os.listdir(self._path):
216 full_filename = os.path.join(self._path, filename)
217 try:
218 stat_data = _stat_func(full_filename)
219 except OSError:
220 # File might have been removed/renamed since we listed the
221 # directory so skip it.
222 continue
224 item = database.item(full_filename, stat_data.st_size,
225 int(stat_data.st_mtime))
226 all_files[filename] = item
228 return self._get_new_files(all_files)