1 # Copyright 2009 Google Inc. Released under the GPL v2
3 import os
, re
, time
, urllib2
, urlparse
, HTMLParser
5 from autotest_lib
.mirror
import database
6 from autotest_lib
.client
.common_lib
import utils
11 Abstract Base Class for the source classes.
13 def __init__(self
, database
):
14 self
.database
= database
17 def _get_new_files(self
, files
):
19 Return a copy of "files" after filtering out known old files
22 old_files
= self
.database
.get_dictionary()
23 return dict(filter(lambda x
: x
[0] not in old_files
, files
.iteritems()))
26 def get_new_files(self
):
27 raise NotImplemented('get_new_files not implemented')
30 def store_files(self
, files
):
31 self
.database
.merge_dictionary(files
)
34 class rsync_source(source
):
35 _cmd_template
= '/usr/bin/rsync -rltz --no-motd %s %s/%s'
37 def __init__(self
, database
, prefix
, excludes
= []):
38 super(rsync_source
, self
).__init
__(database
)
41 self
.exclude
= ' '.join(['--exclude "' + x
+ '"' for x
in excludes
])
45 def _parse_output(self
, output
, prefix
):
47 Parse rsync's "ls -l" style output and return a dictionary of
48 database.item indexed by the "name" field.
51 '-[rwx-]{9} +(\d+) (\d{4}/\d\d/\d\d \d\d:\d\d:\d\d) (.*)')
53 for line
in output
.splitlines():
54 match
= regex
.match(line
)
56 groups
= match
.groups()
57 timestamp
= time
.mktime(time
.strptime(groups
[1],
60 fname
= '%s/%s' % (prefix
, groups
[2])
64 item
= database
.item(fname
, int(groups
[0]), int(timestamp
))
70 def add_path(self
, src
, prefix
=''):
72 Add paths to synchronize from the source.
74 self
.sources
.append((src
, prefix
))
77 def get_new_files(self
):
79 Implement source.get_new_files by using rsync listing feature.
82 for src
, prefix
in self
.sources
:
83 output
= utils
.system_output(self
._cmd
_template
%
84 (self
.exclude
, self
.prefix
, src
))
85 files
.update(self
._parse
_output
(output
, prefix
))
87 return self
._get
_new
_files
(files
)
90 class _ahref_parser(HTMLParser
.HTMLParser
):
91 def reset(self
, url
=None, pattern
=None):
92 HTMLParser
.HTMLParser
.reset(self
)
94 self
.pattern
= pattern
98 def handle_starttag(self
, tag
, attrs
):
100 for name
, value
in attrs
:
102 # compose absolute URL if relative "href" found
103 url
= urlparse
.urljoin(self
.url
, value
)
104 if self
.pattern
.match(url
):
105 self
.links
.append(url
)
108 def get_ahref_list(self
, url
, pattern
):
109 self
.reset(url
, pattern
)
110 self
.feed(urllib2
.urlopen(url
).read())
116 class url_source(source
):
118 A simple URL based source that parses HTML to find references to
121 _extension_pattern
= re
.compile(r
'.*\.[^/.]+$')
123 def __init__(self
, database
, prefix
):
124 super(url_source
, self
).__init
__(database
)
129 def add_url(self
, url
, pattern
):
131 Add a URL path to a HTML document with links to kernel files.
133 @param url: URL path to a HTML file with links to kernel files
134 (can be either an absolute URL or one relative to self.prefix)
135 @param pattern: regex pattern to filter kernel files links out of
136 all othe links found in the HTML document
138 # if it does not have an extension then it's a directory and it needs
139 # a trailing '/'. NOTE: there are some false positives such as
140 # directories named "v2.6" where ".6" will be assumed to be extension.
141 # In order for these to work the caller must provide a trailing /
142 if url
[-1:] != '/' and not self
._extension
_pattern
.match(url
):
144 self
.urls
.append((url
, re
.compile(pattern
)))
150 Get a database.item object by fetching relevant HTTP information
151 from the document pointed to by the given url.
154 info
= urllib2
.urlopen(url
).info()
156 # file is referenced but does not exist
157 print 'WARNING: %s' % err
160 size
= info
.get('content-length')
166 timestamp
= int(time
.mktime(info
.getdate('date')))
170 return database
.item(url
, size
, timestamp
)
173 def get_new_files(self
):
174 parser
= _ahref_parser()
177 for url
, pattern
in self
.urls
:
178 links
= parser
.get_ahref_list(urlparse
.urljoin(self
.prefix
, url
),
181 item
= self
._get
_item
(link
)
183 files
[item
.name
] = item
185 return self
._get
_new
_files
(files
)
188 class directory_source(source
):
190 Source that finds kernel files by listing the contents of a directory.
192 def __init__(self
, database
, path
):
194 Initialize a directory_source instance.
196 @param database: Persistent database with known kernels information.
197 @param path: Path to the directory with the kernel files found by
200 super(directory_source
, self
).__init
__(database
)
205 def get_new_files(self
, _stat_func
=os
.stat
):
207 Main function, see source.get_new_files().
209 @param _stat_func: Used for unit testing, if we stub os.stat in the
210 unit test then unit test failures get reported confusingly
211 because the unit test framework tries to stat() the unit test
215 for filename
in os
.listdir(self
._path
):
216 full_filename
= os
.path
.join(self
._path
, filename
)
218 stat_data
= _stat_func(full_filename
)
220 # File might have been removed/renamed since we listed the
221 # directory so skip it.
224 item
= database
.item(full_filename
, stat_data
.st_size
,
225 int(stat_data
.st_mtime
))
226 all_files
[filename
] = item
228 return self
._get
_new
_files
(all_files
)