mirror/source.py

   1 # Copyright 2009 Google Inc. Released under the GPL v2
   2
   3 import os, re, time, urllib2, urlparse, HTMLParser
   4
   5 from autotest_lib.mirror import database
   6 from autotest_lib.client.common_lib import utils
   7
   8
   9 class source(object):
  10     """
  11     Abstract Base Class for the source classes.
  12     """
  13     def __init__(self, database):
  14         self.database = database
  15
  16
  17     def _get_new_files(self, files):
  18         """
  19         Return a copy of "files" after filtering out known old files
  20         from "files".
  21         """
  22         old_files = self.database.get_dictionary()
  23         return dict(filter(lambda x: x[0] not in old_files, files.iteritems()))
  24
  25
  26     def get_new_files(self):
  27         raise NotImplemented('get_new_files not implemented')
  28
  29
  30     def store_files(self, files):
  31         self.database.merge_dictionary(files)
  32
  33
  34 class rsync_source(source):
  35     _cmd_template = '/usr/bin/rsync -rltz --no-motd %s %s/%s'
  36
  37     def __init__(self, database, prefix, excludes = []):
  38         super(rsync_source, self).__init__(database)
  39
  40         self.prefix = prefix
  41         self.exclude = ' '.join(['--exclude "' + x + '"' for x in excludes])
  42         self.sources = []
  43
  44
  45     def _parse_output(self, output, prefix):
  46         """
  47         Parse rsync's "ls -l" style output and return a dictionary of
  48         database.item indexed by the "name" field.
  49         """
  50         regex = re.compile(
  51             '-[rwx-]{9} +(\d+) (\d{4}/\d\d/\d\d \d\d:\d\d:\d\d) (.*)')
  52         res = {}
  53         for line in output.splitlines():
  54             match = regex.match(line)
  55             if match:
  56                 groups = match.groups()
  57                 timestamp = time.mktime(time.strptime(groups[1],
  58                                         '%Y/%m/%d %H:%M:%S'))
  59                 if prefix:
  60                     fname = '%s/%s' % (prefix, groups[2])
  61                 else:
  62                     fname = groups[2]
  63
  64                 item = database.item(fname, int(groups[0]), int(timestamp))
  65                 res[item.name] = item
  66
  67         return res
  68
  69
  70     def add_path(self, src, prefix=''):
  71         """
  72         Add paths to synchronize from the source.
  73         """
  74         self.sources.append((src, prefix))
  75
  76
  77     def get_new_files(self):
  78         """
  79         Implement source.get_new_files by using rsync listing feature.
  80         """
  81         files = {}
  82         for src, prefix in self.sources:
  83             output = utils.system_output(self._cmd_template %
  84                 (self.exclude, self.prefix, src))
  85             files.update(self._parse_output(output, prefix))
  86
  87         return self._get_new_files(files)
  88
  89
  90 class _ahref_parser(HTMLParser.HTMLParser):
  91     def reset(self, url=None, pattern=None):
  92         HTMLParser.HTMLParser.reset(self)
  93         self.url = url
  94         self.pattern = pattern
  95         self.links = []
  96
  97
  98     def handle_starttag(self, tag, attrs):
  99         if tag == 'a':
 100             for name, value in attrs:
 101                 if name == 'href':
 102                     # compose absolute URL if relative "href" found
 103                     url = urlparse.urljoin(self.url, value)
 104                     if self.pattern.match(url):
 105                         self.links.append(url)
 106
 107
 108     def get_ahref_list(self, url, pattern):
 109         self.reset(url, pattern)
 110         self.feed(urllib2.urlopen(url).read())
 111         self.close()
 112
 113         return self.links
 114
 115
 116 class url_source(source):
 117     """
 118     A simple URL based source that parses HTML to find references to
 119     kernel files.
 120     """
 121     _extension_pattern = re.compile(r'.*\.[^/.]+$')
 122
 123     def __init__(self, database, prefix):
 124         super(url_source, self).__init__(database)
 125         self.prefix = prefix
 126         self.urls = []
 127
 128
 129     def add_url(self, url, pattern):
 130         """
 131         Add a URL path to a HTML document with links to kernel files.
 132
 133         @param url: URL path to a HTML file with links to kernel files
 134                 (can be either an absolute URL or one relative to self.prefix)
 135         @param pattern: regex pattern to filter kernel files links out of
 136                 all othe links found in the HTML document
 137         """
 138         # if it does not have an extension then it's a directory and it needs
 139         # a trailing '/'. NOTE: there are some false positives such as
 140         # directories named "v2.6" where ".6" will be assumed to be extension.
 141         # In order for these to work the caller must provide a trailing /
 142         if url[-1:] != '/' and not self._extension_pattern.match(url):
 143             url = url + '/'
 144         self.urls.append((url, re.compile(pattern)))
 145
 146
 147     @staticmethod
 148     def _get_item(url):
 149         """
 150         Get a database.item object by fetching relevant HTTP information
 151         from the document pointed to by the given url.
 152         """
 153         try:
 154             info = urllib2.urlopen(url).info()
 155         except IOError, err:
 156             # file is referenced but does not exist
 157             print 'WARNING: %s' % err
 158             return None
 159
 160         size = info.get('content-length')
 161         if size:
 162             size = int(size)
 163         else:
 164             size = -1
 165
 166         timestamp = int(time.mktime(info.getdate('date')))
 167         if not timestamp:
 168             timestamp = 0
 169
 170         return database.item(url, size, timestamp)
 171
 172
 173     def get_new_files(self):
 174         parser = _ahref_parser()
 175
 176         files = {}
 177         for url, pattern in self.urls:
 178             links = parser.get_ahref_list(urlparse.urljoin(self.prefix, url),
 179                                           pattern)
 180             for link in links:
 181                 item = self._get_item(link)
 182                 if item:
 183                     files[item.name] = item
 184
 185         return self._get_new_files(files)
 186
 187
 188 class directory_source(source):
 189     """
 190     Source that finds kernel files by listing the contents of a directory.
 191     """
 192     def __init__(self, database, path):
 193         """
 194         Initialize a directory_source instance.
 195
 196         @param database: Persistent database with known kernels information.
 197         @param path: Path to the directory with the kernel files found by
 198                 this source.
 199         """
 200         super(directory_source, self).__init__(database)
 201
 202         self._path = path
 203
 204
 205     def get_new_files(self, _stat_func=os.stat):
 206         """
 207         Main function, see source.get_new_files().
 208
 209         @param _stat_func: Used for unit testing, if we stub os.stat in the
 210                 unit test then unit test failures get reported confusingly
 211                 because the unit test framework tries to stat() the unit test
 212                 file.
 213         """
 214         all_files = {}
 215         for filename in os.listdir(self._path):
 216             full_filename = os.path.join(self._path, filename)
 217             try:
 218                 stat_data = _stat_func(full_filename)
 219             except OSError:
 220                 # File might have been removed/renamed since we listed the
 221                 # directory so skip it.
 222                 continue
 223
 224             item = database.item(full_filename, stat_data.st_size,
 225                                  int(stat_data.st_mtime))
 226             all_files[filename] = item
 227
 228         return self._get_new_files(all_files)