gitscraper: support compressing the sources as 7z.
[maemo-rb.git] / utils / common / gitscraper.py
bloba6b6cf39a6164b6654ab190585d297066337fdce
1 #!/usr/bin/python
2 # __________ __ ___.
3 # Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 # Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 # Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 # Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 # \/ \/ \/ \/ \/
9 # Copyright (c) 2012 Dominik Riebeling
11 # All files in this archive are subject to the GNU General Public License.
12 # See the file COPYING in the source tree root for full license agreement.
14 # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
15 # KIND, either express or implied.
18 '''Scrape files from a git repository.
20 This module provides functions to get a subset of files from a git repository.
21 The files to retrieve can be specified, and the git tree to work on can be
22 specified. That was arbitrary trees can be retrieved (like a subset of files
23 for a given tag).
25 Retrieved files can be packaged into a bzip2 compressed tarball or stored in a
26 given folder for processing afterwards.
28 Calls git commands directly for maximum compatibility.
29 '''
31 import re
32 import subprocess
33 import os
34 import tarfile
35 import tempfile
36 import shutil
39 def get_refs(repo):
40 '''Get dict matching refs to hashes from repository pointed to by repo.
41 @param repo Path to repository root.
42 @return Dict matching hashes to each ref.
43 '''
44 print "Getting list of refs"
45 output = subprocess.Popen(["git", "show-ref"], stdout=subprocess.PIPE,
46 stderr=subprocess.PIPE, cwd=repo)
47 cmdout = output.communicate()
48 refs = {}
50 if len(cmdout[1]) > 0:
51 print "An error occured!\n"
52 print cmdout[1]
53 return refs
55 for line in cmdout:
56 regex = re.findall(r'([a-f0-9]+)\s+(\S+)', line)
57 for r in regex:
58 # ref is the key, hash its value.
59 refs[r[1]] = r[0]
61 return refs
64 def get_lstree(repo, start, filterlist=[]):
65 '''Get recursive list of tree objects for a given tree.
66 @param repo Path to repository root.
67 @param start Hash identifying the tree.
68 @param filterlist List of paths to retrieve objecs hashes for.
69 An empty list will retrieve all paths.
70 @return Dict mapping filename to blob hash
71 '''
72 output = subprocess.Popen(["git", "ls-tree", "-r", start],
73 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
74 cmdout = output.communicate()
75 objects = {}
77 if len(cmdout[1]) > 0:
78 print "An error occured!\n"
79 print cmdout[1]
80 return objects
82 for line in cmdout[0].split('\n'):
83 regex = re.findall(r'([0-9]+)\s+([a-z]+)\s+([0-9a-f]+)\s+(\S+)', line)
84 for rf in regex:
85 # filter
86 add = False
87 for f in filterlist:
88 if rf[3].find(f) == 0:
89 add = True
91 # If two files have the same content they have the same hash, so
92 # the filename has to be used as key.
93 if len(filterlist) == 0 or add == True:
94 if rf[3] in objects:
95 print "FATAL: key already exists in dict!"
96 return {}
97 objects[rf[3]] = rf[2]
98 return objects
101 def get_object(repo, blob, destfile):
102 '''Get an identified object from the repository.
103 @param repo Path to repository root.
104 @param blob hash for blob to retrieve.
105 @param destfile filename for blob output.
106 @return True if file was successfully written, False on error.
108 output = subprocess.Popen(["git", "cat-file", "-p", blob],
109 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
110 cmdout = output.communicate()
111 # make sure output path exists
112 if len(cmdout[1]) > 0:
113 print "An error occured!\n"
114 print cmdout[1]
115 return False
116 if not os.path.exists(os.path.dirname(destfile)):
117 os.makedirs(os.path.dirname(destfile))
118 f = open(destfile, 'wb')
119 for line in cmdout[0]:
120 f.write(line)
121 f.close()
122 return True
125 def describe_treehash(repo, treehash):
126 '''Retrieve output of git-describe for a given hash.
127 @param repo Path to repository root.
128 @param treehash Hash identifying the tree / commit to describe.
129 @return Description string.
131 output = subprocess.Popen(["git", "describe", treehash],
132 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
133 cmdout = output.communicate()
134 if len(cmdout[1]) > 0:
135 print "An error occured!\n"
136 print cmdout[1]
137 return ""
138 return cmdout[0].rstrip()
141 def scrape_files(repo, treehash, filelist, dest=""):
142 '''Scrape list of files from repository.
143 @param repo Path to repository root.
144 @param treehash Hash identifying the tree.
145 @param filelist List of files to get from repository.
146 @param dest Destination path for files. Files will get retrieved with full
147 path from the repository, and the folder structure will get
148 created below dest as necessary.
149 @return Destination path.
151 print "Scraping files from repository"
153 if dest == "":
154 dest = tempfile.mkdtemp()
155 treeobjects = get_lstree(repo, treehash, filelist)
156 for obj in treeobjects:
157 get_object(repo, treeobjects[obj], os.path.join(dest, obj))
159 return dest
162 def archive_files(repo, treehash, filelist, basename, tmpfolder="",
163 archive="tbz"):
164 '''Archive list of files into tarball.
165 @param repo Path to repository root.
166 @param treehash Hash identifying the tree.
167 @param filelist List of files to archive. All files in the archive if left
168 empty.
169 @param basename Basename (including path) of output file. Will get used as
170 basename inside of the archive as well (i.e. no tarbomb).
171 @param tmpfolder Folder to put intermediate files in. If no folder is given
172 a temporary one will get used.
173 @param archive Type of archive to create. Supported values are "tbz" and
174 "7z". The latter requires the 7z binary available in the
175 system's path.
176 @return Output filename.
179 if tmpfolder == "":
180 temp_remove = True
181 tmpfolder = tempfile.mkdtemp()
182 else:
183 temp_remove = False
184 workfolder = scrape_files(repo, treehash, filelist,
185 os.path.join(tmpfolder, basename))
186 if basename is "":
187 return ""
188 print "Archiving files from repository"
189 if archive == "7z":
190 outfile = basename + ".7z"
191 output = subprocess.Popen(["7z", "a",
192 os.path.join(os.getcwd(), basename + ".7z"), basename],
193 cwd=tmpfolder, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
194 output.communicate()
195 else:
196 outfile = basename + ".tar.bz2"
197 tf = tarfile.open(outfile, "w:bz2")
198 tf.add(workfolder, basename)
199 tf.close()
200 if tmpfolder != workfolder:
201 shutil.rmtree(workfolder)
202 if temp_remove:
203 shutil.rmtree(tmpfolder)
204 return outfile