Update bulgarian.lang by Zahari Yurukov
[maemo-rb.git] / utils / common / gitscraper.py
blob774867f7ba7cea0b25af5de93a499565c0d96427
1 #!/usr/bin/python
2 # __________ __ ___.
3 # Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 # Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 # Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 # Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 # \/ \/ \/ \/ \/
9 # Copyright (c) 2012 Dominik Riebeling
11 # All files in this archive are subject to the GNU General Public License.
12 # See the file COPYING in the source tree root for full license agreement.
14 # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
15 # KIND, either express or implied.
18 '''Scrape files from a git repository.
20 This module provides functions to get a subset of files from a git repository.
21 The files to retrieve can be specified, and the git tree to work on can be
22 specified. That way arbitrary trees can be retrieved (like a subset of files
23 for a given tag).
25 Retrieved files can be packaged into a bzip2 compressed tarball or stored in a
26 given folder for processing afterwards.
28 Calls git commands directly for maximum compatibility.
29 '''
31 import re
32 import subprocess
33 import os
34 import tarfile
35 import tempfile
36 import shutil
39 def get_refs(repo):
40 '''Get dict matching refs to hashes from repository pointed to by repo.
41 @param repo Path to repository root.
42 @return Dict matching hashes to each ref.
43 '''
44 print("Getting list of refs")
45 output = subprocess.Popen(["git", "show-ref", "--abbrev"],
46 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
47 cmdout = output.communicate()
48 refs = {}
50 if len(cmdout[1]) > 0:
51 print("An error occured!\n")
52 print(cmdout[1])
53 return refs
55 for line in cmdout:
56 regex = re.findall(b'([a-f0-9]+)\s+(\S+)', line)
57 for r in regex:
58 # ref is the key, hash its value.
59 refs[r[1].decode()] = r[0].decode()
61 return refs
64 def get_lstree(repo, start, filterlist=[]):
65 '''Get recursive list of tree objects for a given tree.
66 @param repo Path to repository root.
67 @param start Hash identifying the tree.
68 @param filterlist List of paths to retrieve objecs hashes for.
69 An empty list will retrieve all paths.
70 @return Dict mapping filename to blob hash
71 '''
72 output = subprocess.Popen(["git", "ls-tree", "-r", start],
73 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
74 cmdout = output.communicate()
75 objects = {}
77 if len(cmdout[1]) > 0:
78 print("An error occured!\n")
79 print(cmdout[1])
80 return objects
82 for line in cmdout[0].decode().split('\n'):
83 regex = re.findall(b'([0-9]+)\s+([a-z]+)\s+([0-9a-f]+)\s+(\S+)',
84 line.encode())
85 for rf in regex:
86 # filter
87 add = False
88 for f in filterlist:
89 if rf[3].decode().find(f) == 0:
90 add = True
92 # If two files have the same content they have the same hash, so
93 # the filename has to be used as key.
94 if len(filterlist) == 0 or add == True:
95 if rf[3] in objects:
96 print("FATAL: key already exists in dict!")
97 return {}
98 objects[rf[3]] = rf[2]
99 return objects
102 def get_file_timestamp(repo, tree, filename):
103 '''Get timestamp for a file.
104 @param repo Path to repository root.
105 @param tree Hash of tree to use.
106 @param filename Filename in tree
107 @return Timestamp as string.
109 output = subprocess.Popen(
110 ["git", "log", "--format=%ai", "-n", "1", tree, filename],
111 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
112 cmdout = output.communicate()
114 return cmdout[0].decode().rstrip()
117 def get_object(repo, blob, destfile):
118 '''Get an identified object from the repository.
119 @param repo Path to repository root.
120 @param blob hash for blob to retrieve.
121 @param destfile filename for blob output.
122 @return True if file was successfully written, False on error.
124 output = subprocess.Popen(["git", "cat-file", "-p", blob],
125 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
126 cmdout = output.communicate()
127 # make sure output path exists
128 if len(cmdout[1]) > 0:
129 print("An error occured!\n")
130 print(cmdout[1])
131 return False
132 if not os.path.exists(os.path.dirname(destfile)):
133 os.makedirs(os.path.dirname(destfile))
134 f = open(destfile, 'wb')
135 f.write(cmdout[0])
136 f.close()
137 return True
140 def describe_treehash(repo, treehash):
141 '''Retrieve output of git-describe for a given hash.
142 @param repo Path to repository root.
143 @param treehash Hash identifying the tree / commit to describe.
144 @return Description string.
146 output = subprocess.Popen(["git", "describe", treehash],
147 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
148 cmdout = output.communicate()
149 if len(cmdout[1]) > 0:
150 print("An error occured!\n")
151 print(cmdout[1])
152 return ""
153 return cmdout[0].rstrip()
156 def scrape_files(repo, treehash, filelist, dest="", timestamp_files=[]):
157 '''Scrape list of files from repository.
158 @param repo Path to repository root.
159 @param treehash Hash identifying the tree.
160 @param filelist List of files to get from repository.
161 @param dest Destination path for files. Files will get retrieved with full
162 path from the repository, and the folder structure will get
163 created below dest as necessary.
164 @param timestamp_files List of files to also get the last modified date.
165 WARNING: this is SLOW!
166 @return Destination path, filename:timestamp dict.
168 print("Scraping files from repository")
170 if dest == "":
171 dest = tempfile.mkdtemp()
172 treeobjects = get_lstree(repo, treehash, filelist)
173 timestamps = {}
174 for obj in treeobjects:
175 get_object(repo, treeobjects[obj], os.path.join(dest.encode(), obj))
176 for f in timestamp_files:
177 if obj.find(f) == 0:
178 timestamps[obj] = get_file_timestamp(repo, treehash, obj)
180 return [dest, timestamps]
183 def archive_files(repo, treehash, filelist, basename, tmpfolder="",
184 archive="tbz"):
185 '''Archive list of files into tarball.
186 @param repo Path to repository root.
187 @param treehash Hash identifying the tree.
188 @param filelist List of files to archive. All files in the archive if left
189 empty.
190 @param basename Basename (including path) of output file. Will get used as
191 basename inside of the archive as well (i.e. no tarbomb).
192 @param tmpfolder Folder to put intermediate files in. If no folder is given
193 a temporary one will get used.
194 @param archive Type of archive to create. Supported values are "tbz" and
195 "7z". The latter requires the 7z binary available in the
196 system's path.
197 @return Output filename.
200 if tmpfolder == "":
201 temp_remove = True
202 tmpfolder = tempfile.mkdtemp()
203 else:
204 temp_remove = False
205 workfolder = scrape_files(repo, treehash, filelist,
206 os.path.join(tmpfolder, basename))[0]
207 if basename is "":
208 return ""
209 print("Archiving files from repository")
210 if archive == "7z":
211 outfile = basename + ".7z"
212 output = subprocess.Popen(["7z", "a",
213 os.path.join(os.getcwd(), basename + ".7z"), basename],
214 cwd=tmpfolder, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
215 output.communicate()
216 elif archive == "tbz":
217 outfile = basename + ".tar.bz2"
218 tf = tarfile.open(outfile, "w:bz2")
219 tf.add(workfolder, basename)
220 tf.close()
221 else:
222 print("Files not archived")
223 if tmpfolder != workfolder:
224 shutil.rmtree(workfolder)
225 if temp_remove:
226 shutil.rmtree(tmpfolder)
227 return outfile