Re-update to rcsparse r2495 to get all the goodness of the new update script.
[cvs2svn.git] / cvs2svn_lib / sort.py
blobbfb33078a3ccc9b3ea2c5b81af774978b9c4c3a0
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Functions to sort large files.
19 The functions in this module were originally downloaded from the
20 following URL:
22 http://code.activestate.com/recipes/466302/
24 It was apparently submitted by Nicolas Lehuen on Tue, 17 Jan 2006.
25 According to the terms of service of that website, the code is usable
26 under the MIT license.
28 """
31 import os
32 import shutil
33 import heapq
34 import itertools
35 import tempfile
38 # The buffer size to use for open files:
39 BUFSIZE = 64 * 1024
42 def get_default_max_merge():
43 """Return the default maximum number of files to merge at once."""
45 # The maximum number of files to merge at once. This number cannot
46 # be unlimited because there are operating system restrictions on
47 # the number of files that a process can have open at once. So...
48 try:
49 # If this constant is available via sysconf, we use half the
50 # number available to the process as a whole.
51 _SC_OPEN_MAX = os.sysconf('SC_OPEN_MAX')
52 if _SC_OPEN_MAX == -1:
53 # This also indicates an error:
54 raise ValueError()
55 return min(_SC_OPEN_MAX // 2, 100)
56 except:
57 # Otherwise, simply limit the number to this constant, which will
58 # hopefully be OK on all operating systems:
59 return 50
62 DEFAULT_MAX_MERGE = get_default_max_merge()
65 def merge(iterables, key=None):
66 """Merge (in the sense of mergesort) ITERABLES.
68 Generate the output in order. If KEY is specified, it should be a
69 function that returns the sort key."""
71 if key is None:
72 key = lambda x : x
74 values = []
76 for index, iterable in enumerate(iterables):
77 try:
78 iterator = iter(iterable)
79 value = iterator.next()
80 except StopIteration:
81 pass
82 else:
83 values.append((key(value), index, value, iterator))
85 heapq.heapify(values)
87 while values:
88 k, index, value, iterator = heapq.heappop(values)
89 yield value
90 try:
91 value = iterator.next()
92 except StopIteration:
93 pass
94 else:
95 heapq.heappush(values, (key(value), index, value, iterator))
98 def merge_files_onepass(input_filenames, output_filename, key=None):
99 """Merge a number of input files into one output file.
101 This is a merge in the sense of mergesort; namely, it is assumed
102 that the input files are each sorted, and (under that assumption)
103 the output file will also be sorted."""
105 input_filenames = list(input_filenames)
106 if len(input_filenames) == 1:
107 shutil.move(input_filenames[0], output_filename)
108 else:
109 output_file = file(output_filename, 'wb', BUFSIZE)
110 try:
111 chunks = []
112 try:
113 for input_filename in input_filenames:
114 chunks.append(open(input_filename, 'rb', BUFSIZE))
115 output_file.writelines(merge(chunks, key))
116 finally:
117 for chunk in chunks:
118 try:
119 chunk.close()
120 except:
121 pass
122 finally:
123 output_file.close()
126 def _try_delete_files(filenames):
127 """Try to remove the named files. Ignore errors."""
129 for filename in filenames:
130 try:
131 os.remove(filename)
132 except:
133 pass
136 def tempfile_generator(tempdirs=[]):
137 """Yield filenames of temporary files."""
139 # Create an iterator that will choose directories to hold the
140 # temporary files:
141 if tempdirs:
142 tempdirs = itertools.cycle(tempdirs)
143 else:
144 tempdirs = itertools.repeat(tempfile.gettempdir())
146 i = 0
147 while True:
148 (fd, filename) = tempfile.mkstemp(
149 '', 'sort%06i-' % (i,), tempdirs.next(), False
151 os.close(fd)
152 yield filename
153 i += 1
156 def _merge_file_generation(
157 input_filenames, delete_inputs, key=None,
158 max_merge=DEFAULT_MAX_MERGE, tempfiles=None,
160 """Merge multiple input files into fewer output files.
162 This is a merge in the sense of mergesort; namely, it is assumed
163 that the input files are each sorted, and (under that assumption)
164 the output file will also be sorted. At most MAX_MERGE input files
165 will be merged at once, to avoid exceeding operating system
166 restrictions on the number of files that can be open at one time.
168 If DELETE_INPUTS is True, then the input files will be deleted when
169 they are no longer needed.
171 If temporary files need to be used, they will be created using the
172 specified TEMPFILES tempfile generator.
174 Generate the names of the output files."""
176 if max_merge <= 1:
177 raise ValueError('max_merge must be greater than one')
179 if tempfiles is None:
180 tempfiles = tempfile_generator()
182 filenames = list(input_filenames)
183 if len(filenames) <= 1:
184 raise ValueError('It makes no sense to merge a single file')
186 while filenames:
187 group = filenames[:max_merge]
188 del filenames[:max_merge]
189 group_output = tempfiles.next()
190 merge_files_onepass(group, group_output, key=key)
191 if delete_inputs:
192 _try_delete_files(group)
193 yield group_output
196 def merge_files(
197 input_filenames, output_filename, key=None, delete_inputs=False,
198 max_merge=DEFAULT_MAX_MERGE, tempfiles=None,
200 """Merge a number of input files into one output file.
202 This is a merge in the sense of mergesort; namely, it is assumed
203 that the input files are each sorted, and (under that assumption)
204 the output file will also be sorted. At most MAX_MERGE input files
205 will be merged at once, to avoid exceeding operating system
206 restrictions on the number of files that can be open at one time.
208 If DELETE_INPUTS is True, then the input files will be deleted when
209 they are no longer needed.
211 If temporary files need to be used, they will be created using the
212 specified TEMPFILES tempfile generator."""
214 filenames = list(input_filenames)
215 if not filenames:
216 # Create an empty file:
217 open(output_filename, 'wb').close()
218 else:
219 if tempfiles is None:
220 tempfiles = tempfile_generator()
221 while len(filenames) > max_merge:
222 # Reduce the number of files by performing groupwise merges:
223 filenames = list(
224 _merge_file_generation(
225 filenames, delete_inputs, key=key,
226 max_merge=max_merge, tempfiles=tempfiles
229 # After the first iteration, we are only working with temporary
230 # files so they can definitely be deleted them when we are done
231 # with them:
232 delete_inputs = True
234 # The last merge writes the results directly into the output
235 # file:
236 merge_files_onepass(filenames, output_filename, key=key)
237 if delete_inputs:
238 _try_delete_files(filenames)
241 def sort_file(
242 input, output, key=None,
243 buffer_size=32000, tempdirs=[], max_merge=DEFAULT_MAX_MERGE,
245 tempfiles = tempfile_generator(tempdirs)
247 filenames = []
249 input_file = file(input, 'rb', BUFSIZE)
250 try:
251 try:
252 input_iterator = iter(input_file)
253 while True:
254 current_chunk = list(itertools.islice(input_iterator, buffer_size))
255 if not current_chunk:
256 break
257 current_chunk.sort(key=key)
258 filename = tempfiles.next()
259 filenames.append(filename)
260 f = open(filename, 'w+b', BUFSIZE)
261 try:
262 f.writelines(current_chunk)
263 finally:
264 f.close()
265 finally:
266 input_file.close()
268 merge_files(
269 filenames, output, key=key,
270 delete_inputs=True, max_merge=max_merge, tempfiles=tempfiles,
272 finally:
273 _try_delete_files(filenames)