Extract functions generate_edits_from_blocks() and write_edits().
[cvs2svn.git] / cvs2svn_lib / sort.py
blob1d0040b639220d1f32daad8ae3222ed16a2b28ec
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Functions to sort large files.
19 The functions in this module were originally downloaded from the
20 following URL:
22 http://code.activestate.com/recipes/466302/
24 It was apparently submitted by Nicolas Lehuen on Tue, 17 Jan 2006.
25 According to the terms of service of that website, the code is usable
26 under the MIT license.
28 """
31 import os
32 import heapq
33 import itertools
34 import tempfile
37 # The buffer size to use for open files:
38 BUFSIZE = 64 * 1024
41 def get_default_max_merge():
42 """Return the default maximum number of files to merge at once."""
44 # The maximum number of files to merge at once. This number cannot
45 # be unlimited because there are operating system restrictions on
46 # the number of files that a process can have open at once. So...
47 try:
48 # If this constant is available via sysconf, we use half the
49 # number available to the process as a whole.
50 _SC_OPEN_MAX = os.sysconf('SC_OPEN_MAX')
51 if _SC_OPEN_MAX == -1:
52 # This also indicates an error:
53 raise ValueError()
54 return min(_SC_OPEN_MAX // 2, 100)
55 except:
56 # Otherwise, simply limit the number to this constant, which will
57 # hopefully be OK on all operating systems:
58 return 50
61 DEFAULT_MAX_MERGE = get_default_max_merge()
64 def merge(iterables, key=None):
65 if key is None:
66 key = lambda x : x
68 values = []
70 for index, iterable in enumerate(iterables):
71 try:
72 iterator = iter(iterable)
73 value = iterator.next()
74 except StopIteration:
75 pass
76 else:
77 values.append((key(value), index, value, iterator))
79 heapq.heapify(values)
81 while values:
82 k, index, value, iterator = heapq.heappop(values)
83 yield value
84 try:
85 value = iterator.next()
86 except StopIteration:
87 pass
88 else:
89 heapq.heappush(values, (key(value), index, value, iterator))
92 def merge_files(input_filenames, output_filename, key=None):
93 output_file = file(output_filename, 'wb', BUFSIZE)
94 try:
95 chunks = []
96 try:
97 for input_filename in input_filenames:
98 chunks.append(open(input_filename, 'rb', BUFSIZE))
99 output_file.writelines(merge(chunks, key))
100 finally:
101 for chunk in chunks:
102 try:
103 chunk.close()
104 except:
105 pass
106 finally:
107 output_file.close()
110 def tempfile_generator(tempdirs=[]):
111 """Yield filenames of temporary files."""
113 # Create an iterator that will choose directories to hold the
114 # temporary files:
115 if tempdirs:
116 tempdirs = itertools.cycle(tempdirs)
117 else:
118 tempdirs = itertools.repeat(tempfile.gettempdir())
120 i = 0
121 while True:
122 (fd, filename) = tempfile.mkstemp(
123 '', 'sort%06i-' % (i,), tempdirs.next(), False
125 os.close(fd)
126 yield filename
127 i += 1
130 def sort_file(
131 input, output, key=None,
132 buffer_size=32000, tempdirs=[], max_merge=DEFAULT_MAX_MERGE,
134 tempfiles = tempfile_generator(tempdirs)
136 filenames = []
137 try:
138 input_file = file(input, 'rb', BUFSIZE)
139 try:
140 input_iterator = iter(input_file)
141 while True:
142 current_chunk = list(itertools.islice(input_iterator, buffer_size))
143 if not current_chunk:
144 break
145 current_chunk.sort(key=key)
146 filename = tempfiles.next()
147 filenames.append(filename)
148 f = open(filename, 'w+b', BUFSIZE)
149 try:
150 f.writelines(current_chunk)
151 finally:
152 f.close()
153 finally:
154 input_file.close()
156 while len(filenames) > max_merge:
157 generation = list(filenames)
158 while generation:
159 group = generation[:max_merge]
160 generation = generation[max_merge:]
161 group_output = tempfiles.next()
162 filenames.append(group_output)
163 merge_files(group, group_output, key)
164 for filename in group:
165 filenames.remove(filename)
166 try:
167 os.remove(filename)
168 except:
169 pass
171 merge_files(filenames, output, key)
172 finally:
173 for filename in filenames:
174 try:
175 os.remove(filename)
176 except:
177 pass