1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Functions to sort large files.
19 The functions in this module were originally downloaded from the
22 http://code.activestate.com/recipes/466302/
24 It was apparently submitted by Nicolas Lehuen on Tue, 17 Jan 2006.
25 According to the terms of service of that website, the code is usable
26 under the MIT license.
37 # The buffer size to use for open files:
41 def get_default_max_merge():
42 """Return the default maximum number of files to merge at once."""
44 # The maximum number of files to merge at once. This number cannot
45 # be unlimited because there are operating system restrictions on
46 # the number of files that a process can have open at once. So...
48 # If this constant is available via sysconf, we use half the
49 # number available to the process as a whole.
50 _SC_OPEN_MAX
= os
.sysconf('SC_OPEN_MAX')
51 if _SC_OPEN_MAX
== -1:
52 # This also indicates an error:
54 return min(_SC_OPEN_MAX
// 2, 100)
56 # Otherwise, simply limit the number to this constant, which will
57 # hopefully be OK on all operating systems:
61 DEFAULT_MAX_MERGE
= get_default_max_merge()
64 def merge(iterables
, key
=None):
70 for index
, iterable
in enumerate(iterables
):
72 iterator
= iter(iterable
)
73 value
= iterator
.next()
77 values
.append((key(value
), index
, value
, iterator
))
82 k
, index
, value
, iterator
= heapq
.heappop(values
)
85 value
= iterator
.next()
89 heapq
.heappush(values
, (key(value
), index
, value
, iterator
))
92 def merge_files(input_filenames
, output_filename
, key
=None):
93 output_file
= file(output_filename
, 'wb', BUFSIZE
)
97 for input_filename
in input_filenames
:
98 chunks
.append(open(input_filename
, 'rb', BUFSIZE
))
99 output_file
.writelines(merge(chunks
, key
))
110 def tempfile_generator(tempdirs
=[]):
111 """Yield filenames of temporary files."""
113 # Create an iterator that will choose directories to hold the
116 tempdirs
= itertools
.cycle(tempdirs
)
118 tempdirs
= itertools
.repeat(tempfile
.gettempdir())
122 (fd
, filename
) = tempfile
.mkstemp(
123 '', 'sort%06i-' % (i
,), tempdirs
.next(), False
131 input, output
, key
=None,
132 buffer_size
=32000, tempdirs
=[], max_merge
=DEFAULT_MAX_MERGE
,
134 tempfiles
= tempfile_generator(tempdirs
)
138 input_file
= file(input, 'rb', BUFSIZE
)
140 input_iterator
= iter(input_file
)
142 current_chunk
= list(itertools
.islice(input_iterator
, buffer_size
))
143 if not current_chunk
:
145 current_chunk
.sort(key
=key
)
146 filename
= tempfiles
.next()
147 filenames
.append(filename
)
148 f
= open(filename
, 'w+b', BUFSIZE
)
150 f
.writelines(current_chunk
)
156 while len(filenames
) > max_merge
:
157 generation
= list(filenames
)
159 group
= generation
[:max_merge
]
160 generation
= generation
[max_merge
:]
161 group_output
= tempfiles
.next()
162 filenames
.append(group_output
)
163 merge_files(group
, group_output
, key
)
164 for filename
in group
:
165 filenames
.remove(filename
)
171 merge_files(filenames
, output
, key
)
173 for filename
in filenames
: