1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Functions to sort large files.
19 The functions in this module were originally downloaded from the
22 http://code.activestate.com/recipes/466302/
24 It was apparently submitted by Nicolas Lehuen on Tue, 17 Jan 2006.
25 According to the terms of service of that website, the code is usable
26 under the MIT license.
37 # The buffer size to use for open files:
41 def merge(iterables
, key
=None):
47 for index
, iterable
in enumerate(iterables
):
49 iterator
= iter(iterable
)
50 value
= iterator
.next()
54 values
.append((key(value
), index
, value
, iterator
))
59 k
, index
, value
, iterator
= heapq
.heappop(values
)
62 value
= iterator
.next()
66 heapq
.heappush(values
, (key(value
), index
, value
, iterator
))
69 def merge_files(input_filenames
, output_filename
, key
=None):
70 output_file
= file(output_filename
, 'wb', BUFSIZE
)
74 for input_filename
in input_filenames
:
75 chunks
.append(open(input_filename
, 'rb', BUFSIZE
))
76 output_file
.writelines(merge(chunks
, key
))
87 def tempfile_generator(tempdirs
=[]):
88 """Yield filenames of temporary files."""
90 # Create an iterator that will choose directories to hold the
93 tempdirs
= itertools
.cycle(tempdirs
)
95 tempdirs
= itertools
.repeat(tempfile
.gettempdir())
99 (fd
, filename
) = tempfile
.mkstemp(
100 '', 'sort%06i-' % (i
,), tempdirs
.next(), False
107 def sort_file(input, output
, key
=None, buffer_size
=32000, tempdirs
=[]):
108 tempfiles
= tempfile_generator(tempdirs
)
112 input_file
= file(input, 'rb', BUFSIZE
)
114 input_iterator
= iter(input_file
)
116 current_chunk
= list(itertools
.islice(input_iterator
, buffer_size
))
117 if not current_chunk
:
119 current_chunk
.sort(key
=key
)
120 filename
= tempfiles
.next()
121 filenames
.append(filename
)
122 f
= open(filename
, 'w+b', BUFSIZE
)
124 f
.writelines(current_chunk
)
130 merge_files(filenames
, output
, key
)
132 for filename
in filenames
: