1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Functions to sort large files.
19 The functions in this module were originally downloaded from the
22 http://code.activestate.com/recipes/466302/
24 It was apparently submitted by Nicolas Lehuen on Tue, 17 Jan 2006.
25 According to the terms of service of that website, the code is usable
26 under the MIT license.
37 def merge(chunks
, key
=None):
43 for index
, chunk
in enumerate(chunks
):
45 iterator
= iter(chunk
)
46 value
= iterator
.next()
55 heapq
.heappush(values
, ((key(value
), index
, value
, iterator
, chunk
)))
58 k
, index
, value
, iterator
, chunk
= heapq
.heappop(values
)
61 value
= iterator
.next()
70 heapq
.heappush(values
, (key(value
), index
, value
, iterator
, chunk
))
73 def sort_file(input, output
, key
=None, buffer_size
=32000, tempdirs
=[]):
75 tempdirs
= [tempfile
.gettempdir()]
77 input_file
= file(input, 'rb', 64*1024)
79 input_iterator
= iter(input_file
)
83 for tempdir
in itertools
.cycle(tempdirs
):
84 current_chunk
= list(itertools
.islice(input_iterator
, buffer_size
))
87 current_chunk
.sort(key
=key
)
88 (fd
, filename
) = tempfile
.mkstemp(
89 '', 'sort%06i' % (len(chunks
),), tempdir
, False
92 output_chunk
= open(filename
, 'w+b', 64*1024)
93 output_chunk
.writelines(current_chunk
)
96 chunks
.append(output_chunk
)
101 os
.remove(chunk
.name
)
104 if output_chunk
not in chunks
:
107 os
.remove(output_chunk
.name
)
114 output_file
= file(output
, 'wb', 64*1024)
116 output_file
.writelines(merge(chunks
, key
))
121 os
.remove(chunk
.name
)