Make temporary filenames more legible.
[cvs2svn.git] / cvs2svn_lib / sort.py
blobe1a64b22a6b1d817c6778b7e7d730e88a61f2576
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Functions to sort large files.
19 The functions in this module were originally downloaded from the
20 following URL:
22 http://code.activestate.com/recipes/466302/
24 It was apparently submitted by Nicolas Lehuen on Tue, 17 Jan 2006.
25 According to the terms of service of that website, the code is usable
26 under the MIT license.
28 """
31 import os
32 import heapq
33 import itertools
34 import tempfile
37 # The buffer size to use for open files:
38 BUFSIZE = 64 * 1024
41 def merge(iterables, key=None):
42 if key is None:
43 key = lambda x : x
45 values = []
47 for index, iterable in enumerate(iterables):
48 try:
49 iterator = iter(iterable)
50 value = iterator.next()
51 except StopIteration:
52 pass
53 else:
54 values.append((key(value), index, value, iterator))
56 heapq.heapify(values)
58 while values:
59 k, index, value, iterator = heapq.heappop(values)
60 yield value
61 try:
62 value = iterator.next()
63 except StopIteration:
64 pass
65 else:
66 heapq.heappush(values, (key(value), index, value, iterator))
69 def merge_files(input_filenames, output_filename, key=None):
70 output_file = file(output_filename, 'wb', BUFSIZE)
71 try:
72 chunks = []
73 try:
74 for input_filename in input_filenames:
75 chunks.append(open(input_filename, 'rb', BUFSIZE))
76 output_file.writelines(merge(chunks, key))
77 finally:
78 for chunk in chunks:
79 try:
80 chunk.close()
81 except:
82 pass
83 finally:
84 output_file.close()
87 def tempfile_generator(tempdirs=[]):
88 """Yield filenames of temporary files."""
90 # Create an iterator that will choose directories to hold the
91 # temporary files:
92 if tempdirs:
93 tempdirs = itertools.cycle(tempdirs)
94 else:
95 tempdirs = itertools.repeat(tempfile.gettempdir())
97 i = 0
98 while True:
99 (fd, filename) = tempfile.mkstemp(
100 '', 'sort%06i-' % (i,), tempdirs.next(), False
102 os.close(fd)
103 yield filename
104 i += 1
107 def sort_file(input, output, key=None, buffer_size=32000, tempdirs=[]):
108 tempfiles = tempfile_generator(tempdirs)
110 filenames = []
111 try:
112 input_file = file(input, 'rb', BUFSIZE)
113 try:
114 input_iterator = iter(input_file)
115 while True:
116 current_chunk = list(itertools.islice(input_iterator, buffer_size))
117 if not current_chunk:
118 break
119 current_chunk.sort(key=key)
120 filename = tempfiles.next()
121 filenames.append(filename)
122 f = open(filename, 'w+b', BUFSIZE)
123 try:
124 f.writelines(current_chunk)
125 finally:
126 f.close()
127 finally:
128 input_file.close()
130 merge_files(filenames, output, key)
131 finally:
132 for filename in filenames:
133 try:
134 os.remove(filename)
135 except:
136 pass