Added WatchedFileHandler (based on SF patch #1598415)
[python.git] / Lib / csv.py
blobf213854783eb67f1483e3abeeb94c70afefa8193
2 """
3 csv.py - read/write/investigate CSV files
4 """
6 import re
7 from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 field_size_limit, \
10 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
12 from _csv import Dialect as _Dialect
14 try:
15 from cStringIO import StringIO
16 except ImportError:
17 from StringIO import StringIO
19 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
24 class Dialect:
25 """Describe an Excel dialect.
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
31 """
32 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
46 self._validate()
48 def _validate(self):
49 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
55 class excel(Dialect):
56 """Describe the usual properties of Excel-generated CSV files."""
57 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63 register_dialect("excel", excel)
65 class excel_tab(excel):
66 """Describe the usual properties of Excel-generated TAB-delimited files."""
67 delimiter = '\t'
68 register_dialect("excel-tab", excel_tab)
71 class DictReader:
72 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
73 dialect="excel", *args, **kwds):
74 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
77 self.reader = reader(f, dialect, *args, **kwds)
79 def __iter__(self):
80 return self
82 def next(self):
83 row = self.reader.next()
84 if self.fieldnames is None:
85 self.fieldnames = row
86 row = self.reader.next()
88 # unlike the basic reader, we prefer not to return blanks,
89 # because we will typically wind up with a dict full of None
90 # values
91 while row == []:
92 row = self.reader.next()
93 d = dict(zip(self.fieldnames, row))
94 lf = len(self.fieldnames)
95 lr = len(row)
96 if lf < lr:
97 d[self.restkey] = row[lf:]
98 elif lf > lr:
99 for key in self.fieldnames[lr:]:
100 d[key] = self.restval
101 return d
104 class DictWriter:
105 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
106 dialect="excel", *args, **kwds):
107 self.fieldnames = fieldnames # list of keys for the dict
108 self.restval = restval # for writing short dicts
109 if extrasaction.lower() not in ("raise", "ignore"):
110 raise ValueError, \
111 ("extrasaction (%s) must be 'raise' or 'ignore'" %
112 extrasaction)
113 self.extrasaction = extrasaction
114 self.writer = writer(f, dialect, *args, **kwds)
116 def _dict_to_list(self, rowdict):
117 if self.extrasaction == "raise":
118 for k in rowdict.keys():
119 if k not in self.fieldnames:
120 raise ValueError, "dict contains fields not in fieldnames"
121 return [rowdict.get(key, self.restval) for key in self.fieldnames]
123 def writerow(self, rowdict):
124 return self.writer.writerow(self._dict_to_list(rowdict))
126 def writerows(self, rowdicts):
127 rows = []
128 for rowdict in rowdicts:
129 rows.append(self._dict_to_list(rowdict))
130 return self.writer.writerows(rows)
132 # Guard Sniffer's type checking against builds that exclude complex()
133 try:
134 complex
135 except NameError:
136 complex = float
138 class Sniffer:
140 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
141 Returns a Dialect object.
143 def __init__(self):
144 # in case there is more than one possible delimiter
145 self.preferred = [',', '\t', ';', ' ', ':']
148 def sniff(self, sample, delimiters=None):
150 Returns a dialect (or None) corresponding to the sample
153 quotechar, delimiter, skipinitialspace = \
154 self._guess_quote_and_delimiter(sample, delimiters)
155 if not delimiter:
156 delimiter, skipinitialspace = self._guess_delimiter(sample,
157 delimiters)
159 if not delimiter:
160 raise Error, "Could not determine delimiter"
162 class dialect(Dialect):
163 _name = "sniffed"
164 lineterminator = '\r\n'
165 quoting = QUOTE_MINIMAL
166 # escapechar = ''
167 doublequote = False
169 dialect.delimiter = delimiter
170 # _csv.reader won't accept a quotechar of ''
171 dialect.quotechar = quotechar or '"'
172 dialect.skipinitialspace = skipinitialspace
174 return dialect
177 def _guess_quote_and_delimiter(self, data, delimiters):
179 Looks for text enclosed between two identical quotes
180 (the probable quotechar) which are preceded and followed
181 by the same character (the probable delimiter).
182 For example:
183 ,'some text',
184 The quote with the most wins, same with the delimiter.
185 If there is no quotechar the delimiter can't be determined
186 this way.
189 matches = []
190 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
191 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
192 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
193 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
194 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
195 matches = regexp.findall(data)
196 if matches:
197 break
199 if not matches:
200 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
202 quotes = {}
203 delims = {}
204 spaces = 0
205 for m in matches:
206 n = regexp.groupindex['quote'] - 1
207 key = m[n]
208 if key:
209 quotes[key] = quotes.get(key, 0) + 1
210 try:
211 n = regexp.groupindex['delim'] - 1
212 key = m[n]
213 except KeyError:
214 continue
215 if key and (delimiters is None or key in delimiters):
216 delims[key] = delims.get(key, 0) + 1
217 try:
218 n = regexp.groupindex['space'] - 1
219 except KeyError:
220 continue
221 if m[n]:
222 spaces += 1
224 quotechar = reduce(lambda a, b, quotes = quotes:
225 (quotes[a] > quotes[b]) and a or b, quotes.keys())
227 if delims:
228 delim = reduce(lambda a, b, delims = delims:
229 (delims[a] > delims[b]) and a or b, delims.keys())
230 skipinitialspace = delims[delim] == spaces
231 if delim == '\n': # most likely a file with a single column
232 delim = ''
233 else:
234 # there is *no* delimiter, it's a single column of quoted data
235 delim = ''
236 skipinitialspace = 0
238 return (quotechar, delim, skipinitialspace)
241 def _guess_delimiter(self, data, delimiters):
243 The delimiter /should/ occur the same number of times on
244 each row. However, due to malformed data, it may not. We don't want
245 an all or nothing approach, so we allow for small variations in this
246 number.
247 1) build a table of the frequency of each character on every line.
248 2) build a table of freqencies of this frequency (meta-frequency?),
249 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
250 7 times in 2 rows'
251 3) use the mode of the meta-frequency to determine the /expected/
252 frequency for that character
253 4) find out how often the character actually meets that goal
254 5) the character that best meets its goal is the delimiter
255 For performance reasons, the data is evaluated in chunks, so it can
256 try and evaluate the smallest portion of the data possible, evaluating
257 additional chunks as necessary.
260 data = filter(None, data.split('\n'))
262 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
264 # build frequency tables
265 chunkLength = min(10, len(data))
266 iteration = 0
267 charFrequency = {}
268 modes = {}
269 delims = {}
270 start, end = 0, min(chunkLength, len(data))
271 while start < len(data):
272 iteration += 1
273 for line in data[start:end]:
274 for char in ascii:
275 metaFrequency = charFrequency.get(char, {})
276 # must count even if frequency is 0
277 freq = line.count(char)
278 # value is the mode
279 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
280 charFrequency[char] = metaFrequency
282 for char in charFrequency.keys():
283 items = charFrequency[char].items()
284 if len(items) == 1 and items[0][0] == 0:
285 continue
286 # get the mode of the frequencies
287 if len(items) > 1:
288 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
289 items)
290 # adjust the mode - subtract the sum of all
291 # other frequencies
292 items.remove(modes[char])
293 modes[char] = (modes[char][0], modes[char][1]
294 - reduce(lambda a, b: (0, a[1] + b[1]),
295 items)[1])
296 else:
297 modes[char] = items[0]
299 # build a list of possible delimiters
300 modeList = modes.items()
301 total = float(chunkLength * iteration)
302 # (rows of consistent data) / (number of rows) = 100%
303 consistency = 1.0
304 # minimum consistency threshold
305 threshold = 0.9
306 while len(delims) == 0 and consistency >= threshold:
307 for k, v in modeList:
308 if v[0] > 0 and v[1] > 0:
309 if ((v[1]/total) >= consistency and
310 (delimiters is None or k in delimiters)):
311 delims[k] = v
312 consistency -= 0.01
314 if len(delims) == 1:
315 delim = delims.keys()[0]
316 skipinitialspace = (data[0].count(delim) ==
317 data[0].count("%c " % delim))
318 return (delim, skipinitialspace)
320 # analyze another chunkLength lines
321 start = end
322 end += chunkLength
324 if not delims:
325 return ('', 0)
327 # if there's more than one, fall back to a 'preferred' list
328 if len(delims) > 1:
329 for d in self.preferred:
330 if d in delims.keys():
331 skipinitialspace = (data[0].count(d) ==
332 data[0].count("%c " % d))
333 return (d, skipinitialspace)
335 # nothing else indicates a preference, pick the character that
336 # dominates(?)
337 items = [(v,k) for (k,v) in delims.items()]
338 items.sort()
339 delim = items[-1][1]
341 skipinitialspace = (data[0].count(delim) ==
342 data[0].count("%c " % delim))
343 return (delim, skipinitialspace)
346 def has_header(self, sample):
347 # Creates a dictionary of types of data in each column. If any
348 # column is of a single type (say, integers), *except* for the first
349 # row, then the first row is presumed to be labels. If the type
350 # can't be determined, it is assumed to be a string in which case
351 # the length of the string is the determining factor: if all of the
352 # rows except for the first are the same length, it's a header.
353 # Finally, a 'vote' is taken at the end for each column, adding or
354 # subtracting from the likelihood of the first row being a header.
356 rdr = reader(StringIO(sample), self.sniff(sample))
358 header = rdr.next() # assume first row is header
360 columns = len(header)
361 columnTypes = {}
362 for i in range(columns): columnTypes[i] = None
364 checked = 0
365 for row in rdr:
366 # arbitrary number of rows to check, to keep it sane
367 if checked > 20:
368 break
369 checked += 1
371 if len(row) != columns:
372 continue # skip rows that have irregular number of columns
374 for col in columnTypes.keys():
376 for thisType in [int, long, float, complex]:
377 try:
378 thisType(row[col])
379 break
380 except (ValueError, OverflowError):
381 pass
382 else:
383 # fallback to length of string
384 thisType = len(row[col])
386 # treat longs as ints
387 if thisType == long:
388 thisType = int
390 if thisType != columnTypes[col]:
391 if columnTypes[col] is None: # add new column type
392 columnTypes[col] = thisType
393 else:
394 # type is inconsistent, remove column from
395 # consideration
396 del columnTypes[col]
398 # finally, compare results against first row and "vote"
399 # on whether it's a header
400 hasHeader = 0
401 for col, colType in columnTypes.items():
402 if type(colType) == type(0): # it's a length
403 if len(header[col]) != colType:
404 hasHeader += 1
405 else:
406 hasHeader -= 1
407 else: # attempt typecast
408 try:
409 colType(header[col])
410 except (ValueError, TypeError):
411 hasHeader += 1
412 else:
413 hasHeader -= 1
415 return hasHeader > 0