Silence the DeprecationWarning raised by importing mimetools in BaseHTTPServer.
[python.git] / Lib / csv.py
blob4f6e5c03a27bcce4dec8ab139a0a0743953bd925
2 """
3 csv.py - read/write/investigate CSV files
4 """
6 import re
7 from functools import reduce
8 from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
10 field_size_limit, \
11 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12 __doc__
13 from _csv import Dialect as _Dialect
15 try:
16 from cStringIO import StringIO
17 except ImportError:
18 from StringIO import StringIO
20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
22 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
23 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25 class Dialect:
26 """Describe an Excel dialect.
28 This must be subclassed (see csv.excel). Valid attributes are:
29 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
30 lineterminator, quoting.
32 """
33 _name = ""
34 _valid = False
35 # placeholders
36 delimiter = None
37 quotechar = None
38 escapechar = None
39 doublequote = None
40 skipinitialspace = None
41 lineterminator = None
42 quoting = None
44 def __init__(self):
45 if self.__class__ != Dialect:
46 self._valid = True
47 self._validate()
49 def _validate(self):
50 try:
51 _Dialect(self)
52 except TypeError, e:
53 # We do this for compatibility with py2.3
54 raise Error(str(e))
56 class excel(Dialect):
57 """Describe the usual properties of Excel-generated CSV files."""
58 delimiter = ','
59 quotechar = '"'
60 doublequote = True
61 skipinitialspace = False
62 lineterminator = '\r\n'
63 quoting = QUOTE_MINIMAL
64 register_dialect("excel", excel)
66 class excel_tab(excel):
67 """Describe the usual properties of Excel-generated TAB-delimited files."""
68 delimiter = '\t'
69 register_dialect("excel-tab", excel_tab)
72 class DictReader:
73 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
74 dialect="excel", *args, **kwds):
75 self._fieldnames = fieldnames # list of keys for the dict
76 self.restkey = restkey # key to catch long rows
77 self.restval = restval # default value for short rows
78 self.reader = reader(f, dialect, *args, **kwds)
79 self.dialect = dialect
80 self.line_num = 0
82 def __iter__(self):
83 return self
85 @property
86 def fieldnames(self):
87 if self._fieldnames is None:
88 try:
89 self._fieldnames = self.reader.next()
90 except StopIteration:
91 pass
92 self.line_num = self.reader.line_num
93 return self._fieldnames
95 @fieldnames.setter
96 def fieldnames(self, value):
97 self._fieldnames = value
99 def next(self):
100 if self.line_num == 0:
101 # Used only for its side effect.
102 self.fieldnames
103 row = self.reader.next()
104 self.line_num = self.reader.line_num
106 # unlike the basic reader, we prefer not to return blanks,
107 # because we will typically wind up with a dict full of None
108 # values
109 while row == []:
110 row = self.reader.next()
111 d = dict(zip(self.fieldnames, row))
112 lf = len(self.fieldnames)
113 lr = len(row)
114 if lf < lr:
115 d[self.restkey] = row[lf:]
116 elif lf > lr:
117 for key in self.fieldnames[lr:]:
118 d[key] = self.restval
119 return d
122 class DictWriter:
123 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
124 dialect="excel", *args, **kwds):
125 self.fieldnames = fieldnames # list of keys for the dict
126 self.restval = restval # for writing short dicts
127 if extrasaction.lower() not in ("raise", "ignore"):
128 raise ValueError, \
129 ("extrasaction (%s) must be 'raise' or 'ignore'" %
130 extrasaction)
131 self.extrasaction = extrasaction
132 self.writer = writer(f, dialect, *args, **kwds)
134 def _dict_to_list(self, rowdict):
135 if self.extrasaction == "raise":
136 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
137 if wrong_fields:
138 raise ValueError("dict contains fields not in fieldnames: " +
139 ", ".join(wrong_fields))
140 return [rowdict.get(key, self.restval) for key in self.fieldnames]
142 def writerow(self, rowdict):
143 return self.writer.writerow(self._dict_to_list(rowdict))
145 def writerows(self, rowdicts):
146 rows = []
147 for rowdict in rowdicts:
148 rows.append(self._dict_to_list(rowdict))
149 return self.writer.writerows(rows)
151 # Guard Sniffer's type checking against builds that exclude complex()
152 try:
153 complex
154 except NameError:
155 complex = float
157 class Sniffer:
159 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
160 Returns a Dialect object.
162 def __init__(self):
163 # in case there is more than one possible delimiter
164 self.preferred = [',', '\t', ';', ' ', ':']
167 def sniff(self, sample, delimiters=None):
169 Returns a dialect (or None) corresponding to the sample
172 quotechar, delimiter, skipinitialspace = \
173 self._guess_quote_and_delimiter(sample, delimiters)
174 if not delimiter:
175 delimiter, skipinitialspace = self._guess_delimiter(sample,
176 delimiters)
178 if not delimiter:
179 raise Error, "Could not determine delimiter"
181 class dialect(Dialect):
182 _name = "sniffed"
183 lineterminator = '\r\n'
184 quoting = QUOTE_MINIMAL
185 # escapechar = ''
186 doublequote = False
188 dialect.delimiter = delimiter
189 # _csv.reader won't accept a quotechar of ''
190 dialect.quotechar = quotechar or '"'
191 dialect.skipinitialspace = skipinitialspace
193 return dialect
196 def _guess_quote_and_delimiter(self, data, delimiters):
198 Looks for text enclosed between two identical quotes
199 (the probable quotechar) which are preceded and followed
200 by the same character (the probable delimiter).
201 For example:
202 ,'some text',
203 The quote with the most wins, same with the delimiter.
204 If there is no quotechar the delimiter can't be determined
205 this way.
208 matches = []
209 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
211 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
212 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
213 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
214 matches = regexp.findall(data)
215 if matches:
216 break
218 if not matches:
219 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
221 quotes = {}
222 delims = {}
223 spaces = 0
224 for m in matches:
225 n = regexp.groupindex['quote'] - 1
226 key = m[n]
227 if key:
228 quotes[key] = quotes.get(key, 0) + 1
229 try:
230 n = regexp.groupindex['delim'] - 1
231 key = m[n]
232 except KeyError:
233 continue
234 if key and (delimiters is None or key in delimiters):
235 delims[key] = delims.get(key, 0) + 1
236 try:
237 n = regexp.groupindex['space'] - 1
238 except KeyError:
239 continue
240 if m[n]:
241 spaces += 1
243 quotechar = reduce(lambda a, b, quotes = quotes:
244 (quotes[a] > quotes[b]) and a or b, quotes.keys())
246 if delims:
247 delim = reduce(lambda a, b, delims = delims:
248 (delims[a] > delims[b]) and a or b, delims.keys())
249 skipinitialspace = delims[delim] == spaces
250 if delim == '\n': # most likely a file with a single column
251 delim = ''
252 else:
253 # there is *no* delimiter, it's a single column of quoted data
254 delim = ''
255 skipinitialspace = 0
257 return (quotechar, delim, skipinitialspace)
260 def _guess_delimiter(self, data, delimiters):
262 The delimiter /should/ occur the same number of times on
263 each row. However, due to malformed data, it may not. We don't want
264 an all or nothing approach, so we allow for small variations in this
265 number.
266 1) build a table of the frequency of each character on every line.
267 2) build a table of freqencies of this frequency (meta-frequency?),
268 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
269 7 times in 2 rows'
270 3) use the mode of the meta-frequency to determine the /expected/
271 frequency for that character
272 4) find out how often the character actually meets that goal
273 5) the character that best meets its goal is the delimiter
274 For performance reasons, the data is evaluated in chunks, so it can
275 try and evaluate the smallest portion of the data possible, evaluating
276 additional chunks as necessary.
279 data = filter(None, data.split('\n'))
281 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
283 # build frequency tables
284 chunkLength = min(10, len(data))
285 iteration = 0
286 charFrequency = {}
287 modes = {}
288 delims = {}
289 start, end = 0, min(chunkLength, len(data))
290 while start < len(data):
291 iteration += 1
292 for line in data[start:end]:
293 for char in ascii:
294 metaFrequency = charFrequency.get(char, {})
295 # must count even if frequency is 0
296 freq = line.count(char)
297 # value is the mode
298 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
299 charFrequency[char] = metaFrequency
301 for char in charFrequency.keys():
302 items = charFrequency[char].items()
303 if len(items) == 1 and items[0][0] == 0:
304 continue
305 # get the mode of the frequencies
306 if len(items) > 1:
307 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
308 items)
309 # adjust the mode - subtract the sum of all
310 # other frequencies
311 items.remove(modes[char])
312 modes[char] = (modes[char][0], modes[char][1]
313 - reduce(lambda a, b: (0, a[1] + b[1]),
314 items)[1])
315 else:
316 modes[char] = items[0]
318 # build a list of possible delimiters
319 modeList = modes.items()
320 total = float(chunkLength * iteration)
321 # (rows of consistent data) / (number of rows) = 100%
322 consistency = 1.0
323 # minimum consistency threshold
324 threshold = 0.9
325 while len(delims) == 0 and consistency >= threshold:
326 for k, v in modeList:
327 if v[0] > 0 and v[1] > 0:
328 if ((v[1]/total) >= consistency and
329 (delimiters is None or k in delimiters)):
330 delims[k] = v
331 consistency -= 0.01
333 if len(delims) == 1:
334 delim = delims.keys()[0]
335 skipinitialspace = (data[0].count(delim) ==
336 data[0].count("%c " % delim))
337 return (delim, skipinitialspace)
339 # analyze another chunkLength lines
340 start = end
341 end += chunkLength
343 if not delims:
344 return ('', 0)
346 # if there's more than one, fall back to a 'preferred' list
347 if len(delims) > 1:
348 for d in self.preferred:
349 if d in delims.keys():
350 skipinitialspace = (data[0].count(d) ==
351 data[0].count("%c " % d))
352 return (d, skipinitialspace)
354 # nothing else indicates a preference, pick the character that
355 # dominates(?)
356 items = [(v,k) for (k,v) in delims.items()]
357 items.sort()
358 delim = items[-1][1]
360 skipinitialspace = (data[0].count(delim) ==
361 data[0].count("%c " % delim))
362 return (delim, skipinitialspace)
365 def has_header(self, sample):
366 # Creates a dictionary of types of data in each column. If any
367 # column is of a single type (say, integers), *except* for the first
368 # row, then the first row is presumed to be labels. If the type
369 # can't be determined, it is assumed to be a string in which case
370 # the length of the string is the determining factor: if all of the
371 # rows except for the first are the same length, it's a header.
372 # Finally, a 'vote' is taken at the end for each column, adding or
373 # subtracting from the likelihood of the first row being a header.
375 rdr = reader(StringIO(sample), self.sniff(sample))
377 header = rdr.next() # assume first row is header
379 columns = len(header)
380 columnTypes = {}
381 for i in range(columns): columnTypes[i] = None
383 checked = 0
384 for row in rdr:
385 # arbitrary number of rows to check, to keep it sane
386 if checked > 20:
387 break
388 checked += 1
390 if len(row) != columns:
391 continue # skip rows that have irregular number of columns
393 for col in columnTypes.keys():
395 for thisType in [int, long, float, complex]:
396 try:
397 thisType(row[col])
398 break
399 except (ValueError, OverflowError):
400 pass
401 else:
402 # fallback to length of string
403 thisType = len(row[col])
405 # treat longs as ints
406 if thisType == long:
407 thisType = int
409 if thisType != columnTypes[col]:
410 if columnTypes[col] is None: # add new column type
411 columnTypes[col] = thisType
412 else:
413 # type is inconsistent, remove column from
414 # consideration
415 del columnTypes[col]
417 # finally, compare results against first row and "vote"
418 # on whether it's a header
419 hasHeader = 0
420 for col, colType in columnTypes.items():
421 if type(colType) == type(0): # it's a length
422 if len(header[col]) != colType:
423 hasHeader += 1
424 else:
425 hasHeader -= 1
426 else: # attempt typecast
427 try:
428 colType(header[col])
429 except (ValueError, TypeError):
430 hasHeader += 1
431 else:
432 hasHeader -= 1
434 return hasHeader > 0