Enable doctest running for several other documents.
[python.git] / Lib / csv.py
blob41026e03bdb2532a3744ee7a8e359276dab414fc
2 """
3 csv.py - read/write/investigate CSV files
4 """
6 import re
7 from _csv import Error, __version__, writer, reader, register_dialect, \
8 unregister_dialect, get_dialect, list_dialects, \
9 field_size_limit, \
10 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
12 from _csv import Dialect as _Dialect
14 try:
15 from cStringIO import StringIO
16 except ImportError:
17 from StringIO import StringIO
19 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
24 class Dialect:
25 """Describe an Excel dialect.
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
31 """
32 _name = ""
33 _valid = False
34 # placeholders
35 delimiter = None
36 quotechar = None
37 escapechar = None
38 doublequote = None
39 skipinitialspace = None
40 lineterminator = None
41 quoting = None
43 def __init__(self):
44 if self.__class__ != Dialect:
45 self._valid = True
46 self._validate()
48 def _validate(self):
49 try:
50 _Dialect(self)
51 except TypeError, e:
52 # We do this for compatibility with py2.3
53 raise Error(str(e))
55 class excel(Dialect):
56 """Describe the usual properties of Excel-generated CSV files."""
57 delimiter = ','
58 quotechar = '"'
59 doublequote = True
60 skipinitialspace = False
61 lineterminator = '\r\n'
62 quoting = QUOTE_MINIMAL
63 register_dialect("excel", excel)
65 class excel_tab(excel):
66 """Describe the usual properties of Excel-generated TAB-delimited files."""
67 delimiter = '\t'
68 register_dialect("excel-tab", excel_tab)
71 class DictReader:
72 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
73 dialect="excel", *args, **kwds):
74 self.fieldnames = fieldnames # list of keys for the dict
75 self.restkey = restkey # key to catch long rows
76 self.restval = restval # default value for short rows
77 self.reader = reader(f, dialect, *args, **kwds)
78 self.dialect = dialect
79 self.line_num = 0
81 def __iter__(self):
82 return self
84 def next(self):
85 row = self.reader.next()
86 if self.fieldnames is None:
87 self.fieldnames = row
88 row = self.reader.next()
89 self.line_num = self.reader.line_num
91 # unlike the basic reader, we prefer not to return blanks,
92 # because we will typically wind up with a dict full of None
93 # values
94 while row == []:
95 row = self.reader.next()
96 d = dict(zip(self.fieldnames, row))
97 lf = len(self.fieldnames)
98 lr = len(row)
99 if lf < lr:
100 d[self.restkey] = row[lf:]
101 elif lf > lr:
102 for key in self.fieldnames[lr:]:
103 d[key] = self.restval
104 return d
107 class DictWriter:
108 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
109 dialect="excel", *args, **kwds):
110 self.fieldnames = fieldnames # list of keys for the dict
111 self.restval = restval # for writing short dicts
112 if extrasaction.lower() not in ("raise", "ignore"):
113 raise ValueError, \
114 ("extrasaction (%s) must be 'raise' or 'ignore'" %
115 extrasaction)
116 self.extrasaction = extrasaction
117 self.writer = writer(f, dialect, *args, **kwds)
119 def _dict_to_list(self, rowdict):
120 if self.extrasaction == "raise":
121 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
122 if wrong_fields:
123 raise ValueError("dict contains fields not in fieldnames: " +
124 ", ".join(wrong_fields))
125 return [rowdict.get(key, self.restval) for key in self.fieldnames]
127 def writerow(self, rowdict):
128 return self.writer.writerow(self._dict_to_list(rowdict))
130 def writerows(self, rowdicts):
131 rows = []
132 for rowdict in rowdicts:
133 rows.append(self._dict_to_list(rowdict))
134 return self.writer.writerows(rows)
136 # Guard Sniffer's type checking against builds that exclude complex()
137 try:
138 complex
139 except NameError:
140 complex = float
142 class Sniffer:
144 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
145 Returns a Dialect object.
147 def __init__(self):
148 # in case there is more than one possible delimiter
149 self.preferred = [',', '\t', ';', ' ', ':']
152 def sniff(self, sample, delimiters=None):
154 Returns a dialect (or None) corresponding to the sample
157 quotechar, delimiter, skipinitialspace = \
158 self._guess_quote_and_delimiter(sample, delimiters)
159 if not delimiter:
160 delimiter, skipinitialspace = self._guess_delimiter(sample,
161 delimiters)
163 if not delimiter:
164 raise Error, "Could not determine delimiter"
166 class dialect(Dialect):
167 _name = "sniffed"
168 lineterminator = '\r\n'
169 quoting = QUOTE_MINIMAL
170 # escapechar = ''
171 doublequote = False
173 dialect.delimiter = delimiter
174 # _csv.reader won't accept a quotechar of ''
175 dialect.quotechar = quotechar or '"'
176 dialect.skipinitialspace = skipinitialspace
178 return dialect
181 def _guess_quote_and_delimiter(self, data, delimiters):
183 Looks for text enclosed between two identical quotes
184 (the probable quotechar) which are preceded and followed
185 by the same character (the probable delimiter).
186 For example:
187 ,'some text',
188 The quote with the most wins, same with the delimiter.
189 If there is no quotechar the delimiter can't be determined
190 this way.
193 matches = []
194 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
195 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
196 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
197 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
198 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
199 matches = regexp.findall(data)
200 if matches:
201 break
203 if not matches:
204 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
206 quotes = {}
207 delims = {}
208 spaces = 0
209 for m in matches:
210 n = regexp.groupindex['quote'] - 1
211 key = m[n]
212 if key:
213 quotes[key] = quotes.get(key, 0) + 1
214 try:
215 n = regexp.groupindex['delim'] - 1
216 key = m[n]
217 except KeyError:
218 continue
219 if key and (delimiters is None or key in delimiters):
220 delims[key] = delims.get(key, 0) + 1
221 try:
222 n = regexp.groupindex['space'] - 1
223 except KeyError:
224 continue
225 if m[n]:
226 spaces += 1
228 quotechar = reduce(lambda a, b, quotes = quotes:
229 (quotes[a] > quotes[b]) and a or b, quotes.keys())
231 if delims:
232 delim = reduce(lambda a, b, delims = delims:
233 (delims[a] > delims[b]) and a or b, delims.keys())
234 skipinitialspace = delims[delim] == spaces
235 if delim == '\n': # most likely a file with a single column
236 delim = ''
237 else:
238 # there is *no* delimiter, it's a single column of quoted data
239 delim = ''
240 skipinitialspace = 0
242 return (quotechar, delim, skipinitialspace)
245 def _guess_delimiter(self, data, delimiters):
247 The delimiter /should/ occur the same number of times on
248 each row. However, due to malformed data, it may not. We don't want
249 an all or nothing approach, so we allow for small variations in this
250 number.
251 1) build a table of the frequency of each character on every line.
252 2) build a table of freqencies of this frequency (meta-frequency?),
253 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
254 7 times in 2 rows'
255 3) use the mode of the meta-frequency to determine the /expected/
256 frequency for that character
257 4) find out how often the character actually meets that goal
258 5) the character that best meets its goal is the delimiter
259 For performance reasons, the data is evaluated in chunks, so it can
260 try and evaluate the smallest portion of the data possible, evaluating
261 additional chunks as necessary.
264 data = filter(None, data.split('\n'))
266 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
268 # build frequency tables
269 chunkLength = min(10, len(data))
270 iteration = 0
271 charFrequency = {}
272 modes = {}
273 delims = {}
274 start, end = 0, min(chunkLength, len(data))
275 while start < len(data):
276 iteration += 1
277 for line in data[start:end]:
278 for char in ascii:
279 metaFrequency = charFrequency.get(char, {})
280 # must count even if frequency is 0
281 freq = line.count(char)
282 # value is the mode
283 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
284 charFrequency[char] = metaFrequency
286 for char in charFrequency.keys():
287 items = charFrequency[char].items()
288 if len(items) == 1 and items[0][0] == 0:
289 continue
290 # get the mode of the frequencies
291 if len(items) > 1:
292 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
293 items)
294 # adjust the mode - subtract the sum of all
295 # other frequencies
296 items.remove(modes[char])
297 modes[char] = (modes[char][0], modes[char][1]
298 - reduce(lambda a, b: (0, a[1] + b[1]),
299 items)[1])
300 else:
301 modes[char] = items[0]
303 # build a list of possible delimiters
304 modeList = modes.items()
305 total = float(chunkLength * iteration)
306 # (rows of consistent data) / (number of rows) = 100%
307 consistency = 1.0
308 # minimum consistency threshold
309 threshold = 0.9
310 while len(delims) == 0 and consistency >= threshold:
311 for k, v in modeList:
312 if v[0] > 0 and v[1] > 0:
313 if ((v[1]/total) >= consistency and
314 (delimiters is None or k in delimiters)):
315 delims[k] = v
316 consistency -= 0.01
318 if len(delims) == 1:
319 delim = delims.keys()[0]
320 skipinitialspace = (data[0].count(delim) ==
321 data[0].count("%c " % delim))
322 return (delim, skipinitialspace)
324 # analyze another chunkLength lines
325 start = end
326 end += chunkLength
328 if not delims:
329 return ('', 0)
331 # if there's more than one, fall back to a 'preferred' list
332 if len(delims) > 1:
333 for d in self.preferred:
334 if d in delims.keys():
335 skipinitialspace = (data[0].count(d) ==
336 data[0].count("%c " % d))
337 return (d, skipinitialspace)
339 # nothing else indicates a preference, pick the character that
340 # dominates(?)
341 items = [(v,k) for (k,v) in delims.items()]
342 items.sort()
343 delim = items[-1][1]
345 skipinitialspace = (data[0].count(delim) ==
346 data[0].count("%c " % delim))
347 return (delim, skipinitialspace)
350 def has_header(self, sample):
351 # Creates a dictionary of types of data in each column. If any
352 # column is of a single type (say, integers), *except* for the first
353 # row, then the first row is presumed to be labels. If the type
354 # can't be determined, it is assumed to be a string in which case
355 # the length of the string is the determining factor: if all of the
356 # rows except for the first are the same length, it's a header.
357 # Finally, a 'vote' is taken at the end for each column, adding or
358 # subtracting from the likelihood of the first row being a header.
360 rdr = reader(StringIO(sample), self.sniff(sample))
362 header = rdr.next() # assume first row is header
364 columns = len(header)
365 columnTypes = {}
366 for i in range(columns): columnTypes[i] = None
368 checked = 0
369 for row in rdr:
370 # arbitrary number of rows to check, to keep it sane
371 if checked > 20:
372 break
373 checked += 1
375 if len(row) != columns:
376 continue # skip rows that have irregular number of columns
378 for col in columnTypes.keys():
380 for thisType in [int, long, float, complex]:
381 try:
382 thisType(row[col])
383 break
384 except (ValueError, OverflowError):
385 pass
386 else:
387 # fallback to length of string
388 thisType = len(row[col])
390 # treat longs as ints
391 if thisType == long:
392 thisType = int
394 if thisType != columnTypes[col]:
395 if columnTypes[col] is None: # add new column type
396 columnTypes[col] = thisType
397 else:
398 # type is inconsistent, remove column from
399 # consideration
400 del columnTypes[col]
402 # finally, compare results against first row and "vote"
403 # on whether it's a header
404 hasHeader = 0
405 for col, colType in columnTypes.items():
406 if type(colType) == type(0): # it's a length
407 if len(header[col]) != colType:
408 hasHeader += 1
409 else:
410 hasHeader -= 1
411 else: # attempt typecast
412 try:
413 colType(header[col])
414 except (ValueError, TypeError):
415 hasHeader += 1
416 else:
417 hasHeader -= 1
419 return hasHeader > 0