3 csv.py - read/write/investigate CSV files
7 from functools
import reduce
8 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
9 unregister_dialect
, get_dialect
, list_dialects
, \
11 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
13 from _csv
import Dialect
as _Dialect
16 from cStringIO
import StringIO
18 from StringIO
import StringIO
20 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
27 """Describe an Excel dialect.
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
41 skipinitialspace
= None
46 if self
.__class
__ != Dialect
:
54 # We do this for compatibility with py2.3
58 """Describe the usual properties of Excel-generated CSV files."""
62 skipinitialspace
= False
63 lineterminator
= '\r\n'
64 quoting
= QUOTE_MINIMAL
65 register_dialect("excel", excel
)
67 class excel_tab(excel
):
68 """Describe the usual properties of Excel-generated TAB-delimited files."""
70 register_dialect("excel-tab", excel_tab
)
74 def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
75 dialect
="excel", *args
, **kwds
):
76 self
._fieldnames
= fieldnames
# list of keys for the dict
77 self
.restkey
= restkey
# key to catch long rows
78 self
.restval
= restval
# default value for short rows
79 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
80 self
.dialect
= dialect
88 if self
._fieldnames
is None:
90 self
._fieldnames
= self
.reader
.next()
93 self
.line_num
= self
.reader
.line_num
94 return self
._fieldnames
97 def fieldnames(self
, value
):
98 self
._fieldnames
= value
101 if self
.line_num
== 0:
102 # Used only for its side effect.
104 row
= self
.reader
.next()
105 self
.line_num
= self
.reader
.line_num
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
111 row
= self
.reader
.next()
112 d
= dict(zip(self
.fieldnames
, row
))
113 lf
= len(self
.fieldnames
)
116 d
[self
.restkey
] = row
[lf
:]
118 for key
in self
.fieldnames
[lr
:]:
119 d
[key
] = self
.restval
124 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
125 dialect
="excel", *args
, **kwds
):
126 self
.fieldnames
= fieldnames
# list of keys for the dict
127 self
.restval
= restval
# for writing short dicts
128 if extrasaction
.lower() not in ("raise", "ignore"):
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
132 self
.extrasaction
= extrasaction
133 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
135 def _dict_to_list(self
, rowdict
):
136 if self
.extrasaction
== "raise":
137 wrong_fields
= [k
for k
in rowdict
if k
not in self
.fieldnames
]
139 raise ValueError("dict contains fields not in fieldnames: " +
140 ", ".join(wrong_fields
))
141 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
143 def writerow(self
, rowdict
):
144 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
146 def writerows(self
, rowdicts
):
148 for rowdict
in rowdicts
:
149 rows
.append(self
._dict
_to
_list
(rowdict
))
150 return self
.writer
.writerows(rows
)
152 # Guard Sniffer's type checking against builds that exclude complex()
160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
161 Returns a Dialect object.
164 # in case there is more than one possible delimiter
165 self
.preferred
= [',', '\t', ';', ' ', ':']
168 def sniff(self
, sample
, delimiters
=None):
170 Returns a dialect (or None) corresponding to the sample
173 quotechar
, delimiter
, skipinitialspace
= \
174 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
176 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
180 raise Error
, "Could not determine delimiter"
182 class dialect(Dialect
):
184 lineterminator
= '\r\n'
185 quoting
= QUOTE_MINIMAL
189 dialect
.delimiter
= delimiter
190 # _csv.reader won't accept a quotechar of ''
191 dialect
.quotechar
= quotechar
or '"'
192 dialect
.skipinitialspace
= skipinitialspace
197 def _guess_quote_and_delimiter(self
, data
, delimiters
):
199 Looks for text enclosed between two identical quotes
200 (the probable quotechar) which are preceded and followed
201 by the same character (the probable delimiter).
204 The quote with the most wins, same with the delimiter.
205 If there is no quotechar the delimiter can't be determined
210 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
212 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
213 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
214 regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
215 matches
= regexp
.findall(data
)
220 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
226 n
= regexp
.groupindex
['quote'] - 1
229 quotes
[key
] = quotes
.get(key
, 0) + 1
231 n
= regexp
.groupindex
['delim'] - 1
235 if key
and (delimiters
is None or key
in delimiters
):
236 delims
[key
] = delims
.get(key
, 0) + 1
238 n
= regexp
.groupindex
['space'] - 1
244 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
245 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
248 delim
= reduce(lambda a
, b
, delims
= delims
:
249 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
250 skipinitialspace
= delims
[delim
] == spaces
251 if delim
== '\n': # most likely a file with a single column
254 # there is *no* delimiter, it's a single column of quoted data
258 return (quotechar
, delim
, skipinitialspace
)
261 def _guess_delimiter(self
, data
, delimiters
):
263 The delimiter /should/ occur the same number of times on
264 each row. However, due to malformed data, it may not. We don't want
265 an all or nothing approach, so we allow for small variations in this
267 1) build a table of the frequency of each character on every line.
268 2) build a table of freqencies of this frequency (meta-frequency?),
269 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
271 3) use the mode of the meta-frequency to determine the /expected/
272 frequency for that character
273 4) find out how often the character actually meets that goal
274 5) the character that best meets its goal is the delimiter
275 For performance reasons, the data is evaluated in chunks, so it can
276 try and evaluate the smallest portion of the data possible, evaluating
277 additional chunks as necessary.
280 data
= filter(None, data
.split('\n'))
282 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
284 # build frequency tables
285 chunkLength
= min(10, len(data
))
290 start
, end
= 0, min(chunkLength
, len(data
))
291 while start
< len(data
):
293 for line
in data
[start
:end
]:
295 metaFrequency
= charFrequency
.get(char
, {})
296 # must count even if frequency is 0
297 freq
= line
.count(char
)
299 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
300 charFrequency
[char
] = metaFrequency
302 for char
in charFrequency
.keys():
303 items
= charFrequency
[char
].items()
304 if len(items
) == 1 and items
[0][0] == 0:
306 # get the mode of the frequencies
308 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
310 # adjust the mode - subtract the sum of all
312 items
.remove(modes
[char
])
313 modes
[char
] = (modes
[char
][0], modes
[char
][1]
314 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
317 modes
[char
] = items
[0]
319 # build a list of possible delimiters
320 modeList
= modes
.items()
321 total
= float(chunkLength
* iteration
)
322 # (rows of consistent data) / (number of rows) = 100%
324 # minimum consistency threshold
326 while len(delims
) == 0 and consistency
>= threshold
:
327 for k
, v
in modeList
:
328 if v
[0] > 0 and v
[1] > 0:
329 if ((v
[1]/total
) >= consistency
and
330 (delimiters
is None or k
in delimiters
)):
335 delim
= delims
.keys()[0]
336 skipinitialspace
= (data
[0].count(delim
) ==
337 data
[0].count("%c " % delim
))
338 return (delim
, skipinitialspace
)
340 # analyze another chunkLength lines
347 # if there's more than one, fall back to a 'preferred' list
349 for d
in self
.preferred
:
350 if d
in delims
.keys():
351 skipinitialspace
= (data
[0].count(d
) ==
352 data
[0].count("%c " % d
))
353 return (d
, skipinitialspace
)
355 # nothing else indicates a preference, pick the character that
357 items
= [(v
,k
) for (k
,v
) in delims
.items()]
361 skipinitialspace
= (data
[0].count(delim
) ==
362 data
[0].count("%c " % delim
))
363 return (delim
, skipinitialspace
)
366 def has_header(self
, sample
):
367 # Creates a dictionary of types of data in each column. If any
368 # column is of a single type (say, integers), *except* for the first
369 # row, then the first row is presumed to be labels. If the type
370 # can't be determined, it is assumed to be a string in which case
371 # the length of the string is the determining factor: if all of the
372 # rows except for the first are the same length, it's a header.
373 # Finally, a 'vote' is taken at the end for each column, adding or
374 # subtracting from the likelihood of the first row being a header.
376 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
378 header
= rdr
.next() # assume first row is header
380 columns
= len(header
)
382 for i
in range(columns
): columnTypes
[i
] = None
386 # arbitrary number of rows to check, to keep it sane
391 if len(row
) != columns
:
392 continue # skip rows that have irregular number of columns
394 for col
in columnTypes
.keys():
396 for thisType
in [int, long, float, complex]:
400 except (ValueError, OverflowError):
403 # fallback to length of string
404 thisType
= len(row
[col
])
406 # treat longs as ints
410 if thisType
!= columnTypes
[col
]:
411 if columnTypes
[col
] is None: # add new column type
412 columnTypes
[col
] = thisType
414 # type is inconsistent, remove column from
418 # finally, compare results against first row and "vote"
419 # on whether it's a header
421 for col
, colType
in columnTypes
.items():
422 if type(colType
) == type(0): # it's a length
423 if len(header
[col
]) != colType
:
427 else: # attempt typecast
430 except (ValueError, TypeError):