3 csv.py - read/write/investigate CSV files
7 from functools
import reduce
8 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
9 unregister_dialect
, get_dialect
, list_dialects
, \
11 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
13 from _csv
import Dialect
as _Dialect
16 from cStringIO
import StringIO
18 from StringIO
import StringIO
20 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
27 """Describe an Excel dialect.
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
41 skipinitialspace
= None
46 if self
.__class
__ != Dialect
:
54 # We do this for compatibility with py2.3
58 """Describe the usual properties of Excel-generated CSV files."""
62 skipinitialspace
= False
63 lineterminator
= '\r\n'
64 quoting
= QUOTE_MINIMAL
65 register_dialect("excel", excel
)
67 class excel_tab(excel
):
68 """Describe the usual properties of Excel-generated TAB-delimited files."""
70 register_dialect("excel-tab", excel_tab
)
74 def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
75 dialect
="excel", *args
, **kwds
):
76 self
._fieldnames
= fieldnames
# list of keys for the dict
77 self
.restkey
= restkey
# key to catch long rows
78 self
.restval
= restval
# default value for short rows
79 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
80 self
.dialect
= dialect
88 if self
._fieldnames
is None:
90 self
._fieldnames
= self
.reader
.next()
93 self
.line_num
= self
.reader
.line_num
94 return self
._fieldnames
97 def fieldnames(self
, value
):
98 self
._fieldnames
= value
101 if self
.line_num
== 0:
102 # Used only for its side effect.
104 row
= self
.reader
.next()
105 self
.line_num
= self
.reader
.line_num
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
111 row
= self
.reader
.next()
112 d
= dict(zip(self
.fieldnames
, row
))
113 lf
= len(self
.fieldnames
)
116 d
[self
.restkey
] = row
[lf
:]
118 for key
in self
.fieldnames
[lr
:]:
119 d
[key
] = self
.restval
124 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
125 dialect
="excel", *args
, **kwds
):
126 self
.fieldnames
= fieldnames
# list of keys for the dict
127 self
.restval
= restval
# for writing short dicts
128 if extrasaction
.lower() not in ("raise", "ignore"):
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
132 self
.extrasaction
= extrasaction
133 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
135 def _dict_to_list(self
, rowdict
):
136 if self
.extrasaction
== "raise":
137 wrong_fields
= [k
for k
in rowdict
if k
not in self
.fieldnames
]
139 raise ValueError("dict contains fields not in fieldnames: " +
140 ", ".join(wrong_fields
))
141 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
143 def writerow(self
, rowdict
):
144 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
146 def writerows(self
, rowdicts
):
148 for rowdict
in rowdicts
:
149 rows
.append(self
._dict
_to
_list
(rowdict
))
150 return self
.writer
.writerows(rows
)
152 # Guard Sniffer's type checking against builds that exclude complex()
160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
161 Returns a Dialect object.
164 # in case there is more than one possible delimiter
165 self
.preferred
= [',', '\t', ';', ' ', ':']
168 def sniff(self
, sample
, delimiters
=None):
170 Returns a dialect (or None) corresponding to the sample
173 quotechar
, doublequote
, delimiter
, skipinitialspace
= \
174 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
176 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
180 raise Error
, "Could not determine delimiter"
182 class dialect(Dialect
):
184 lineterminator
= '\r\n'
185 quoting
= QUOTE_MINIMAL
188 dialect
.doublequote
= doublequote
189 dialect
.delimiter
= delimiter
190 # _csv.reader won't accept a quotechar of ''
191 dialect
.quotechar
= quotechar
or '"'
192 dialect
.skipinitialspace
= skipinitialspace
197 def _guess_quote_and_delimiter(self
, data
, delimiters
):
199 Looks for text enclosed between two identical quotes
200 (the probable quotechar) which are preceded and followed
201 by the same character (the probable delimiter).
204 The quote with the most wins, same with the delimiter.
205 If there is no quotechar the delimiter can't be determined
210 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
212 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
213 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
214 regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
215 matches
= regexp
.findall(data
)
220 # (quotechar, doublequote, delimiter, skipinitialspace)
221 return ('', False, None, 0)
226 n
= regexp
.groupindex
['quote'] - 1
229 quotes
[key
] = quotes
.get(key
, 0) + 1
231 n
= regexp
.groupindex
['delim'] - 1
235 if key
and (delimiters
is None or key
in delimiters
):
236 delims
[key
] = delims
.get(key
, 0) + 1
238 n
= regexp
.groupindex
['space'] - 1
244 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
245 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
248 delim
= reduce(lambda a
, b
, delims
= delims
:
249 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
250 skipinitialspace
= delims
[delim
] == spaces
251 if delim
== '\n': # most likely a file with a single column
254 # there is *no* delimiter, it's a single column of quoted data
258 # if we see an extra quote between delimiters, we've got a
259 # double quoted format
260 dq_regexp
= re
.compile(r
"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
261 {'delim':delim
, 'quote':quotechar
}, re
.MULTILINE
)
265 if dq_regexp
.search(data
):
270 return (quotechar
, doublequote
, delim
, skipinitialspace
)
273 def _guess_delimiter(self
, data
, delimiters
):
275 The delimiter /should/ occur the same number of times on
276 each row. However, due to malformed data, it may not. We don't want
277 an all or nothing approach, so we allow for small variations in this
279 1) build a table of the frequency of each character on every line.
280 2) build a table of freqencies of this frequency (meta-frequency?),
281 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
283 3) use the mode of the meta-frequency to determine the /expected/
284 frequency for that character
285 4) find out how often the character actually meets that goal
286 5) the character that best meets its goal is the delimiter
287 For performance reasons, the data is evaluated in chunks, so it can
288 try and evaluate the smallest portion of the data possible, evaluating
289 additional chunks as necessary.
292 data
= filter(None, data
.split('\n'))
294 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
296 # build frequency tables
297 chunkLength
= min(10, len(data
))
302 start
, end
= 0, min(chunkLength
, len(data
))
303 while start
< len(data
):
305 for line
in data
[start
:end
]:
307 metaFrequency
= charFrequency
.get(char
, {})
308 # must count even if frequency is 0
309 freq
= line
.count(char
)
311 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
312 charFrequency
[char
] = metaFrequency
314 for char
in charFrequency
.keys():
315 items
= charFrequency
[char
].items()
316 if len(items
) == 1 and items
[0][0] == 0:
318 # get the mode of the frequencies
320 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
322 # adjust the mode - subtract the sum of all
324 items
.remove(modes
[char
])
325 modes
[char
] = (modes
[char
][0], modes
[char
][1]
326 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
329 modes
[char
] = items
[0]
331 # build a list of possible delimiters
332 modeList
= modes
.items()
333 total
= float(chunkLength
* iteration
)
334 # (rows of consistent data) / (number of rows) = 100%
336 # minimum consistency threshold
338 while len(delims
) == 0 and consistency
>= threshold
:
339 for k
, v
in modeList
:
340 if v
[0] > 0 and v
[1] > 0:
341 if ((v
[1]/total
) >= consistency
and
342 (delimiters
is None or k
in delimiters
)):
347 delim
= delims
.keys()[0]
348 skipinitialspace
= (data
[0].count(delim
) ==
349 data
[0].count("%c " % delim
))
350 return (delim
, skipinitialspace
)
352 # analyze another chunkLength lines
359 # if there's more than one, fall back to a 'preferred' list
361 for d
in self
.preferred
:
362 if d
in delims
.keys():
363 skipinitialspace
= (data
[0].count(d
) ==
364 data
[0].count("%c " % d
))
365 return (d
, skipinitialspace
)
367 # nothing else indicates a preference, pick the character that
369 items
= [(v
,k
) for (k
,v
) in delims
.items()]
373 skipinitialspace
= (data
[0].count(delim
) ==
374 data
[0].count("%c " % delim
))
375 return (delim
, skipinitialspace
)
378 def has_header(self
, sample
):
379 # Creates a dictionary of types of data in each column. If any
380 # column is of a single type (say, integers), *except* for the first
381 # row, then the first row is presumed to be labels. If the type
382 # can't be determined, it is assumed to be a string in which case
383 # the length of the string is the determining factor: if all of the
384 # rows except for the first are the same length, it's a header.
385 # Finally, a 'vote' is taken at the end for each column, adding or
386 # subtracting from the likelihood of the first row being a header.
388 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
390 header
= rdr
.next() # assume first row is header
392 columns
= len(header
)
394 for i
in range(columns
): columnTypes
[i
] = None
398 # arbitrary number of rows to check, to keep it sane
403 if len(row
) != columns
:
404 continue # skip rows that have irregular number of columns
406 for col
in columnTypes
.keys():
408 for thisType
in [int, long, float, complex]:
412 except (ValueError, OverflowError):
415 # fallback to length of string
416 thisType
= len(row
[col
])
418 # treat longs as ints
422 if thisType
!= columnTypes
[col
]:
423 if columnTypes
[col
] is None: # add new column type
424 columnTypes
[col
] = thisType
426 # type is inconsistent, remove column from
430 # finally, compare results against first row and "vote"
431 # on whether it's a header
433 for col
, colType
in columnTypes
.items():
434 if type(colType
) == type(0): # it's a length
435 if len(header
[col
]) != colType
:
439 else: # attempt typecast
442 except (ValueError, TypeError):