3 csv.py - read/write/investigate CSV files
7 from functools
import reduce
8 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
9 unregister_dialect
, get_dialect
, list_dialects
, \
11 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
13 from _csv
import Dialect
as _Dialect
16 from cStringIO
import StringIO
18 from StringIO
import StringIO
20 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
22 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
23 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
26 """Describe an Excel dialect.
28 This must be subclassed (see csv.excel). Valid attributes are:
29 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
30 lineterminator, quoting.
40 skipinitialspace
= None
45 if self
.__class
__ != Dialect
:
53 # We do this for compatibility with py2.3
57 """Describe the usual properties of Excel-generated CSV files."""
61 skipinitialspace
= False
62 lineterminator
= '\r\n'
63 quoting
= QUOTE_MINIMAL
64 register_dialect("excel", excel
)
66 class excel_tab(excel
):
67 """Describe the usual properties of Excel-generated TAB-delimited files."""
69 register_dialect("excel-tab", excel_tab
)
73 def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
74 dialect
="excel", *args
, **kwds
):
75 self
._fieldnames
= fieldnames
# list of keys for the dict
76 self
.restkey
= restkey
# key to catch long rows
77 self
.restval
= restval
# default value for short rows
78 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
79 self
.dialect
= dialect
87 if self
._fieldnames
is None:
89 self
._fieldnames
= self
.reader
.next()
92 self
.line_num
= self
.reader
.line_num
93 return self
._fieldnames
96 def fieldnames(self
, value
):
97 self
._fieldnames
= value
100 if self
.line_num
== 0:
101 # Used only for its side effect.
103 row
= self
.reader
.next()
104 self
.line_num
= self
.reader
.line_num
106 # unlike the basic reader, we prefer not to return blanks,
107 # because we will typically wind up with a dict full of None
110 row
= self
.reader
.next()
111 d
= dict(zip(self
.fieldnames
, row
))
112 lf
= len(self
.fieldnames
)
115 d
[self
.restkey
] = row
[lf
:]
117 for key
in self
.fieldnames
[lr
:]:
118 d
[key
] = self
.restval
123 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
124 dialect
="excel", *args
, **kwds
):
125 self
.fieldnames
= fieldnames
# list of keys for the dict
126 self
.restval
= restval
# for writing short dicts
127 if extrasaction
.lower() not in ("raise", "ignore"):
129 ("extrasaction (%s) must be 'raise' or 'ignore'" %
131 self
.extrasaction
= extrasaction
132 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
134 def _dict_to_list(self
, rowdict
):
135 if self
.extrasaction
== "raise":
136 wrong_fields
= [k
for k
in rowdict
if k
not in self
.fieldnames
]
138 raise ValueError("dict contains fields not in fieldnames: " +
139 ", ".join(wrong_fields
))
140 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
142 def writerow(self
, rowdict
):
143 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
145 def writerows(self
, rowdicts
):
147 for rowdict
in rowdicts
:
148 rows
.append(self
._dict
_to
_list
(rowdict
))
149 return self
.writer
.writerows(rows
)
151 # Guard Sniffer's type checking against builds that exclude complex()
159 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
160 Returns a Dialect object.
163 # in case there is more than one possible delimiter
164 self
.preferred
= [',', '\t', ';', ' ', ':']
167 def sniff(self
, sample
, delimiters
=None):
169 Returns a dialect (or None) corresponding to the sample
172 quotechar
, delimiter
, skipinitialspace
= \
173 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
175 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
179 raise Error
, "Could not determine delimiter"
181 class dialect(Dialect
):
183 lineterminator
= '\r\n'
184 quoting
= QUOTE_MINIMAL
188 dialect
.delimiter
= delimiter
189 # _csv.reader won't accept a quotechar of ''
190 dialect
.quotechar
= quotechar
or '"'
191 dialect
.skipinitialspace
= skipinitialspace
196 def _guess_quote_and_delimiter(self
, data
, delimiters
):
198 Looks for text enclosed between two identical quotes
199 (the probable quotechar) which are preceded and followed
200 by the same character (the probable delimiter).
203 The quote with the most wins, same with the delimiter.
204 If there is no quotechar the delimiter can't be determined
209 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
211 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
212 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
213 regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
214 matches
= regexp
.findall(data
)
219 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
225 n
= regexp
.groupindex
['quote'] - 1
228 quotes
[key
] = quotes
.get(key
, 0) + 1
230 n
= regexp
.groupindex
['delim'] - 1
234 if key
and (delimiters
is None or key
in delimiters
):
235 delims
[key
] = delims
.get(key
, 0) + 1
237 n
= regexp
.groupindex
['space'] - 1
243 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
244 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
247 delim
= reduce(lambda a
, b
, delims
= delims
:
248 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
249 skipinitialspace
= delims
[delim
] == spaces
250 if delim
== '\n': # most likely a file with a single column
253 # there is *no* delimiter, it's a single column of quoted data
257 return (quotechar
, delim
, skipinitialspace
)
260 def _guess_delimiter(self
, data
, delimiters
):
262 The delimiter /should/ occur the same number of times on
263 each row. However, due to malformed data, it may not. We don't want
264 an all or nothing approach, so we allow for small variations in this
266 1) build a table of the frequency of each character on every line.
267 2) build a table of freqencies of this frequency (meta-frequency?),
268 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
270 3) use the mode of the meta-frequency to determine the /expected/
271 frequency for that character
272 4) find out how often the character actually meets that goal
273 5) the character that best meets its goal is the delimiter
274 For performance reasons, the data is evaluated in chunks, so it can
275 try and evaluate the smallest portion of the data possible, evaluating
276 additional chunks as necessary.
279 data
= filter(None, data
.split('\n'))
281 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
283 # build frequency tables
284 chunkLength
= min(10, len(data
))
289 start
, end
= 0, min(chunkLength
, len(data
))
290 while start
< len(data
):
292 for line
in data
[start
:end
]:
294 metaFrequency
= charFrequency
.get(char
, {})
295 # must count even if frequency is 0
296 freq
= line
.count(char
)
298 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
299 charFrequency
[char
] = metaFrequency
301 for char
in charFrequency
.keys():
302 items
= charFrequency
[char
].items()
303 if len(items
) == 1 and items
[0][0] == 0:
305 # get the mode of the frequencies
307 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
309 # adjust the mode - subtract the sum of all
311 items
.remove(modes
[char
])
312 modes
[char
] = (modes
[char
][0], modes
[char
][1]
313 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
316 modes
[char
] = items
[0]
318 # build a list of possible delimiters
319 modeList
= modes
.items()
320 total
= float(chunkLength
* iteration
)
321 # (rows of consistent data) / (number of rows) = 100%
323 # minimum consistency threshold
325 while len(delims
) == 0 and consistency
>= threshold
:
326 for k
, v
in modeList
:
327 if v
[0] > 0 and v
[1] > 0:
328 if ((v
[1]/total
) >= consistency
and
329 (delimiters
is None or k
in delimiters
)):
334 delim
= delims
.keys()[0]
335 skipinitialspace
= (data
[0].count(delim
) ==
336 data
[0].count("%c " % delim
))
337 return (delim
, skipinitialspace
)
339 # analyze another chunkLength lines
346 # if there's more than one, fall back to a 'preferred' list
348 for d
in self
.preferred
:
349 if d
in delims
.keys():
350 skipinitialspace
= (data
[0].count(d
) ==
351 data
[0].count("%c " % d
))
352 return (d
, skipinitialspace
)
354 # nothing else indicates a preference, pick the character that
356 items
= [(v
,k
) for (k
,v
) in delims
.items()]
360 skipinitialspace
= (data
[0].count(delim
) ==
361 data
[0].count("%c " % delim
))
362 return (delim
, skipinitialspace
)
365 def has_header(self
, sample
):
366 # Creates a dictionary of types of data in each column. If any
367 # column is of a single type (say, integers), *except* for the first
368 # row, then the first row is presumed to be labels. If the type
369 # can't be determined, it is assumed to be a string in which case
370 # the length of the string is the determining factor: if all of the
371 # rows except for the first are the same length, it's a header.
372 # Finally, a 'vote' is taken at the end for each column, adding or
373 # subtracting from the likelihood of the first row being a header.
375 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
377 header
= rdr
.next() # assume first row is header
379 columns
= len(header
)
381 for i
in range(columns
): columnTypes
[i
] = None
385 # arbitrary number of rows to check, to keep it sane
390 if len(row
) != columns
:
391 continue # skip rows that have irregular number of columns
393 for col
in columnTypes
.keys():
395 for thisType
in [int, long, float, complex]:
399 except (ValueError, OverflowError):
402 # fallback to length of string
403 thisType
= len(row
[col
])
405 # treat longs as ints
409 if thisType
!= columnTypes
[col
]:
410 if columnTypes
[col
] is None: # add new column type
411 columnTypes
[col
] = thisType
413 # type is inconsistent, remove column from
417 # finally, compare results against first row and "vote"
418 # on whether it's a header
420 for col
, colType
in columnTypes
.items():
421 if type(colType
) == type(0): # it's a length
422 if len(header
[col
]) != colType
:
426 else: # attempt typecast
429 except (ValueError, TypeError):