3 csv.py - read/write/investigate CSV files
7 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
8 unregister_dialect
, get_dialect
, list_dialects
, \
10 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
12 from _csv
import Dialect
as _Dialect
15 from cStringIO
import StringIO
17 from StringIO
import StringIO
19 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25 """Describe an Excel dialect.
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
39 skipinitialspace
= None
44 if self
.__class
__ != Dialect
:
52 # We do this for compatibility with py2.3
56 """Describe the usual properties of Excel-generated CSV files."""
60 skipinitialspace
= False
61 lineterminator
= '\r\n'
62 quoting
= QUOTE_MINIMAL
63 register_dialect("excel", excel
)
65 class excel_tab(excel
):
66 """Describe the usual properties of Excel-generated TAB-delimited files."""
68 register_dialect("excel-tab", excel_tab
)
72 def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
73 dialect
="excel", *args
, **kwds
):
74 self
.fieldnames
= fieldnames
# list of keys for the dict
75 self
.restkey
= restkey
# key to catch long rows
76 self
.restval
= restval
# default value for short rows
77 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
83 row
= self
.reader
.next()
84 if self
.fieldnames
is None:
86 row
= self
.reader
.next()
88 # unlike the basic reader, we prefer not to return blanks,
89 # because we will typically wind up with a dict full of None
92 row
= self
.reader
.next()
93 d
= dict(zip(self
.fieldnames
, row
))
94 lf
= len(self
.fieldnames
)
97 d
[self
.restkey
] = row
[lf
:]
99 for key
in self
.fieldnames
[lr
:]:
100 d
[key
] = self
.restval
105 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
106 dialect
="excel", *args
, **kwds
):
107 self
.fieldnames
= fieldnames
# list of keys for the dict
108 self
.restval
= restval
# for writing short dicts
109 if extrasaction
.lower() not in ("raise", "ignore"):
111 ("extrasaction (%s) must be 'raise' or 'ignore'" %
113 self
.extrasaction
= extrasaction
114 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
116 def _dict_to_list(self
, rowdict
):
117 if self
.extrasaction
== "raise":
118 wrong_fields
= [k
for k
in rowdict
if k
not in self
.fieldnames
]
120 raise ValueError("dict contains fields not in fieldnames: " +
121 ", ".join(wrong_fields
))
122 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
124 def writerow(self
, rowdict
):
125 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
127 def writerows(self
, rowdicts
):
129 for rowdict
in rowdicts
:
130 rows
.append(self
._dict
_to
_list
(rowdict
))
131 return self
.writer
.writerows(rows
)
133 # Guard Sniffer's type checking against builds that exclude complex()
141 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
142 Returns a Dialect object.
145 # in case there is more than one possible delimiter
146 self
.preferred
= [',', '\t', ';', ' ', ':']
149 def sniff(self
, sample
, delimiters
=None):
151 Returns a dialect (or None) corresponding to the sample
154 quotechar
, delimiter
, skipinitialspace
= \
155 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
157 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
161 raise Error
, "Could not determine delimiter"
163 class dialect(Dialect
):
165 lineterminator
= '\r\n'
166 quoting
= QUOTE_MINIMAL
170 dialect
.delimiter
= delimiter
171 # _csv.reader won't accept a quotechar of ''
172 dialect
.quotechar
= quotechar
or '"'
173 dialect
.skipinitialspace
= skipinitialspace
178 def _guess_quote_and_delimiter(self
, data
, delimiters
):
180 Looks for text enclosed between two identical quotes
181 (the probable quotechar) which are preceded and followed
182 by the same character (the probable delimiter).
185 The quote with the most wins, same with the delimiter.
186 If there is no quotechar the delimiter can't be determined
191 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
192 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
193 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
194 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
195 regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
196 matches
= regexp
.findall(data
)
201 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
207 n
= regexp
.groupindex
['quote'] - 1
210 quotes
[key
] = quotes
.get(key
, 0) + 1
212 n
= regexp
.groupindex
['delim'] - 1
216 if key
and (delimiters
is None or key
in delimiters
):
217 delims
[key
] = delims
.get(key
, 0) + 1
219 n
= regexp
.groupindex
['space'] - 1
225 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
226 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
229 delim
= reduce(lambda a
, b
, delims
= delims
:
230 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
231 skipinitialspace
= delims
[delim
] == spaces
232 if delim
== '\n': # most likely a file with a single column
235 # there is *no* delimiter, it's a single column of quoted data
239 return (quotechar
, delim
, skipinitialspace
)
242 def _guess_delimiter(self
, data
, delimiters
):
244 The delimiter /should/ occur the same number of times on
245 each row. However, due to malformed data, it may not. We don't want
246 an all or nothing approach, so we allow for small variations in this
248 1) build a table of the frequency of each character on every line.
249 2) build a table of freqencies of this frequency (meta-frequency?),
250 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
252 3) use the mode of the meta-frequency to determine the /expected/
253 frequency for that character
254 4) find out how often the character actually meets that goal
255 5) the character that best meets its goal is the delimiter
256 For performance reasons, the data is evaluated in chunks, so it can
257 try and evaluate the smallest portion of the data possible, evaluating
258 additional chunks as necessary.
261 data
= filter(None, data
.split('\n'))
263 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
265 # build frequency tables
266 chunkLength
= min(10, len(data
))
271 start
, end
= 0, min(chunkLength
, len(data
))
272 while start
< len(data
):
274 for line
in data
[start
:end
]:
276 metaFrequency
= charFrequency
.get(char
, {})
277 # must count even if frequency is 0
278 freq
= line
.count(char
)
280 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
281 charFrequency
[char
] = metaFrequency
283 for char
in charFrequency
.keys():
284 items
= charFrequency
[char
].items()
285 if len(items
) == 1 and items
[0][0] == 0:
287 # get the mode of the frequencies
289 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
291 # adjust the mode - subtract the sum of all
293 items
.remove(modes
[char
])
294 modes
[char
] = (modes
[char
][0], modes
[char
][1]
295 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
298 modes
[char
] = items
[0]
300 # build a list of possible delimiters
301 modeList
= modes
.items()
302 total
= float(chunkLength
* iteration
)
303 # (rows of consistent data) / (number of rows) = 100%
305 # minimum consistency threshold
307 while len(delims
) == 0 and consistency
>= threshold
:
308 for k
, v
in modeList
:
309 if v
[0] > 0 and v
[1] > 0:
310 if ((v
[1]/total
) >= consistency
and
311 (delimiters
is None or k
in delimiters
)):
316 delim
= delims
.keys()[0]
317 skipinitialspace
= (data
[0].count(delim
) ==
318 data
[0].count("%c " % delim
))
319 return (delim
, skipinitialspace
)
321 # analyze another chunkLength lines
328 # if there's more than one, fall back to a 'preferred' list
330 for d
in self
.preferred
:
331 if d
in delims
.keys():
332 skipinitialspace
= (data
[0].count(d
) ==
333 data
[0].count("%c " % d
))
334 return (d
, skipinitialspace
)
336 # nothing else indicates a preference, pick the character that
338 items
= [(v
,k
) for (k
,v
) in delims
.items()]
342 skipinitialspace
= (data
[0].count(delim
) ==
343 data
[0].count("%c " % delim
))
344 return (delim
, skipinitialspace
)
347 def has_header(self
, sample
):
348 # Creates a dictionary of types of data in each column. If any
349 # column is of a single type (say, integers), *except* for the first
350 # row, then the first row is presumed to be labels. If the type
351 # can't be determined, it is assumed to be a string in which case
352 # the length of the string is the determining factor: if all of the
353 # rows except for the first are the same length, it's a header.
354 # Finally, a 'vote' is taken at the end for each column, adding or
355 # subtracting from the likelihood of the first row being a header.
357 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
359 header
= rdr
.next() # assume first row is header
361 columns
= len(header
)
363 for i
in range(columns
): columnTypes
[i
] = None
367 # arbitrary number of rows to check, to keep it sane
372 if len(row
) != columns
:
373 continue # skip rows that have irregular number of columns
375 for col
in columnTypes
.keys():
377 for thisType
in [int, long, float, complex]:
381 except (ValueError, OverflowError):
384 # fallback to length of string
385 thisType
= len(row
[col
])
387 # treat longs as ints
391 if thisType
!= columnTypes
[col
]:
392 if columnTypes
[col
] is None: # add new column type
393 columnTypes
[col
] = thisType
395 # type is inconsistent, remove column from
399 # finally, compare results against first row and "vote"
400 # on whether it's a header
402 for col
, colType
in columnTypes
.items():
403 if type(colType
) == type(0): # it's a length
404 if len(header
[col
]) != colType
:
408 else: # attempt typecast
411 except (ValueError, TypeError):