Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from functools import reduce
   8 from _csv import Error, __version__, writer, reader, register_dialect, \
   9                  unregister_dialect, get_dialect, list_dialects, \
  10                  field_size_limit, \
  11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  12                  __doc__
  13 from _csv import Dialect as _Dialect
  14
  15 try:
  16     from cStringIO import StringIO
  17 except ImportError:
  18     from StringIO import StringIO
  19
  20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  21             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
  22             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  23             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  24
  25 class Dialect:
  26     """Describe an Excel dialect.
  27
  28     This must be subclassed (see csv.excel).  Valid attributes are:
  29     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  30     lineterminator, quoting.
  31
  32     """
  33     _name = ""
  34     _valid = False
  35     # placeholders
  36     delimiter = None
  37     quotechar = None
  38     escapechar = None
  39     doublequote = None
  40     skipinitialspace = None
  41     lineterminator = None
  42     quoting = None
  43
  44     def __init__(self):
  45         if self.__class__ != Dialect:
  46             self._valid = True
  47         self._validate()
  48
  49     def _validate(self):
  50         try:
  51             _Dialect(self)
  52         except TypeError, e:
  53             # We do this for compatibility with py2.3
  54             raise Error(str(e))
  55
  56 class excel(Dialect):
  57     """Describe the usual properties of Excel-generated CSV files."""
  58     delimiter = ','
  59     quotechar = '"'
  60     doublequote = True
  61     skipinitialspace = False
  62     lineterminator = '\r\n'
  63     quoting = QUOTE_MINIMAL
  64 register_dialect("excel", excel)
  65
  66 class excel_tab(excel):
  67     """Describe the usual properties of Excel-generated TAB-delimited files."""
  68     delimiter = '\t'
  69 register_dialect("excel-tab", excel_tab)
  70
  71
  72 class DictReader:
  73     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  74                  dialect="excel", *args, **kwds):
  75         self._fieldnames = fieldnames   # list of keys for the dict
  76         self.restkey = restkey          # key to catch long rows
  77         self.restval = restval          # default value for short rows
  78         self.reader = reader(f, dialect, *args, **kwds)
  79         self.dialect = dialect
  80         self.line_num = 0
  81
  82     def __iter__(self):
  83         return self
  84
  85     @property
  86     def fieldnames(self):
  87         if self._fieldnames is None:
  88             try:
  89                 self._fieldnames = self.reader.next()
  90             except StopIteration:
  91                 pass
  92         self.line_num = self.reader.line_num
  93         return self._fieldnames
  94
  95     @fieldnames.setter
  96     def fieldnames(self, value):
  97         self._fieldnames = value
  98
  99     def next(self):
 100         if self.line_num == 0:
 101             # Used only for its side effect.
 102             self.fieldnames
 103         row = self.reader.next()
 104         self.line_num = self.reader.line_num
 105
 106         # unlike the basic reader, we prefer not to return blanks,
 107         # because we will typically wind up with a dict full of None
 108         # values
 109         while row == []:
 110             row = self.reader.next()
 111         d = dict(zip(self.fieldnames, row))
 112         lf = len(self.fieldnames)
 113         lr = len(row)
 114         if lf < lr:
 115             d[self.restkey] = row[lf:]
 116         elif lf > lr:
 117             for key in self.fieldnames[lr:]:
 118                 d[key] = self.restval
 119         return d
 120
 121
 122 class DictWriter:
 123     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 124                  dialect="excel", *args, **kwds):
 125         self.fieldnames = fieldnames    # list of keys for the dict
 126         self.restval = restval          # for writing short dicts
 127         if extrasaction.lower() not in ("raise", "ignore"):
 128             raise ValueError, \
 129                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 130                    extrasaction)
 131         self.extrasaction = extrasaction
 132         self.writer = writer(f, dialect, *args, **kwds)
 133
 134     def _dict_to_list(self, rowdict):
 135         if self.extrasaction == "raise":
 136             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
 137             if wrong_fields:
 138                 raise ValueError("dict contains fields not in fieldnames: " +
 139                                  ", ".join(wrong_fields))
 140         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 141
 142     def writerow(self, rowdict):
 143         return self.writer.writerow(self._dict_to_list(rowdict))
 144
 145     def writerows(self, rowdicts):
 146         rows = []
 147         for rowdict in rowdicts:
 148             rows.append(self._dict_to_list(rowdict))
 149         return self.writer.writerows(rows)
 150
 151 # Guard Sniffer's type checking against builds that exclude complex()
 152 try:
 153     complex
 154 except NameError:
 155     complex = float
 156
 157 class Sniffer:
 158     '''
 159     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 160     Returns a Dialect object.
 161     '''
 162     def __init__(self):
 163         # in case there is more than one possible delimiter
 164         self.preferred = [',', '\t', ';', ' ', ':']
 165
 166
 167     def sniff(self, sample, delimiters=None):
 168         """
 169         Returns a dialect (or None) corresponding to the sample
 170         """
 171
 172         quotechar, delimiter, skipinitialspace = \
 173                    self._guess_quote_and_delimiter(sample, delimiters)
 174         if not delimiter:
 175             delimiter, skipinitialspace = self._guess_delimiter(sample,
 176                                                                 delimiters)
 177
 178         if not delimiter:
 179             raise Error, "Could not determine delimiter"
 180
 181         class dialect(Dialect):
 182             _name = "sniffed"
 183             lineterminator = '\r\n'
 184             quoting = QUOTE_MINIMAL
 185             # escapechar = ''
 186             doublequote = False
 187
 188         dialect.delimiter = delimiter
 189         # _csv.reader won't accept a quotechar of ''
 190         dialect.quotechar = quotechar or '"'
 191         dialect.skipinitialspace = skipinitialspace
 192
 193         return dialect
 194
 195
 196     def _guess_quote_and_delimiter(self, data, delimiters):
 197         """
 198         Looks for text enclosed between two identical quotes
 199         (the probable quotechar) which are preceded and followed
 200         by the same character (the probable delimiter).
 201         For example:
 202                          ,'some text',
 203         The quote with the most wins, same with the delimiter.
 204         If there is no quotechar the delimiter can't be determined
 205         this way.
 206         """
 207
 208         matches = []
 209         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 210                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 211                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 212                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 213             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 214             matches = regexp.findall(data)
 215             if matches:
 216                 break
 217
 218         if not matches:
 219             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 220
 221         quotes = {}
 222         delims = {}
 223         spaces = 0
 224         for m in matches:
 225             n = regexp.groupindex['quote'] - 1
 226             key = m[n]
 227             if key:
 228                 quotes[key] = quotes.get(key, 0) + 1
 229             try:
 230                 n = regexp.groupindex['delim'] - 1
 231                 key = m[n]
 232             except KeyError:
 233                 continue
 234             if key and (delimiters is None or key in delimiters):
 235                 delims[key] = delims.get(key, 0) + 1
 236             try:
 237                 n = regexp.groupindex['space'] - 1
 238             except KeyError:
 239                 continue
 240             if m[n]:
 241                 spaces += 1
 242
 243         quotechar = reduce(lambda a, b, quotes = quotes:
 244                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 245
 246         if delims:
 247             delim = reduce(lambda a, b, delims = delims:
 248                            (delims[a] > delims[b]) and a or b, delims.keys())
 249             skipinitialspace = delims[delim] == spaces
 250             if delim == '\n': # most likely a file with a single column
 251                 delim = ''
 252         else:
 253             # there is *no* delimiter, it's a single column of quoted data
 254             delim = ''
 255             skipinitialspace = 0
 256
 257         return (quotechar, delim, skipinitialspace)
 258
 259
 260     def _guess_delimiter(self, data, delimiters):
 261         """
 262         The delimiter /should/ occur the same number of times on
 263         each row. However, due to malformed data, it may not. We don't want
 264         an all or nothing approach, so we allow for small variations in this
 265         number.
 266           1) build a table of the frequency of each character on every line.
 267           2) build a table of freqencies of this frequency (meta-frequency?),
 268              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 269              7 times in 2 rows'
 270           3) use the mode of the meta-frequency to determine the /expected/
 271              frequency for that character
 272           4) find out how often the character actually meets that goal
 273           5) the character that best meets its goal is the delimiter
 274         For performance reasons, the data is evaluated in chunks, so it can
 275         try and evaluate the smallest portion of the data possible, evaluating
 276         additional chunks as necessary.
 277         """
 278
 279         data = filter(None, data.split('\n'))
 280
 281         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 282
 283         # build frequency tables
 284         chunkLength = min(10, len(data))
 285         iteration = 0
 286         charFrequency = {}
 287         modes = {}
 288         delims = {}
 289         start, end = 0, min(chunkLength, len(data))
 290         while start < len(data):
 291             iteration += 1
 292             for line in data[start:end]:
 293                 for char in ascii:
 294                     metaFrequency = charFrequency.get(char, {})
 295                     # must count even if frequency is 0
 296                     freq = line.count(char)
 297                     # value is the mode
 298                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 299                     charFrequency[char] = metaFrequency
 300
 301             for char in charFrequency.keys():
 302                 items = charFrequency[char].items()
 303                 if len(items) == 1 and items[0][0] == 0:
 304                     continue
 305                 # get the mode of the frequencies
 306                 if len(items) > 1:
 307                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 308                                          items)
 309                     # adjust the mode - subtract the sum of all
 310                     # other frequencies
 311                     items.remove(modes[char])
 312                     modes[char] = (modes[char][0], modes[char][1]
 313                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 314                                             items)[1])
 315                 else:
 316                     modes[char] = items[0]
 317
 318             # build a list of possible delimiters
 319             modeList = modes.items()
 320             total = float(chunkLength * iteration)
 321             # (rows of consistent data) / (number of rows) = 100%
 322             consistency = 1.0
 323             # minimum consistency threshold
 324             threshold = 0.9
 325             while len(delims) == 0 and consistency >= threshold:
 326                 for k, v in modeList:
 327                     if v[0] > 0 and v[1] > 0:
 328                         if ((v[1]/total) >= consistency and
 329                             (delimiters is None or k in delimiters)):
 330                             delims[k] = v
 331                 consistency -= 0.01
 332
 333             if len(delims) == 1:
 334                 delim = delims.keys()[0]
 335                 skipinitialspace = (data[0].count(delim) ==
 336                                     data[0].count("%c " % delim))
 337                 return (delim, skipinitialspace)
 338
 339             # analyze another chunkLength lines
 340             start = end
 341             end += chunkLength
 342
 343         if not delims:
 344             return ('', 0)
 345
 346         # if there's more than one, fall back to a 'preferred' list
 347         if len(delims) > 1:
 348             for d in self.preferred:
 349                 if d in delims.keys():
 350                     skipinitialspace = (data[0].count(d) ==
 351                                         data[0].count("%c " % d))
 352                     return (d, skipinitialspace)
 353
 354         # nothing else indicates a preference, pick the character that
 355         # dominates(?)
 356         items = [(v,k) for (k,v) in delims.items()]
 357         items.sort()
 358         delim = items[-1][1]
 359
 360         skipinitialspace = (data[0].count(delim) ==
 361                             data[0].count("%c " % delim))
 362         return (delim, skipinitialspace)
 363
 364
 365     def has_header(self, sample):
 366         # Creates a dictionary of types of data in each column. If any
 367         # column is of a single type (say, integers), *except* for the first
 368         # row, then the first row is presumed to be labels. If the type
 369         # can't be determined, it is assumed to be a string in which case
 370         # the length of the string is the determining factor: if all of the
 371         # rows except for the first are the same length, it's a header.
 372         # Finally, a 'vote' is taken at the end for each column, adding or
 373         # subtracting from the likelihood of the first row being a header.
 374
 375         rdr = reader(StringIO(sample), self.sniff(sample))
 376
 377         header = rdr.next() # assume first row is header
 378
 379         columns = len(header)
 380         columnTypes = {}
 381         for i in range(columns): columnTypes[i] = None
 382
 383         checked = 0
 384         for row in rdr:
 385             # arbitrary number of rows to check, to keep it sane
 386             if checked > 20:
 387                 break
 388             checked += 1
 389
 390             if len(row) != columns:
 391                 continue # skip rows that have irregular number of columns
 392
 393             for col in columnTypes.keys():
 394
 395                 for thisType in [int, long, float, complex]:
 396                     try:
 397                         thisType(row[col])
 398                         break
 399                     except (ValueError, OverflowError):
 400                         pass
 401                 else:
 402                     # fallback to length of string
 403                     thisType = len(row[col])
 404
 405                 # treat longs as ints
 406                 if thisType == long:
 407                     thisType = int
 408
 409                 if thisType != columnTypes[col]:
 410                     if columnTypes[col] is None: # add new column type
 411                         columnTypes[col] = thisType
 412                     else:
 413                         # type is inconsistent, remove column from
 414                         # consideration
 415                         del columnTypes[col]
 416
 417         # finally, compare results against first row and "vote"
 418         # on whether it's a header
 419         hasHeader = 0
 420         for col, colType in columnTypes.items():
 421             if type(colType) == type(0): # it's a length
 422                 if len(header[col]) != colType:
 423                     hasHeader += 1
 424                 else:
 425                     hasHeader -= 1
 426             else: # attempt typecast
 427                 try:
 428                     colType(header[col])
 429                 except (ValueError, TypeError):
 430                     hasHeader += 1
 431                 else:
 432                     hasHeader -= 1
 433
 434         return hasHeader > 0