Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from functools import reduce
   8 from _csv import Error, __version__, writer, reader, register_dialect, \
   9                  unregister_dialect, get_dialect, list_dialects, \
  10                  field_size_limit, \
  11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  12                  __doc__
  13 from _csv import Dialect as _Dialect
  14
  15 try:
  16     from cStringIO import StringIO
  17 except ImportError:
  18     from StringIO import StringIO
  19
  20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  21             "Error", "Dialect", "__doc__", "excel", "excel_tab",
  22             "field_size_limit", "reader", "writer",
  23             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  24             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  25
  26 class Dialect:
  27     """Describe an Excel dialect.
  28
  29     This must be subclassed (see csv.excel).  Valid attributes are:
  30     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  31     lineterminator, quoting.
  32
  33     """
  34     _name = ""
  35     _valid = False
  36     # placeholders
  37     delimiter = None
  38     quotechar = None
  39     escapechar = None
  40     doublequote = None
  41     skipinitialspace = None
  42     lineterminator = None
  43     quoting = None
  44
  45     def __init__(self):
  46         if self.__class__ != Dialect:
  47             self._valid = True
  48         self._validate()
  49
  50     def _validate(self):
  51         try:
  52             _Dialect(self)
  53         except TypeError, e:
  54             # We do this for compatibility with py2.3
  55             raise Error(str(e))
  56
  57 class excel(Dialect):
  58     """Describe the usual properties of Excel-generated CSV files."""
  59     delimiter = ','
  60     quotechar = '"'
  61     doublequote = True
  62     skipinitialspace = False
  63     lineterminator = '\r\n'
  64     quoting = QUOTE_MINIMAL
  65 register_dialect("excel", excel)
  66
  67 class excel_tab(excel):
  68     """Describe the usual properties of Excel-generated TAB-delimited files."""
  69     delimiter = '\t'
  70 register_dialect("excel-tab", excel_tab)
  71
  72
  73 class DictReader:
  74     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  75                  dialect="excel", *args, **kwds):
  76         self._fieldnames = fieldnames   # list of keys for the dict
  77         self.restkey = restkey          # key to catch long rows
  78         self.restval = restval          # default value for short rows
  79         self.reader = reader(f, dialect, *args, **kwds)
  80         self.dialect = dialect
  81         self.line_num = 0
  82
  83     def __iter__(self):
  84         return self
  85
  86     @property
  87     def fieldnames(self):
  88         if self._fieldnames is None:
  89             try:
  90                 self._fieldnames = self.reader.next()
  91             except StopIteration:
  92                 pass
  93         self.line_num = self.reader.line_num
  94         return self._fieldnames
  95
  96     @fieldnames.setter
  97     def fieldnames(self, value):
  98         self._fieldnames = value
  99
 100     def next(self):
 101         if self.line_num == 0:
 102             # Used only for its side effect.
 103             self.fieldnames
 104         row = self.reader.next()
 105         self.line_num = self.reader.line_num
 106
 107         # unlike the basic reader, we prefer not to return blanks,
 108         # because we will typically wind up with a dict full of None
 109         # values
 110         while row == []:
 111             row = self.reader.next()
 112         d = dict(zip(self.fieldnames, row))
 113         lf = len(self.fieldnames)
 114         lr = len(row)
 115         if lf < lr:
 116             d[self.restkey] = row[lf:]
 117         elif lf > lr:
 118             for key in self.fieldnames[lr:]:
 119                 d[key] = self.restval
 120         return d
 121
 122
 123 class DictWriter:
 124     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 125                  dialect="excel", *args, **kwds):
 126         self.fieldnames = fieldnames    # list of keys for the dict
 127         self.restval = restval          # for writing short dicts
 128         if extrasaction.lower() not in ("raise", "ignore"):
 129             raise ValueError, \
 130                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 131                    extrasaction)
 132         self.extrasaction = extrasaction
 133         self.writer = writer(f, dialect, *args, **kwds)
 134
 135     def _dict_to_list(self, rowdict):
 136         if self.extrasaction == "raise":
 137             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
 138             if wrong_fields:
 139                 raise ValueError("dict contains fields not in fieldnames: " +
 140                                  ", ".join(wrong_fields))
 141         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 142
 143     def writerow(self, rowdict):
 144         return self.writer.writerow(self._dict_to_list(rowdict))
 145
 146     def writerows(self, rowdicts):
 147         rows = []
 148         for rowdict in rowdicts:
 149             rows.append(self._dict_to_list(rowdict))
 150         return self.writer.writerows(rows)
 151
 152 # Guard Sniffer's type checking against builds that exclude complex()
 153 try:
 154     complex
 155 except NameError:
 156     complex = float
 157
 158 class Sniffer:
 159     '''
 160     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 161     Returns a Dialect object.
 162     '''
 163     def __init__(self):
 164         # in case there is more than one possible delimiter
 165         self.preferred = [',', '\t', ';', ' ', ':']
 166
 167
 168     def sniff(self, sample, delimiters=None):
 169         """
 170         Returns a dialect (or None) corresponding to the sample
 171         """
 172
 173         quotechar, doublequote, delimiter, skipinitialspace = \
 174                    self._guess_quote_and_delimiter(sample, delimiters)
 175         if not delimiter:
 176             delimiter, skipinitialspace = self._guess_delimiter(sample,
 177                                                                 delimiters)
 178
 179         if not delimiter:
 180             raise Error, "Could not determine delimiter"
 181
 182         class dialect(Dialect):
 183             _name = "sniffed"
 184             lineterminator = '\r\n'
 185             quoting = QUOTE_MINIMAL
 186             # escapechar = ''
 187
 188         dialect.doublequote = doublequote
 189         dialect.delimiter = delimiter
 190         # _csv.reader won't accept a quotechar of ''
 191         dialect.quotechar = quotechar or '"'
 192         dialect.skipinitialspace = skipinitialspace
 193
 194         return dialect
 195
 196
 197     def _guess_quote_and_delimiter(self, data, delimiters):
 198         """
 199         Looks for text enclosed between two identical quotes
 200         (the probable quotechar) which are preceded and followed
 201         by the same character (the probable delimiter).
 202         For example:
 203                          ,'some text',
 204         The quote with the most wins, same with the delimiter.
 205         If there is no quotechar the delimiter can't be determined
 206         this way.
 207         """
 208
 209         matches = []
 210         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 211                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 212                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 213                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 214             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 215             matches = regexp.findall(data)
 216             if matches:
 217                 break
 218
 219         if not matches:
 220             # (quotechar, doublequote, delimiter, skipinitialspace)
 221             return ('', False, None, 0)
 222         quotes = {}
 223         delims = {}
 224         spaces = 0
 225         for m in matches:
 226             n = regexp.groupindex['quote'] - 1
 227             key = m[n]
 228             if key:
 229                 quotes[key] = quotes.get(key, 0) + 1
 230             try:
 231                 n = regexp.groupindex['delim'] - 1
 232                 key = m[n]
 233             except KeyError:
 234                 continue
 235             if key and (delimiters is None or key in delimiters):
 236                 delims[key] = delims.get(key, 0) + 1
 237             try:
 238                 n = regexp.groupindex['space'] - 1
 239             except KeyError:
 240                 continue
 241             if m[n]:
 242                 spaces += 1
 243
 244         quotechar = reduce(lambda a, b, quotes = quotes:
 245                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 246
 247         if delims:
 248             delim = reduce(lambda a, b, delims = delims:
 249                            (delims[a] > delims[b]) and a or b, delims.keys())
 250             skipinitialspace = delims[delim] == spaces
 251             if delim == '\n': # most likely a file with a single column
 252                 delim = ''
 253         else:
 254             # there is *no* delimiter, it's a single column of quoted data
 255             delim = ''
 256             skipinitialspace = 0
 257
 258         # if we see an extra quote between delimiters, we've got a
 259         # double quoted format
 260         dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
 261                                {'delim':delim, 'quote':quotechar}, re.MULTILINE)
 262
 263
 264
 265         if dq_regexp.search(data):
 266             doublequote = True
 267         else:
 268             doublequote = False
 269
 270         return (quotechar, doublequote, delim, skipinitialspace)
 271
 272
 273     def _guess_delimiter(self, data, delimiters):
 274         """
 275         The delimiter /should/ occur the same number of times on
 276         each row. However, due to malformed data, it may not. We don't want
 277         an all or nothing approach, so we allow for small variations in this
 278         number.
 279           1) build a table of the frequency of each character on every line.
 280           2) build a table of freqencies of this frequency (meta-frequency?),
 281              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 282              7 times in 2 rows'
 283           3) use the mode of the meta-frequency to determine the /expected/
 284              frequency for that character
 285           4) find out how often the character actually meets that goal
 286           5) the character that best meets its goal is the delimiter
 287         For performance reasons, the data is evaluated in chunks, so it can
 288         try and evaluate the smallest portion of the data possible, evaluating
 289         additional chunks as necessary.
 290         """
 291
 292         data = filter(None, data.split('\n'))
 293
 294         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 295
 296         # build frequency tables
 297         chunkLength = min(10, len(data))
 298         iteration = 0
 299         charFrequency = {}
 300         modes = {}
 301         delims = {}
 302         start, end = 0, min(chunkLength, len(data))
 303         while start < len(data):
 304             iteration += 1
 305             for line in data[start:end]:
 306                 for char in ascii:
 307                     metaFrequency = charFrequency.get(char, {})
 308                     # must count even if frequency is 0
 309                     freq = line.count(char)
 310                     # value is the mode
 311                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 312                     charFrequency[char] = metaFrequency
 313
 314             for char in charFrequency.keys():
 315                 items = charFrequency[char].items()
 316                 if len(items) == 1 and items[0][0] == 0:
 317                     continue
 318                 # get the mode of the frequencies
 319                 if len(items) > 1:
 320                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 321                                          items)
 322                     # adjust the mode - subtract the sum of all
 323                     # other frequencies
 324                     items.remove(modes[char])
 325                     modes[char] = (modes[char][0], modes[char][1]
 326                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 327                                             items)[1])
 328                 else:
 329                     modes[char] = items[0]
 330
 331             # build a list of possible delimiters
 332             modeList = modes.items()
 333             total = float(chunkLength * iteration)
 334             # (rows of consistent data) / (number of rows) = 100%
 335             consistency = 1.0
 336             # minimum consistency threshold
 337             threshold = 0.9
 338             while len(delims) == 0 and consistency >= threshold:
 339                 for k, v in modeList:
 340                     if v[0] > 0 and v[1] > 0:
 341                         if ((v[1]/total) >= consistency and
 342                             (delimiters is None or k in delimiters)):
 343                             delims[k] = v
 344                 consistency -= 0.01
 345
 346             if len(delims) == 1:
 347                 delim = delims.keys()[0]
 348                 skipinitialspace = (data[0].count(delim) ==
 349                                     data[0].count("%c " % delim))
 350                 return (delim, skipinitialspace)
 351
 352             # analyze another chunkLength lines
 353             start = end
 354             end += chunkLength
 355
 356         if not delims:
 357             return ('', 0)
 358
 359         # if there's more than one, fall back to a 'preferred' list
 360         if len(delims) > 1:
 361             for d in self.preferred:
 362                 if d in delims.keys():
 363                     skipinitialspace = (data[0].count(d) ==
 364                                         data[0].count("%c " % d))
 365                     return (d, skipinitialspace)
 366
 367         # nothing else indicates a preference, pick the character that
 368         # dominates(?)
 369         items = [(v,k) for (k,v) in delims.items()]
 370         items.sort()
 371         delim = items[-1][1]
 372
 373         skipinitialspace = (data[0].count(delim) ==
 374                             data[0].count("%c " % delim))
 375         return (delim, skipinitialspace)
 376
 377
 378     def has_header(self, sample):
 379         # Creates a dictionary of types of data in each column. If any
 380         # column is of a single type (say, integers), *except* for the first
 381         # row, then the first row is presumed to be labels. If the type
 382         # can't be determined, it is assumed to be a string in which case
 383         # the length of the string is the determining factor: if all of the
 384         # rows except for the first are the same length, it's a header.
 385         # Finally, a 'vote' is taken at the end for each column, adding or
 386         # subtracting from the likelihood of the first row being a header.
 387
 388         rdr = reader(StringIO(sample), self.sniff(sample))
 389
 390         header = rdr.next() # assume first row is header
 391
 392         columns = len(header)
 393         columnTypes = {}
 394         for i in range(columns): columnTypes[i] = None
 395
 396         checked = 0
 397         for row in rdr:
 398             # arbitrary number of rows to check, to keep it sane
 399             if checked > 20:
 400                 break
 401             checked += 1
 402
 403             if len(row) != columns:
 404                 continue # skip rows that have irregular number of columns
 405
 406             for col in columnTypes.keys():
 407
 408                 for thisType in [int, long, float, complex]:
 409                     try:
 410                         thisType(row[col])
 411                         break
 412                     except (ValueError, OverflowError):
 413                         pass
 414                 else:
 415                     # fallback to length of string
 416                     thisType = len(row[col])
 417
 418                 # treat longs as ints
 419                 if thisType == long:
 420                     thisType = int
 421
 422                 if thisType != columnTypes[col]:
 423                     if columnTypes[col] is None: # add new column type
 424                         columnTypes[col] = thisType
 425                     else:
 426                         # type is inconsistent, remove column from
 427                         # consideration
 428                         del columnTypes[col]
 429
 430         # finally, compare results against first row and "vote"
 431         # on whether it's a header
 432         hasHeader = 0
 433         for col, colType in columnTypes.items():
 434             if type(colType) == type(0): # it's a length
 435                 if len(header[col]) != colType:
 436                     hasHeader += 1
 437                 else:
 438                     hasHeader -= 1
 439             else: # attempt typecast
 440                 try:
 441                     colType(header[col])
 442                 except (ValueError, TypeError):
 443                     hasHeader += 1
 444                 else:
 445                     hasHeader -= 1
 446
 447         return hasHeader > 0