Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from functools import reduce
   8 from _csv import Error, __version__, writer, reader, register_dialect, \
   9                  unregister_dialect, get_dialect, list_dialects, \
  10                  field_size_limit, \
  11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  12                  __doc__
  13 from _csv import Dialect as _Dialect
  14
  15 try:
  16     from cStringIO import StringIO
  17 except ImportError:
  18     from StringIO import StringIO
  19
  20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  21             "Error", "Dialect", "__doc__", "excel", "excel_tab",
  22             "field_size_limit", "reader", "writer",
  23             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  24             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  25
  26 class Dialect:
  27     """Describe an Excel dialect.
  28
  29     This must be subclassed (see csv.excel).  Valid attributes are:
  30     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  31     lineterminator, quoting.
  32
  33     """
  34     _name = ""
  35     _valid = False
  36     # placeholders
  37     delimiter = None
  38     quotechar = None
  39     escapechar = None
  40     doublequote = None
  41     skipinitialspace = None
  42     lineterminator = None
  43     quoting = None
  44
  45     def __init__(self):
  46         if self.__class__ != Dialect:
  47             self._valid = True
  48         self._validate()
  49
  50     def _validate(self):
  51         try:
  52             _Dialect(self)
  53         except TypeError, e:
  54             # We do this for compatibility with py2.3
  55             raise Error(str(e))
  56
  57 class excel(Dialect):
  58     """Describe the usual properties of Excel-generated CSV files."""
  59     delimiter = ','
  60     quotechar = '"'
  61     doublequote = True
  62     skipinitialspace = False
  63     lineterminator = '\r\n'
  64     quoting = QUOTE_MINIMAL
  65 register_dialect("excel", excel)
  66
  67 class excel_tab(excel):
  68     """Describe the usual properties of Excel-generated TAB-delimited files."""
  69     delimiter = '\t'
  70 register_dialect("excel-tab", excel_tab)
  71
  72
  73 class DictReader:
  74     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  75                  dialect="excel", *args, **kwds):
  76         self._fieldnames = fieldnames   # list of keys for the dict
  77         self.restkey = restkey          # key to catch long rows
  78         self.restval = restval          # default value for short rows
  79         self.reader = reader(f, dialect, *args, **kwds)
  80         self.dialect = dialect
  81         self.line_num = 0
  82
  83     def __iter__(self):
  84         return self
  85
  86     @property
  87     def fieldnames(self):
  88         if self._fieldnames is None:
  89             try:
  90                 self._fieldnames = self.reader.next()
  91             except StopIteration:
  92                 pass
  93         self.line_num = self.reader.line_num
  94         return self._fieldnames
  95
  96     @fieldnames.setter
  97     def fieldnames(self, value):
  98         self._fieldnames = value
  99
 100     def next(self):
 101         if self.line_num == 0:
 102             # Used only for its side effect.
 103             self.fieldnames
 104         row = self.reader.next()
 105         self.line_num = self.reader.line_num
 106
 107         # unlike the basic reader, we prefer not to return blanks,
 108         # because we will typically wind up with a dict full of None
 109         # values
 110         while row == []:
 111             row = self.reader.next()
 112         d = dict(zip(self.fieldnames, row))
 113         lf = len(self.fieldnames)
 114         lr = len(row)
 115         if lf < lr:
 116             d[self.restkey] = row[lf:]
 117         elif lf > lr:
 118             for key in self.fieldnames[lr:]:
 119                 d[key] = self.restval
 120         return d
 121
 122
 123 class DictWriter:
 124     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 125                  dialect="excel", *args, **kwds):
 126         self.fieldnames = fieldnames    # list of keys for the dict
 127         self.restval = restval          # for writing short dicts
 128         if extrasaction.lower() not in ("raise", "ignore"):
 129             raise ValueError, \
 130                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 131                    extrasaction)
 132         self.extrasaction = extrasaction
 133         self.writer = writer(f, dialect, *args, **kwds)
 134
 135     def _dict_to_list(self, rowdict):
 136         if self.extrasaction == "raise":
 137             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
 138             if wrong_fields:
 139                 raise ValueError("dict contains fields not in fieldnames: " +
 140                                  ", ".join(wrong_fields))
 141         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 142
 143     def writerow(self, rowdict):
 144         return self.writer.writerow(self._dict_to_list(rowdict))
 145
 146     def writerows(self, rowdicts):
 147         rows = []
 148         for rowdict in rowdicts:
 149             rows.append(self._dict_to_list(rowdict))
 150         return self.writer.writerows(rows)
 151
 152 # Guard Sniffer's type checking against builds that exclude complex()
 153 try:
 154     complex
 155 except NameError:
 156     complex = float
 157
 158 class Sniffer:
 159     '''
 160     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 161     Returns a Dialect object.
 162     '''
 163     def __init__(self):
 164         # in case there is more than one possible delimiter
 165         self.preferred = [',', '\t', ';', ' ', ':']
 166
 167
 168     def sniff(self, sample, delimiters=None):
 169         """
 170         Returns a dialect (or None) corresponding to the sample
 171         """
 172
 173         quotechar, delimiter, skipinitialspace = \
 174                    self._guess_quote_and_delimiter(sample, delimiters)
 175         if not delimiter:
 176             delimiter, skipinitialspace = self._guess_delimiter(sample,
 177                                                                 delimiters)
 178
 179         if not delimiter:
 180             raise Error, "Could not determine delimiter"
 181
 182         class dialect(Dialect):
 183             _name = "sniffed"
 184             lineterminator = '\r\n'
 185             quoting = QUOTE_MINIMAL
 186             # escapechar = ''
 187             doublequote = False
 188
 189         dialect.delimiter = delimiter
 190         # _csv.reader won't accept a quotechar of ''
 191         dialect.quotechar = quotechar or '"'
 192         dialect.skipinitialspace = skipinitialspace
 193
 194         return dialect
 195
 196
 197     def _guess_quote_and_delimiter(self, data, delimiters):
 198         """
 199         Looks for text enclosed between two identical quotes
 200         (the probable quotechar) which are preceded and followed
 201         by the same character (the probable delimiter).
 202         For example:
 203                          ,'some text',
 204         The quote with the most wins, same with the delimiter.
 205         If there is no quotechar the delimiter can't be determined
 206         this way.
 207         """
 208
 209         matches = []
 210         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 211                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 212                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 213                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 214             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 215             matches = regexp.findall(data)
 216             if matches:
 217                 break
 218
 219         if not matches:
 220             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 221
 222         quotes = {}
 223         delims = {}
 224         spaces = 0
 225         for m in matches:
 226             n = regexp.groupindex['quote'] - 1
 227             key = m[n]
 228             if key:
 229                 quotes[key] = quotes.get(key, 0) + 1
 230             try:
 231                 n = regexp.groupindex['delim'] - 1
 232                 key = m[n]
 233             except KeyError:
 234                 continue
 235             if key and (delimiters is None or key in delimiters):
 236                 delims[key] = delims.get(key, 0) + 1
 237             try:
 238                 n = regexp.groupindex['space'] - 1
 239             except KeyError:
 240                 continue
 241             if m[n]:
 242                 spaces += 1
 243
 244         quotechar = reduce(lambda a, b, quotes = quotes:
 245                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 246
 247         if delims:
 248             delim = reduce(lambda a, b, delims = delims:
 249                            (delims[a] > delims[b]) and a or b, delims.keys())
 250             skipinitialspace = delims[delim] == spaces
 251             if delim == '\n': # most likely a file with a single column
 252                 delim = ''
 253         else:
 254             # there is *no* delimiter, it's a single column of quoted data
 255             delim = ''
 256             skipinitialspace = 0
 257
 258         return (quotechar, delim, skipinitialspace)
 259
 260
 261     def _guess_delimiter(self, data, delimiters):
 262         """
 263         The delimiter /should/ occur the same number of times on
 264         each row. However, due to malformed data, it may not. We don't want
 265         an all or nothing approach, so we allow for small variations in this
 266         number.
 267           1) build a table of the frequency of each character on every line.
 268           2) build a table of freqencies of this frequency (meta-frequency?),
 269              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 270              7 times in 2 rows'
 271           3) use the mode of the meta-frequency to determine the /expected/
 272              frequency for that character
 273           4) find out how often the character actually meets that goal
 274           5) the character that best meets its goal is the delimiter
 275         For performance reasons, the data is evaluated in chunks, so it can
 276         try and evaluate the smallest portion of the data possible, evaluating
 277         additional chunks as necessary.
 278         """
 279
 280         data = filter(None, data.split('\n'))
 281
 282         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 283
 284         # build frequency tables
 285         chunkLength = min(10, len(data))
 286         iteration = 0
 287         charFrequency = {}
 288         modes = {}
 289         delims = {}
 290         start, end = 0, min(chunkLength, len(data))
 291         while start < len(data):
 292             iteration += 1
 293             for line in data[start:end]:
 294                 for char in ascii:
 295                     metaFrequency = charFrequency.get(char, {})
 296                     # must count even if frequency is 0
 297                     freq = line.count(char)
 298                     # value is the mode
 299                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 300                     charFrequency[char] = metaFrequency
 301
 302             for char in charFrequency.keys():
 303                 items = charFrequency[char].items()
 304                 if len(items) == 1 and items[0][0] == 0:
 305                     continue
 306                 # get the mode of the frequencies
 307                 if len(items) > 1:
 308                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 309                                          items)
 310                     # adjust the mode - subtract the sum of all
 311                     # other frequencies
 312                     items.remove(modes[char])
 313                     modes[char] = (modes[char][0], modes[char][1]
 314                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 315                                             items)[1])
 316                 else:
 317                     modes[char] = items[0]
 318
 319             # build a list of possible delimiters
 320             modeList = modes.items()
 321             total = float(chunkLength * iteration)
 322             # (rows of consistent data) / (number of rows) = 100%
 323             consistency = 1.0
 324             # minimum consistency threshold
 325             threshold = 0.9
 326             while len(delims) == 0 and consistency >= threshold:
 327                 for k, v in modeList:
 328                     if v[0] > 0 and v[1] > 0:
 329                         if ((v[1]/total) >= consistency and
 330                             (delimiters is None or k in delimiters)):
 331                             delims[k] = v
 332                 consistency -= 0.01
 333
 334             if len(delims) == 1:
 335                 delim = delims.keys()[0]
 336                 skipinitialspace = (data[0].count(delim) ==
 337                                     data[0].count("%c " % delim))
 338                 return (delim, skipinitialspace)
 339
 340             # analyze another chunkLength lines
 341             start = end
 342             end += chunkLength
 343
 344         if not delims:
 345             return ('', 0)
 346
 347         # if there's more than one, fall back to a 'preferred' list
 348         if len(delims) > 1:
 349             for d in self.preferred:
 350                 if d in delims.keys():
 351                     skipinitialspace = (data[0].count(d) ==
 352                                         data[0].count("%c " % d))
 353                     return (d, skipinitialspace)
 354
 355         # nothing else indicates a preference, pick the character that
 356         # dominates(?)
 357         items = [(v,k) for (k,v) in delims.items()]
 358         items.sort()
 359         delim = items[-1][1]
 360
 361         skipinitialspace = (data[0].count(delim) ==
 362                             data[0].count("%c " % delim))
 363         return (delim, skipinitialspace)
 364
 365
 366     def has_header(self, sample):
 367         # Creates a dictionary of types of data in each column. If any
 368         # column is of a single type (say, integers), *except* for the first
 369         # row, then the first row is presumed to be labels. If the type
 370         # can't be determined, it is assumed to be a string in which case
 371         # the length of the string is the determining factor: if all of the
 372         # rows except for the first are the same length, it's a header.
 373         # Finally, a 'vote' is taken at the end for each column, adding or
 374         # subtracting from the likelihood of the first row being a header.
 375
 376         rdr = reader(StringIO(sample), self.sniff(sample))
 377
 378         header = rdr.next() # assume first row is header
 379
 380         columns = len(header)
 381         columnTypes = {}
 382         for i in range(columns): columnTypes[i] = None
 383
 384         checked = 0
 385         for row in rdr:
 386             # arbitrary number of rows to check, to keep it sane
 387             if checked > 20:
 388                 break
 389             checked += 1
 390
 391             if len(row) != columns:
 392                 continue # skip rows that have irregular number of columns
 393
 394             for col in columnTypes.keys():
 395
 396                 for thisType in [int, long, float, complex]:
 397                     try:
 398                         thisType(row[col])
 399                         break
 400                     except (ValueError, OverflowError):
 401                         pass
 402                 else:
 403                     # fallback to length of string
 404                     thisType = len(row[col])
 405
 406                 # treat longs as ints
 407                 if thisType == long:
 408                     thisType = int
 409
 410                 if thisType != columnTypes[col]:
 411                     if columnTypes[col] is None: # add new column type
 412                         columnTypes[col] = thisType
 413                     else:
 414                         # type is inconsistent, remove column from
 415                         # consideration
 416                         del columnTypes[col]
 417
 418         # finally, compare results against first row and "vote"
 419         # on whether it's a header
 420         hasHeader = 0
 421         for col, colType in columnTypes.items():
 422             if type(colType) == type(0): # it's a length
 423                 if len(header[col]) != colType:
 424                     hasHeader += 1
 425                 else:
 426                     hasHeader -= 1
 427             else: # attempt typecast
 428                 try:
 429                     colType(header[col])
 430                 except (ValueError, TypeError):
 431                     hasHeader += 1
 432                 else:
 433                     hasHeader -= 1
 434
 435         return hasHeader > 0