Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from _csv import Error, __version__, writer, reader, register_dialect, \
   8                  unregister_dialect, get_dialect, list_dialects, \
   9                  field_size_limit, \
  10                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  11                  __doc__
  12 from _csv import Dialect as _Dialect
  13
  14 try:
  15     from cStringIO import StringIO
  16 except ImportError:
  17     from StringIO import StringIO
  18
  19 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  20             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
  21             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  22             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  23
  24 class Dialect:
  25     """Describe an Excel dialect.
  26
  27     This must be subclassed (see csv.excel).  Valid attributes are:
  28     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  29     lineterminator, quoting.
  30
  31     """
  32     _name = ""
  33     _valid = False
  34     # placeholders
  35     delimiter = None
  36     quotechar = None
  37     escapechar = None
  38     doublequote = None
  39     skipinitialspace = None
  40     lineterminator = None
  41     quoting = None
  42
  43     def __init__(self):
  44         if self.__class__ != Dialect:
  45             self._valid = True
  46         self._validate()
  47
  48     def _validate(self):
  49         try:
  50             _Dialect(self)
  51         except TypeError, e:
  52             # We do this for compatibility with py2.3
  53             raise Error(str(e))
  54
  55 class excel(Dialect):
  56     """Describe the usual properties of Excel-generated CSV files."""
  57     delimiter = ','
  58     quotechar = '"'
  59     doublequote = True
  60     skipinitialspace = False
  61     lineterminator = '\r\n'
  62     quoting = QUOTE_MINIMAL
  63 register_dialect("excel", excel)
  64
  65 class excel_tab(excel):
  66     """Describe the usual properties of Excel-generated TAB-delimited files."""
  67     delimiter = '\t'
  68 register_dialect("excel-tab", excel_tab)
  69
  70
  71 class DictReader:
  72     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  73                  dialect="excel", *args, **kwds):
  74         self.fieldnames = fieldnames    # list of keys for the dict
  75         self.restkey = restkey          # key to catch long rows
  76         self.restval = restval          # default value for short rows
  77         self.reader = reader(f, dialect, *args, **kwds)
  78         self.dialect = dialect
  79         self.line_num = 0
  80
  81     def __iter__(self):
  82         return self
  83
  84     def next(self):
  85         row = self.reader.next()
  86         if self.fieldnames is None:
  87             self.fieldnames = row
  88             row = self.reader.next()
  89         self.line_num = self.reader.line_num
  90
  91         # unlike the basic reader, we prefer not to return blanks,
  92         # because we will typically wind up with a dict full of None
  93         # values
  94         while row == []:
  95             row = self.reader.next()
  96         d = dict(zip(self.fieldnames, row))
  97         lf = len(self.fieldnames)
  98         lr = len(row)
  99         if lf < lr:
 100             d[self.restkey] = row[lf:]
 101         elif lf > lr:
 102             for key in self.fieldnames[lr:]:
 103                 d[key] = self.restval
 104         return d
 105
 106
 107 class DictWriter:
 108     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 109                  dialect="excel", *args, **kwds):
 110         self.fieldnames = fieldnames    # list of keys for the dict
 111         self.restval = restval          # for writing short dicts
 112         if extrasaction.lower() not in ("raise", "ignore"):
 113             raise ValueError, \
 114                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 115                    extrasaction)
 116         self.extrasaction = extrasaction
 117         self.writer = writer(f, dialect, *args, **kwds)
 118
 119     def _dict_to_list(self, rowdict):
 120         if self.extrasaction == "raise":
 121             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
 122             if wrong_fields:
 123                 raise ValueError("dict contains fields not in fieldnames: " +
 124                                  ", ".join(wrong_fields))
 125         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 126
 127     def writerow(self, rowdict):
 128         return self.writer.writerow(self._dict_to_list(rowdict))
 129
 130     def writerows(self, rowdicts):
 131         rows = []
 132         for rowdict in rowdicts:
 133             rows.append(self._dict_to_list(rowdict))
 134         return self.writer.writerows(rows)
 135
 136 # Guard Sniffer's type checking against builds that exclude complex()
 137 try:
 138     complex
 139 except NameError:
 140     complex = float
 141
 142 class Sniffer:
 143     '''
 144     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 145     Returns a Dialect object.
 146     '''
 147     def __init__(self):
 148         # in case there is more than one possible delimiter
 149         self.preferred = [',', '\t', ';', ' ', ':']
 150
 151
 152     def sniff(self, sample, delimiters=None):
 153         """
 154         Returns a dialect (or None) corresponding to the sample
 155         """
 156
 157         quotechar, delimiter, skipinitialspace = \
 158                    self._guess_quote_and_delimiter(sample, delimiters)
 159         if not delimiter:
 160             delimiter, skipinitialspace = self._guess_delimiter(sample,
 161                                                                 delimiters)
 162
 163         if not delimiter:
 164             raise Error, "Could not determine delimiter"
 165
 166         class dialect(Dialect):
 167             _name = "sniffed"
 168             lineterminator = '\r\n'
 169             quoting = QUOTE_MINIMAL
 170             # escapechar = ''
 171             doublequote = False
 172
 173         dialect.delimiter = delimiter
 174         # _csv.reader won't accept a quotechar of ''
 175         dialect.quotechar = quotechar or '"'
 176         dialect.skipinitialspace = skipinitialspace
 177
 178         return dialect
 179
 180
 181     def _guess_quote_and_delimiter(self, data, delimiters):
 182         """
 183         Looks for text enclosed between two identical quotes
 184         (the probable quotechar) which are preceded and followed
 185         by the same character (the probable delimiter).
 186         For example:
 187                          ,'some text',
 188         The quote with the most wins, same with the delimiter.
 189         If there is no quotechar the delimiter can't be determined
 190         this way.
 191         """
 192
 193         matches = []
 194         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 195                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 196                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 197                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 198             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 199             matches = regexp.findall(data)
 200             if matches:
 201                 break
 202
 203         if not matches:
 204             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 205
 206         quotes = {}
 207         delims = {}
 208         spaces = 0
 209         for m in matches:
 210             n = regexp.groupindex['quote'] - 1
 211             key = m[n]
 212             if key:
 213                 quotes[key] = quotes.get(key, 0) + 1
 214             try:
 215                 n = regexp.groupindex['delim'] - 1
 216                 key = m[n]
 217             except KeyError:
 218                 continue
 219             if key and (delimiters is None or key in delimiters):
 220                 delims[key] = delims.get(key, 0) + 1
 221             try:
 222                 n = regexp.groupindex['space'] - 1
 223             except KeyError:
 224                 continue
 225             if m[n]:
 226                 spaces += 1
 227
 228         quotechar = reduce(lambda a, b, quotes = quotes:
 229                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 230
 231         if delims:
 232             delim = reduce(lambda a, b, delims = delims:
 233                            (delims[a] > delims[b]) and a or b, delims.keys())
 234             skipinitialspace = delims[delim] == spaces
 235             if delim == '\n': # most likely a file with a single column
 236                 delim = ''
 237         else:
 238             # there is *no* delimiter, it's a single column of quoted data
 239             delim = ''
 240             skipinitialspace = 0
 241
 242         return (quotechar, delim, skipinitialspace)
 243
 244
 245     def _guess_delimiter(self, data, delimiters):
 246         """
 247         The delimiter /should/ occur the same number of times on
 248         each row. However, due to malformed data, it may not. We don't want
 249         an all or nothing approach, so we allow for small variations in this
 250         number.
 251           1) build a table of the frequency of each character on every line.
 252           2) build a table of freqencies of this frequency (meta-frequency?),
 253              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 254              7 times in 2 rows'
 255           3) use the mode of the meta-frequency to determine the /expected/
 256              frequency for that character
 257           4) find out how often the character actually meets that goal
 258           5) the character that best meets its goal is the delimiter
 259         For performance reasons, the data is evaluated in chunks, so it can
 260         try and evaluate the smallest portion of the data possible, evaluating
 261         additional chunks as necessary.
 262         """
 263
 264         data = filter(None, data.split('\n'))
 265
 266         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 267
 268         # build frequency tables
 269         chunkLength = min(10, len(data))
 270         iteration = 0
 271         charFrequency = {}
 272         modes = {}
 273         delims = {}
 274         start, end = 0, min(chunkLength, len(data))
 275         while start < len(data):
 276             iteration += 1
 277             for line in data[start:end]:
 278                 for char in ascii:
 279                     metaFrequency = charFrequency.get(char, {})
 280                     # must count even if frequency is 0
 281                     freq = line.count(char)
 282                     # value is the mode
 283                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 284                     charFrequency[char] = metaFrequency
 285
 286             for char in charFrequency.keys():
 287                 items = charFrequency[char].items()
 288                 if len(items) == 1 and items[0][0] == 0:
 289                     continue
 290                 # get the mode of the frequencies
 291                 if len(items) > 1:
 292                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 293                                          items)
 294                     # adjust the mode - subtract the sum of all
 295                     # other frequencies
 296                     items.remove(modes[char])
 297                     modes[char] = (modes[char][0], modes[char][1]
 298                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 299                                             items)[1])
 300                 else:
 301                     modes[char] = items[0]
 302
 303             # build a list of possible delimiters
 304             modeList = modes.items()
 305             total = float(chunkLength * iteration)
 306             # (rows of consistent data) / (number of rows) = 100%
 307             consistency = 1.0
 308             # minimum consistency threshold
 309             threshold = 0.9
 310             while len(delims) == 0 and consistency >= threshold:
 311                 for k, v in modeList:
 312                     if v[0] > 0 and v[1] > 0:
 313                         if ((v[1]/total) >= consistency and
 314                             (delimiters is None or k in delimiters)):
 315                             delims[k] = v
 316                 consistency -= 0.01
 317
 318             if len(delims) == 1:
 319                 delim = delims.keys()[0]
 320                 skipinitialspace = (data[0].count(delim) ==
 321                                     data[0].count("%c " % delim))
 322                 return (delim, skipinitialspace)
 323
 324             # analyze another chunkLength lines
 325             start = end
 326             end += chunkLength
 327
 328         if not delims:
 329             return ('', 0)
 330
 331         # if there's more than one, fall back to a 'preferred' list
 332         if len(delims) > 1:
 333             for d in self.preferred:
 334                 if d in delims.keys():
 335                     skipinitialspace = (data[0].count(d) ==
 336                                         data[0].count("%c " % d))
 337                     return (d, skipinitialspace)
 338
 339         # nothing else indicates a preference, pick the character that
 340         # dominates(?)
 341         items = [(v,k) for (k,v) in delims.items()]
 342         items.sort()
 343         delim = items[-1][1]
 344
 345         skipinitialspace = (data[0].count(delim) ==
 346                             data[0].count("%c " % delim))
 347         return (delim, skipinitialspace)
 348
 349
 350     def has_header(self, sample):
 351         # Creates a dictionary of types of data in each column. If any
 352         # column is of a single type (say, integers), *except* for the first
 353         # row, then the first row is presumed to be labels. If the type
 354         # can't be determined, it is assumed to be a string in which case
 355         # the length of the string is the determining factor: if all of the
 356         # rows except for the first are the same length, it's a header.
 357         # Finally, a 'vote' is taken at the end for each column, adding or
 358         # subtracting from the likelihood of the first row being a header.
 359
 360         rdr = reader(StringIO(sample), self.sniff(sample))
 361
 362         header = rdr.next() # assume first row is header
 363
 364         columns = len(header)
 365         columnTypes = {}
 366         for i in range(columns): columnTypes[i] = None
 367
 368         checked = 0
 369         for row in rdr:
 370             # arbitrary number of rows to check, to keep it sane
 371             if checked > 20:
 372                 break
 373             checked += 1
 374
 375             if len(row) != columns:
 376                 continue # skip rows that have irregular number of columns
 377
 378             for col in columnTypes.keys():
 379
 380                 for thisType in [int, long, float, complex]:
 381                     try:
 382                         thisType(row[col])
 383                         break
 384                     except (ValueError, OverflowError):
 385                         pass
 386                 else:
 387                     # fallback to length of string
 388                     thisType = len(row[col])
 389
 390                 # treat longs as ints
 391                 if thisType == long:
 392                     thisType = int
 393
 394                 if thisType != columnTypes[col]:
 395                     if columnTypes[col] is None: # add new column type
 396                         columnTypes[col] = thisType
 397                     else:
 398                         # type is inconsistent, remove column from
 399                         # consideration
 400                         del columnTypes[col]
 401
 402         # finally, compare results against first row and "vote"
 403         # on whether it's a header
 404         hasHeader = 0
 405         for col, colType in columnTypes.items():
 406             if type(colType) == type(0): # it's a length
 407                 if len(header[col]) != colType:
 408                     hasHeader += 1
 409                 else:
 410                     hasHeader -= 1
 411             else: # attempt typecast
 412                 try:
 413                     colType(header[col])
 414                 except (ValueError, TypeError):
 415                     hasHeader += 1
 416                 else:
 417                     hasHeader -= 1
 418
 419         return hasHeader > 0