Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from _csv import Error, __version__, writer, reader, register_dialect, \
   8                  unregister_dialect, get_dialect, list_dialects, \
   9                  field_size_limit, \
  10                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  11                  __doc__
  12 from _csv import Dialect as _Dialect
  13
  14 try:
  15     from cStringIO import StringIO
  16 except ImportError:
  17     from StringIO import StringIO
  18
  19 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  20             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
  21             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  22             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  23
  24 class Dialect:
  25     """Describe an Excel dialect.
  26
  27     This must be subclassed (see csv.excel).  Valid attributes are:
  28     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  29     lineterminator, quoting.
  30
  31     """
  32     _name = ""
  33     _valid = False
  34     # placeholders
  35     delimiter = None
  36     quotechar = None
  37     escapechar = None
  38     doublequote = None
  39     skipinitialspace = None
  40     lineterminator = None
  41     quoting = None
  42
  43     def __init__(self):
  44         if self.__class__ != Dialect:
  45             self._valid = True
  46         self._validate()
  47
  48     def _validate(self):
  49         try:
  50             _Dialect(self)
  51         except TypeError, e:
  52             # We do this for compatibility with py2.3
  53             raise Error(str(e))
  54
  55 class excel(Dialect):
  56     """Describe the usual properties of Excel-generated CSV files."""
  57     delimiter = ','
  58     quotechar = '"'
  59     doublequote = True
  60     skipinitialspace = False
  61     lineterminator = '\r\n'
  62     quoting = QUOTE_MINIMAL
  63 register_dialect("excel", excel)
  64
  65 class excel_tab(excel):
  66     """Describe the usual properties of Excel-generated TAB-delimited files."""
  67     delimiter = '\t'
  68 register_dialect("excel-tab", excel_tab)
  69
  70
  71 class DictReader:
  72     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  73                  dialect="excel", *args, **kwds):
  74         self.fieldnames = fieldnames    # list of keys for the dict
  75         self.restkey = restkey          # key to catch long rows
  76         self.restval = restval          # default value for short rows
  77         self.reader = reader(f, dialect, *args, **kwds)
  78
  79     def __iter__(self):
  80         return self
  81
  82     def next(self):
  83         row = self.reader.next()
  84         if self.fieldnames is None:
  85             self.fieldnames = row
  86             row = self.reader.next()
  87
  88         # unlike the basic reader, we prefer not to return blanks,
  89         # because we will typically wind up with a dict full of None
  90         # values
  91         while row == []:
  92             row = self.reader.next()
  93         d = dict(zip(self.fieldnames, row))
  94         lf = len(self.fieldnames)
  95         lr = len(row)
  96         if lf < lr:
  97             d[self.restkey] = row[lf:]
  98         elif lf > lr:
  99             for key in self.fieldnames[lr:]:
 100                 d[key] = self.restval
 101         return d
 102
 103
 104 class DictWriter:
 105     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 106                  dialect="excel", *args, **kwds):
 107         self.fieldnames = fieldnames    # list of keys for the dict
 108         self.restval = restval          # for writing short dicts
 109         if extrasaction.lower() not in ("raise", "ignore"):
 110             raise ValueError, \
 111                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 112                    extrasaction)
 113         self.extrasaction = extrasaction
 114         self.writer = writer(f, dialect, *args, **kwds)
 115
 116     def _dict_to_list(self, rowdict):
 117         if self.extrasaction == "raise":
 118             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
 119             if wrong_fields:
 120                 raise ValueError("dict contains fields not in fieldnames: " +
 121                                  ", ".join(wrong_fields))
 122         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 123
 124     def writerow(self, rowdict):
 125         return self.writer.writerow(self._dict_to_list(rowdict))
 126
 127     def writerows(self, rowdicts):
 128         rows = []
 129         for rowdict in rowdicts:
 130             rows.append(self._dict_to_list(rowdict))
 131         return self.writer.writerows(rows)
 132
 133 # Guard Sniffer's type checking against builds that exclude complex()
 134 try:
 135     complex
 136 except NameError:
 137     complex = float
 138
 139 class Sniffer:
 140     '''
 141     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 142     Returns a Dialect object.
 143     '''
 144     def __init__(self):
 145         # in case there is more than one possible delimiter
 146         self.preferred = [',', '\t', ';', ' ', ':']
 147
 148
 149     def sniff(self, sample, delimiters=None):
 150         """
 151         Returns a dialect (or None) corresponding to the sample
 152         """
 153
 154         quotechar, delimiter, skipinitialspace = \
 155                    self._guess_quote_and_delimiter(sample, delimiters)
 156         if not delimiter:
 157             delimiter, skipinitialspace = self._guess_delimiter(sample,
 158                                                                 delimiters)
 159
 160         if not delimiter:
 161             raise Error, "Could not determine delimiter"
 162
 163         class dialect(Dialect):
 164             _name = "sniffed"
 165             lineterminator = '\r\n'
 166             quoting = QUOTE_MINIMAL
 167             # escapechar = ''
 168             doublequote = False
 169
 170         dialect.delimiter = delimiter
 171         # _csv.reader won't accept a quotechar of ''
 172         dialect.quotechar = quotechar or '"'
 173         dialect.skipinitialspace = skipinitialspace
 174
 175         return dialect
 176
 177
 178     def _guess_quote_and_delimiter(self, data, delimiters):
 179         """
 180         Looks for text enclosed between two identical quotes
 181         (the probable quotechar) which are preceded and followed
 182         by the same character (the probable delimiter).
 183         For example:
 184                          ,'some text',
 185         The quote with the most wins, same with the delimiter.
 186         If there is no quotechar the delimiter can't be determined
 187         this way.
 188         """
 189
 190         matches = []
 191         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 192                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 193                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 194                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 195             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 196             matches = regexp.findall(data)
 197             if matches:
 198                 break
 199
 200         if not matches:
 201             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 202
 203         quotes = {}
 204         delims = {}
 205         spaces = 0
 206         for m in matches:
 207             n = regexp.groupindex['quote'] - 1
 208             key = m[n]
 209             if key:
 210                 quotes[key] = quotes.get(key, 0) + 1
 211             try:
 212                 n = regexp.groupindex['delim'] - 1
 213                 key = m[n]
 214             except KeyError:
 215                 continue
 216             if key and (delimiters is None or key in delimiters):
 217                 delims[key] = delims.get(key, 0) + 1
 218             try:
 219                 n = regexp.groupindex['space'] - 1
 220             except KeyError:
 221                 continue
 222             if m[n]:
 223                 spaces += 1
 224
 225         quotechar = reduce(lambda a, b, quotes = quotes:
 226                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 227
 228         if delims:
 229             delim = reduce(lambda a, b, delims = delims:
 230                            (delims[a] > delims[b]) and a or b, delims.keys())
 231             skipinitialspace = delims[delim] == spaces
 232             if delim == '\n': # most likely a file with a single column
 233                 delim = ''
 234         else:
 235             # there is *no* delimiter, it's a single column of quoted data
 236             delim = ''
 237             skipinitialspace = 0
 238
 239         return (quotechar, delim, skipinitialspace)
 240
 241
 242     def _guess_delimiter(self, data, delimiters):
 243         """
 244         The delimiter /should/ occur the same number of times on
 245         each row. However, due to malformed data, it may not. We don't want
 246         an all or nothing approach, so we allow for small variations in this
 247         number.
 248           1) build a table of the frequency of each character on every line.
 249           2) build a table of freqencies of this frequency (meta-frequency?),
 250              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 251              7 times in 2 rows'
 252           3) use the mode of the meta-frequency to determine the /expected/
 253              frequency for that character
 254           4) find out how often the character actually meets that goal
 255           5) the character that best meets its goal is the delimiter
 256         For performance reasons, the data is evaluated in chunks, so it can
 257         try and evaluate the smallest portion of the data possible, evaluating
 258         additional chunks as necessary.
 259         """
 260
 261         data = filter(None, data.split('\n'))
 262
 263         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 264
 265         # build frequency tables
 266         chunkLength = min(10, len(data))
 267         iteration = 0
 268         charFrequency = {}
 269         modes = {}
 270         delims = {}
 271         start, end = 0, min(chunkLength, len(data))
 272         while start < len(data):
 273             iteration += 1
 274             for line in data[start:end]:
 275                 for char in ascii:
 276                     metaFrequency = charFrequency.get(char, {})
 277                     # must count even if frequency is 0
 278                     freq = line.count(char)
 279                     # value is the mode
 280                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 281                     charFrequency[char] = metaFrequency
 282
 283             for char in charFrequency.keys():
 284                 items = charFrequency[char].items()
 285                 if len(items) == 1 and items[0][0] == 0:
 286                     continue
 287                 # get the mode of the frequencies
 288                 if len(items) > 1:
 289                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 290                                          items)
 291                     # adjust the mode - subtract the sum of all
 292                     # other frequencies
 293                     items.remove(modes[char])
 294                     modes[char] = (modes[char][0], modes[char][1]
 295                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 296                                             items)[1])
 297                 else:
 298                     modes[char] = items[0]
 299
 300             # build a list of possible delimiters
 301             modeList = modes.items()
 302             total = float(chunkLength * iteration)
 303             # (rows of consistent data) / (number of rows) = 100%
 304             consistency = 1.0
 305             # minimum consistency threshold
 306             threshold = 0.9
 307             while len(delims) == 0 and consistency >= threshold:
 308                 for k, v in modeList:
 309                     if v[0] > 0 and v[1] > 0:
 310                         if ((v[1]/total) >= consistency and
 311                             (delimiters is None or k in delimiters)):
 312                             delims[k] = v
 313                 consistency -= 0.01
 314
 315             if len(delims) == 1:
 316                 delim = delims.keys()[0]
 317                 skipinitialspace = (data[0].count(delim) ==
 318                                     data[0].count("%c " % delim))
 319                 return (delim, skipinitialspace)
 320
 321             # analyze another chunkLength lines
 322             start = end
 323             end += chunkLength
 324
 325         if not delims:
 326             return ('', 0)
 327
 328         # if there's more than one, fall back to a 'preferred' list
 329         if len(delims) > 1:
 330             for d in self.preferred:
 331                 if d in delims.keys():
 332                     skipinitialspace = (data[0].count(d) ==
 333                                         data[0].count("%c " % d))
 334                     return (d, skipinitialspace)
 335
 336         # nothing else indicates a preference, pick the character that
 337         # dominates(?)
 338         items = [(v,k) for (k,v) in delims.items()]
 339         items.sort()
 340         delim = items[-1][1]
 341
 342         skipinitialspace = (data[0].count(delim) ==
 343                             data[0].count("%c " % delim))
 344         return (delim, skipinitialspace)
 345
 346
 347     def has_header(self, sample):
 348         # Creates a dictionary of types of data in each column. If any
 349         # column is of a single type (say, integers), *except* for the first
 350         # row, then the first row is presumed to be labels. If the type
 351         # can't be determined, it is assumed to be a string in which case
 352         # the length of the string is the determining factor: if all of the
 353         # rows except for the first are the same length, it's a header.
 354         # Finally, a 'vote' is taken at the end for each column, adding or
 355         # subtracting from the likelihood of the first row being a header.
 356
 357         rdr = reader(StringIO(sample), self.sniff(sample))
 358
 359         header = rdr.next() # assume first row is header
 360
 361         columns = len(header)
 362         columnTypes = {}
 363         for i in range(columns): columnTypes[i] = None
 364
 365         checked = 0
 366         for row in rdr:
 367             # arbitrary number of rows to check, to keep it sane
 368             if checked > 20:
 369                 break
 370             checked += 1
 371
 372             if len(row) != columns:
 373                 continue # skip rows that have irregular number of columns
 374
 375             for col in columnTypes.keys():
 376
 377                 for thisType in [int, long, float, complex]:
 378                     try:
 379                         thisType(row[col])
 380                         break
 381                     except (ValueError, OverflowError):
 382                         pass
 383                 else:
 384                     # fallback to length of string
 385                     thisType = len(row[col])
 386
 387                 # treat longs as ints
 388                 if thisType == long:
 389                     thisType = int
 390
 391                 if thisType != columnTypes[col]:
 392                     if columnTypes[col] is None: # add new column type
 393                         columnTypes[col] = thisType
 394                     else:
 395                         # type is inconsistent, remove column from
 396                         # consideration
 397                         del columnTypes[col]
 398
 399         # finally, compare results against first row and "vote"
 400         # on whether it's a header
 401         hasHeader = 0
 402         for col, colType in columnTypes.items():
 403             if type(colType) == type(0): # it's a length
 404                 if len(header[col]) != colType:
 405                     hasHeader += 1
 406                 else:
 407                     hasHeader -= 1
 408             else: # attempt typecast
 409                 try:
 410                     colType(header[col])
 411                 except (ValueError, TypeError):
 412                     hasHeader += 1
 413                 else:
 414                     hasHeader -= 1
 415
 416         return hasHeader > 0