libgo/go/encoding/csv/reader.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package csv reads and writes comma-separated values (CSV) files.
   6 // There are many kinds of CSV files; this package supports the format
   7 // described in RFC 4180.
   8 //
   9 // A csv file contains zero or more records of one or more fields per record.
  10 // Each record is separated by the newline character. The final record may
  11 // optionally be followed by a newline character.
  12 //
  13 //      field1,field2,field3
  14 //
  15 // White space is considered part of a field.
  16 //
  17 // Carriage returns before newline characters are silently removed.
  18 //
  19 // Blank lines are ignored. A line with only whitespace characters (excluding
  20 // the ending newline character) is not considered a blank line.
  21 //
  22 // Fields which start and stop with the quote character " are called
  23 // quoted-fields. The beginning and ending quote are not part of the
  24 // field.
  25 //
  26 // The source:
  27 //
  28 //      normal string,"quoted-field"
  29 //
  30 // results in the fields
  31 //
  32 //      {`normal string`, `quoted-field`}
  33 //
  34 // Within a quoted-field a quote character followed by a second quote
  35 // character is considered a single quote.
  36 //
  37 //      "the ""word"" is true","a ""quoted-field"""
  38 //
  39 // results in
  40 //
  41 //      {`the "word" is true`, `a "quoted-field"`}
  42 //
  43 // Newlines and commas may be included in a quoted-field
  44 //
  45 //      "Multi-line
  46 //      field","comma is ,"
  47 //
  48 // results in
  49 //
  50 //      {`Multi-line
  51 //      field`, `comma is ,`}
  52 package csv
  53
  54 import (
  55         "bufio"
  56         "bytes"
  57         "errors"
  58         "fmt"
  59         "io"
  60         "unicode"
  61 )
  62
  63 // A ParseError is returned for parsing errors.
  64 // The first line is 1.  The first column is 0.
  65 type ParseError struct {
  66         Line   int   // Line where the error occurred
  67         Column int   // Column (rune index) where the error occurred
  68         Err    error // The actual error
  69 }
  70
  71 func (e *ParseError) Error() string {
  72         return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
  73 }
  74
  75 // These are the errors that can be returned in ParseError.Error
  76 var (
  77         ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used
  78         ErrBareQuote     = errors.New("bare \" in non-quoted-field")
  79         ErrQuote         = errors.New("extraneous \" in field")
  80         ErrFieldCount    = errors.New("wrong number of fields in line")
  81 )
  82
  83 // A Reader reads records from a CSV-encoded file.
  84 //
  85 // As returned by NewReader, a Reader expects input conforming to RFC 4180.
  86 // The exported fields can be changed to customize the details before the
  87 // first call to Read or ReadAll.
  88 //
  89 //
  90 type Reader struct {
  91         // Comma is the field delimiter.
  92         // It is set to comma (',') by NewReader.
  93         Comma rune
  94         // Comment, if not 0, is the comment character. Lines beginning with the
  95         // Comment character without preceding whitespace are ignored.
  96         // With leading whitespace the Comment character becomes part of the
  97         // field, even if TrimLeadingSpace is true.
  98         Comment rune
  99         // FieldsPerRecord is the number of expected fields per record.
 100         // If FieldsPerRecord is positive, Read requires each record to
 101         // have the given number of fields. If FieldsPerRecord is 0, Read sets it to
 102         // the number of fields in the first record, so that future records must
 103         // have the same field count. If FieldsPerRecord is negative, no check is
 104         // made and records may have a variable number of fields.
 105         FieldsPerRecord int
 106         // If LazyQuotes is true, a quote may appear in an unquoted field and a
 107         // non-doubled quote may appear in a quoted field.
 108         LazyQuotes    bool
 109         TrailingComma bool // ignored; here for backwards compatibility
 110         // If TrimLeadingSpace is true, leading white space in a field is ignored.
 111         // This is done even if the field delimiter, Comma, is white space.
 112         TrimLeadingSpace bool
 113         // ReuseRecord controls whether calls to Read may return a slice sharing
 114         // the backing array of the previous call's returned slice for performance.
 115         // By default, each call to Read returns newly allocated memory owned by the caller.
 116         ReuseRecord bool
 117
 118         line   int
 119         column int
 120         r      *bufio.Reader
 121         // lineBuffer holds the unescaped fields read by readField, one after another.
 122         // The fields can be accessed by using the indexes in fieldIndexes.
 123         // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
 124         // fieldIndexes will contain the indexes 0, 1, 2, 5.
 125         lineBuffer bytes.Buffer
 126         // Indexes of fields inside lineBuffer
 127         // The i'th field starts at offset fieldIndexes[i] in lineBuffer.
 128         fieldIndexes []int
 129
 130         // only used when ReuseRecord == true
 131         lastRecord []string
 132 }
 133
 134 // NewReader returns a new Reader that reads from r.
 135 func NewReader(r io.Reader) *Reader {
 136         return &Reader{
 137                 Comma: ',',
 138                 r:     bufio.NewReader(r),
 139         }
 140 }
 141
 142 // error creates a new ParseError based on err.
 143 func (r *Reader) error(err error) error {
 144         return &ParseError{
 145                 Line:   r.line,
 146                 Column: r.column,
 147                 Err:    err,
 148         }
 149 }
 150
 151 // Read reads one record (a slice of fields) from r.
 152 // If the record has an unexpected number of fields,
 153 // Read returns the record along with the error ErrFieldCount.
 154 // Except for that case, Read always returns either a non-nil
 155 // record or a non-nil error, but not both.
 156 // If there is no data left to be read, Read returns nil, io.EOF.
 157 // If ReuseRecord is true, the returned slice may be shared
 158 // between multiple calls to Read.
 159 func (r *Reader) Read() (record []string, err error) {
 160         if r.ReuseRecord {
 161                 record, err = r.readRecord(r.lastRecord)
 162                 r.lastRecord = record
 163         } else {
 164                 record, err = r.readRecord(nil)
 165         }
 166
 167         return record, err
 168 }
 169
 170 // ReadAll reads all the remaining records from r.
 171 // Each record is a slice of fields.
 172 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is
 173 // defined to read until EOF, it does not treat end of file as an error to be
 174 // reported.
 175 func (r *Reader) ReadAll() (records [][]string, err error) {
 176         for {
 177                 record, err := r.readRecord(nil)
 178                 if err == io.EOF {
 179                         return records, nil
 180                 }
 181                 if err != nil {
 182                         return nil, err
 183                 }
 184                 records = append(records, record)
 185         }
 186 }
 187
 188 // readRecord reads and parses a single csv record from r.
 189 // Unlike parseRecord, readRecord handles FieldsPerRecord.
 190 // If dst has enough capacity it will be used for the returned record.
 191 func (r *Reader) readRecord(dst []string) (record []string, err error) {
 192         for {
 193                 record, err = r.parseRecord(dst)
 194                 if record != nil {
 195                         break
 196                 }
 197                 if err != nil {
 198                         return nil, err
 199                 }
 200         }
 201
 202         if r.FieldsPerRecord > 0 {
 203                 if len(record) != r.FieldsPerRecord {
 204                         r.column = 0 // report at start of record
 205                         return record, r.error(ErrFieldCount)
 206                 }
 207         } else if r.FieldsPerRecord == 0 {
 208                 r.FieldsPerRecord = len(record)
 209         }
 210         return record, nil
 211 }
 212
 213 // readRune reads one rune from r, folding \r\n to \n and keeping track
 214 // of how far into the line we have read.  r.column will point to the start
 215 // of this rune, not the end of this rune.
 216 func (r *Reader) readRune() (rune, error) {
 217         r1, _, err := r.r.ReadRune()
 218
 219         // Handle \r\n here. We make the simplifying assumption that
 220         // anytime \r is followed by \n that it can be folded to \n.
 221         // We will not detect files which contain both \r\n and bare \n.
 222         if r1 == '\r' {
 223                 r1, _, err = r.r.ReadRune()
 224                 if err == nil {
 225                         if r1 != '\n' {
 226                                 r.r.UnreadRune()
 227                                 r1 = '\r'
 228                         }
 229                 }
 230         }
 231         r.column++
 232         return r1, err
 233 }
 234
 235 // skip reads runes up to and including the rune delim or until error.
 236 func (r *Reader) skip(delim rune) error {
 237         for {
 238                 r1, err := r.readRune()
 239                 if err != nil {
 240                         return err
 241                 }
 242                 if r1 == delim {
 243                         return nil
 244                 }
 245         }
 246 }
 247
 248 // parseRecord reads and parses a single csv record from r.
 249 // If dst has enough capacity it will be used for the returned fields.
 250 func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
 251         // Each record starts on a new line. We increment our line
 252         // number (lines start at 1, not 0) and set column to -1
 253         // so as we increment in readRune it points to the character we read.
 254         r.line++
 255         r.column = -1
 256
 257         // Peek at the first rune. If it is an error we are done.
 258         // If we support comments and it is the comment character
 259         // then skip to the end of line.
 260
 261         r1, _, err := r.r.ReadRune()
 262         if err != nil {
 263                 return nil, err
 264         }
 265
 266         if r.Comment != 0 && r1 == r.Comment {
 267                 return nil, r.skip('\n')
 268         }
 269         r.r.UnreadRune()
 270
 271         r.lineBuffer.Reset()
 272         r.fieldIndexes = r.fieldIndexes[:0]
 273
 274         // At this point we have at least one field.
 275         for {
 276                 idx := r.lineBuffer.Len()
 277
 278                 haveField, delim, err := r.parseField()
 279                 if haveField {
 280                         r.fieldIndexes = append(r.fieldIndexes, idx)
 281                 }
 282
 283                 if delim == '\n' || err == io.EOF {
 284                         if len(r.fieldIndexes) == 0 {
 285                                 return nil, err
 286                         }
 287                         break
 288                 }
 289
 290                 if err != nil {
 291                         return nil, err
 292                 }
 293         }
 294
 295         fieldCount := len(r.fieldIndexes)
 296         // Using this approach (creating a single string and taking slices of it)
 297         // means that a single reference to any of the fields will retain the whole
 298         // string. The risk of a nontrivial space leak caused by this is considered
 299         // minimal and a tradeoff for better performance through the combined
 300         // allocations.
 301         line := r.lineBuffer.String()
 302
 303         if cap(dst) >= fieldCount {
 304                 fields = dst[:fieldCount]
 305         } else {
 306                 fields = make([]string, fieldCount)
 307         }
 308
 309         for i, idx := range r.fieldIndexes {
 310                 if i == fieldCount-1 {
 311                         fields[i] = line[idx:]
 312                 } else {
 313                         fields[i] = line[idx:r.fieldIndexes[i+1]]
 314                 }
 315         }
 316
 317         return fields, nil
 318 }
 319
 320 // parseField parses the next field in the record. The read field is
 321 // appended to r.lineBuffer. Delim is the first character not part of the field
 322 // (r.Comma or '\n').
 323 func (r *Reader) parseField() (haveField bool, delim rune, err error) {
 324         r1, err := r.readRune()
 325         for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
 326                 r1, err = r.readRune()
 327         }
 328
 329         if err == io.EOF && r.column != 0 {
 330                 return true, 0, err
 331         }
 332         if err != nil {
 333                 return false, 0, err
 334         }
 335
 336         switch r1 {
 337         case r.Comma:
 338                 // will check below
 339
 340         case '\n':
 341                 // We are a trailing empty field or a blank line
 342                 if r.column == 0 {
 343                         return false, r1, nil
 344                 }
 345                 return true, r1, nil
 346
 347         case '"':
 348                 // quoted field
 349         Quoted:
 350                 for {
 351                         r1, err = r.readRune()
 352                         if err != nil {
 353                                 if err == io.EOF {
 354                                         if r.LazyQuotes {
 355                                                 return true, 0, err
 356                                         }
 357                                         return false, 0, r.error(ErrQuote)
 358                                 }
 359                                 return false, 0, err
 360                         }
 361                         switch r1 {
 362                         case '"':
 363                                 r1, err = r.readRune()
 364                                 if err != nil || r1 == r.Comma {
 365                                         break Quoted
 366                                 }
 367                                 if r1 == '\n' {
 368                                         return true, r1, nil
 369                                 }
 370                                 if r1 != '"' {
 371                                         if !r.LazyQuotes {
 372                                                 r.column--
 373                                                 return false, 0, r.error(ErrQuote)
 374                                         }
 375                                         // accept the bare quote
 376                                         r.lineBuffer.WriteRune('"')
 377                                 }
 378                         case '\n':
 379                                 r.line++
 380                                 r.column = -1
 381                         }
 382                         r.lineBuffer.WriteRune(r1)
 383                 }
 384
 385         default:
 386                 // unquoted field
 387                 for {
 388                         r.lineBuffer.WriteRune(r1)
 389                         r1, err = r.readRune()
 390                         if err != nil || r1 == r.Comma {
 391                                 break
 392                         }
 393                         if r1 == '\n' {
 394                                 return true, r1, nil
 395                         }
 396                         if !r.LazyQuotes && r1 == '"' {
 397                                 return false, 0, r.error(ErrBareQuote)
 398                         }
 399                 }
 400         }
 401
 402         if err != nil {
 403                 if err == io.EOF {
 404                         return true, 0, err
 405                 }
 406                 return false, 0, err
 407         }
 408
 409         return true, r1, nil
 410 }