libgo: update to go1.9
[official-gcc.git] / libgo / go / encoding / csv / reader.go
bloba3497c84f9604f872166ff053f696fc75fd0407e
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package csv reads and writes comma-separated values (CSV) files.
6 // There are many kinds of CSV files; this package supports the format
7 // described in RFC 4180.
8 //
9 // A csv file contains zero or more records of one or more fields per record.
10 // Each record is separated by the newline character. The final record may
11 // optionally be followed by a newline character.
13 // field1,field2,field3
15 // White space is considered part of a field.
17 // Carriage returns before newline characters are silently removed.
19 // Blank lines are ignored. A line with only whitespace characters (excluding
20 // the ending newline character) is not considered a blank line.
22 // Fields which start and stop with the quote character " are called
23 // quoted-fields. The beginning and ending quote are not part of the
24 // field.
26 // The source:
28 // normal string,"quoted-field"
30 // results in the fields
32 // {`normal string`, `quoted-field`}
34 // Within a quoted-field a quote character followed by a second quote
35 // character is considered a single quote.
37 // "the ""word"" is true","a ""quoted-field"""
39 // results in
41 // {`the "word" is true`, `a "quoted-field"`}
43 // Newlines and commas may be included in a quoted-field
45 // "Multi-line
46 // field","comma is ,"
48 // results in
50 // {`Multi-line
51 // field`, `comma is ,`}
52 package csv
54 import (
55 "bufio"
56 "bytes"
57 "errors"
58 "fmt"
59 "io"
60 "unicode"
63 // A ParseError is returned for parsing errors.
64 // The first line is 1. The first column is 0.
65 type ParseError struct {
66 Line int // Line where the error occurred
67 Column int // Column (rune index) where the error occurred
68 Err error // The actual error
71 func (e *ParseError) Error() string {
72 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
75 // These are the errors that can be returned in ParseError.Error
76 var (
77 ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used
78 ErrBareQuote = errors.New("bare \" in non-quoted-field")
79 ErrQuote = errors.New("extraneous \" in field")
80 ErrFieldCount = errors.New("wrong number of fields in line")
83 // A Reader reads records from a CSV-encoded file.
85 // As returned by NewReader, a Reader expects input conforming to RFC 4180.
86 // The exported fields can be changed to customize the details before the
87 // first call to Read or ReadAll.
90 type Reader struct {
91 // Comma is the field delimiter.
92 // It is set to comma (',') by NewReader.
93 Comma rune
94 // Comment, if not 0, is the comment character. Lines beginning with the
95 // Comment character without preceding whitespace are ignored.
96 // With leading whitespace the Comment character becomes part of the
97 // field, even if TrimLeadingSpace is true.
98 Comment rune
99 // FieldsPerRecord is the number of expected fields per record.
100 // If FieldsPerRecord is positive, Read requires each record to
101 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to
102 // the number of fields in the first record, so that future records must
103 // have the same field count. If FieldsPerRecord is negative, no check is
104 // made and records may have a variable number of fields.
105 FieldsPerRecord int
106 // If LazyQuotes is true, a quote may appear in an unquoted field and a
107 // non-doubled quote may appear in a quoted field.
108 LazyQuotes bool
109 TrailingComma bool // ignored; here for backwards compatibility
110 // If TrimLeadingSpace is true, leading white space in a field is ignored.
111 // This is done even if the field delimiter, Comma, is white space.
112 TrimLeadingSpace bool
113 // ReuseRecord controls whether calls to Read may return a slice sharing
114 // the backing array of the previous call's returned slice for performance.
115 // By default, each call to Read returns newly allocated memory owned by the caller.
116 ReuseRecord bool
118 line int
119 column int
120 r *bufio.Reader
121 // lineBuffer holds the unescaped fields read by readField, one after another.
122 // The fields can be accessed by using the indexes in fieldIndexes.
123 // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
124 // fieldIndexes will contain the indexes 0, 1, 2, 5.
125 lineBuffer bytes.Buffer
126 // Indexes of fields inside lineBuffer
127 // The i'th field starts at offset fieldIndexes[i] in lineBuffer.
128 fieldIndexes []int
130 // only used when ReuseRecord == true
131 lastRecord []string
134 // NewReader returns a new Reader that reads from r.
135 func NewReader(r io.Reader) *Reader {
136 return &Reader{
137 Comma: ',',
138 r: bufio.NewReader(r),
142 // error creates a new ParseError based on err.
143 func (r *Reader) error(err error) error {
144 return &ParseError{
145 Line: r.line,
146 Column: r.column,
147 Err: err,
151 // Read reads one record (a slice of fields) from r.
152 // If the record has an unexpected number of fields,
153 // Read returns the record along with the error ErrFieldCount.
154 // Except for that case, Read always returns either a non-nil
155 // record or a non-nil error, but not both.
156 // If there is no data left to be read, Read returns nil, io.EOF.
157 // If ReuseRecord is true, the returned slice may be shared
158 // between multiple calls to Read.
159 func (r *Reader) Read() (record []string, err error) {
160 if r.ReuseRecord {
161 record, err = r.readRecord(r.lastRecord)
162 r.lastRecord = record
163 } else {
164 record, err = r.readRecord(nil)
167 return record, err
170 // ReadAll reads all the remaining records from r.
171 // Each record is a slice of fields.
172 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is
173 // defined to read until EOF, it does not treat end of file as an error to be
174 // reported.
175 func (r *Reader) ReadAll() (records [][]string, err error) {
176 for {
177 record, err := r.readRecord(nil)
178 if err == io.EOF {
179 return records, nil
181 if err != nil {
182 return nil, err
184 records = append(records, record)
188 // readRecord reads and parses a single csv record from r.
189 // Unlike parseRecord, readRecord handles FieldsPerRecord.
190 // If dst has enough capacity it will be used for the returned record.
191 func (r *Reader) readRecord(dst []string) (record []string, err error) {
192 for {
193 record, err = r.parseRecord(dst)
194 if record != nil {
195 break
197 if err != nil {
198 return nil, err
202 if r.FieldsPerRecord > 0 {
203 if len(record) != r.FieldsPerRecord {
204 r.column = 0 // report at start of record
205 return record, r.error(ErrFieldCount)
207 } else if r.FieldsPerRecord == 0 {
208 r.FieldsPerRecord = len(record)
210 return record, nil
213 // readRune reads one rune from r, folding \r\n to \n and keeping track
214 // of how far into the line we have read. r.column will point to the start
215 // of this rune, not the end of this rune.
216 func (r *Reader) readRune() (rune, error) {
217 r1, _, err := r.r.ReadRune()
219 // Handle \r\n here. We make the simplifying assumption that
220 // anytime \r is followed by \n that it can be folded to \n.
221 // We will not detect files which contain both \r\n and bare \n.
222 if r1 == '\r' {
223 r1, _, err = r.r.ReadRune()
224 if err == nil {
225 if r1 != '\n' {
226 r.r.UnreadRune()
227 r1 = '\r'
231 r.column++
232 return r1, err
235 // skip reads runes up to and including the rune delim or until error.
236 func (r *Reader) skip(delim rune) error {
237 for {
238 r1, err := r.readRune()
239 if err != nil {
240 return err
242 if r1 == delim {
243 return nil
248 // parseRecord reads and parses a single csv record from r.
249 // If dst has enough capacity it will be used for the returned fields.
250 func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
251 // Each record starts on a new line. We increment our line
252 // number (lines start at 1, not 0) and set column to -1
253 // so as we increment in readRune it points to the character we read.
254 r.line++
255 r.column = -1
257 // Peek at the first rune. If it is an error we are done.
258 // If we support comments and it is the comment character
259 // then skip to the end of line.
261 r1, _, err := r.r.ReadRune()
262 if err != nil {
263 return nil, err
266 if r.Comment != 0 && r1 == r.Comment {
267 return nil, r.skip('\n')
269 r.r.UnreadRune()
271 r.lineBuffer.Reset()
272 r.fieldIndexes = r.fieldIndexes[:0]
274 // At this point we have at least one field.
275 for {
276 idx := r.lineBuffer.Len()
278 haveField, delim, err := r.parseField()
279 if haveField {
280 r.fieldIndexes = append(r.fieldIndexes, idx)
283 if delim == '\n' || err == io.EOF {
284 if len(r.fieldIndexes) == 0 {
285 return nil, err
287 break
290 if err != nil {
291 return nil, err
295 fieldCount := len(r.fieldIndexes)
296 // Using this approach (creating a single string and taking slices of it)
297 // means that a single reference to any of the fields will retain the whole
298 // string. The risk of a nontrivial space leak caused by this is considered
299 // minimal and a tradeoff for better performance through the combined
300 // allocations.
301 line := r.lineBuffer.String()
303 if cap(dst) >= fieldCount {
304 fields = dst[:fieldCount]
305 } else {
306 fields = make([]string, fieldCount)
309 for i, idx := range r.fieldIndexes {
310 if i == fieldCount-1 {
311 fields[i] = line[idx:]
312 } else {
313 fields[i] = line[idx:r.fieldIndexes[i+1]]
317 return fields, nil
320 // parseField parses the next field in the record. The read field is
321 // appended to r.lineBuffer. Delim is the first character not part of the field
322 // (r.Comma or '\n').
323 func (r *Reader) parseField() (haveField bool, delim rune, err error) {
324 r1, err := r.readRune()
325 for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
326 r1, err = r.readRune()
329 if err == io.EOF && r.column != 0 {
330 return true, 0, err
332 if err != nil {
333 return false, 0, err
336 switch r1 {
337 case r.Comma:
338 // will check below
340 case '\n':
341 // We are a trailing empty field or a blank line
342 if r.column == 0 {
343 return false, r1, nil
345 return true, r1, nil
347 case '"':
348 // quoted field
349 Quoted:
350 for {
351 r1, err = r.readRune()
352 if err != nil {
353 if err == io.EOF {
354 if r.LazyQuotes {
355 return true, 0, err
357 return false, 0, r.error(ErrQuote)
359 return false, 0, err
361 switch r1 {
362 case '"':
363 r1, err = r.readRune()
364 if err != nil || r1 == r.Comma {
365 break Quoted
367 if r1 == '\n' {
368 return true, r1, nil
370 if r1 != '"' {
371 if !r.LazyQuotes {
372 r.column--
373 return false, 0, r.error(ErrQuote)
375 // accept the bare quote
376 r.lineBuffer.WriteRune('"')
378 case '\n':
379 r.line++
380 r.column = -1
382 r.lineBuffer.WriteRune(r1)
385 default:
386 // unquoted field
387 for {
388 r.lineBuffer.WriteRune(r1)
389 r1, err = r.readRune()
390 if err != nil || r1 == r.Comma {
391 break
393 if r1 == '\n' {
394 return true, r1, nil
396 if !r.LazyQuotes && r1 == '"' {
397 return false, 0, r.error(ErrBareQuote)
402 if err != nil {
403 if err == io.EOF {
404 return true, 0, err
406 return false, 0, err
409 return true, r1, nil