1 /* File I/O for GNU DIFF.
3 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4 2004 Free Software Foundation, Inc.
6 This file is part of GNU DIFF.
8 GNU DIFF is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GNU DIFF is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING.
20 If not, write to the Free Software Foundation,
21 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
25 #include <file-type.h>
29 /* Rotate an unsigned value to the left. */
30 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
32 /* Given a hash value and a new character, return a new hash value. */
33 #define HASH(h, c) ((c) + ROL (h, 7))
35 /* The type of a hash value. */
36 typedef size_t hash_value
;
37 verify (hash_value_is_unsigned
, ! TYPE_SIGNED (hash_value
));
39 /* Lines are put into equivalence classes of lines that match in lines_differ.
40 Each equivalence class is represented by one of these structures,
41 but only while the classes are being computed.
42 Afterward, each class is represented by a number. */
45 lin next
; /* Next item in this bucket. */
46 hash_value hash
; /* Hash of lines in this class. */
47 char const *line
; /* A line that fits this class. */
48 size_t length
; /* That line's length, not counting its newline. */
51 /* Hash-table: array of buckets, each being a chain of equivalence classes.
52 buckets[-1] is reserved for incomplete lines. */
55 /* Number of buckets in the hash table array, not counting buckets[-1]. */
56 static size_t nbuckets
;
58 /* Array in which the equivalence classes are allocated.
59 The bucket-chains go through the elements in this array.
60 The number of an equivalence class is its index in this array. */
61 static struct equivclass
*equivs
;
63 /* Index of first free element in the array `equivs'. */
64 static lin equivs_index
;
66 /* Number of elements allocated in the array `equivs'. */
67 static lin equivs_alloc
;
69 /* Read a block of data into a file buffer, checking for EOF and error. */
72 file_block_read (struct file_data
*current
, size_t size
)
74 if (size
&& ! current
->eof
)
76 size_t s
= block_read (current
->desc
,
77 FILE_BUFFER (current
) + current
->buffered
, size
);
79 pfatal_with_name (current
->name
);
80 current
->buffered
+= s
;
81 current
->eof
= s
< size
;
85 /* Check for binary files and compare them for exact identity. */
87 /* Return 1 if BUF contains a non text character.
88 SIZE is the number of characters in BUF. */
90 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
92 /* Get ready to read the current file.
93 Return nonzero if SKIP_TEST is zero,
94 and if it appears to be a binary file. */
97 sip (struct file_data
*current
, bool skip_test
)
99 /* If we have a nonexistent file at this stage, treat it as empty. */
100 if (current
->desc
< 0)
102 /* Leave room for a sentinel. */
103 current
->bufsize
= sizeof (word
);
104 current
->buffer
= xmalloc (current
->bufsize
);
108 current
->bufsize
= buffer_lcm (sizeof (word
),
109 STAT_BLOCKSIZE (current
->stat
),
110 PTRDIFF_MAX
- 2 * sizeof (word
));
111 current
->buffer
= xmalloc (current
->bufsize
);
115 /* Check first part of file to see if it's a binary file. */
117 bool was_binary
= set_binary_mode (current
->desc
, true);
119 file_block_read (current
, current
->bufsize
);
120 buffered
= current
->buffered
;
124 /* Revert to text mode and seek back to the beginning to
125 reread the file. Use relative seek, since file
126 descriptors like stdin might not start at offset
129 if (lseek (current
->desc
, - buffered
, SEEK_CUR
) == -1)
130 pfatal_with_name (current
->name
);
131 set_binary_mode (current
->desc
, false);
132 current
->buffered
= 0;
133 current
->eof
= false;
136 return binary_file_p (current
->buffer
, buffered
);
140 current
->buffered
= 0;
141 current
->eof
= false;
145 /* Slurp the rest of the current file completely into memory. */
148 slurp (struct file_data
*current
)
152 if (current
->desc
< 0)
154 /* The file is nonexistent. */
158 if (S_ISREG (current
->stat
.st_mode
))
160 /* It's a regular file; slurp in the rest all at once. */
162 /* Get the size out of the stat block.
163 Allocate just enough room for appended newline plus word sentinel,
164 plus word-alignment since we want the buffer word-aligned. */
165 size_t file_size
= current
->stat
.st_size
;
166 cc
= file_size
+ 2 * sizeof (word
) - file_size
% sizeof (word
);
167 if (file_size
!= current
->stat
.st_size
|| cc
< file_size
168 || PTRDIFF_MAX
<= cc
)
171 if (current
->bufsize
< cc
)
173 current
->bufsize
= cc
;
174 current
->buffer
= xrealloc (current
->buffer
, cc
);
177 /* Try to read at least 1 more byte than the size indicates, to
178 detect whether the file is growing. This is a nicety for
179 users who run 'diff' on files while they are changing. */
181 if (current
->buffered
<= file_size
)
183 file_block_read (current
, file_size
+ 1 - current
->buffered
);
184 if (current
->buffered
<= file_size
)
189 /* It's not a regular file, or it's a growing regular file; read it,
190 growing the buffer as needed. */
192 file_block_read (current
, current
->bufsize
- current
->buffered
);
194 if (current
->buffered
)
196 while (current
->buffered
== current
->bufsize
)
198 if (PTRDIFF_MAX
/ 2 - sizeof (word
) < current
->bufsize
)
200 current
->bufsize
*= 2;
201 current
->buffer
= xrealloc (current
->buffer
, current
->bufsize
);
202 file_block_read (current
, current
->bufsize
- current
->buffered
);
205 /* Allocate just enough room for appended newline plus word
206 sentinel, plus word-alignment. */
207 cc
= current
->buffered
+ 2 * sizeof (word
);
208 current
->bufsize
= cc
- cc
% sizeof (word
);
209 current
->buffer
= xrealloc (current
->buffer
, current
->bufsize
);
213 /* Split the file into lines, simultaneously computing the equivalence
214 class for each line. */
217 find_and_hash_each_line (struct file_data
*current
)
220 char const *p
= current
->prefix_end
;
225 /* Cache often-used quantities in local variables to help the compiler. */
226 char const **linbuf
= current
->linbuf
;
227 lin alloc_lines
= current
->alloc_lines
;
229 lin linbuf_base
= current
->linbuf_base
;
230 lin
*cureqs
= xmalloc (alloc_lines
* sizeof *cureqs
);
231 struct equivclass
*eqs
= equivs
;
232 lin eqs_index
= equivs_index
;
233 lin eqs_alloc
= equivs_alloc
;
234 char const *suffix_begin
= current
->suffix_begin
;
235 char const *bufend
= FILE_BUFFER (current
) + current
->buffered
;
236 bool diff_length_compare_anyway
=
237 ignore_white_space
!= IGNORE_NO_WHITE_SPACE
;
238 bool same_length_diff_contents_compare_anyway
=
239 diff_length_compare_anyway
| ignore_case
;
241 while (p
< suffix_begin
)
247 /* Hash this line until we find a newline. */
249 switch (ignore_white_space
)
251 case IGNORE_ALL_SPACE
:
252 while ((c
= *p
++) != '\n')
254 h
= HASH (h
, tolower (c
));
257 case IGNORE_SPACE_CHANGE
:
258 while ((c
= *p
++) != '\n')
263 if ((c
= *p
++) == '\n')
270 /* C is now the first non-space. */
271 h
= HASH (h
, tolower (c
));
275 case IGNORE_TAB_EXPANSION
:
278 while ((c
= *p
++) != '\n')
280 size_t repetitions
= 1;
285 column
-= 0 < column
;
290 repetitions
= tabsize
- column
% tabsize
;
291 column
= (column
+ repetitions
< column
293 : column
+ repetitions
);
308 while (--repetitions
!= 0);
314 while ((c
= *p
++) != '\n')
315 h
= HASH (h
, tolower (c
));
319 switch (ignore_white_space
)
321 case IGNORE_ALL_SPACE
:
322 while ((c
= *p
++) != '\n')
327 case IGNORE_SPACE_CHANGE
:
328 while ((c
= *p
++) != '\n')
333 if ((c
= *p
++) == '\n')
340 /* C is now the first non-space. */
345 case IGNORE_TAB_EXPANSION
:
348 while ((c
= *p
++) != '\n')
350 size_t repetitions
= 1;
355 column
-= 0 < column
;
360 repetitions
= tabsize
- column
% tabsize
;
361 column
= (column
+ repetitions
< column
363 : column
+ repetitions
);
377 while (--repetitions
!= 0);
383 while ((c
= *p
++) != '\n')
390 bucket
= &buckets
[h
% nbuckets
];
394 && current
->missing_newline
395 && ROBUST_OUTPUT_STYLE (output_style
))
397 /* This line is incomplete. If this is significant,
398 put the line into buckets[-1]. */
399 if (ignore_white_space
< IGNORE_SPACE_CHANGE
)
400 bucket
= &buckets
[-1];
402 /* Omit the inserted newline when computing linbuf later. */
404 bufend
= suffix_begin
= p
;
407 for (i
= *bucket
; ; i
= eqs
[i
].next
)
410 /* Create a new equivalence class in this bucket. */
414 if (PTRDIFF_MAX
/ (2 * sizeof *eqs
) <= eqs_alloc
)
417 eqs
= xrealloc (eqs
, eqs_alloc
* sizeof *eqs
);
419 eqs
[i
].next
= *bucket
;
422 eqs
[i
].length
= length
;
426 else if (eqs
[i
].hash
== h
)
428 char const *eqline
= eqs
[i
].line
;
430 /* Reuse existing class if lines_differ reports the lines
432 if (eqs
[i
].length
== length
)
434 /* Reuse existing equivalence class if the lines are identical.
435 This detects the common case of exact identity
436 faster than lines_differ would. */
437 if (memcmp (eqline
, ip
, length
) == 0)
439 if (!same_length_diff_contents_compare_anyway
)
442 else if (!diff_length_compare_anyway
)
445 if (! lines_differ (eqline
, ip
))
449 /* Maybe increase the size of the line table. */
450 if (line
== alloc_lines
)
452 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
453 if (PTRDIFF_MAX
/ 3 <= alloc_lines
454 || PTRDIFF_MAX
/ sizeof *cureqs
<= 2 * alloc_lines
- linbuf_base
455 || PTRDIFF_MAX
/ sizeof *linbuf
<= alloc_lines
- linbuf_base
)
457 alloc_lines
= 2 * alloc_lines
- linbuf_base
;
458 cureqs
= xrealloc (cureqs
, alloc_lines
* sizeof *cureqs
);
459 linbuf
+= linbuf_base
;
460 linbuf
= xrealloc (linbuf
,
461 (alloc_lines
- linbuf_base
) * sizeof *linbuf
);
462 linbuf
-= linbuf_base
;
469 current
->buffered_lines
= line
;
473 /* Record the line start for lines in the suffix that we care about.
474 Record one more line start than lines,
475 so that we can compute the length of any buffered line. */
476 if (line
== alloc_lines
)
478 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
479 if (PTRDIFF_MAX
/ 3 <= alloc_lines
480 || PTRDIFF_MAX
/ sizeof *cureqs
<= 2 * alloc_lines
- linbuf_base
481 || PTRDIFF_MAX
/ sizeof *linbuf
<= alloc_lines
- linbuf_base
)
483 alloc_lines
= 2 * alloc_lines
- linbuf_base
;
484 linbuf
+= linbuf_base
;
485 linbuf
= xrealloc (linbuf
,
486 (alloc_lines
- linbuf_base
) * sizeof *linbuf
);
487 linbuf
-= linbuf_base
;
494 if (context
<= i
&& no_diff_means_no_output
)
503 /* Done with cache in local variables. */
504 current
->linbuf
= linbuf
;
505 current
->valid_lines
= line
;
506 current
->alloc_lines
= alloc_lines
;
507 current
->equivs
= cureqs
;
509 equivs_alloc
= eqs_alloc
;
510 equivs_index
= eqs_index
;
513 /* Prepare the text. Make sure the text end is initialized.
514 Make sure text ends in a newline,
515 but remember that we had to add one.
516 Strip trailing CRs, if that was requested. */
519 prepare_text (struct file_data
*current
)
521 size_t buffered
= current
->buffered
;
522 char *p
= FILE_BUFFER (current
);
525 if (buffered
== 0 || p
[buffered
- 1] == '\n')
526 current
->missing_newline
= false;
529 p
[buffered
++] = '\n';
530 current
->missing_newline
= true;
536 /* Don't use uninitialized storage when planting or using sentinels. */
537 memset (p
+ buffered
, 0, sizeof (word
));
539 if (strip_trailing_cr
&& (dst
= memchr (p
, '\r', buffered
)))
541 char const *src
= dst
;
542 char const *srclim
= p
+ buffered
;
545 dst
+= ! ((*dst
= *src
++) == '\r' && *src
== '\n');
546 while (src
< srclim
);
548 buffered
-= src
- dst
;
551 current
->buffered
= buffered
;
554 /* We have found N lines in a buffer of size S; guess the
555 proportionate number of lines that will be found in a buffer of
556 size T. However, do not guess a number of lines so large that the
557 resulting line table might cause overflow in size calculations. */
559 guess_lines (lin n
, size_t s
, size_t t
)
561 size_t guessed_bytes_per_line
= n
< 10 ? 32 : s
/ (n
- 1);
562 lin guessed_lines
= MAX (1, t
/ guessed_bytes_per_line
);
563 return MIN (guessed_lines
, PTRDIFF_MAX
/ (2 * sizeof (char *) + 1) - 5) + 5;
566 /* Given a vector of two file_data objects, find the identical
567 prefixes and suffixes of each object. */
570 find_identical_ends (struct file_data filevec
[])
573 char *p0
, *p1
, *buffer0
, *buffer1
;
574 char const *end0
, *beg0
;
575 char const **linbuf0
, **linbuf1
;
578 lin alloc_lines0
, alloc_lines1
;
579 lin buffered_prefix
, prefix_count
, prefix_mask
;
580 lin middle_guess
, suffix_guess
;
583 prepare_text (&filevec
[0]);
584 if (filevec
[0].desc
!= filevec
[1].desc
)
587 prepare_text (&filevec
[1]);
591 filevec
[1].buffer
= filevec
[0].buffer
;
592 filevec
[1].bufsize
= filevec
[0].bufsize
;
593 filevec
[1].buffered
= filevec
[0].buffered
;
594 filevec
[1].missing_newline
= filevec
[0].missing_newline
;
597 /* Find identical prefix. */
599 w0
= filevec
[0].buffer
;
600 w1
= filevec
[1].buffer
;
601 p0
= buffer0
= (char *) w0
;
602 p1
= buffer1
= (char *) w1
;
603 n0
= filevec
[0].buffered
;
604 n1
= filevec
[1].buffered
;
607 /* The buffers are the same; sentinels won't work. */
611 /* Insert end sentinels, in this case characters that are guaranteed
612 to make the equality test false, and thus terminate the loop. */
619 /* Loop until first mismatch, or to the sentinel characters. */
621 /* Compare a word at a time for speed. */
625 /* Do the last few bytes of comparison a byte at a time. */
631 /* Don't mistakenly count missing newline as part of prefix. */
632 if (ROBUST_OUTPUT_STYLE (output_style
)
633 && ((buffer0
+ n0
- filevec
[0].missing_newline
< p0
)
635 (buffer1
+ n1
- filevec
[1].missing_newline
< p1
)))
639 /* Now P0 and P1 point at the first nonmatching characters. */
641 /* Skip back to last line-beginning in the prefix,
642 and then discard up to HORIZON_LINES lines from the prefix. */
644 while (p0
!= buffer0
&& (p0
[-1] != '\n' || i
--))
647 /* Record the prefix. */
648 filevec
[0].prefix_end
= p0
;
649 filevec
[1].prefix_end
= p1
;
651 /* Find identical suffix. */
653 /* P0 and P1 point beyond the last chars not yet compared. */
657 if (! ROBUST_OUTPUT_STYLE (output_style
)
658 || filevec
[0].missing_newline
== filevec
[1].missing_newline
)
660 end0
= p0
; /* Addr of last char in file 0. */
662 /* Get value of P0 at which we should stop scanning backward:
663 this is when either P0 or P1 points just past the last char
664 of the identical prefix. */
665 beg0
= filevec
[0].prefix_end
+ (n0
< n1
? 0 : n0
- n1
);
667 /* Scan back until chars don't match or we reach that point. */
668 for (; p0
!= beg0
; p0
--, p1
--)
671 /* Point at the first char of the matching suffix. */
676 /* Are we at a line-beginning in both files? If not, add the rest of
677 this line to the main body. Discard up to HORIZON_LINES lines from
678 the identical suffix. Also, discard one extra line,
679 because shift_boundaries may need it. */
680 i
= horizon_lines
+ !((buffer0
== p0
|| p0
[-1] == '\n')
682 (buffer1
== p1
|| p1
[-1] == '\n'));
683 while (i
-- && p0
!= end0
)
684 while (*p0
++ != '\n')
690 /* Record the suffix. */
691 filevec
[0].suffix_begin
= p0
;
692 filevec
[1].suffix_begin
= p1
;
694 /* Calculate number of lines of prefix to save.
696 prefix_count == 0 means save the whole prefix;
697 we need this for options like -D that output the whole file,
698 or for enormous contexts (to avoid worrying about arithmetic overflow).
699 We also need it for options like -F that output some preceding line;
700 at least we will need to find the last few lines,
701 but since we don't know how many, it's easiest to find them all.
703 Otherwise, prefix_count != 0. Save just prefix_count lines at start
704 of the line buffer; they'll be moved to the proper location later.
705 Handle 1 more line than the context says (because we count 1 too many),
706 rounded up to the next power of 2 to speed index computation. */
708 if (no_diff_means_no_output
&& ! function_regexp
.fastmap
709 && context
< LIN_MAX
/ 4 && context
< n0
)
711 middle_guess
= guess_lines (0, 0, p0
- filevec
[0].prefix_end
);
712 suffix_guess
= guess_lines (0, 0, buffer0
+ n0
- p0
);
713 for (prefix_count
= 1; prefix_count
<= context
; prefix_count
*= 2)
715 alloc_lines0
= (prefix_count
+ middle_guess
716 + MIN (context
, suffix_guess
));
721 alloc_lines0
= guess_lines (0, 0, n0
);
724 prefix_mask
= prefix_count
- 1;
726 linbuf0
= xmalloc (alloc_lines0
* sizeof *linbuf0
);
729 /* If the prefix is needed, find the prefix lines. */
730 if (! (no_diff_means_no_output
731 && filevec
[0].prefix_end
== p0
732 && filevec
[1].prefix_end
== p1
))
734 end0
= filevec
[0].prefix_end
;
737 lin l
= lines
++ & prefix_mask
;
738 if (l
== alloc_lines0
)
740 if (PTRDIFF_MAX
/ (2 * sizeof *linbuf0
) <= alloc_lines0
)
743 linbuf0
= xrealloc (linbuf0
, alloc_lines0
* sizeof *linbuf0
);
746 while (*p0
++ != '\n')
750 buffered_prefix
= prefix_count
&& context
< lines
? context
: lines
;
752 /* Allocate line buffer 1. */
754 middle_guess
= guess_lines (lines
, p0
- buffer0
, p1
- filevec
[1].prefix_end
);
755 suffix_guess
= guess_lines (lines
, p0
- buffer0
, buffer1
+ n1
- p1
);
756 alloc_lines1
= buffered_prefix
+ middle_guess
+ MIN (context
, suffix_guess
);
757 if (alloc_lines1
< buffered_prefix
758 || PTRDIFF_MAX
/ sizeof *linbuf1
<= alloc_lines1
)
760 linbuf1
= xmalloc (alloc_lines1
* sizeof *linbuf1
);
762 if (buffered_prefix
!= lines
)
764 /* Rotate prefix lines to proper location. */
765 for (i
= 0; i
< buffered_prefix
; i
++)
766 linbuf1
[i
] = linbuf0
[(lines
- context
+ i
) & prefix_mask
];
767 for (i
= 0; i
< buffered_prefix
; i
++)
768 linbuf0
[i
] = linbuf1
[i
];
771 /* Initialize line buffer 1 from line buffer 0. */
772 for (i
= 0; i
< buffered_prefix
; i
++)
773 linbuf1
[i
] = linbuf0
[i
] - buffer0
+ buffer1
;
775 /* Record the line buffer, adjusted so that
776 linbuf[0] points at the first differing line. */
777 filevec
[0].linbuf
= linbuf0
+ buffered_prefix
;
778 filevec
[1].linbuf
= linbuf1
+ buffered_prefix
;
779 filevec
[0].linbuf_base
= filevec
[1].linbuf_base
= - buffered_prefix
;
780 filevec
[0].alloc_lines
= alloc_lines0
- buffered_prefix
;
781 filevec
[1].alloc_lines
= alloc_lines1
- buffered_prefix
;
782 filevec
[0].prefix_lines
= filevec
[1].prefix_lines
= lines
;
785 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
786 than 2**k. This table is derived from Chris K. Caldwell's list
787 <http://www.utm.edu/research/primes/lists/2small/>. */
789 static unsigned char const prime_offset
[] =
791 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
792 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
793 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
797 /* Verify that this host's size_t is not too wide for the above table. */
799 verify (enough_prime_offsets
,
800 sizeof (size_t) * CHAR_BIT
<= sizeof prime_offset
);
802 /* Given a vector of two file_data objects, read the file associated
803 with each one, and build the table of equivalence classes.
804 Return nonzero if either file appears to be a binary file.
805 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
808 read_files (struct file_data filevec
[], bool pretend_binary
)
811 bool skip_test
= text
| pretend_binary
;
812 bool appears_binary
= pretend_binary
| sip (&filevec
[0], skip_test
);
814 if (filevec
[0].desc
!= filevec
[1].desc
)
815 appears_binary
|= sip (&filevec
[1], skip_test
| appears_binary
);
818 filevec
[1].buffer
= filevec
[0].buffer
;
819 filevec
[1].bufsize
= filevec
[0].bufsize
;
820 filevec
[1].buffered
= filevec
[0].buffered
;
824 set_binary_mode (filevec
[0].desc
, true);
825 set_binary_mode (filevec
[1].desc
, true);
829 find_identical_ends (filevec
);
831 equivs_alloc
= filevec
[0].alloc_lines
+ filevec
[1].alloc_lines
+ 1;
832 if (PTRDIFF_MAX
/ sizeof *equivs
<= equivs_alloc
)
834 equivs
= xmalloc (equivs_alloc
* sizeof *equivs
);
835 /* Equivalence class 0 is permanently safe for lines that were not
836 hashed. Real equivalence classes start at 1. */
839 /* Allocate (one plus) a prime number of hash buckets. Use a prime
840 number between 1/3 and 2/3 of the value of equiv_allocs,
842 for (i
= 9; (size_t) 1 << i
< equivs_alloc
/ 3; i
++)
844 nbuckets
= ((size_t) 1 << i
) - prime_offset
[i
];
845 if (PTRDIFF_MAX
/ sizeof *buckets
<= nbuckets
)
847 buckets
= zalloc ((nbuckets
+ 1) * sizeof *buckets
);
850 for (i
= 0; i
< 2; i
++)
851 find_and_hash_each_line (&filevec
[i
]);
853 filevec
[0].equiv_max
= filevec
[1].equiv_max
= equivs_index
;