Remove extra newline
[official-gcc.git] / gcc / input.c
blobdd1d23df2f75d07edf396e21fe99705a86dba63d
1 /* Data and functions related to line maps and input files.
2 Copyright (C) 2004-2020 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "diagnostic-core.h"
26 #include "selftest.h"
27 #include "cpplib.h"
29 #ifndef HAVE_ICONV
30 #define HAVE_ICONV 0
31 #endif
33 /* This is a cache used by get_next_line to store the content of a
34 file to be searched for file lines. */
35 class fcache
37 public:
38 /* These are information used to store a line boundary. */
39 class line_info
41 public:
42 /* The line number. It starts from 1. */
43 size_t line_num;
45 /* The position (byte count) of the beginning of the line,
46 relative to the file data pointer. This starts at zero. */
47 size_t start_pos;
49 /* The position (byte count) of the last byte of the line. This
50 normally points to the '\n' character, or to one byte after the
51 last byte of the file, if the file doesn't contain a '\n'
52 character. */
53 size_t end_pos;
55 line_info (size_t l, size_t s, size_t e)
56 : line_num (l), start_pos (s), end_pos (e)
59 line_info ()
60 :line_num (0), start_pos (0), end_pos (0)
64 /* The number of time this file has been accessed. This is used
65 to designate which file cache to evict from the cache
66 array. */
67 unsigned use_count;
69 /* The file_path is the key for identifying a particular file in
70 the cache.
71 For libcpp-using code, the underlying buffer for this field is
72 owned by the corresponding _cpp_file within the cpp_reader. */
73 const char *file_path;
75 FILE *fp;
77 /* This points to the content of the file that we've read so
78 far. */
79 char *data;
81 /* The size of the DATA array above.*/
82 size_t size;
84 /* The number of bytes read from the underlying file so far. This
85 must be less (or equal) than SIZE above. */
86 size_t nb_read;
88 /* The index of the beginning of the current line. */
89 size_t line_start_idx;
91 /* The number of the previous line read. This starts at 1. Zero
92 means we've read no line so far. */
93 size_t line_num;
95 /* This is the total number of lines of the current file. At the
96 moment, we try to get this information from the line map
97 subsystem. Note that this is just a hint. When using the C++
98 front-end, this hint is correct because the input file is then
99 completely tokenized before parsing starts; so the line map knows
100 the number of lines before compilation really starts. For e.g,
101 the C front-end, it can happen that we start emitting diagnostics
102 before the line map has seen the end of the file. */
103 size_t total_lines;
105 /* Could this file be missing a trailing newline on its final line?
106 Initially true (to cope with empty files), set to true/false
107 as each line is read. */
108 bool missing_trailing_newline;
110 /* This is a record of the beginning and end of the lines we've seen
111 while reading the file. This is useful to avoid walking the data
112 from the beginning when we are asked to read a line that is
113 before LINE_START_IDX above. Note that the maximum size of this
114 record is fcache_line_record_size, so that the memory consumption
115 doesn't explode. We thus scale total_lines down to
116 fcache_line_record_size. */
117 vec<line_info, va_heap> line_record;
119 fcache ();
120 ~fcache ();
123 /* Current position in real source file. */
125 location_t input_location = UNKNOWN_LOCATION;
127 class line_maps *line_table;
129 /* A stashed copy of "line_table" for use by selftest::line_table_test.
130 This needs to be a global so that it can be a GC root, and thus
131 prevent the stashed copy from being garbage-collected if the GC runs
132 during a line_table_test. */
134 class line_maps *saved_line_table;
136 static fcache *fcache_tab;
137 static const size_t fcache_tab_size = 16;
138 static const size_t fcache_buffer_size = 4 * 1024;
139 static const size_t fcache_line_record_size = 100;
141 /* Expand the source location LOC into a human readable location. If
142 LOC resolves to a builtin location, the file name of the readable
143 location is set to the string "<built-in>". If EXPANSION_POINT_P is
144 TRUE and LOC is virtual, then it is resolved to the expansion
145 point of the involved macro. Otherwise, it is resolved to the
146 spelling location of the token.
148 When resolving to the spelling location of the token, if the
149 resulting location is for a built-in location (that is, it has no
150 associated line/column) in the context of a macro expansion, the
151 returned location is the first one (while unwinding the macro
152 location towards its expansion point) that is in real source
153 code.
155 ASPECT controls which part of the location to use. */
157 static expanded_location
158 expand_location_1 (location_t loc,
159 bool expansion_point_p,
160 enum location_aspect aspect)
162 expanded_location xloc;
163 const line_map_ordinary *map;
164 enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
165 tree block = NULL;
167 if (IS_ADHOC_LOC (loc))
169 block = LOCATION_BLOCK (loc);
170 loc = LOCATION_LOCUS (loc);
173 memset (&xloc, 0, sizeof (xloc));
175 if (loc >= RESERVED_LOCATION_COUNT)
177 if (!expansion_point_p)
179 /* We want to resolve LOC to its spelling location.
181 But if that spelling location is a reserved location that
182 appears in the context of a macro expansion (like for a
183 location for a built-in token), let's consider the first
184 location (toward the expansion point) that is not reserved;
185 that is, the first location that is in real source code. */
186 loc = linemap_unwind_to_first_non_reserved_loc (line_table,
187 loc, NULL);
188 lrk = LRK_SPELLING_LOCATION;
190 loc = linemap_resolve_location (line_table, loc, lrk, &map);
192 /* loc is now either in an ordinary map, or is a reserved location.
193 If it is a compound location, the caret is in a spelling location,
194 but the start/finish might still be a virtual location.
195 Depending of what the caller asked for, we may need to recurse
196 one level in order to resolve any virtual locations in the
197 end-points. */
198 switch (aspect)
200 default:
201 gcc_unreachable ();
202 /* Fall through. */
203 case LOCATION_ASPECT_CARET:
204 break;
205 case LOCATION_ASPECT_START:
207 location_t start = get_start (loc);
208 if (start != loc)
209 return expand_location_1 (start, expansion_point_p, aspect);
211 break;
212 case LOCATION_ASPECT_FINISH:
214 location_t finish = get_finish (loc);
215 if (finish != loc)
216 return expand_location_1 (finish, expansion_point_p, aspect);
218 break;
220 xloc = linemap_expand_location (line_table, map, loc);
223 xloc.data = block;
224 if (loc <= BUILTINS_LOCATION)
225 xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
227 return xloc;
230 /* Initialize the set of cache used for files accessed by caret
231 diagnostic. */
233 static void
234 diagnostic_file_cache_init (void)
236 if (fcache_tab == NULL)
237 fcache_tab = new fcache[fcache_tab_size];
240 /* Free the resources used by the set of cache used for files accessed
241 by caret diagnostic. */
243 void
244 diagnostic_file_cache_fini (void)
246 if (fcache_tab)
248 delete [] (fcache_tab);
249 fcache_tab = NULL;
253 /* Return the total lines number that have been read so far by the
254 line map (in the preprocessor) so far. For languages like C++ that
255 entirely preprocess the input file before starting to parse, this
256 equals the actual number of lines of the file. */
258 static size_t
259 total_lines_num (const char *file_path)
261 size_t r = 0;
262 location_t l = 0;
263 if (linemap_get_file_highest_location (line_table, file_path, &l))
265 gcc_assert (l >= RESERVED_LOCATION_COUNT);
266 expanded_location xloc = expand_location (l);
267 r = xloc.line;
269 return r;
272 /* Lookup the cache used for the content of a given file accessed by
273 caret diagnostic. Return the found cached file, or NULL if no
274 cached file was found. */
276 static fcache*
277 lookup_file_in_cache_tab (const char *file_path)
279 if (file_path == NULL)
280 return NULL;
282 diagnostic_file_cache_init ();
284 /* This will contain the found cached file. */
285 fcache *r = NULL;
286 for (unsigned i = 0; i < fcache_tab_size; ++i)
288 fcache *c = &fcache_tab[i];
289 if (c->file_path && !strcmp (c->file_path, file_path))
291 ++c->use_count;
292 r = c;
296 if (r)
297 ++r->use_count;
299 return r;
302 /* Purge any mention of FILENAME from the cache of files used for
303 printing source code. For use in selftests when working
304 with tempfiles. */
306 void
307 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
309 gcc_assert (file_path);
311 fcache *r = lookup_file_in_cache_tab (file_path);
312 if (!r)
313 /* Not found. */
314 return;
316 r->file_path = NULL;
317 if (r->fp)
318 fclose (r->fp);
319 r->fp = NULL;
320 r->nb_read = 0;
321 r->line_start_idx = 0;
322 r->line_num = 0;
323 r->line_record.truncate (0);
324 r->use_count = 0;
325 r->total_lines = 0;
326 r->missing_trailing_newline = true;
329 /* Return the file cache that has been less used, recently, or the
330 first empty one. If HIGHEST_USE_COUNT is non-null,
331 *HIGHEST_USE_COUNT is set to the highest use count of the entries
332 in the cache table. */
334 static fcache*
335 evicted_cache_tab_entry (unsigned *highest_use_count)
337 diagnostic_file_cache_init ();
339 fcache *to_evict = &fcache_tab[0];
340 unsigned huc = to_evict->use_count;
341 for (unsigned i = 1; i < fcache_tab_size; ++i)
343 fcache *c = &fcache_tab[i];
344 bool c_is_empty = (c->file_path == NULL);
346 if (c->use_count < to_evict->use_count
347 || (to_evict->file_path && c_is_empty))
348 /* We evict C because it's either an entry with a lower use
349 count or one that is empty. */
350 to_evict = c;
352 if (huc < c->use_count)
353 huc = c->use_count;
355 if (c_is_empty)
356 /* We've reached the end of the cache; subsequent elements are
357 all empty. */
358 break;
361 if (highest_use_count)
362 *highest_use_count = huc;
364 return to_evict;
367 /* Create the cache used for the content of a given file to be
368 accessed by caret diagnostic. This cache is added to an array of
369 cache and can be retrieved by lookup_file_in_cache_tab. This
370 function returns the created cache. Note that only the last
371 fcache_tab_size files are cached. */
373 static fcache*
374 add_file_to_cache_tab (const char *file_path)
377 FILE *fp = fopen (file_path, "r");
378 if (fp == NULL)
379 return NULL;
381 unsigned highest_use_count = 0;
382 fcache *r = evicted_cache_tab_entry (&highest_use_count);
383 r->file_path = file_path;
384 if (r->fp)
385 fclose (r->fp);
386 r->fp = fp;
387 r->nb_read = 0;
388 r->line_start_idx = 0;
389 r->line_num = 0;
390 r->line_record.truncate (0);
391 /* Ensure that this cache entry doesn't get evicted next time
392 add_file_to_cache_tab is called. */
393 r->use_count = ++highest_use_count;
394 r->total_lines = total_lines_num (file_path);
395 r->missing_trailing_newline = true;
397 return r;
400 /* Lookup the cache used for the content of a given file accessed by
401 caret diagnostic. If no cached file was found, create a new cache
402 for this file, add it to the array of cached file and return
403 it. */
405 static fcache*
406 lookup_or_add_file_to_cache_tab (const char *file_path)
408 fcache *r = lookup_file_in_cache_tab (file_path);
409 if (r == NULL)
410 r = add_file_to_cache_tab (file_path);
411 return r;
414 /* Default constructor for a cache of file used by caret
415 diagnostic. */
417 fcache::fcache ()
418 : use_count (0), file_path (NULL), fp (NULL), data (0),
419 size (0), nb_read (0), line_start_idx (0), line_num (0),
420 total_lines (0), missing_trailing_newline (true)
422 line_record.create (0);
425 /* Destructor for a cache of file used by caret diagnostic. */
427 fcache::~fcache ()
429 if (fp)
431 fclose (fp);
432 fp = NULL;
434 if (data)
436 XDELETEVEC (data);
437 data = 0;
439 line_record.release ();
442 /* Returns TRUE iff the cache would need to be filled with data coming
443 from the file. That is, either the cache is empty or full or the
444 current line is empty. Note that if the cache is full, it would
445 need to be extended and filled again. */
447 static bool
448 needs_read (fcache *c)
450 return (c->nb_read == 0
451 || c->nb_read == c->size
452 || (c->line_start_idx >= c->nb_read - 1));
455 /* Return TRUE iff the cache is full and thus needs to be
456 extended. */
458 static bool
459 needs_grow (fcache *c)
461 return c->nb_read == c->size;
464 /* Grow the cache if it needs to be extended. */
466 static void
467 maybe_grow (fcache *c)
469 if (!needs_grow (c))
470 return;
472 size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
473 c->data = XRESIZEVEC (char, c->data, size);
474 c->size = size;
477 /* Read more data into the cache. Extends the cache if need be.
478 Returns TRUE iff new data could be read. */
480 static bool
481 read_data (fcache *c)
483 if (feof (c->fp) || ferror (c->fp))
484 return false;
486 maybe_grow (c);
488 char * from = c->data + c->nb_read;
489 size_t to_read = c->size - c->nb_read;
490 size_t nb_read = fread (from, 1, to_read, c->fp);
492 if (ferror (c->fp))
493 return false;
495 c->nb_read += nb_read;
496 return !!nb_read;
499 /* Read new data iff the cache needs to be filled with more data
500 coming from the file FP. Return TRUE iff the cache was filled with
501 mode data. */
503 static bool
504 maybe_read_data (fcache *c)
506 if (!needs_read (c))
507 return false;
508 return read_data (c);
511 /* Read a new line from file FP, using C as a cache for the data
512 coming from the file. Upon successful completion, *LINE is set to
513 the beginning of the line found. *LINE points directly in the
514 line cache and is only valid until the next call of get_next_line.
515 *LINE_LEN is set to the length of the line. Note that the line
516 does not contain any terminal delimiter. This function returns
517 true if some data was read or process from the cache, false
518 otherwise. Note that subsequent calls to get_next_line might
519 make the content of *LINE invalid. */
521 static bool
522 get_next_line (fcache *c, char **line, ssize_t *line_len)
524 /* Fill the cache with data to process. */
525 maybe_read_data (c);
527 size_t remaining_size = c->nb_read - c->line_start_idx;
528 if (remaining_size == 0)
529 /* There is no more data to process. */
530 return false;
532 char *line_start = c->data + c->line_start_idx;
534 char *next_line_start = NULL;
535 size_t len = 0;
536 char *line_end = (char *) memchr (line_start, '\n', remaining_size);
537 if (line_end == NULL)
539 /* We haven't found the end-of-line delimiter in the cache.
540 Fill the cache with more data from the file and look for the
541 '\n'. */
542 while (maybe_read_data (c))
544 line_start = c->data + c->line_start_idx;
545 remaining_size = c->nb_read - c->line_start_idx;
546 line_end = (char *) memchr (line_start, '\n', remaining_size);
547 if (line_end != NULL)
549 next_line_start = line_end + 1;
550 break;
553 if (line_end == NULL)
555 /* We've loadded all the file into the cache and still no
556 '\n'. Let's say the line ends up at one byte passed the
557 end of the file. This is to stay consistent with the case
558 of when the line ends up with a '\n' and line_end points to
559 that terminal '\n'. That consistency is useful below in
560 the len calculation. */
561 line_end = c->data + c->nb_read ;
562 c->missing_trailing_newline = true;
564 else
565 c->missing_trailing_newline = false;
567 else
569 next_line_start = line_end + 1;
570 c->missing_trailing_newline = false;
573 if (ferror (c->fp))
574 return false;
576 /* At this point, we've found the end of the of line. It either
577 points to the '\n' or to one byte after the last byte of the
578 file. */
579 gcc_assert (line_end != NULL);
581 len = line_end - line_start;
583 if (c->line_start_idx < c->nb_read)
584 *line = line_start;
586 ++c->line_num;
588 /* Before we update our line record, make sure the hint about the
589 total number of lines of the file is correct. If it's not, then
590 we give up recording line boundaries from now on. */
591 bool update_line_record = true;
592 if (c->line_num > c->total_lines)
593 update_line_record = false;
595 /* Now update our line record so that re-reading lines from the
596 before c->line_start_idx is faster. */
597 if (update_line_record
598 && c->line_record.length () < fcache_line_record_size)
600 /* If the file lines fits in the line record, we just record all
601 its lines ...*/
602 if (c->total_lines <= fcache_line_record_size
603 && c->line_num > c->line_record.length ())
604 c->line_record.safe_push (fcache::line_info (c->line_num,
605 c->line_start_idx,
606 line_end - c->data));
607 else if (c->total_lines > fcache_line_record_size)
609 /* ... otherwise, we just scale total_lines down to
610 (fcache_line_record_size lines. */
611 size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
612 if (c->line_record.length () == 0
613 || n >= c->line_record.length ())
614 c->line_record.safe_push (fcache::line_info (c->line_num,
615 c->line_start_idx,
616 line_end - c->data));
620 /* Update c->line_start_idx so that it points to the next line to be
621 read. */
622 if (next_line_start)
623 c->line_start_idx = next_line_start - c->data;
624 else
625 /* We didn't find any terminal '\n'. Let's consider that the end
626 of line is the end of the data in the cache. The next
627 invocation of get_next_line will either read more data from the
628 underlying file or return false early because we've reached the
629 end of the file. */
630 c->line_start_idx = c->nb_read;
632 *line_len = len;
634 return true;
637 /* Consume the next bytes coming from the cache (or from its
638 underlying file if there are remaining unread bytes in the file)
639 until we reach the next end-of-line (or end-of-file). There is no
640 copying from the cache involved. Return TRUE upon successful
641 completion. */
643 static bool
644 goto_next_line (fcache *cache)
646 char *l;
647 ssize_t len;
649 return get_next_line (cache, &l, &len);
652 /* Read an arbitrary line number LINE_NUM from the file cached in C.
653 If the line was read successfully, *LINE points to the beginning
654 of the line in the file cache and *LINE_LEN is the length of the
655 line. *LINE is not nul-terminated, but may contain zero bytes.
656 *LINE is only valid until the next call of read_line_num.
657 This function returns bool if a line was read. */
659 static bool
660 read_line_num (fcache *c, size_t line_num,
661 char **line, ssize_t *line_len)
663 gcc_assert (line_num > 0);
665 if (line_num <= c->line_num)
667 /* We've been asked to read lines that are before c->line_num.
668 So lets use our line record (if it's not empty) to try to
669 avoid re-reading the file from the beginning again. */
671 if (c->line_record.is_empty ())
673 c->line_start_idx = 0;
674 c->line_num = 0;
676 else
678 fcache::line_info *i = NULL;
679 if (c->total_lines <= fcache_line_record_size)
681 /* In languages where the input file is not totally
682 preprocessed up front, the c->total_lines hint
683 can be smaller than the number of lines of the
684 file. In that case, only the first
685 c->total_lines have been recorded.
687 Otherwise, the first c->total_lines we've read have
688 their start/end recorded here. */
689 i = (line_num <= c->total_lines)
690 ? &c->line_record[line_num - 1]
691 : &c->line_record[c->total_lines - 1];
692 gcc_assert (i->line_num <= line_num);
694 else
696 /* So the file had more lines than our line record
697 size. Thus the number of lines we've recorded has
698 been scaled down to fcache_line_reacord_size. Let's
699 pick the start/end of the recorded line that is
700 closest to line_num. */
701 size_t n = (line_num <= c->total_lines)
702 ? line_num * fcache_line_record_size / c->total_lines
703 : c ->line_record.length () - 1;
704 if (n < c->line_record.length ())
706 i = &c->line_record[n];
707 gcc_assert (i->line_num <= line_num);
711 if (i && i->line_num == line_num)
713 /* We have the start/end of the line. */
714 *line = c->data + i->start_pos;
715 *line_len = i->end_pos - i->start_pos;
716 return true;
719 if (i)
721 c->line_start_idx = i->start_pos;
722 c->line_num = i->line_num - 1;
724 else
726 c->line_start_idx = 0;
727 c->line_num = 0;
732 /* Let's walk from line c->line_num up to line_num - 1, without
733 copying any line. */
734 while (c->line_num < line_num - 1)
735 if (!goto_next_line (c))
736 return false;
738 /* The line we want is the next one. Let's read and copy it back to
739 the caller. */
740 return get_next_line (c, line, line_len);
743 /* Return the physical source line that corresponds to FILE_PATH/LINE.
744 The line is not nul-terminated. The returned pointer is only
745 valid until the next call of location_get_source_line.
746 Note that the line can contain several null characters,
747 so the returned value's length has the actual length of the line.
748 If the function fails, a NULL char_span is returned. */
750 char_span
751 location_get_source_line (const char *file_path, int line)
753 char *buffer = NULL;
754 ssize_t len;
756 if (line == 0)
757 return char_span (NULL, 0);
759 fcache *c = lookup_or_add_file_to_cache_tab (file_path);
760 if (c == NULL)
761 return char_span (NULL, 0);
763 bool read = read_line_num (c, line, &buffer, &len);
764 if (!read)
765 return char_span (NULL, 0);
767 return char_span (buffer, len);
770 /* Determine if FILE_PATH missing a trailing newline on its final line.
771 Only valid to call once all of the file has been loaded, by
772 requesting a line number beyond the end of the file. */
774 bool
775 location_missing_trailing_newline (const char *file_path)
777 fcache *c = lookup_or_add_file_to_cache_tab (file_path);
778 if (c == NULL)
779 return false;
781 return c->missing_trailing_newline;
784 /* Test if the location originates from the spelling location of a
785 builtin-tokens. That is, return TRUE if LOC is a (possibly
786 virtual) location of a built-in token that appears in the expansion
787 list of a macro. Please note that this function also works on
788 tokens that result from built-in tokens. For instance, the
789 function would return true if passed a token "4" that is the result
790 of the expansion of the built-in __LINE__ macro. */
791 bool
792 is_location_from_builtin_token (location_t loc)
794 const line_map_ordinary *map = NULL;
795 loc = linemap_resolve_location (line_table, loc,
796 LRK_SPELLING_LOCATION, &map);
797 return loc == BUILTINS_LOCATION;
800 /* Expand the source location LOC into a human readable location. If
801 LOC is virtual, it resolves to the expansion point of the involved
802 macro. If LOC resolves to a builtin location, the file name of the
803 readable location is set to the string "<built-in>". */
805 expanded_location
806 expand_location (location_t loc)
808 return expand_location_1 (loc, /*expansion_point_p=*/true,
809 LOCATION_ASPECT_CARET);
812 /* Expand the source location LOC into a human readable location. If
813 LOC is virtual, it resolves to the expansion location of the
814 relevant macro. If LOC resolves to a builtin location, the file
815 name of the readable location is set to the string
816 "<built-in>". */
818 expanded_location
819 expand_location_to_spelling_point (location_t loc,
820 enum location_aspect aspect)
822 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
825 /* The rich_location class within libcpp requires a way to expand
826 location_t instances, and relies on the client code
827 providing a symbol named
828 linemap_client_expand_location_to_spelling_point
829 to do this.
831 This is the implementation for libcommon.a (all host binaries),
832 which simply calls into expand_location_1. */
834 expanded_location
835 linemap_client_expand_location_to_spelling_point (location_t loc,
836 enum location_aspect aspect)
838 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
842 /* If LOCATION is in a system header and if it is a virtual location for
843 a token coming from the expansion of a macro, unwind it to the
844 location of the expansion point of the macro. Otherwise, just return
845 LOCATION.
847 This is used for instance when we want to emit diagnostics about a
848 token that may be located in a macro that is itself defined in a
849 system header, for example, for the NULL macro. In such a case, if
850 LOCATION were passed directly to diagnostic functions such as
851 warning_at, the diagnostic would be suppressed (unless
852 -Wsystem-headers). */
854 location_t
855 expansion_point_location_if_in_system_header (location_t location)
857 if (in_system_header_at (location))
858 location = linemap_resolve_location (line_table, location,
859 LRK_MACRO_EXPANSION_POINT,
860 NULL);
861 return location;
864 /* If LOCATION is a virtual location for a token coming from the expansion
865 of a macro, unwind to the location of the expansion point of the macro. */
867 location_t
868 expansion_point_location (location_t location)
870 return linemap_resolve_location (line_table, location,
871 LRK_MACRO_EXPANSION_POINT, NULL);
874 /* Construct a location with caret at CARET, ranging from START to
875 finish e.g.
877 11111111112
878 12345678901234567890
880 523 return foo + bar;
881 ~~~~^~~~~
884 The location's caret is at the "+", line 523 column 15, but starts
885 earlier, at the "f" of "foo" at column 11. The finish is at the "r"
886 of "bar" at column 19. */
888 location_t
889 make_location (location_t caret, location_t start, location_t finish)
891 location_t pure_loc = get_pure_location (caret);
892 source_range src_range;
893 src_range.m_start = get_start (start);
894 src_range.m_finish = get_finish (finish);
895 location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
896 pure_loc,
897 src_range,
898 NULL);
899 return combined_loc;
902 /* Same as above, but taking a source range rather than two locations. */
904 location_t
905 make_location (location_t caret, source_range src_range)
907 location_t pure_loc = get_pure_location (caret);
908 return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
911 /* An expanded_location stores the column in byte units. This function
912 converts that column to display units. That requires reading the associated
913 source line in order to calculate the display width. If that cannot be done
914 for any reason, then returns the byte column as a fallback. */
916 location_compute_display_column (expanded_location exploc)
918 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
919 return exploc.column;
920 char_span line = location_get_source_line (exploc.file, exploc.line);
921 /* If line is NULL, this function returns exploc.column which is the
922 desired fallback. */
923 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
924 exploc.column);
927 /* Dump statistics to stderr about the memory usage of the line_table
928 set of line maps. This also displays some statistics about macro
929 expansion. */
931 void
932 dump_line_table_statistics (void)
934 struct linemap_stats s;
935 long total_used_map_size,
936 macro_maps_size,
937 total_allocated_map_size;
939 memset (&s, 0, sizeof (s));
941 linemap_get_statistics (line_table, &s);
943 macro_maps_size = s.macro_maps_used_size
944 + s.macro_maps_locations_size;
946 total_allocated_map_size = s.ordinary_maps_allocated_size
947 + s.macro_maps_allocated_size
948 + s.macro_maps_locations_size;
950 total_used_map_size = s.ordinary_maps_used_size
951 + s.macro_maps_used_size
952 + s.macro_maps_locations_size;
954 fprintf (stderr, "Number of expanded macros: %5ld\n",
955 s.num_expanded_macros);
956 if (s.num_expanded_macros != 0)
957 fprintf (stderr, "Average number of tokens per macro expansion: %5ld\n",
958 s.num_macro_tokens / s.num_expanded_macros);
959 fprintf (stderr,
960 "\nLine Table allocations during the "
961 "compilation process\n");
962 fprintf (stderr, "Number of ordinary maps used: " PRsa (5) "\n",
963 SIZE_AMOUNT (s.num_ordinary_maps_used));
964 fprintf (stderr, "Ordinary map used size: " PRsa (5) "\n",
965 SIZE_AMOUNT (s.ordinary_maps_used_size));
966 fprintf (stderr, "Number of ordinary maps allocated: " PRsa (5) "\n",
967 SIZE_AMOUNT (s.num_ordinary_maps_allocated));
968 fprintf (stderr, "Ordinary maps allocated size: " PRsa (5) "\n",
969 SIZE_AMOUNT (s.ordinary_maps_allocated_size));
970 fprintf (stderr, "Number of macro maps used: " PRsa (5) "\n",
971 SIZE_AMOUNT (s.num_macro_maps_used));
972 fprintf (stderr, "Macro maps used size: " PRsa (5) "\n",
973 SIZE_AMOUNT (s.macro_maps_used_size));
974 fprintf (stderr, "Macro maps locations size: " PRsa (5) "\n",
975 SIZE_AMOUNT (s.macro_maps_locations_size));
976 fprintf (stderr, "Macro maps size: " PRsa (5) "\n",
977 SIZE_AMOUNT (macro_maps_size));
978 fprintf (stderr, "Duplicated maps locations size: " PRsa (5) "\n",
979 SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
980 fprintf (stderr, "Total allocated maps size: " PRsa (5) "\n",
981 SIZE_AMOUNT (total_allocated_map_size));
982 fprintf (stderr, "Total used maps size: " PRsa (5) "\n",
983 SIZE_AMOUNT (total_used_map_size));
984 fprintf (stderr, "Ad-hoc table size: " PRsa (5) "\n",
985 SIZE_AMOUNT (s.adhoc_table_size));
986 fprintf (stderr, "Ad-hoc table entries used: " PRsa (5) "\n",
987 SIZE_AMOUNT (s.adhoc_table_entries_used));
988 fprintf (stderr, "optimized_ranges: " PRsa (5) "\n",
989 SIZE_AMOUNT (line_table->num_optimized_ranges));
990 fprintf (stderr, "unoptimized_ranges: " PRsa (5) "\n",
991 SIZE_AMOUNT (line_table->num_unoptimized_ranges));
993 fprintf (stderr, "\n");
996 /* Get location one beyond the final location in ordinary map IDX. */
998 static location_t
999 get_end_location (class line_maps *set, unsigned int idx)
1001 if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1002 return set->highest_location;
1004 struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1005 return MAP_START_LOCATION (next_map);
1008 /* Helper function for write_digit_row. */
1010 static void
1011 write_digit (FILE *stream, int digit)
1013 fputc ('0' + (digit % 10), stream);
1016 /* Helper function for dump_location_info.
1017 Write a row of numbers to STREAM, numbering a source line,
1018 giving the units, tens, hundreds etc of the column number. */
1020 static void
1021 write_digit_row (FILE *stream, int indent,
1022 const line_map_ordinary *map,
1023 location_t loc, int max_col, int divisor)
1025 fprintf (stream, "%*c", indent, ' ');
1026 fprintf (stream, "|");
1027 for (int column = 1; column < max_col; column++)
1029 location_t column_loc = loc + (column << map->m_range_bits);
1030 write_digit (stream, column_loc / divisor);
1032 fprintf (stream, "\n");
1035 /* Write a half-closed (START) / half-open (END) interval of
1036 location_t to STREAM. */
1038 static void
1039 dump_location_range (FILE *stream,
1040 location_t start, location_t end)
1042 fprintf (stream,
1043 " location_t interval: %u <= loc < %u\n",
1044 start, end);
1047 /* Write a labelled description of a half-closed (START) / half-open (END)
1048 interval of location_t to STREAM. */
1050 static void
1051 dump_labelled_location_range (FILE *stream,
1052 const char *name,
1053 location_t start, location_t end)
1055 fprintf (stream, "%s\n", name);
1056 dump_location_range (stream, start, end);
1057 fprintf (stream, "\n");
1060 /* Write a visualization of the locations in the line_table to STREAM. */
1062 void
1063 dump_location_info (FILE *stream)
1065 /* Visualize the reserved locations. */
1066 dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1067 0, RESERVED_LOCATION_COUNT);
1069 /* Visualize the ordinary line_map instances, rendering the sources. */
1070 for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1072 location_t end_location = get_end_location (line_table, idx);
1073 /* half-closed: doesn't include this one. */
1075 const line_map_ordinary *map
1076 = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1077 fprintf (stream, "ORDINARY MAP: %i\n", idx);
1078 dump_location_range (stream,
1079 MAP_START_LOCATION (map), end_location);
1080 fprintf (stream, " file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1081 fprintf (stream, " starting at line: %i\n",
1082 ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1083 fprintf (stream, " column and range bits: %i\n",
1084 map->m_column_and_range_bits);
1085 fprintf (stream, " column bits: %i\n",
1086 map->m_column_and_range_bits - map->m_range_bits);
1087 fprintf (stream, " range bits: %i\n",
1088 map->m_range_bits);
1089 const char * reason;
1090 switch (map->reason) {
1091 case LC_ENTER:
1092 reason = "LC_ENTER";
1093 break;
1094 case LC_LEAVE:
1095 reason = "LC_LEAVE";
1096 break;
1097 case LC_RENAME:
1098 reason = "LC_RENAME";
1099 break;
1100 case LC_RENAME_VERBATIM:
1101 reason = "LC_RENAME_VERBATIM";
1102 break;
1103 case LC_ENTER_MACRO:
1104 reason = "LC_RENAME_MACRO";
1105 break;
1106 default:
1107 reason = "Unknown";
1109 fprintf (stream, " reason: %d (%s)\n", map->reason, reason);
1111 const line_map_ordinary *includer_map
1112 = linemap_included_from_linemap (line_table, map);
1113 fprintf (stream, " included from location: %d",
1114 linemap_included_from (map));
1115 if (includer_map) {
1116 fprintf (stream, " (in ordinary map %d)",
1117 int (includer_map - line_table->info_ordinary.maps));
1119 fprintf (stream, "\n");
1121 /* Render the span of source lines that this "map" covers. */
1122 for (location_t loc = MAP_START_LOCATION (map);
1123 loc < end_location;
1124 loc += (1 << map->m_range_bits) )
1126 gcc_assert (pure_location_p (line_table, loc) );
1128 expanded_location exploc
1129 = linemap_expand_location (line_table, map, loc);
1131 if (exploc.column == 0)
1133 /* Beginning of a new source line: draw the line. */
1135 char_span line_text = location_get_source_line (exploc.file,
1136 exploc.line);
1137 if (!line_text)
1138 break;
1139 fprintf (stream,
1140 "%s:%3i|loc:%5i|%.*s\n",
1141 exploc.file, exploc.line,
1142 loc,
1143 (int)line_text.length (), line_text.get_buffer ());
1145 /* "loc" is at column 0, which means "the whole line".
1146 Render the locations *within* the line, by underlining
1147 it, showing the location_t numeric values
1148 at each column. */
1149 size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1150 if (max_col > line_text.length ())
1151 max_col = line_text.length () + 1;
1153 int len_lnum = num_digits (exploc.line);
1154 if (len_lnum < 3)
1155 len_lnum = 3;
1156 int len_loc = num_digits (loc);
1157 if (len_loc < 5)
1158 len_loc = 5;
1160 int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1162 /* Thousands. */
1163 if (end_location > 999)
1164 write_digit_row (stream, indent, map, loc, max_col, 1000);
1166 /* Hundreds. */
1167 if (end_location > 99)
1168 write_digit_row (stream, indent, map, loc, max_col, 100);
1170 /* Tens. */
1171 write_digit_row (stream, indent, map, loc, max_col, 10);
1173 /* Units. */
1174 write_digit_row (stream, indent, map, loc, max_col, 1);
1177 fprintf (stream, "\n");
1180 /* Visualize unallocated values. */
1181 dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1182 line_table->highest_location,
1183 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1185 /* Visualize the macro line_map instances, rendering the sources. */
1186 for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1188 /* Each macro map that is allocated owns location_t values
1189 that are *lower* that the one before them.
1190 Hence it's meaningful to view them either in order of ascending
1191 source locations, or in order of ascending macro map index. */
1192 const bool ascending_location_ts = true;
1193 unsigned int idx = (ascending_location_ts
1194 ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1195 : i);
1196 const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1197 fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1198 idx,
1199 linemap_map_get_macro_name (map),
1200 MACRO_MAP_NUM_MACRO_TOKENS (map));
1201 dump_location_range (stream,
1202 map->start_location,
1203 (map->start_location
1204 + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1205 inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1206 "expansion point is location %i",
1207 MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1208 fprintf (stream, " map->start_location: %u\n",
1209 map->start_location);
1211 fprintf (stream, " macro_locations:\n");
1212 for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1214 location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1215 location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1217 /* linemap_add_macro_token encodes token numbers in an expansion
1218 by putting them after MAP_START_LOCATION. */
1220 /* I'm typically seeing 4 uninitialized entries at the end of
1221 0xafafafaf.
1222 This appears to be due to macro.c:replace_args
1223 adding 2 extra args for padding tokens; presumably there may
1224 be a leading and/or trailing padding token injected,
1225 each for 2 more location slots.
1226 This would explain there being up to 4 location_ts slots
1227 that may be uninitialized. */
1229 fprintf (stream, " %u: %u, %u\n",
1233 if (x == y)
1235 if (x < MAP_START_LOCATION (map))
1236 inform (x, "token %u has %<x-location == y-location == %u%>",
1237 i, x);
1238 else
1239 fprintf (stream,
1240 "x-location == y-location == %u encodes token # %u\n",
1241 x, x - MAP_START_LOCATION (map));
1243 else
1245 inform (x, "token %u has %<x-location == %u%>", i, x);
1246 inform (x, "token %u has %<y-location == %u%>", i, y);
1249 fprintf (stream, "\n");
1252 /* It appears that MAX_LOCATION_T itself is never assigned to a
1253 macro map, presumably due to an off-by-one error somewhere
1254 between the logic in linemap_enter_macro and
1255 LINEMAPS_MACRO_LOWEST_LOCATION. */
1256 dump_labelled_location_range (stream, "MAX_LOCATION_T",
1257 MAX_LOCATION_T,
1258 MAX_LOCATION_T + 1);
1260 /* Visualize ad-hoc values. */
1261 dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1262 MAX_LOCATION_T + 1, UINT_MAX);
1265 /* string_concat's constructor. */
1267 string_concat::string_concat (int num, location_t *locs)
1268 : m_num (num)
1270 m_locs = ggc_vec_alloc <location_t> (num);
1271 for (int i = 0; i < num; i++)
1272 m_locs[i] = locs[i];
1275 /* string_concat_db's constructor. */
1277 string_concat_db::string_concat_db ()
1279 m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1282 /* Record that a string concatenation occurred, covering NUM
1283 string literal tokens. LOCS is an array of size NUM, containing the
1284 locations of the tokens. A copy of LOCS is taken. */
1286 void
1287 string_concat_db::record_string_concatenation (int num, location_t *locs)
1289 gcc_assert (num > 1);
1290 gcc_assert (locs);
1292 location_t key_loc = get_key_loc (locs[0]);
1294 string_concat *concat
1295 = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1296 m_table->put (key_loc, concat);
1299 /* Determine if LOC was the location of the initial token of a
1300 concatenation of string literal tokens.
1301 If so, *OUT_NUM is written to with the number of tokens, and
1302 *OUT_LOCS with the location of an array of locations of the
1303 tokens, and return true. *OUT_LOCS is a borrowed pointer to
1304 storage owned by the string_concat_db.
1305 Otherwise, return false. */
1307 bool
1308 string_concat_db::get_string_concatenation (location_t loc,
1309 int *out_num,
1310 location_t **out_locs)
1312 gcc_assert (out_num);
1313 gcc_assert (out_locs);
1315 location_t key_loc = get_key_loc (loc);
1317 string_concat **concat = m_table->get (key_loc);
1318 if (!concat)
1319 return false;
1321 *out_num = (*concat)->m_num;
1322 *out_locs =(*concat)->m_locs;
1323 return true;
1326 /* Internal function. Canonicalize LOC into a form suitable for
1327 use as a key within the database, stripping away macro expansion,
1328 ad-hoc information, and range information, using the location of
1329 the start of LOC within an ordinary linemap. */
1331 location_t
1332 string_concat_db::get_key_loc (location_t loc)
1334 loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1335 NULL);
1337 loc = get_range_from_loc (line_table, loc).m_start;
1339 return loc;
1342 /* Helper class for use within get_substring_ranges_for_loc.
1343 An vec of cpp_string with responsibility for releasing all of the
1344 str->text for each str in the vector. */
1346 class auto_cpp_string_vec : public auto_vec <cpp_string>
1348 public:
1349 auto_cpp_string_vec (int alloc)
1350 : auto_vec <cpp_string> (alloc) {}
1352 ~auto_cpp_string_vec ()
1354 /* Clean up the copies within this vec. */
1355 int i;
1356 cpp_string *str;
1357 FOR_EACH_VEC_ELT (*this, i, str)
1358 free (const_cast <unsigned char *> (str->text));
1362 /* Attempt to populate RANGES with source location information on the
1363 individual characters within the string literal found at STRLOC.
1364 If CONCATS is non-NULL, then any string literals that the token at
1365 STRLOC was concatenated with are also added to RANGES.
1367 Return NULL if successful, or an error message if any errors occurred (in
1368 which case RANGES may be only partially populated and should not
1369 be used).
1371 This is implemented by re-parsing the relevant source line(s). */
1373 static const char *
1374 get_substring_ranges_for_loc (cpp_reader *pfile,
1375 string_concat_db *concats,
1376 location_t strloc,
1377 enum cpp_ttype type,
1378 cpp_substring_ranges &ranges)
1380 gcc_assert (pfile);
1382 if (strloc == UNKNOWN_LOCATION)
1383 return "unknown location";
1385 /* Reparsing the strings requires accurate location information.
1386 If -ftrack-macro-expansion has been overridden from its default
1387 of 2, then we might have a location of a macro expansion point,
1388 rather than the location of the literal itself.
1389 Avoid this by requiring that we have full macro expansion tracking
1390 for substring locations to be available. */
1391 if (cpp_get_options (pfile)->track_macro_expansion != 2)
1392 return "track_macro_expansion != 2";
1394 /* If #line or # 44 "file"-style directives are present, then there's
1395 no guarantee that the line numbers we have can be used to locate
1396 the strings. For example, we might have a .i file with # directives
1397 pointing back to lines within a .c file, but the .c file might
1398 have been edited since the .i file was created.
1399 In such a case, the safest course is to disable on-demand substring
1400 locations. */
1401 if (line_table->seen_line_directive)
1402 return "seen line directive";
1404 /* If string concatenation has occurred at STRLOC, get the locations
1405 of all of the literal tokens making up the compound string.
1406 Otherwise, just use STRLOC. */
1407 int num_locs = 1;
1408 location_t *strlocs = &strloc;
1409 if (concats)
1410 concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1412 auto_cpp_string_vec strs (num_locs);
1413 auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1414 for (int i = 0; i < num_locs; i++)
1416 /* Get range of strloc. We will use it to locate the start and finish
1417 of the literal token within the line. */
1418 source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1420 if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1422 /* If the string token was within a macro expansion, then we can
1423 cope with it for the simple case where we have a single token.
1424 Otherwise, bail out. */
1425 if (src_range.m_start != src_range.m_finish)
1426 return "macro expansion";
1428 else
1430 if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1431 /* If so, we can't reliably determine where the token started within
1432 its line. */
1433 return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1435 if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1436 /* If so, we can't reliably determine where the token finished
1437 within its line. */
1438 return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1441 expanded_location start
1442 = expand_location_to_spelling_point (src_range.m_start,
1443 LOCATION_ASPECT_START);
1444 expanded_location finish
1445 = expand_location_to_spelling_point (src_range.m_finish,
1446 LOCATION_ASPECT_FINISH);
1447 if (start.file != finish.file)
1448 return "range endpoints are in different files";
1449 if (start.line != finish.line)
1450 return "range endpoints are on different lines";
1451 if (start.column > finish.column)
1452 return "range endpoints are reversed";
1454 char_span line = location_get_source_line (start.file, start.line);
1455 if (!line)
1456 return "unable to read source line";
1458 /* Determine the location of the literal (including quotes
1459 and leading prefix chars, such as the 'u' in a u""
1460 token). */
1461 size_t literal_length = finish.column - start.column + 1;
1463 /* Ensure that we don't crash if we got the wrong location. */
1464 if (line.length () < (start.column - 1 + literal_length))
1465 return "line is not wide enough";
1467 char_span literal = line.subspan (start.column - 1, literal_length);
1469 cpp_string from;
1470 from.len = literal_length;
1471 /* Make a copy of the literal, to avoid having to rely on
1472 the lifetime of the copy of the line within the cache.
1473 This will be released by the auto_cpp_string_vec dtor. */
1474 from.text = (unsigned char *)literal.xstrdup ();
1475 strs.safe_push (from);
1477 /* For very long lines, a new linemap could have started
1478 halfway through the token.
1479 Ensure that the loc_reader uses the linemap of the
1480 *end* of the token for its start location. */
1481 const line_map_ordinary *start_ord_map;
1482 linemap_resolve_location (line_table, src_range.m_start,
1483 LRK_SPELLING_LOCATION, &start_ord_map);
1484 const line_map_ordinary *final_ord_map;
1485 linemap_resolve_location (line_table, src_range.m_finish,
1486 LRK_SPELLING_LOCATION, &final_ord_map);
1487 if (start_ord_map == NULL || final_ord_map == NULL)
1488 return "failed to get ordinary maps";
1489 /* Bulletproofing. We ought to only have different ordinary maps
1490 for start vs finish due to line-length jumps. */
1491 if (start_ord_map != final_ord_map
1492 && start_ord_map->to_file != final_ord_map->to_file)
1493 return "start and finish are spelled in different ordinary maps";
1494 /* The file from linemap_resolve_location ought to match that from
1495 expand_location_to_spelling_point. */
1496 if (start_ord_map->to_file != start.file)
1497 return "mismatching file after resolving linemap";
1499 location_t start_loc
1500 = linemap_position_for_line_and_column (line_table, final_ord_map,
1501 start.line, start.column);
1503 cpp_string_location_reader loc_reader (start_loc, line_table);
1504 loc_readers.safe_push (loc_reader);
1507 /* Rerun cpp_interpret_string, or rather, a modified version of it. */
1508 const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1509 loc_readers.address (),
1510 num_locs, &ranges, type);
1511 if (err)
1512 return err;
1514 /* Success: "ranges" should now contain information on the string. */
1515 return NULL;
1518 /* Attempt to populate *OUT_LOC with source location information on the
1519 given characters within the string literal found at STRLOC.
1520 CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1521 character set.
1523 For example, given CARET_IDX = 4, START_IDX = 3, END_IDX = 7
1524 and string literal "012345\n789"
1525 *OUT_LOC is written to with:
1526 "012345\n789"
1527 ~^~~~~
1529 If CONCATS is non-NULL, then any string literals that the token at
1530 STRLOC was concatenated with are also considered.
1532 This is implemented by re-parsing the relevant source line(s).
1534 Return NULL if successful, or an error message if any errors occurred.
1535 Error messages are intended for GCC developers (to help debugging) rather
1536 than for end-users. */
1538 const char *
1539 get_location_within_string (cpp_reader *pfile,
1540 string_concat_db *concats,
1541 location_t strloc,
1542 enum cpp_ttype type,
1543 int caret_idx, int start_idx, int end_idx,
1544 location_t *out_loc)
1546 gcc_checking_assert (caret_idx >= 0);
1547 gcc_checking_assert (start_idx >= 0);
1548 gcc_checking_assert (end_idx >= 0);
1549 gcc_assert (out_loc);
1551 cpp_substring_ranges ranges;
1552 const char *err
1553 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1554 if (err)
1555 return err;
1557 if (caret_idx >= ranges.get_num_ranges ())
1558 return "caret_idx out of range";
1559 if (start_idx >= ranges.get_num_ranges ())
1560 return "start_idx out of range";
1561 if (end_idx >= ranges.get_num_ranges ())
1562 return "end_idx out of range";
1564 *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1565 ranges.get_range (start_idx).m_start,
1566 ranges.get_range (end_idx).m_finish);
1567 return NULL;
1570 #if CHECKING_P
1572 namespace selftest {
1574 /* Selftests of location handling. */
1576 /* Attempt to populate *OUT_RANGE with source location information on the
1577 given character within the string literal found at STRLOC.
1578 CHAR_IDX refers to an offset within the execution character set.
1579 If CONCATS is non-NULL, then any string literals that the token at
1580 STRLOC was concatenated with are also considered.
1582 This is implemented by re-parsing the relevant source line(s).
1584 Return NULL if successful, or an error message if any errors occurred.
1585 Error messages are intended for GCC developers (to help debugging) rather
1586 than for end-users. */
1588 static const char *
1589 get_source_range_for_char (cpp_reader *pfile,
1590 string_concat_db *concats,
1591 location_t strloc,
1592 enum cpp_ttype type,
1593 int char_idx,
1594 source_range *out_range)
1596 gcc_checking_assert (char_idx >= 0);
1597 gcc_assert (out_range);
1599 cpp_substring_ranges ranges;
1600 const char *err
1601 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1602 if (err)
1603 return err;
1605 if (char_idx >= ranges.get_num_ranges ())
1606 return "char_idx out of range";
1608 *out_range = ranges.get_range (char_idx);
1609 return NULL;
1612 /* As get_source_range_for_char, but write to *OUT the number
1613 of ranges that are available. */
1615 static const char *
1616 get_num_source_ranges_for_substring (cpp_reader *pfile,
1617 string_concat_db *concats,
1618 location_t strloc,
1619 enum cpp_ttype type,
1620 int *out)
1622 gcc_assert (out);
1624 cpp_substring_ranges ranges;
1625 const char *err
1626 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1628 if (err)
1629 return err;
1631 *out = ranges.get_num_ranges ();
1632 return NULL;
1635 /* Selftests of location handling. */
1637 /* Verify that compare() on linenum_type handles comparisons over the full
1638 range of the type. */
1640 static void
1641 test_linenum_comparisons ()
1643 linenum_type min_line (0);
1644 linenum_type max_line (0xffffffff);
1645 ASSERT_EQ (0, compare (min_line, min_line));
1646 ASSERT_EQ (0, compare (max_line, max_line));
1648 ASSERT_GT (compare (max_line, min_line), 0);
1649 ASSERT_LT (compare (min_line, max_line), 0);
1652 /* Helper function for verifying location data: when location_t
1653 values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1654 as having column 0. */
1656 static bool
1657 should_have_column_data_p (location_t loc)
1659 if (IS_ADHOC_LOC (loc))
1660 loc = get_location_from_adhoc_loc (line_table, loc);
1661 if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1662 return false;
1663 return true;
1666 /* Selftest for should_have_column_data_p. */
1668 static void
1669 test_should_have_column_data_p ()
1671 ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1672 ASSERT_TRUE
1673 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1674 ASSERT_FALSE
1675 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1678 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1679 on LOC. */
1681 static void
1682 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1683 location_t loc)
1685 ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1686 ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1687 /* If location_t values are sufficiently high, then column numbers
1688 will be unavailable and LOCATION_COLUMN (loc) will be 0.
1689 When close to the threshold, column numbers *may* be present: if
1690 the final linemap before the threshold contains a line that straddles
1691 the threshold, locations in that line have column information. */
1692 if (should_have_column_data_p (loc))
1693 ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1696 /* Various selftests involve constructing a line table and one or more
1697 line maps within it.
1699 For maximum test coverage we want to run these tests with a variety
1700 of situations:
1701 - line_table->default_range_bits: some frontends use a non-zero value
1702 and others use zero
1703 - the fallback modes within line-map.c: there are various threshold
1704 values for location_t beyond line-map.c changes
1705 behavior (disabling of the range-packing optimization, disabling
1706 of column-tracking). We can exercise these by starting the line_table
1707 at interesting values at or near these thresholds.
1709 The following struct describes a particular case within our test
1710 matrix. */
1712 class line_table_case
1714 public:
1715 line_table_case (int default_range_bits, int base_location)
1716 : m_default_range_bits (default_range_bits),
1717 m_base_location (base_location)
1720 int m_default_range_bits;
1721 int m_base_location;
1724 /* Constructor. Store the old value of line_table, and create a new
1725 one, using sane defaults. */
1727 line_table_test::line_table_test ()
1729 gcc_assert (saved_line_table == NULL);
1730 saved_line_table = line_table;
1731 line_table = ggc_alloc<line_maps> ();
1732 linemap_init (line_table, BUILTINS_LOCATION);
1733 gcc_assert (saved_line_table->reallocator);
1734 line_table->reallocator = saved_line_table->reallocator;
1735 gcc_assert (saved_line_table->round_alloc_size);
1736 line_table->round_alloc_size = saved_line_table->round_alloc_size;
1737 line_table->default_range_bits = 0;
1740 /* Constructor. Store the old value of line_table, and create a new
1741 one, using the sitation described in CASE_. */
1743 line_table_test::line_table_test (const line_table_case &case_)
1745 gcc_assert (saved_line_table == NULL);
1746 saved_line_table = line_table;
1747 line_table = ggc_alloc<line_maps> ();
1748 linemap_init (line_table, BUILTINS_LOCATION);
1749 gcc_assert (saved_line_table->reallocator);
1750 line_table->reallocator = saved_line_table->reallocator;
1751 gcc_assert (saved_line_table->round_alloc_size);
1752 line_table->round_alloc_size = saved_line_table->round_alloc_size;
1753 line_table->default_range_bits = case_.m_default_range_bits;
1754 if (case_.m_base_location)
1756 line_table->highest_location = case_.m_base_location;
1757 line_table->highest_line = case_.m_base_location;
1761 /* Destructor. Restore the old value of line_table. */
1763 line_table_test::~line_table_test ()
1765 gcc_assert (saved_line_table != NULL);
1766 line_table = saved_line_table;
1767 saved_line_table = NULL;
1770 /* Verify basic operation of ordinary linemaps. */
1772 static void
1773 test_accessing_ordinary_linemaps (const line_table_case &case_)
1775 line_table_test ltt (case_);
1777 /* Build a simple linemap describing some locations. */
1778 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1780 linemap_line_start (line_table, 1, 100);
1781 location_t loc_a = linemap_position_for_column (line_table, 1);
1782 location_t loc_b = linemap_position_for_column (line_table, 23);
1784 linemap_line_start (line_table, 2, 100);
1785 location_t loc_c = linemap_position_for_column (line_table, 1);
1786 location_t loc_d = linemap_position_for_column (line_table, 17);
1788 /* Example of a very long line. */
1789 linemap_line_start (line_table, 3, 2000);
1790 location_t loc_e = linemap_position_for_column (line_table, 700);
1792 /* Transitioning back to a short line. */
1793 linemap_line_start (line_table, 4, 0);
1794 location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1796 if (should_have_column_data_p (loc_back_to_short))
1798 /* Verify that we switched to short lines in the linemap. */
1799 line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1800 ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1803 /* Example of a line that will eventually be seen to be longer
1804 than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1805 below that. */
1806 linemap_line_start (line_table, 5, 2000);
1808 location_t loc_start_of_very_long_line
1809 = linemap_position_for_column (line_table, 2000);
1810 location_t loc_too_wide
1811 = linemap_position_for_column (line_table, 4097);
1812 location_t loc_too_wide_2
1813 = linemap_position_for_column (line_table, 4098);
1815 /* ...and back to a sane line length. */
1816 linemap_line_start (line_table, 6, 100);
1817 location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1819 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1821 /* Multiple files. */
1822 linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1823 linemap_line_start (line_table, 1, 200);
1824 location_t loc_f = linemap_position_for_column (line_table, 150);
1825 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1827 /* Verify that we can recover the location info. */
1828 assert_loceq ("foo.c", 1, 1, loc_a);
1829 assert_loceq ("foo.c", 1, 23, loc_b);
1830 assert_loceq ("foo.c", 2, 1, loc_c);
1831 assert_loceq ("foo.c", 2, 17, loc_d);
1832 assert_loceq ("foo.c", 3, 700, loc_e);
1833 assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1835 /* In the very wide line, the initial location should be fully tracked. */
1836 assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1837 /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1838 be disabled. */
1839 assert_loceq ("foo.c", 5, 0, loc_too_wide);
1840 assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1841 /*...and column-tracking should be re-enabled for subsequent lines. */
1842 assert_loceq ("foo.c", 6, 10, loc_sane_again);
1844 assert_loceq ("bar.c", 1, 150, loc_f);
1846 ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1847 ASSERT_TRUE (pure_location_p (line_table, loc_a));
1849 /* Verify using make_location to build a range, and extracting data
1850 back from it. */
1851 location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1852 ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1853 ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1854 source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1855 ASSERT_EQ (loc_b, src_range.m_start);
1856 ASSERT_EQ (loc_d, src_range.m_finish);
1859 /* Verify various properties of UNKNOWN_LOCATION. */
1861 static void
1862 test_unknown_location ()
1864 ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1865 ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1866 ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1869 /* Verify various properties of BUILTINS_LOCATION. */
1871 static void
1872 test_builtins ()
1874 assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1875 ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1878 /* Regression test for make_location.
1879 Ensure that we use pure locations for the start/finish of the range,
1880 rather than storing a packed or ad-hoc range as the start/finish. */
1882 static void
1883 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1885 /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1886 with C++ frontend.
1887 ....................0000000001111111111222.
1888 ....................1234567890123456789012. */
1889 const char *content = " r += !aaa == bbb;\n";
1890 temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1891 line_table_test ltt (case_);
1892 linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1894 const location_t c11 = linemap_position_for_column (line_table, 11);
1895 const location_t c12 = linemap_position_for_column (line_table, 12);
1896 const location_t c13 = linemap_position_for_column (line_table, 13);
1897 const location_t c14 = linemap_position_for_column (line_table, 14);
1898 const location_t c21 = linemap_position_for_column (line_table, 21);
1900 if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1901 return;
1903 /* Use column 13 for the caret location, arbitrarily, to verify that we
1904 handle start != caret. */
1905 const location_t aaa = make_location (c13, c12, c14);
1906 ASSERT_EQ (c13, get_pure_location (aaa));
1907 ASSERT_EQ (c12, get_start (aaa));
1908 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1909 ASSERT_EQ (c14, get_finish (aaa));
1910 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1912 /* Make a location using a location with a range as the start-point. */
1913 const location_t not_aaa = make_location (c11, aaa, c14);
1914 ASSERT_EQ (c11, get_pure_location (not_aaa));
1915 /* It should use the start location of the range, not store the range
1916 itself. */
1917 ASSERT_EQ (c12, get_start (not_aaa));
1918 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1919 ASSERT_EQ (c14, get_finish (not_aaa));
1920 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1922 /* Similarly, make a location with a range as the end-point. */
1923 const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1924 ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1925 ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1926 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1927 ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1928 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1929 const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1930 /* It should use the finish location of the range, not store the range
1931 itself. */
1932 ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1933 ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1934 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1935 ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1936 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1939 /* Verify reading of input files (e.g. for caret-based diagnostics). */
1941 static void
1942 test_reading_source_line ()
1944 /* Create a tempfile and write some text to it. */
1945 temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1946 "01234567890123456789\n"
1947 "This is the test text\n"
1948 "This is the 3rd line");
1950 /* Read back a specific line from the tempfile. */
1951 char_span source_line = location_get_source_line (tmp.get_filename (), 3);
1952 ASSERT_TRUE (source_line);
1953 ASSERT_TRUE (source_line.get_buffer () != NULL);
1954 ASSERT_EQ (20, source_line.length ());
1955 ASSERT_TRUE (!strncmp ("This is the 3rd line",
1956 source_line.get_buffer (), source_line.length ()));
1958 source_line = location_get_source_line (tmp.get_filename (), 2);
1959 ASSERT_TRUE (source_line);
1960 ASSERT_TRUE (source_line.get_buffer () != NULL);
1961 ASSERT_EQ (21, source_line.length ());
1962 ASSERT_TRUE (!strncmp ("This is the test text",
1963 source_line.get_buffer (), source_line.length ()));
1965 source_line = location_get_source_line (tmp.get_filename (), 4);
1966 ASSERT_FALSE (source_line);
1967 ASSERT_TRUE (source_line.get_buffer () == NULL);
1970 /* Tests of lexing. */
1972 /* Verify that token TOK from PARSER has cpp_token_as_text
1973 equal to EXPECTED_TEXT. */
1975 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT) \
1976 SELFTEST_BEGIN_STMT \
1977 unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK)); \
1978 ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt); \
1979 SELFTEST_END_STMT
1981 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1982 and ranges from EXP_START_COL to EXP_FINISH_COL.
1983 Use LOC as the effective location of the selftest. */
1985 static void
1986 assert_token_loc_eq (const location &loc,
1987 const cpp_token *tok,
1988 const char *exp_filename, int exp_linenum,
1989 int exp_start_col, int exp_finish_col)
1991 location_t tok_loc = tok->src_loc;
1992 ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1993 ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1995 /* If location_t values are sufficiently high, then column numbers
1996 will be unavailable. */
1997 if (!should_have_column_data_p (tok_loc))
1998 return;
2000 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2001 source_range tok_range = get_range_from_loc (line_table, tok_loc);
2002 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2003 ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2006 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2007 SELFTEST_LOCATION as the effective location of the selftest. */
2009 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2010 EXP_START_COL, EXP_FINISH_COL) \
2011 assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2012 (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2014 /* Test of lexing a file using libcpp, verifying tokens and their
2015 location information. */
2017 static void
2018 test_lexer (const line_table_case &case_)
2020 /* Create a tempfile and write some text to it. */
2021 const char *content =
2022 /*00000000011111111112222222222333333.3333444444444.455555555556
2023 12345678901234567890123456789012345.6789012345678.901234567890. */
2024 ("test_name /* c-style comment */\n"
2025 " \"test literal\"\n"
2026 " // test c++-style comment\n"
2027 " 42\n");
2028 temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2030 line_table_test ltt (case_);
2032 cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2034 const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2035 ASSERT_NE (fname, NULL);
2037 /* Verify that we get the expected tokens back, with the correct
2038 location information. */
2040 location_t loc;
2041 const cpp_token *tok;
2042 tok = cpp_get_token_with_location (parser, &loc);
2043 ASSERT_NE (tok, NULL);
2044 ASSERT_EQ (tok->type, CPP_NAME);
2045 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2046 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2048 tok = cpp_get_token_with_location (parser, &loc);
2049 ASSERT_NE (tok, NULL);
2050 ASSERT_EQ (tok->type, CPP_STRING);
2051 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2052 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2054 tok = cpp_get_token_with_location (parser, &loc);
2055 ASSERT_NE (tok, NULL);
2056 ASSERT_EQ (tok->type, CPP_NUMBER);
2057 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2058 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2060 tok = cpp_get_token_with_location (parser, &loc);
2061 ASSERT_NE (tok, NULL);
2062 ASSERT_EQ (tok->type, CPP_EOF);
2064 cpp_finish (parser, NULL);
2065 cpp_destroy (parser);
2068 /* Forward decls. */
2070 class lexer_test;
2071 class lexer_test_options;
2073 /* A class for specifying options of a lexer_test.
2074 The "apply" vfunc is called during the lexer_test constructor. */
2076 class lexer_test_options
2078 public:
2079 virtual void apply (lexer_test &) = 0;
2082 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2083 in its dtor.
2085 This is needed by struct lexer_test to ensure that the cleanup of the
2086 cpp_reader happens *after* the cleanup of the temp_source_file. */
2088 class cpp_reader_ptr
2090 public:
2091 cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2093 ~cpp_reader_ptr ()
2095 cpp_finish (m_ptr, NULL);
2096 cpp_destroy (m_ptr);
2099 operator cpp_reader * () const { return m_ptr; }
2101 private:
2102 cpp_reader *m_ptr;
2105 /* A struct for writing lexer tests. */
2107 class lexer_test
2109 public:
2110 lexer_test (const line_table_case &case_, const char *content,
2111 lexer_test_options *options);
2112 ~lexer_test ();
2114 const cpp_token *get_token ();
2116 /* The ordering of these fields matters.
2117 The line_table_test must be first, since the cpp_reader_ptr
2118 uses it.
2119 The cpp_reader must be cleaned up *after* the temp_source_file
2120 since the filenames in input.c's input cache are owned by the
2121 cpp_reader; in particular, when ~temp_source_file evicts the
2122 filename the filenames must still be alive. */
2123 line_table_test m_ltt;
2124 cpp_reader_ptr m_parser;
2125 temp_source_file m_tempfile;
2126 string_concat_db m_concats;
2127 bool m_implicitly_expect_EOF;
2130 /* Use an EBCDIC encoding for the execution charset, specifically
2131 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2133 This exercises iconv integration within libcpp.
2134 Not every build of iconv supports the given charset,
2135 so we need to flag this error and handle it gracefully. */
2137 class ebcdic_execution_charset : public lexer_test_options
2139 public:
2140 ebcdic_execution_charset () : m_num_iconv_errors (0)
2142 gcc_assert (s_singleton == NULL);
2143 s_singleton = this;
2145 ~ebcdic_execution_charset ()
2147 gcc_assert (s_singleton == this);
2148 s_singleton = NULL;
2151 void apply (lexer_test &test) FINAL OVERRIDE
2153 cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2154 cpp_opts->narrow_charset = "IBM1047";
2156 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2157 callbacks->diagnostic = on_diagnostic;
2160 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2161 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2162 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2163 rich_location *richloc ATTRIBUTE_UNUSED,
2164 const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2165 ATTRIBUTE_FPTR_PRINTF(5,0)
2167 gcc_assert (s_singleton);
2168 /* Avoid exgettext from picking this up, it is translated in libcpp. */
2169 const char *msg = "conversion from %s to %s not supported by iconv";
2170 #ifdef ENABLE_NLS
2171 msg = dgettext ("cpplib", msg);
2172 #endif
2173 /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2174 when the local iconv build doesn't support the conversion. */
2175 if (strcmp (msgid, msg) == 0)
2177 s_singleton->m_num_iconv_errors++;
2178 return true;
2181 /* Otherwise, we have an unexpected error. */
2182 abort ();
2185 bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2187 private:
2188 static ebcdic_execution_charset *s_singleton;
2189 int m_num_iconv_errors;
2192 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2194 /* A lexer_test_options subclass that records a list of diagnostic
2195 messages emitted by the lexer. */
2197 class lexer_diagnostic_sink : public lexer_test_options
2199 public:
2200 lexer_diagnostic_sink ()
2202 gcc_assert (s_singleton == NULL);
2203 s_singleton = this;
2205 ~lexer_diagnostic_sink ()
2207 gcc_assert (s_singleton == this);
2208 s_singleton = NULL;
2210 int i;
2211 char *str;
2212 FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2213 free (str);
2216 void apply (lexer_test &test) FINAL OVERRIDE
2218 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2219 callbacks->diagnostic = on_diagnostic;
2222 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2223 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2224 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2225 rich_location *richloc ATTRIBUTE_UNUSED,
2226 const char *msgid, va_list *ap)
2227 ATTRIBUTE_FPTR_PRINTF(5,0)
2229 char *msg = xvasprintf (msgid, *ap);
2230 s_singleton->m_diagnostics.safe_push (msg);
2231 return true;
2234 auto_vec<char *> m_diagnostics;
2236 private:
2237 static lexer_diagnostic_sink *s_singleton;
2240 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2242 /* Constructor. Override line_table with a new instance based on CASE_,
2243 and write CONTENT to a tempfile. Create a cpp_reader, and use it to
2244 start parsing the tempfile. */
2246 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2247 lexer_test_options *options)
2248 : m_ltt (case_),
2249 m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2250 /* Create a tempfile and write the text to it. */
2251 m_tempfile (SELFTEST_LOCATION, ".c", content),
2252 m_concats (),
2253 m_implicitly_expect_EOF (true)
2255 if (options)
2256 options->apply (*this);
2258 cpp_init_iconv (m_parser);
2260 /* Parse the file. */
2261 const char *fname = cpp_read_main_file (m_parser,
2262 m_tempfile.get_filename ());
2263 ASSERT_NE (fname, NULL);
2266 /* Destructor. By default, verify that the next token in m_parser is EOF. */
2268 lexer_test::~lexer_test ()
2270 location_t loc;
2271 const cpp_token *tok;
2273 if (m_implicitly_expect_EOF)
2275 tok = cpp_get_token_with_location (m_parser, &loc);
2276 ASSERT_NE (tok, NULL);
2277 ASSERT_EQ (tok->type, CPP_EOF);
2281 /* Get the next token from m_parser. */
2283 const cpp_token *
2284 lexer_test::get_token ()
2286 location_t loc;
2287 const cpp_token *tok;
2289 tok = cpp_get_token_with_location (m_parser, &loc);
2290 ASSERT_NE (tok, NULL);
2291 return tok;
2294 /* Verify that locations within string literals are correctly handled. */
2296 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2297 using the string concatenation database for TEST.
2299 Assert that the character at index IDX is on EXPECTED_LINE,
2300 and that it begins at column EXPECTED_START_COL and ends at
2301 EXPECTED_FINISH_COL (unless the locations are beyond
2302 LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2303 columns). */
2305 static void
2306 assert_char_at_range (const location &loc,
2307 lexer_test& test,
2308 location_t strloc, enum cpp_ttype type, int idx,
2309 int expected_line, int expected_start_col,
2310 int expected_finish_col)
2312 cpp_reader *pfile = test.m_parser;
2313 string_concat_db *concats = &test.m_concats;
2315 source_range actual_range = source_range();
2316 const char *err
2317 = get_source_range_for_char (pfile, concats, strloc, type, idx,
2318 &actual_range);
2319 if (should_have_column_data_p (strloc))
2320 ASSERT_EQ_AT (loc, NULL, err);
2321 else
2323 ASSERT_STREQ_AT (loc,
2324 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2325 err);
2326 return;
2329 int actual_start_line = LOCATION_LINE (actual_range.m_start);
2330 ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2331 int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2332 ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2334 if (should_have_column_data_p (actual_range.m_start))
2336 int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2337 ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2339 if (should_have_column_data_p (actual_range.m_finish))
2341 int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2342 ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2346 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2347 the effective location of any errors. */
2349 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2350 EXPECTED_START_COL, EXPECTED_FINISH_COL) \
2351 assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2352 (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2353 (EXPECTED_FINISH_COL))
2355 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2356 using the string concatenation database for TEST.
2358 Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */
2360 static void
2361 assert_num_substring_ranges (const location &loc,
2362 lexer_test& test,
2363 location_t strloc,
2364 enum cpp_ttype type,
2365 int expected_num_ranges)
2367 cpp_reader *pfile = test.m_parser;
2368 string_concat_db *concats = &test.m_concats;
2370 int actual_num_ranges = -1;
2371 const char *err
2372 = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2373 &actual_num_ranges);
2374 if (should_have_column_data_p (strloc))
2375 ASSERT_EQ_AT (loc, NULL, err);
2376 else
2378 ASSERT_STREQ_AT (loc,
2379 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2380 err);
2381 return;
2383 ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2386 /* Macro for calling assert_num_substring_ranges, supplying
2387 SELFTEST_LOCATION for the effective location of any errors. */
2389 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2390 EXPECTED_NUM_RANGES) \
2391 assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2392 (TYPE), (EXPECTED_NUM_RANGES))
2395 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2396 returns an error (using the string concatenation database for TEST). */
2398 static void
2399 assert_has_no_substring_ranges (const location &loc,
2400 lexer_test& test,
2401 location_t strloc,
2402 enum cpp_ttype type,
2403 const char *expected_err)
2405 cpp_reader *pfile = test.m_parser;
2406 string_concat_db *concats = &test.m_concats;
2407 cpp_substring_ranges ranges;
2408 const char *actual_err
2409 = get_substring_ranges_for_loc (pfile, concats, strloc,
2410 type, ranges);
2411 if (should_have_column_data_p (strloc))
2412 ASSERT_STREQ_AT (loc, expected_err, actual_err);
2413 else
2414 ASSERT_STREQ_AT (loc,
2415 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2416 actual_err);
2419 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \
2420 assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2421 (STRLOC), (TYPE), (ERR))
2423 /* Lex a simple string literal. Verify the substring location data, before
2424 and after running cpp_interpret_string on it. */
2426 static void
2427 test_lexer_string_locations_simple (const line_table_case &case_)
2429 /* Digits 0-9 (with 0 at column 10), the simple way.
2430 ....................000000000.11111111112.2222222223333333333
2431 ....................123456789.01234567890.1234567890123456789
2432 We add a trailing comment to ensure that we correctly locate
2433 the end of the string literal token. */
2434 const char *content = " \"0123456789\" /* not a string */\n";
2435 lexer_test test (case_, content, NULL);
2437 /* Verify that we get the expected token back, with the correct
2438 location information. */
2439 const cpp_token *tok = test.get_token ();
2440 ASSERT_EQ (tok->type, CPP_STRING);
2441 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2442 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2444 /* At this point in lexing, the quote characters are treated as part of
2445 the string (they are stripped off by cpp_interpret_string). */
2447 ASSERT_EQ (tok->val.str.len, 12);
2449 /* Verify that cpp_interpret_string works. */
2450 cpp_string dst_string;
2451 const enum cpp_ttype type = CPP_STRING;
2452 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2453 &dst_string, type);
2454 ASSERT_TRUE (result);
2455 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2456 free (const_cast <unsigned char *> (dst_string.text));
2458 /* Verify ranges of individual characters. This no longer includes the
2459 opening quote, but does include the closing quote. */
2460 for (int i = 0; i <= 10; i++)
2461 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2462 10 + i, 10 + i);
2464 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2467 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2468 encoding. */
2470 static void
2471 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2473 /* EBCDIC support requires iconv. */
2474 if (!HAVE_ICONV)
2475 return;
2477 /* Digits 0-9 (with 0 at column 10), the simple way.
2478 ....................000000000.11111111112.2222222223333333333
2479 ....................123456789.01234567890.1234567890123456789
2480 We add a trailing comment to ensure that we correctly locate
2481 the end of the string literal token. */
2482 const char *content = " \"0123456789\" /* not a string */\n";
2483 ebcdic_execution_charset use_ebcdic;
2484 lexer_test test (case_, content, &use_ebcdic);
2486 /* Verify that we get the expected token back, with the correct
2487 location information. */
2488 const cpp_token *tok = test.get_token ();
2489 ASSERT_EQ (tok->type, CPP_STRING);
2490 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2491 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2493 /* At this point in lexing, the quote characters are treated as part of
2494 the string (they are stripped off by cpp_interpret_string). */
2496 ASSERT_EQ (tok->val.str.len, 12);
2498 /* The remainder of the test requires an iconv implementation that
2499 can convert from UTF-8 to the EBCDIC encoding requested above. */
2500 if (use_ebcdic.iconv_errors_occurred_p ())
2501 return;
2503 /* Verify that cpp_interpret_string works. */
2504 cpp_string dst_string;
2505 const enum cpp_ttype type = CPP_STRING;
2506 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2507 &dst_string, type);
2508 ASSERT_TRUE (result);
2509 /* We should now have EBCDIC-encoded text, specifically
2510 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2511 The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */
2512 ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2513 (const char *)dst_string.text);
2514 free (const_cast <unsigned char *> (dst_string.text));
2516 /* Verify that we don't attempt to record substring location information
2517 for such cases. */
2518 ASSERT_HAS_NO_SUBSTRING_RANGES
2519 (test, tok->src_loc, type,
2520 "execution character set != source character set");
2523 /* Lex a string literal containing a hex-escaped character.
2524 Verify the substring location data, before and after running
2525 cpp_interpret_string on it. */
2527 static void
2528 test_lexer_string_locations_hex (const line_table_case &case_)
2530 /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2531 and with a space in place of digit 6, to terminate the escaped
2532 hex code.
2533 ....................000000000.111111.11112222.
2534 ....................123456789.012345.67890123. */
2535 const char *content = " \"01234\\x35 789\"\n";
2536 lexer_test test (case_, content, NULL);
2538 /* Verify that we get the expected token back, with the correct
2539 location information. */
2540 const cpp_token *tok = test.get_token ();
2541 ASSERT_EQ (tok->type, CPP_STRING);
2542 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2543 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2545 /* At this point in lexing, the quote characters are treated as part of
2546 the string (they are stripped off by cpp_interpret_string). */
2547 ASSERT_EQ (tok->val.str.len, 15);
2549 /* Verify that cpp_interpret_string works. */
2550 cpp_string dst_string;
2551 const enum cpp_ttype type = CPP_STRING;
2552 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2553 &dst_string, type);
2554 ASSERT_TRUE (result);
2555 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2556 free (const_cast <unsigned char *> (dst_string.text));
2558 /* Verify ranges of individual characters. This no longer includes the
2559 opening quote, but does include the closing quote. */
2560 for (int i = 0; i <= 4; i++)
2561 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2562 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2563 for (int i = 6; i <= 10; i++)
2564 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2566 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2569 /* Lex a string literal containing an octal-escaped character.
2570 Verify the substring location data after running cpp_interpret_string
2571 on it. */
2573 static void
2574 test_lexer_string_locations_oct (const line_table_case &case_)
2576 /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2577 and with a space in place of digit 6, to terminate the escaped
2578 octal code.
2579 ....................000000000.111111.11112222.2222223333333333444
2580 ....................123456789.012345.67890123.4567890123456789012 */
2581 const char *content = " \"01234\\065 789\" /* not a string */\n";
2582 lexer_test test (case_, content, NULL);
2584 /* Verify that we get the expected token back, with the correct
2585 location information. */
2586 const cpp_token *tok = test.get_token ();
2587 ASSERT_EQ (tok->type, CPP_STRING);
2588 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2590 /* Verify that cpp_interpret_string works. */
2591 cpp_string dst_string;
2592 const enum cpp_ttype type = CPP_STRING;
2593 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2594 &dst_string, type);
2595 ASSERT_TRUE (result);
2596 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2597 free (const_cast <unsigned char *> (dst_string.text));
2599 /* Verify ranges of individual characters. This no longer includes the
2600 opening quote, but does include the closing quote. */
2601 for (int i = 0; i < 5; i++)
2602 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2603 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2604 for (int i = 6; i <= 10; i++)
2605 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2607 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2610 /* Test of string literal containing letter escapes. */
2612 static void
2613 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2615 /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2616 .....................000000000.1.11111.1.1.11222.22222223333333
2617 .....................123456789.0.12345.6.7.89012.34567890123456. */
2618 const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2619 lexer_test test (case_, content, NULL);
2621 /* Verify that we get the expected tokens back. */
2622 const cpp_token *tok = test.get_token ();
2623 ASSERT_EQ (tok->type, CPP_STRING);
2624 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2626 /* Verify ranges of individual characters. */
2627 /* "\t". */
2628 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2629 0, 1, 10, 11);
2630 /* "foo". */
2631 for (int i = 1; i <= 3; i++)
2632 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2633 i, 1, 11 + i, 11 + i);
2634 /* "\\" and "\n". */
2635 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2636 4, 1, 15, 16);
2637 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2638 5, 1, 17, 18);
2640 /* "bar" and closing quote for nul-terminator. */
2641 for (int i = 6; i <= 9; i++)
2642 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2643 i, 1, 13 + i, 13 + i);
2645 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2648 /* Another test of a string literal containing a letter escape.
2649 Based on string seen in
2650 printf ("%-%\n");
2651 in gcc.dg/format/c90-printf-1.c. */
2653 static void
2654 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2656 /* .....................000000000.1111.11.1111.22222222223.
2657 .....................123456789.0123.45.6789.01234567890. */
2658 const char *content = (" \"%-%\\n\" /* non-str */\n");
2659 lexer_test test (case_, content, NULL);
2661 /* Verify that we get the expected tokens back. */
2662 const cpp_token *tok = test.get_token ();
2663 ASSERT_EQ (tok->type, CPP_STRING);
2664 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2666 /* Verify ranges of individual characters. */
2667 /* "%-%". */
2668 for (int i = 0; i < 3; i++)
2669 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2670 i, 1, 10 + i, 10 + i);
2671 /* "\n". */
2672 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2673 3, 1, 13, 14);
2675 /* Closing quote for nul-terminator. */
2676 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2677 4, 1, 15, 15);
2679 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2682 /* Lex a string literal containing UCN 4 characters.
2683 Verify the substring location data after running cpp_interpret_string
2684 on it. */
2686 static void
2687 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2689 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2690 as UCN 4.
2691 ....................000000000.111111.111122.222222223.33333333344444
2692 ....................123456789.012345.678901.234567890.12345678901234 */
2693 const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n";
2694 lexer_test test (case_, content, NULL);
2696 /* Verify that we get the expected token back, with the correct
2697 location information. */
2698 const cpp_token *tok = test.get_token ();
2699 ASSERT_EQ (tok->type, CPP_STRING);
2700 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2702 /* Verify that cpp_interpret_string works.
2703 The string should be encoded in the execution character
2704 set. Assuming that is UTF-8, we should have the following:
2705 ----------- ---- ----- ------- ----------------
2706 Byte offset Byte Octal Unicode Source Column(s)
2707 ----------- ---- ----- ------- ----------------
2708 0 0x30 '0' 10
2709 1 0x31 '1' 11
2710 2 0x32 '2' 12
2711 3 0x33 '3' 13
2712 4 0x34 '4' 14
2713 5 0xE2 \342 U+2174 15-20
2714 6 0x85 \205 (cont) 15-20
2715 7 0xB4 \264 (cont) 15-20
2716 8 0xE2 \342 U+2175 21-26
2717 9 0x85 \205 (cont) 21-26
2718 10 0xB5 \265 (cont) 21-26
2719 11 0x37 '7' 27
2720 12 0x38 '8' 28
2721 13 0x39 '9' 29
2722 14 0x00 30 (closing quote)
2723 ----------- ---- ----- ------- ---------------. */
2725 cpp_string dst_string;
2726 const enum cpp_ttype type = CPP_STRING;
2727 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2728 &dst_string, type);
2729 ASSERT_TRUE (result);
2730 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2731 (const char *)dst_string.text);
2732 free (const_cast <unsigned char *> (dst_string.text));
2734 /* Verify ranges of individual characters. This no longer includes the
2735 opening quote, but does include the closing quote.
2736 '01234'. */
2737 for (int i = 0; i <= 4; i++)
2738 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2739 /* U+2174. */
2740 for (int i = 5; i <= 7; i++)
2741 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2742 /* U+2175. */
2743 for (int i = 8; i <= 10; i++)
2744 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2745 /* '789' and nul terminator */
2746 for (int i = 11; i <= 14; i++)
2747 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2749 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2752 /* Lex a string literal containing UCN 8 characters.
2753 Verify the substring location data after running cpp_interpret_string
2754 on it. */
2756 static void
2757 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2759 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2760 ....................000000000.111111.1111222222.2222333333333.344444
2761 ....................123456789.012345.6789012345.6789012345678.901234 */
2762 const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n";
2763 lexer_test test (case_, content, NULL);
2765 /* Verify that we get the expected token back, with the correct
2766 location information. */
2767 const cpp_token *tok = test.get_token ();
2768 ASSERT_EQ (tok->type, CPP_STRING);
2769 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2770 "\"01234\\U00002174\\U00002175789\"");
2772 /* Verify that cpp_interpret_string works.
2773 The UTF-8 encoding of the string is identical to that from
2774 the ucn4 testcase above; the only difference is the column
2775 locations. */
2776 cpp_string dst_string;
2777 const enum cpp_ttype type = CPP_STRING;
2778 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2779 &dst_string, type);
2780 ASSERT_TRUE (result);
2781 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2782 (const char *)dst_string.text);
2783 free (const_cast <unsigned char *> (dst_string.text));
2785 /* Verify ranges of individual characters. This no longer includes the
2786 opening quote, but does include the closing quote.
2787 '01234'. */
2788 for (int i = 0; i <= 4; i++)
2789 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2790 /* U+2174. */
2791 for (int i = 5; i <= 7; i++)
2792 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2793 /* U+2175. */
2794 for (int i = 8; i <= 10; i++)
2795 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2796 /* '789' at columns 35-37 */
2797 for (int i = 11; i <= 13; i++)
2798 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2799 /* Closing quote/nul-terminator at column 38. */
2800 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2802 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2805 /* Fetch a big-endian 32-bit value and convert to host endianness. */
2807 static uint32_t
2808 uint32_from_big_endian (const uint32_t *ptr_be_value)
2810 const unsigned char *buf = (const unsigned char *)ptr_be_value;
2811 return (((uint32_t) buf[0] << 24)
2812 | ((uint32_t) buf[1] << 16)
2813 | ((uint32_t) buf[2] << 8)
2814 | (uint32_t) buf[3]);
2817 /* Lex a wide string literal and verify that attempts to read substring
2818 location data from it fail gracefully. */
2820 static void
2821 test_lexer_string_locations_wide_string (const line_table_case &case_)
2823 /* Digits 0-9.
2824 ....................000000000.11111111112.22222222233333
2825 ....................123456789.01234567890.12345678901234 */
2826 const char *content = " L\"0123456789\" /* non-str */\n";
2827 lexer_test test (case_, content, NULL);
2829 /* Verify that we get the expected token back, with the correct
2830 location information. */
2831 const cpp_token *tok = test.get_token ();
2832 ASSERT_EQ (tok->type, CPP_WSTRING);
2833 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2835 /* Verify that cpp_interpret_string works, using CPP_WSTRING. */
2836 cpp_string dst_string;
2837 const enum cpp_ttype type = CPP_WSTRING;
2838 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2839 &dst_string, type);
2840 ASSERT_TRUE (result);
2841 /* The cpp_reader defaults to big-endian with
2842 CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2843 now be encoded as UTF-32BE. */
2844 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2845 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2846 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2847 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2848 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2849 free (const_cast <unsigned char *> (dst_string.text));
2851 /* We don't yet support generating substring location information
2852 for L"" strings. */
2853 ASSERT_HAS_NO_SUBSTRING_RANGES
2854 (test, tok->src_loc, type,
2855 "execution character set != source character set");
2858 /* Fetch a big-endian 16-bit value and convert to host endianness. */
2860 static uint16_t
2861 uint16_from_big_endian (const uint16_t *ptr_be_value)
2863 const unsigned char *buf = (const unsigned char *)ptr_be_value;
2864 return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2867 /* Lex a u"" string literal and verify that attempts to read substring
2868 location data from it fail gracefully. */
2870 static void
2871 test_lexer_string_locations_string16 (const line_table_case &case_)
2873 /* Digits 0-9.
2874 ....................000000000.11111111112.22222222233333
2875 ....................123456789.01234567890.12345678901234 */
2876 const char *content = " u\"0123456789\" /* non-str */\n";
2877 lexer_test test (case_, content, NULL);
2879 /* Verify that we get the expected token back, with the correct
2880 location information. */
2881 const cpp_token *tok = test.get_token ();
2882 ASSERT_EQ (tok->type, CPP_STRING16);
2883 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2885 /* Verify that cpp_interpret_string works, using CPP_STRING16. */
2886 cpp_string dst_string;
2887 const enum cpp_ttype type = CPP_STRING16;
2888 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2889 &dst_string, type);
2890 ASSERT_TRUE (result);
2892 /* The cpp_reader defaults to big-endian, so dst_string should
2893 now be encoded as UTF-16BE. */
2894 const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2895 ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2896 ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2897 ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2898 ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2899 free (const_cast <unsigned char *> (dst_string.text));
2901 /* We don't yet support generating substring location information
2902 for L"" strings. */
2903 ASSERT_HAS_NO_SUBSTRING_RANGES
2904 (test, tok->src_loc, type,
2905 "execution character set != source character set");
2908 /* Lex a U"" string literal and verify that attempts to read substring
2909 location data from it fail gracefully. */
2911 static void
2912 test_lexer_string_locations_string32 (const line_table_case &case_)
2914 /* Digits 0-9.
2915 ....................000000000.11111111112.22222222233333
2916 ....................123456789.01234567890.12345678901234 */
2917 const char *content = " U\"0123456789\" /* non-str */\n";
2918 lexer_test test (case_, content, NULL);
2920 /* Verify that we get the expected token back, with the correct
2921 location information. */
2922 const cpp_token *tok = test.get_token ();
2923 ASSERT_EQ (tok->type, CPP_STRING32);
2924 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2926 /* Verify that cpp_interpret_string works, using CPP_STRING32. */
2927 cpp_string dst_string;
2928 const enum cpp_ttype type = CPP_STRING32;
2929 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2930 &dst_string, type);
2931 ASSERT_TRUE (result);
2933 /* The cpp_reader defaults to big-endian, so dst_string should
2934 now be encoded as UTF-32BE. */
2935 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2936 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2937 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2938 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2939 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2940 free (const_cast <unsigned char *> (dst_string.text));
2942 /* We don't yet support generating substring location information
2943 for L"" strings. */
2944 ASSERT_HAS_NO_SUBSTRING_RANGES
2945 (test, tok->src_loc, type,
2946 "execution character set != source character set");
2949 /* Lex a u8-string literal.
2950 Verify the substring location data after running cpp_interpret_string
2951 on it. */
2953 static void
2954 test_lexer_string_locations_u8 (const line_table_case &case_)
2956 /* Digits 0-9.
2957 ....................000000000.11111111112.22222222233333
2958 ....................123456789.01234567890.12345678901234 */
2959 const char *content = " u8\"0123456789\" /* non-str */\n";
2960 lexer_test test (case_, content, NULL);
2962 /* Verify that we get the expected token back, with the correct
2963 location information. */
2964 const cpp_token *tok = test.get_token ();
2965 ASSERT_EQ (tok->type, CPP_UTF8STRING);
2966 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2968 /* Verify that cpp_interpret_string works. */
2969 cpp_string dst_string;
2970 const enum cpp_ttype type = CPP_STRING;
2971 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2972 &dst_string, type);
2973 ASSERT_TRUE (result);
2974 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2975 free (const_cast <unsigned char *> (dst_string.text));
2977 /* Verify ranges of individual characters. This no longer includes the
2978 opening quote, but does include the closing quote. */
2979 for (int i = 0; i <= 10; i++)
2980 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2983 /* Lex a string literal containing UTF-8 source characters.
2984 Verify the substring location data after running cpp_interpret_string
2985 on it. */
2987 static void
2988 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2990 /* This string literal is written out to the source file as UTF-8,
2991 and is of the form "before mojibake after", where "mojibake"
2992 is written as the following four unicode code points:
2993 U+6587 CJK UNIFIED IDEOGRAPH-6587
2994 U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2995 U+5316 CJK UNIFIED IDEOGRAPH-5316
2996 U+3051 HIRAGANA LETTER KE.
2997 Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2998 "before" and "after" are 1 byte per unicode character.
3000 The numbering shown are "columns", which are *byte* numbers within
3001 the line, rather than unicode character numbers.
3003 .................... 000000000.1111111.
3004 .................... 123456789.0123456. */
3005 const char *content = (" \"before "
3006 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3007 UTF-8: 0xE6 0x96 0x87
3008 C octal escaped UTF-8: \346\226\207
3009 "column" numbers: 17-19. */
3010 "\346\226\207"
3012 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3013 UTF-8: 0xE5 0xAD 0x97
3014 C octal escaped UTF-8: \345\255\227
3015 "column" numbers: 20-22. */
3016 "\345\255\227"
3018 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3019 UTF-8: 0xE5 0x8C 0x96
3020 C octal escaped UTF-8: \345\214\226
3021 "column" numbers: 23-25. */
3022 "\345\214\226"
3024 /* U+3051 HIRAGANA LETTER KE
3025 UTF-8: 0xE3 0x81 0x91
3026 C octal escaped UTF-8: \343\201\221
3027 "column" numbers: 26-28. */
3028 "\343\201\221"
3030 /* column numbers 29 onwards
3031 2333333.33334444444444
3032 9012345.67890123456789. */
3033 " after\" /* non-str */\n");
3034 lexer_test test (case_, content, NULL);
3036 /* Verify that we get the expected token back, with the correct
3037 location information. */
3038 const cpp_token *tok = test.get_token ();
3039 ASSERT_EQ (tok->type, CPP_STRING);
3040 ASSERT_TOKEN_AS_TEXT_EQ
3041 (test.m_parser, tok,
3042 "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3044 /* Verify that cpp_interpret_string works. */
3045 cpp_string dst_string;
3046 const enum cpp_ttype type = CPP_STRING;
3047 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3048 &dst_string, type);
3049 ASSERT_TRUE (result);
3050 ASSERT_STREQ
3051 ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3052 (const char *)dst_string.text);
3053 free (const_cast <unsigned char *> (dst_string.text));
3055 /* Verify ranges of individual characters. This no longer includes the
3056 opening quote, but does include the closing quote.
3057 Assuming that both source and execution encodings are UTF-8, we have
3058 a run of 25 octets in each, plus the NUL terminator. */
3059 for (int i = 0; i < 25; i++)
3060 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3061 /* NUL-terminator should use the closing quote at column 35. */
3062 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3064 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3067 /* Test of string literal concatenation. */
3069 static void
3070 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3072 /* Digits 0-9.
3073 .....................000000000.111111.11112222222222
3074 .....................123456789.012345.67890123456789. */
3075 const char *content = (" \"01234\" /* non-str */\n"
3076 " \"56789\" /* non-str */\n");
3077 lexer_test test (case_, content, NULL);
3079 location_t input_locs[2];
3081 /* Verify that we get the expected tokens back. */
3082 auto_vec <cpp_string> input_strings;
3083 const cpp_token *tok_a = test.get_token ();
3084 ASSERT_EQ (tok_a->type, CPP_STRING);
3085 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3086 input_strings.safe_push (tok_a->val.str);
3087 input_locs[0] = tok_a->src_loc;
3089 const cpp_token *tok_b = test.get_token ();
3090 ASSERT_EQ (tok_b->type, CPP_STRING);
3091 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3092 input_strings.safe_push (tok_b->val.str);
3093 input_locs[1] = tok_b->src_loc;
3095 /* Verify that cpp_interpret_string works. */
3096 cpp_string dst_string;
3097 const enum cpp_ttype type = CPP_STRING;
3098 bool result = cpp_interpret_string (test.m_parser,
3099 input_strings.address (), 2,
3100 &dst_string, type);
3101 ASSERT_TRUE (result);
3102 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3103 free (const_cast <unsigned char *> (dst_string.text));
3105 /* Simulate c-lex.c's lex_string in order to record concatenation. */
3106 test.m_concats.record_string_concatenation (2, input_locs);
3108 location_t initial_loc = input_locs[0];
3110 /* "01234" on line 1. */
3111 for (int i = 0; i <= 4; i++)
3112 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3113 /* "56789" in line 2, plus its closing quote for the nul terminator. */
3114 for (int i = 5; i <= 10; i++)
3115 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3117 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3120 /* Another test of string literal concatenation. */
3122 static void
3123 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3125 /* Digits 0-9.
3126 .....................000000000.111.11111112222222
3127 .....................123456789.012.34567890123456. */
3128 const char *content = (" \"01\" /* non-str */\n"
3129 " \"23\" /* non-str */\n"
3130 " \"45\" /* non-str */\n"
3131 " \"67\" /* non-str */\n"
3132 " \"89\" /* non-str */\n");
3133 lexer_test test (case_, content, NULL);
3135 auto_vec <cpp_string> input_strings;
3136 location_t input_locs[5];
3138 /* Verify that we get the expected tokens back. */
3139 for (int i = 0; i < 5; i++)
3141 const cpp_token *tok = test.get_token ();
3142 ASSERT_EQ (tok->type, CPP_STRING);
3143 input_strings.safe_push (tok->val.str);
3144 input_locs[i] = tok->src_loc;
3147 /* Verify that cpp_interpret_string works. */
3148 cpp_string dst_string;
3149 const enum cpp_ttype type = CPP_STRING;
3150 bool result = cpp_interpret_string (test.m_parser,
3151 input_strings.address (), 5,
3152 &dst_string, type);
3153 ASSERT_TRUE (result);
3154 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3155 free (const_cast <unsigned char *> (dst_string.text));
3157 /* Simulate c-lex.c's lex_string in order to record concatenation. */
3158 test.m_concats.record_string_concatenation (5, input_locs);
3160 location_t initial_loc = input_locs[0];
3162 /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3163 detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3164 and expect get_source_range_for_substring to fail.
3165 However, for a string concatenation test, we can have a case
3166 where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3167 but subsequent strings can be after it.
3168 Attempting to detect this within assert_char_at_range
3169 would overcomplicate the logic for the common test cases, so
3170 we detect it here. */
3171 if (should_have_column_data_p (input_locs[0])
3172 && !should_have_column_data_p (input_locs[4]))
3174 /* Verify that get_source_range_for_substring gracefully rejects
3175 this case. */
3176 source_range actual_range;
3177 const char *err
3178 = get_source_range_for_char (test.m_parser, &test.m_concats,
3179 initial_loc, type, 0, &actual_range);
3180 ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3181 return;
3184 for (int i = 0; i < 5; i++)
3185 for (int j = 0; j < 2; j++)
3186 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3187 i + 1, 10 + j, 10 + j);
3189 /* NUL-terminator should use the final closing quote at line 5 column 12. */
3190 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3192 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3195 /* Another test of string literal concatenation, this time combined with
3196 various kinds of escaped characters. */
3198 static void
3199 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3201 /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3202 digit 6 in ASCII as octal "\066", concatenating multiple strings. */
3203 const char *content
3204 /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3205 .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3206 = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n");
3207 lexer_test test (case_, content, NULL);
3209 auto_vec <cpp_string> input_strings;
3210 location_t input_locs[4];
3212 /* Verify that we get the expected tokens back. */
3213 for (int i = 0; i < 4; i++)
3215 const cpp_token *tok = test.get_token ();
3216 ASSERT_EQ (tok->type, CPP_STRING);
3217 input_strings.safe_push (tok->val.str);
3218 input_locs[i] = tok->src_loc;
3221 /* Verify that cpp_interpret_string works. */
3222 cpp_string dst_string;
3223 const enum cpp_ttype type = CPP_STRING;
3224 bool result = cpp_interpret_string (test.m_parser,
3225 input_strings.address (), 4,
3226 &dst_string, type);
3227 ASSERT_TRUE (result);
3228 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3229 free (const_cast <unsigned char *> (dst_string.text));
3231 /* Simulate c-lex.c's lex_string in order to record concatenation. */
3232 test.m_concats.record_string_concatenation (4, input_locs);
3234 location_t initial_loc = input_locs[0];
3236 for (int i = 0; i <= 4; i++)
3237 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3238 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3239 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3240 for (int i = 7; i <= 9; i++)
3241 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3243 /* NUL-terminator should use the location of the final closing quote. */
3244 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3246 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3249 /* Test of string literal in a macro. */
3251 static void
3252 test_lexer_string_locations_macro (const line_table_case &case_)
3254 /* Digits 0-9.
3255 .....................0000000001111111111.22222222223.
3256 .....................1234567890123456789.01234567890. */
3257 const char *content = ("#define MACRO \"0123456789\" /* non-str */\n"
3258 " MACRO");
3259 lexer_test test (case_, content, NULL);
3261 /* Verify that we get the expected tokens back. */
3262 const cpp_token *tok = test.get_token ();
3263 ASSERT_EQ (tok->type, CPP_PADDING);
3265 tok = test.get_token ();
3266 ASSERT_EQ (tok->type, CPP_STRING);
3267 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3269 /* Verify ranges of individual characters. We ought to
3270 see columns within the macro definition. */
3271 for (int i = 0; i <= 10; i++)
3272 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3273 i, 1, 20 + i, 20 + i);
3275 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3277 tok = test.get_token ();
3278 ASSERT_EQ (tok->type, CPP_PADDING);
3281 /* Test of stringification of a macro argument. */
3283 static void
3284 test_lexer_string_locations_stringified_macro_argument
3285 (const line_table_case &case_)
3287 /* .....................000000000111111111122222222223.
3288 .....................123456789012345678901234567890. */
3289 const char *content = ("#define MACRO(X) #X /* non-str */\n"
3290 "MACRO(foo)\n");
3291 lexer_test test (case_, content, NULL);
3293 /* Verify that we get the expected token back. */
3294 const cpp_token *tok = test.get_token ();
3295 ASSERT_EQ (tok->type, CPP_PADDING);
3297 tok = test.get_token ();
3298 ASSERT_EQ (tok->type, CPP_STRING);
3299 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3301 /* We don't support getting the location of a stringified macro
3302 argument. Verify that it fails gracefully. */
3303 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3304 "cpp_interpret_string_1 failed");
3306 tok = test.get_token ();
3307 ASSERT_EQ (tok->type, CPP_PADDING);
3309 tok = test.get_token ();
3310 ASSERT_EQ (tok->type, CPP_PADDING);
3313 /* Ensure that we are fail gracefully if something attempts to pass
3314 in a location that isn't a string literal token. Seen on this code:
3316 const char a[] = " %d ";
3317 __builtin_printf (a, 0.5);
3320 when c-format.c erroneously used the indicated one-character
3321 location as the format string location, leading to a read past the
3322 end of a string buffer in cpp_interpret_string_1. */
3324 static void
3325 test_lexer_string_locations_non_string (const line_table_case &case_)
3327 /* .....................000000000111111111122222222223.
3328 .....................123456789012345678901234567890. */
3329 const char *content = (" a\n");
3330 lexer_test test (case_, content, NULL);
3332 /* Verify that we get the expected token back. */
3333 const cpp_token *tok = test.get_token ();
3334 ASSERT_EQ (tok->type, CPP_NAME);
3335 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3337 /* At this point, libcpp is attempting to interpret the name as a
3338 string literal, despite it not starting with a quote. We don't detect
3339 that, but we should at least fail gracefully. */
3340 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3341 "cpp_interpret_string_1 failed");
3344 /* Ensure that we can read substring information for a token which
3345 starts in one linemap and ends in another . Adapted from
3346 gcc.dg/cpp/pr69985.c. */
3348 static void
3349 test_lexer_string_locations_long_line (const line_table_case &case_)
3351 /* .....................000000.000111111111
3352 .....................123456.789012346789. */
3353 const char *content = ("/* A very long line, so that we start a new line map. */\n"
3354 " \"0123456789012345678901234567890123456789"
3355 "0123456789012345678901234567890123456789"
3356 "0123456789012345678901234567890123456789"
3357 "0123456789\"\n");
3359 lexer_test test (case_, content, NULL);
3361 /* Verify that we get the expected token back. */
3362 const cpp_token *tok = test.get_token ();
3363 ASSERT_EQ (tok->type, CPP_STRING);
3365 if (!should_have_column_data_p (line_table->highest_location))
3366 return;
3368 /* Verify ranges of individual characters. */
3369 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3370 for (int i = 0; i < 131; i++)
3371 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3372 i, 2, 7 + i, 7 + i);
3375 /* Test of locations within a raw string that doesn't contain a newline. */
3377 static void
3378 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3380 /* .....................00.0000000111111111122.
3381 .....................12.3456789012345678901. */
3382 const char *content = ("R\"foo(0123456789)foo\"\n");
3383 lexer_test test (case_, content, NULL);
3385 /* Verify that we get the expected token back. */
3386 const cpp_token *tok = test.get_token ();
3387 ASSERT_EQ (tok->type, CPP_STRING);
3389 /* Verify that cpp_interpret_string works. */
3390 cpp_string dst_string;
3391 const enum cpp_ttype type = CPP_STRING;
3392 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3393 &dst_string, type);
3394 ASSERT_TRUE (result);
3395 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3396 free (const_cast <unsigned char *> (dst_string.text));
3398 if (!should_have_column_data_p (line_table->highest_location))
3399 return;
3401 /* 0-9, plus the nil terminator. */
3402 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3403 for (int i = 0; i < 11; i++)
3404 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3405 i, 1, 7 + i, 7 + i);
3408 /* Test of locations within a raw string that contains a newline. */
3410 static void
3411 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3413 /* .....................00.0000.
3414 .....................12.3456. */
3415 const char *content = ("R\"foo(\n"
3416 /* .....................00000.
3417 .....................12345. */
3418 "hello\n"
3419 "world\n"
3420 /* .....................00000.
3421 .....................12345. */
3422 ")foo\"\n");
3423 lexer_test test (case_, content, NULL);
3425 /* Verify that we get the expected token back. */
3426 const cpp_token *tok = test.get_token ();
3427 ASSERT_EQ (tok->type, CPP_STRING);
3429 /* Verify that cpp_interpret_string works. */
3430 cpp_string dst_string;
3431 const enum cpp_ttype type = CPP_STRING;
3432 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3433 &dst_string, type);
3434 ASSERT_TRUE (result);
3435 ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3436 free (const_cast <unsigned char *> (dst_string.text));
3438 if (!should_have_column_data_p (line_table->highest_location))
3439 return;
3441 /* Currently we don't support locations within raw strings that
3442 contain newlines. */
3443 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3444 "range endpoints are on different lines");
3447 /* Test of parsing an unterminated raw string. */
3449 static void
3450 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3452 const char *content = "R\"ouch()ouCh\" /* etc */";
3454 lexer_diagnostic_sink diagnostics;
3455 lexer_test test (case_, content, &diagnostics);
3456 test.m_implicitly_expect_EOF = false;
3458 /* Attempt to parse the raw string. */
3459 const cpp_token *tok = test.get_token ();
3460 ASSERT_EQ (tok->type, CPP_EOF);
3462 ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3463 /* We expect the message "unterminated raw string"
3464 in the "cpplib" translation domain.
3465 It's not clear that dgettext is available on all supported hosts,
3466 so this assertion is commented-out for now.
3467 ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3468 diagnostics.m_diagnostics[0]);
3472 /* Test of lexing char constants. */
3474 static void
3475 test_lexer_char_constants (const line_table_case &case_)
3477 /* Various char constants.
3478 .....................0000000001111111111.22222222223.
3479 .....................1234567890123456789.01234567890. */
3480 const char *content = (" 'a'\n"
3481 " u'a'\n"
3482 " U'a'\n"
3483 " L'a'\n"
3484 " 'abc'\n");
3485 lexer_test test (case_, content, NULL);
3487 /* Verify that we get the expected tokens back. */
3488 /* 'a'. */
3489 const cpp_token *tok = test.get_token ();
3490 ASSERT_EQ (tok->type, CPP_CHAR);
3491 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3493 unsigned int chars_seen;
3494 int unsignedp;
3495 cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3496 &chars_seen, &unsignedp);
3497 ASSERT_EQ (cc, 'a');
3498 ASSERT_EQ (chars_seen, 1);
3500 /* u'a'. */
3501 tok = test.get_token ();
3502 ASSERT_EQ (tok->type, CPP_CHAR16);
3503 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3505 /* U'a'. */
3506 tok = test.get_token ();
3507 ASSERT_EQ (tok->type, CPP_CHAR32);
3508 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3510 /* L'a'. */
3511 tok = test.get_token ();
3512 ASSERT_EQ (tok->type, CPP_WCHAR);
3513 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3515 /* 'abc' (c-char-sequence). */
3516 tok = test.get_token ();
3517 ASSERT_EQ (tok->type, CPP_CHAR);
3518 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3520 /* A table of interesting location_t values, giving one axis of our test
3521 matrix. */
3523 static const location_t boundary_locations[] = {
3524 /* Zero means "don't override the default values for a new line_table". */
3527 /* An arbitrary non-zero value that isn't close to one of
3528 the boundary values below. */
3529 0x10000,
3531 /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES. */
3532 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3533 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3534 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3535 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3536 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3538 /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS. */
3539 LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3540 LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3541 LINE_MAP_MAX_LOCATION_WITH_COLS,
3542 LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3543 LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3546 /* Run TESTCASE multiple times, once for each case in our test matrix. */
3548 void
3549 for_each_line_table_case (void (*testcase) (const line_table_case &))
3551 /* As noted above in the description of struct line_table_case,
3552 we want to explore a test matrix of interesting line_table
3553 situations, running various selftests for each case within the
3554 matrix. */
3556 /* Run all tests with:
3557 (a) line_table->default_range_bits == 0, and
3558 (b) line_table->default_range_bits == 5. */
3559 int num_cases_tested = 0;
3560 for (int default_range_bits = 0; default_range_bits <= 5;
3561 default_range_bits += 5)
3563 /* ...and use each of the "interesting" location values as
3564 the starting location within line_table. */
3565 const int num_boundary_locations
3566 = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3567 for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3569 line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3571 testcase (c);
3573 num_cases_tested++;
3577 /* Verify that we fully covered the test matrix. */
3578 ASSERT_EQ (num_cases_tested, 2 * 12);
3581 /* Verify that when presented with a consecutive pair of locations with
3582 a very large line offset, we don't attempt to consolidate them into
3583 a single ordinary linemap where the line offsets within the line map
3584 would lead to overflow (PR lto/88147). */
3586 static void
3587 test_line_offset_overflow ()
3589 line_table_test ltt (line_table_case (5, 0));
3591 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3592 linemap_line_start (line_table, 1, 100);
3593 location_t loc_a = linemap_line_start (line_table, 2578, 255);
3594 assert_loceq ("foo.c", 2578, 0, loc_a);
3596 const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3597 ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3598 ASSERT_EQ (ordmap_a->m_range_bits, 5);
3600 location_t loc_b = linemap_line_start (line_table, 404198, 512);
3601 assert_loceq ("foo.c", 404198, 0, loc_b);
3603 /* We should have started a new linemap, rather than attempting to store
3604 a very large line offset. */
3605 const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3606 ASSERT_NE (ordmap_a, ordmap_b);
3609 void test_cpp_utf8 ()
3611 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
3613 int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8);
3614 ASSERT_EQ (8, w_bad);
3615 int w_ctrl = cpp_display_width ("\r\t\n\v\0\1", 6);
3616 ASSERT_EQ (6, w_ctrl);
3619 /* Verify that wcwidth of valid UTF-8 is as expected. */
3621 const int w_pi = cpp_display_width ("\xcf\x80", 2);
3622 ASSERT_EQ (1, w_pi);
3623 const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4);
3624 ASSERT_EQ (2, w_emoji);
3625 const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2);
3626 ASSERT_EQ (1, w_umlaut_precomposed);
3627 const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3);
3628 ASSERT_EQ (1, w_umlaut_combining);
3629 const int w_han = cpp_display_width ("\xe4\xb8\xba", 3);
3630 ASSERT_EQ (2, w_han);
3631 const int w_ascii = cpp_display_width ("GCC", 3);
3632 ASSERT_EQ (3, w_ascii);
3633 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3634 "\x9f! \xe4\xb8\xba y\xcc\x88", 24);
3635 ASSERT_EQ (18, w_mixed);
3638 /* Verify that cpp_byte_column_to_display_column can go past the end,
3639 and similar edge cases. */
3641 const char *str
3642 /* Display columns.
3643 111111112345 */
3644 = "\xcf\x80 abc";
3645 /* 111122223456
3646 Byte columns. */
3648 ASSERT_EQ (5, cpp_display_width (str, 6));
3649 ASSERT_EQ (105, cpp_byte_column_to_display_column (str, 6, 106));
3650 ASSERT_EQ (10000, cpp_byte_column_to_display_column (NULL, 0, 10000));
3651 ASSERT_EQ (0, cpp_byte_column_to_display_column (NULL, 10000, 0));
3654 /* Verify that cpp_display_column_to_byte_column can go past the end,
3655 and similar edge cases, and check invertibility. */
3657 const char *str
3658 /* Display columns.
3659 000000000000000000000000000000000000011
3660 111111112222222234444444455555555678901 */
3661 = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3662 /* 000000000000000000000000000000000111111
3663 111122223333444456666777788889999012345
3664 Byte columns. */
3665 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2));
3666 ASSERT_EQ (15, cpp_display_column_to_byte_column (str, 15, 11));
3667 ASSERT_EQ (115, cpp_display_column_to_byte_column (str, 15, 111));
3668 ASSERT_EQ (10000, cpp_display_column_to_byte_column (NULL, 0, 10000));
3669 ASSERT_EQ (0, cpp_display_column_to_byte_column (NULL, 10000, 0));
3671 /* Verify that we do not interrupt a UTF-8 sequence. */
3672 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1));
3674 for (int byte_col = 1; byte_col <= 15; ++byte_col)
3676 const int disp_col = cpp_byte_column_to_display_column (str, 15,
3677 byte_col);
3678 const int byte_col2 = cpp_display_column_to_byte_column (str, 15,
3679 disp_col);
3681 /* If we ask for the display column in the middle of a UTF-8
3682 sequence, it will return the length of the partial sequence,
3683 matching the behavior of GCC before display column support.
3684 Otherwise check the round trip was successful. */
3685 if (byte_col < 4)
3686 ASSERT_EQ (byte_col, disp_col);
3687 else if (byte_col >= 6 && byte_col < 9)
3688 ASSERT_EQ (3 + (byte_col - 5), disp_col);
3689 else
3690 ASSERT_EQ (byte_col2, byte_col);
3696 /* Run all of the selftests within this file. */
3698 void
3699 input_c_tests ()
3701 test_linenum_comparisons ();
3702 test_should_have_column_data_p ();
3703 test_unknown_location ();
3704 test_builtins ();
3705 for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3707 for_each_line_table_case (test_accessing_ordinary_linemaps);
3708 for_each_line_table_case (test_lexer);
3709 for_each_line_table_case (test_lexer_string_locations_simple);
3710 for_each_line_table_case (test_lexer_string_locations_ebcdic);
3711 for_each_line_table_case (test_lexer_string_locations_hex);
3712 for_each_line_table_case (test_lexer_string_locations_oct);
3713 for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3714 for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3715 for_each_line_table_case (test_lexer_string_locations_ucn4);
3716 for_each_line_table_case (test_lexer_string_locations_ucn8);
3717 for_each_line_table_case (test_lexer_string_locations_wide_string);
3718 for_each_line_table_case (test_lexer_string_locations_string16);
3719 for_each_line_table_case (test_lexer_string_locations_string32);
3720 for_each_line_table_case (test_lexer_string_locations_u8);
3721 for_each_line_table_case (test_lexer_string_locations_utf8_source);
3722 for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3723 for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3724 for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3725 for_each_line_table_case (test_lexer_string_locations_macro);
3726 for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3727 for_each_line_table_case (test_lexer_string_locations_non_string);
3728 for_each_line_table_case (test_lexer_string_locations_long_line);
3729 for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3730 for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3731 for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3732 for_each_line_table_case (test_lexer_char_constants);
3734 test_reading_source_line ();
3736 test_line_offset_overflow ();
3738 test_cpp_utf8 ();
3741 } // namespace selftest
3743 #endif /* CHECKING_P */