ada: Fix infinite loop with multiple limited with clauses
[official-gcc.git] / gcc / input.cc
blobfd09fccb0e3c0e41e102e68242ceddecde2468cd
1 /* Data and functions related to line maps and input files.
2 Copyright (C) 2004-2023 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "selftest.h"
26 #include "cpplib.h"
28 #ifndef HAVE_ICONV
29 #define HAVE_ICONV 0
30 #endif
32 const char *
33 special_fname_builtin ()
35 return _("<built-in>");
38 /* Input charset configuration. */
39 static const char *default_charset_callback (const char *)
41 return nullptr;
44 void
45 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
46 bool should_skip_bom)
48 in_context.ccb = (ccb ? ccb : default_charset_callback);
49 in_context.should_skip_bom = should_skip_bom;
52 /* This is a cache used by get_next_line to store the content of a
53 file to be searched for file lines. */
54 class file_cache_slot
56 public:
57 file_cache_slot ();
58 ~file_cache_slot ();
60 bool read_line_num (size_t line_num,
61 char ** line, ssize_t *line_len);
63 /* Accessors. */
64 const char *get_file_path () const { return m_file_path; }
65 unsigned get_use_count () const { return m_use_count; }
66 bool missing_trailing_newline_p () const
68 return m_missing_trailing_newline;
70 char_span get_full_file_content ();
72 void inc_use_count () { m_use_count++; }
74 bool create (const file_cache::input_context &in_context,
75 const char *file_path, FILE *fp, unsigned highest_use_count);
76 void evict ();
78 private:
79 /* These are information used to store a line boundary. */
80 class line_info
82 public:
83 /* The line number. It starts from 1. */
84 size_t line_num;
86 /* The position (byte count) of the beginning of the line,
87 relative to the file data pointer. This starts at zero. */
88 size_t start_pos;
90 /* The position (byte count) of the last byte of the line. This
91 normally points to the '\n' character, or to one byte after the
92 last byte of the file, if the file doesn't contain a '\n'
93 character. */
94 size_t end_pos;
96 line_info (size_t l, size_t s, size_t e)
97 : line_num (l), start_pos (s), end_pos (e)
100 line_info ()
101 :line_num (0), start_pos (0), end_pos (0)
105 bool needs_read_p () const;
106 bool needs_grow_p () const;
107 void maybe_grow ();
108 bool read_data ();
109 bool maybe_read_data ();
110 bool get_next_line (char **line, ssize_t *line_len);
111 bool read_next_line (char ** line, ssize_t *line_len);
112 bool goto_next_line ();
114 static const size_t buffer_size = 4 * 1024;
115 static const size_t line_record_size = 100;
117 /* The number of time this file has been accessed. This is used
118 to designate which file cache to evict from the cache
119 array. */
120 unsigned m_use_count;
122 /* The file_path is the key for identifying a particular file in
123 the cache.
124 For libcpp-using code, the underlying buffer for this field is
125 owned by the corresponding _cpp_file within the cpp_reader. */
126 const char *m_file_path;
128 FILE *m_fp;
130 /* This points to the content of the file that we've read so
131 far. */
132 char *m_data;
134 /* The allocated buffer to be freed may start a little earlier than DATA,
135 e.g. if a UTF8 BOM was skipped at the beginning. */
136 int m_alloc_offset;
138 /* The size of the DATA array above.*/
139 size_t m_size;
141 /* The number of bytes read from the underlying file so far. This
142 must be less (or equal) than SIZE above. */
143 size_t m_nb_read;
145 /* The index of the beginning of the current line. */
146 size_t m_line_start_idx;
148 /* The number of the previous line read. This starts at 1. Zero
149 means we've read no line so far. */
150 size_t m_line_num;
152 /* This is the total number of lines of the current file. At the
153 moment, we try to get this information from the line map
154 subsystem. Note that this is just a hint. When using the C++
155 front-end, this hint is correct because the input file is then
156 completely tokenized before parsing starts; so the line map knows
157 the number of lines before compilation really starts. For e.g,
158 the C front-end, it can happen that we start emitting diagnostics
159 before the line map has seen the end of the file. */
160 size_t m_total_lines;
162 /* Could this file be missing a trailing newline on its final line?
163 Initially true (to cope with empty files), set to true/false
164 as each line is read. */
165 bool m_missing_trailing_newline;
167 /* This is a record of the beginning and end of the lines we've seen
168 while reading the file. This is useful to avoid walking the data
169 from the beginning when we are asked to read a line that is
170 before LINE_START_IDX above. Note that the maximum size of this
171 record is line_record_size, so that the memory consumption
172 doesn't explode. We thus scale total_lines down to
173 line_record_size. */
174 vec<line_info, va_heap> m_line_record;
176 void offset_buffer (int offset)
178 gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
179 : (size_t) offset <= m_size);
180 gcc_assert (m_data);
181 m_alloc_offset += offset;
182 m_data += offset;
183 m_size -= offset;
188 /* Current position in real source file. */
190 location_t input_location = UNKNOWN_LOCATION;
192 class line_maps *line_table;
194 /* A stashed copy of "line_table" for use by selftest::line_table_test.
195 This needs to be a global so that it can be a GC root, and thus
196 prevent the stashed copy from being garbage-collected if the GC runs
197 during a line_table_test. */
199 class line_maps *saved_line_table;
201 /* Expand the source location LOC into a human readable location. If
202 LOC resolves to a builtin location, the file name of the readable
203 location is set to the string "<built-in>". If EXPANSION_POINT_P is
204 TRUE and LOC is virtual, then it is resolved to the expansion
205 point of the involved macro. Otherwise, it is resolved to the
206 spelling location of the token.
208 When resolving to the spelling location of the token, if the
209 resulting location is for a built-in location (that is, it has no
210 associated line/column) in the context of a macro expansion, the
211 returned location is the first one (while unwinding the macro
212 location towards its expansion point) that is in real source
213 code.
215 ASPECT controls which part of the location to use. */
217 static expanded_location
218 expand_location_1 (location_t loc,
219 bool expansion_point_p,
220 enum location_aspect aspect)
222 expanded_location xloc;
223 const line_map_ordinary *map;
224 enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
225 tree block = NULL;
227 if (IS_ADHOC_LOC (loc))
229 block = LOCATION_BLOCK (loc);
230 loc = LOCATION_LOCUS (loc);
233 memset (&xloc, 0, sizeof (xloc));
235 if (loc >= RESERVED_LOCATION_COUNT)
237 if (!expansion_point_p)
239 /* We want to resolve LOC to its spelling location.
241 But if that spelling location is a reserved location that
242 appears in the context of a macro expansion (like for a
243 location for a built-in token), let's consider the first
244 location (toward the expansion point) that is not reserved;
245 that is, the first location that is in real source code. */
246 loc = linemap_unwind_to_first_non_reserved_loc (line_table,
247 loc, NULL);
248 lrk = LRK_SPELLING_LOCATION;
250 loc = linemap_resolve_location (line_table, loc, lrk, &map);
252 /* loc is now either in an ordinary map, or is a reserved location.
253 If it is a compound location, the caret is in a spelling location,
254 but the start/finish might still be a virtual location.
255 Depending of what the caller asked for, we may need to recurse
256 one level in order to resolve any virtual locations in the
257 end-points. */
258 switch (aspect)
260 default:
261 gcc_unreachable ();
262 /* Fall through. */
263 case LOCATION_ASPECT_CARET:
264 break;
265 case LOCATION_ASPECT_START:
267 location_t start = get_start (loc);
268 if (start != loc)
269 return expand_location_1 (start, expansion_point_p, aspect);
271 break;
272 case LOCATION_ASPECT_FINISH:
274 location_t finish = get_finish (loc);
275 if (finish != loc)
276 return expand_location_1 (finish, expansion_point_p, aspect);
278 break;
280 xloc = linemap_expand_location (line_table, map, loc);
283 xloc.data = block;
284 if (loc <= BUILTINS_LOCATION)
285 xloc.file = loc == UNKNOWN_LOCATION ? NULL : special_fname_builtin ();
287 return xloc;
290 /* Initialize the set of cache used for files accessed by caret
291 diagnostic. */
293 static void
294 diagnostic_file_cache_init (void)
296 gcc_assert (global_dc);
297 if (global_dc->m_file_cache == NULL)
298 global_dc->m_file_cache = new file_cache ();
301 /* Free the resources used by the set of cache used for files accessed
302 by caret diagnostic. */
304 void
305 diagnostic_file_cache_fini (void)
307 if (global_dc->m_file_cache)
309 delete global_dc->m_file_cache;
310 global_dc->m_file_cache = NULL;
314 /* Return the total lines number that have been read so far by the
315 line map (in the preprocessor) so far. For languages like C++ that
316 entirely preprocess the input file before starting to parse, this
317 equals the actual number of lines of the file. */
319 static size_t
320 total_lines_num (const char *file_path)
322 size_t r = 0;
323 location_t l = 0;
324 if (linemap_get_file_highest_location (line_table, file_path, &l))
326 gcc_assert (l >= RESERVED_LOCATION_COUNT);
327 expanded_location xloc = expand_location (l);
328 r = xloc.line;
330 return r;
333 /* Lookup the cache used for the content of a given file accessed by
334 caret diagnostic. Return the found cached file, or NULL if no
335 cached file was found. */
337 file_cache_slot *
338 file_cache::lookup_file (const char *file_path)
340 gcc_assert (file_path);
342 /* This will contain the found cached file. */
343 file_cache_slot *r = NULL;
344 for (unsigned i = 0; i < num_file_slots; ++i)
346 file_cache_slot *c = &m_file_slots[i];
347 if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
349 c->inc_use_count ();
350 r = c;
354 if (r)
355 r->inc_use_count ();
357 return r;
360 /* Purge any mention of FILENAME from the cache of files used for
361 printing source code. For use in selftests when working
362 with tempfiles. */
364 void
365 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
367 gcc_assert (file_path);
369 if (!global_dc->m_file_cache)
370 return;
372 global_dc->m_file_cache->forcibly_evict_file (file_path);
375 void
376 file_cache::forcibly_evict_file (const char *file_path)
378 gcc_assert (file_path);
380 file_cache_slot *r = lookup_file (file_path);
381 if (!r)
382 /* Not found. */
383 return;
385 r->evict ();
388 void
389 file_cache_slot::evict ()
391 m_file_path = NULL;
392 if (m_fp)
393 fclose (m_fp);
394 m_fp = NULL;
395 m_nb_read = 0;
396 m_line_start_idx = 0;
397 m_line_num = 0;
398 m_line_record.truncate (0);
399 m_use_count = 0;
400 m_total_lines = 0;
401 m_missing_trailing_newline = true;
404 /* Return the file cache that has been less used, recently, or the
405 first empty one. If HIGHEST_USE_COUNT is non-null,
406 *HIGHEST_USE_COUNT is set to the highest use count of the entries
407 in the cache table. */
409 file_cache_slot*
410 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
412 diagnostic_file_cache_init ();
414 file_cache_slot *to_evict = &m_file_slots[0];
415 unsigned huc = to_evict->get_use_count ();
416 for (unsigned i = 1; i < num_file_slots; ++i)
418 file_cache_slot *c = &m_file_slots[i];
419 bool c_is_empty = (c->get_file_path () == NULL);
421 if (c->get_use_count () < to_evict->get_use_count ()
422 || (to_evict->get_file_path () && c_is_empty))
423 /* We evict C because it's either an entry with a lower use
424 count or one that is empty. */
425 to_evict = c;
427 if (huc < c->get_use_count ())
428 huc = c->get_use_count ();
430 if (c_is_empty)
431 /* We've reached the end of the cache; subsequent elements are
432 all empty. */
433 break;
436 if (highest_use_count)
437 *highest_use_count = huc;
439 return to_evict;
442 /* Create the cache used for the content of a given file to be
443 accessed by caret diagnostic. This cache is added to an array of
444 cache and can be retrieved by lookup_file_in_cache_tab. This
445 function returns the created cache. Note that only the last
446 num_file_slots files are cached.
448 This can return nullptr if the FILE_PATH can't be opened for
449 reading, or if the content can't be converted to the input_charset. */
451 file_cache_slot*
452 file_cache::add_file (const char *file_path)
455 FILE *fp = fopen (file_path, "r");
456 if (fp == NULL)
457 return NULL;
459 unsigned highest_use_count = 0;
460 file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
461 if (!r->create (in_context, file_path, fp, highest_use_count))
462 return NULL;
463 return r;
466 /* Get a borrowed char_span to the full content of this file
467 as decoded according to the input charset, encoded as UTF-8. */
469 char_span
470 file_cache_slot::get_full_file_content ()
472 char *line;
473 ssize_t line_len;
474 while (get_next_line (&line, &line_len))
477 return char_span (m_data, m_nb_read);
480 /* Populate this slot for use on FILE_PATH and FP, dropping any
481 existing cached content within it. */
483 bool
484 file_cache_slot::create (const file_cache::input_context &in_context,
485 const char *file_path, FILE *fp,
486 unsigned highest_use_count)
488 m_file_path = file_path;
489 if (m_fp)
490 fclose (m_fp);
491 m_fp = fp;
492 if (m_alloc_offset)
493 offset_buffer (-m_alloc_offset);
494 m_nb_read = 0;
495 m_line_start_idx = 0;
496 m_line_num = 0;
497 m_line_record.truncate (0);
498 /* Ensure that this cache entry doesn't get evicted next time
499 add_file_to_cache_tab is called. */
500 m_use_count = ++highest_use_count;
501 m_total_lines = total_lines_num (file_path);
502 m_missing_trailing_newline = true;
505 /* Check the input configuration to determine if we need to do any
506 transformations, such as charset conversion or BOM skipping. */
507 if (const char *input_charset = in_context.ccb (file_path))
509 /* Need a full-blown conversion of the input charset. */
510 fclose (m_fp);
511 m_fp = NULL;
512 const cpp_converted_source cs
513 = cpp_get_converted_source (file_path, input_charset);
514 if (!cs.data)
515 return false;
516 if (m_data)
517 XDELETEVEC (m_data);
518 m_data = cs.data;
519 m_nb_read = m_size = cs.len;
520 m_alloc_offset = cs.data - cs.to_free;
522 else if (in_context.should_skip_bom)
524 if (read_data ())
526 const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
527 offset_buffer (offset);
528 m_nb_read -= offset;
532 return true;
535 /* file_cache's ctor. */
537 file_cache::file_cache ()
538 : m_file_slots (new file_cache_slot[num_file_slots])
540 initialize_input_context (nullptr, false);
543 /* file_cache's dtor. */
545 file_cache::~file_cache ()
547 delete[] m_file_slots;
550 /* Lookup the cache used for the content of a given file accessed by
551 caret diagnostic. If no cached file was found, create a new cache
552 for this file, add it to the array of cached file and return
555 This can return nullptr on a cache miss if FILE_PATH can't be opened for
556 reading, or if the content can't be converted to the input_charset. */
558 file_cache_slot*
559 file_cache::lookup_or_add_file (const char *file_path)
561 file_cache_slot *r = lookup_file (file_path);
562 if (r == NULL)
563 r = add_file (file_path);
564 return r;
567 /* Default constructor for a cache of file used by caret
568 diagnostic. */
570 file_cache_slot::file_cache_slot ()
571 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
572 m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
573 m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
575 m_line_record.create (0);
578 /* Destructor for a cache of file used by caret diagnostic. */
580 file_cache_slot::~file_cache_slot ()
582 if (m_fp)
584 fclose (m_fp);
585 m_fp = NULL;
587 if (m_data)
589 offset_buffer (-m_alloc_offset);
590 XDELETEVEC (m_data);
591 m_data = 0;
593 m_line_record.release ();
596 /* Returns TRUE iff the cache would need to be filled with data coming
597 from the file. That is, either the cache is empty or full or the
598 current line is empty. Note that if the cache is full, it would
599 need to be extended and filled again. */
601 bool
602 file_cache_slot::needs_read_p () const
604 return m_fp && (m_nb_read == 0
605 || m_nb_read == m_size
606 || (m_line_start_idx >= m_nb_read - 1));
609 /* Return TRUE iff the cache is full and thus needs to be
610 extended. */
612 bool
613 file_cache_slot::needs_grow_p () const
615 return m_nb_read == m_size;
618 /* Grow the cache if it needs to be extended. */
620 void
621 file_cache_slot::maybe_grow ()
623 if (!needs_grow_p ())
624 return;
626 if (!m_data)
628 gcc_assert (m_size == 0 && m_alloc_offset == 0);
629 m_size = buffer_size;
630 m_data = XNEWVEC (char, m_size);
632 else
634 const int offset = m_alloc_offset;
635 offset_buffer (-offset);
636 m_size *= 2;
637 m_data = XRESIZEVEC (char, m_data, m_size);
638 offset_buffer (offset);
642 /* Read more data into the cache. Extends the cache if need be.
643 Returns TRUE iff new data could be read. */
645 bool
646 file_cache_slot::read_data ()
648 if (feof (m_fp) || ferror (m_fp))
649 return false;
651 maybe_grow ();
653 char * from = m_data + m_nb_read;
654 size_t to_read = m_size - m_nb_read;
655 size_t nb_read = fread (from, 1, to_read, m_fp);
657 if (ferror (m_fp))
658 return false;
660 m_nb_read += nb_read;
661 return !!nb_read;
664 /* Read new data iff the cache needs to be filled with more data
665 coming from the file FP. Return TRUE iff the cache was filled with
666 mode data. */
668 bool
669 file_cache_slot::maybe_read_data ()
671 if (!needs_read_p ())
672 return false;
673 return read_data ();
676 /* Helper function for file_cache_slot::get_next_line (), to find the end of
677 the next line. Returns with the memchr convention, i.e. nullptr if a line
678 terminator was not found. We need to determine line endings in the same
679 manner that libcpp does: any of \n, \r\n, or \r is a line ending. */
681 static char *
682 find_end_of_line (char *s, size_t len)
684 for (const auto end = s + len; s != end; ++s)
686 if (*s == '\n')
687 return s;
688 if (*s == '\r')
690 const auto next = s + 1;
691 if (next == end)
693 /* Don't find the line ending if \r is the very last character
694 in the buffer; we do not know if it's the end of the file or
695 just the end of what has been read so far, and we wouldn't
696 want to break in the middle of what's actually a \r\n
697 sequence. Instead, we will handle the case of a file ending
698 in a \r later. */
699 break;
701 return (*next == '\n' ? next : s);
704 return nullptr;
707 /* Read a new line from file FP, using C as a cache for the data
708 coming from the file. Upon successful completion, *LINE is set to
709 the beginning of the line found. *LINE points directly in the
710 line cache and is only valid until the next call of get_next_line.
711 *LINE_LEN is set to the length of the line. Note that the line
712 does not contain any terminal delimiter. This function returns
713 true if some data was read or process from the cache, false
714 otherwise. Note that subsequent calls to get_next_line might
715 make the content of *LINE invalid. */
717 bool
718 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
720 /* Fill the cache with data to process. */
721 maybe_read_data ();
723 size_t remaining_size = m_nb_read - m_line_start_idx;
724 if (remaining_size == 0)
725 /* There is no more data to process. */
726 return false;
728 char *line_start = m_data + m_line_start_idx;
730 char *next_line_start = NULL;
731 size_t len = 0;
732 char *line_end = find_end_of_line (line_start, remaining_size);
733 if (line_end == NULL)
735 /* We haven't found an end-of-line delimiter in the cache.
736 Fill the cache with more data from the file and look again. */
737 while (maybe_read_data ())
739 line_start = m_data + m_line_start_idx;
740 remaining_size = m_nb_read - m_line_start_idx;
741 line_end = find_end_of_line (line_start, remaining_size);
742 if (line_end != NULL)
744 next_line_start = line_end + 1;
745 break;
748 if (line_end == NULL)
750 /* We've loaded all the file into the cache and still no
751 terminator. Let's say the line ends up at one byte past the
752 end of the file. This is to stay consistent with the case
753 of when the line ends up with a terminator and line_end points to
754 that. That consistency is useful below in the len calculation.
756 If the file ends in a \r, we didn't identify it as a line
757 terminator above, so do that now instead. */
758 line_end = m_data + m_nb_read;
759 if (m_nb_read && line_end[-1] == '\r')
761 --line_end;
762 m_missing_trailing_newline = false;
764 else
765 m_missing_trailing_newline = true;
767 else
768 m_missing_trailing_newline = false;
770 else
772 next_line_start = line_end + 1;
773 m_missing_trailing_newline = false;
776 if (m_fp && ferror (m_fp))
777 return false;
779 /* At this point, we've found the end of the of line. It either points to
780 the line terminator or to one byte after the last byte of the file. */
781 gcc_assert (line_end != NULL);
783 len = line_end - line_start;
785 if (m_line_start_idx < m_nb_read)
786 *line = line_start;
788 ++m_line_num;
790 /* Before we update our line record, make sure the hint about the
791 total number of lines of the file is correct. If it's not, then
792 we give up recording line boundaries from now on. */
793 bool update_line_record = true;
794 if (m_line_num > m_total_lines)
795 update_line_record = false;
797 /* Now update our line record so that re-reading lines from the
798 before m_line_start_idx is faster. */
799 if (update_line_record
800 && m_line_record.length () < line_record_size)
802 /* If the file lines fits in the line record, we just record all
803 its lines ...*/
804 if (m_total_lines <= line_record_size
805 && m_line_num > m_line_record.length ())
806 m_line_record.safe_push
807 (file_cache_slot::line_info (m_line_num,
808 m_line_start_idx,
809 line_end - m_data));
810 else if (m_total_lines > line_record_size)
812 /* ... otherwise, we just scale total_lines down to
813 (line_record_size lines. */
814 size_t n = (m_line_num * line_record_size) / m_total_lines;
815 if (m_line_record.length () == 0
816 || n >= m_line_record.length ())
817 m_line_record.safe_push
818 (file_cache_slot::line_info (m_line_num,
819 m_line_start_idx,
820 line_end - m_data));
824 /* Update m_line_start_idx so that it points to the next line to be
825 read. */
826 if (next_line_start)
827 m_line_start_idx = next_line_start - m_data;
828 else
829 /* We didn't find any terminal '\n'. Let's consider that the end
830 of line is the end of the data in the cache. The next
831 invocation of get_next_line will either read more data from the
832 underlying file or return false early because we've reached the
833 end of the file. */
834 m_line_start_idx = m_nb_read;
836 *line_len = len;
838 return true;
841 /* Consume the next bytes coming from the cache (or from its
842 underlying file if there are remaining unread bytes in the file)
843 until we reach the next end-of-line (or end-of-file). There is no
844 copying from the cache involved. Return TRUE upon successful
845 completion. */
847 bool
848 file_cache_slot::goto_next_line ()
850 char *l;
851 ssize_t len;
853 return get_next_line (&l, &len);
856 /* Read an arbitrary line number LINE_NUM from the file cached in C.
857 If the line was read successfully, *LINE points to the beginning
858 of the line in the file cache and *LINE_LEN is the length of the
859 line. *LINE is not nul-terminated, but may contain zero bytes.
860 *LINE is only valid until the next call of read_line_num.
861 This function returns bool if a line was read. */
863 bool
864 file_cache_slot::read_line_num (size_t line_num,
865 char ** line, ssize_t *line_len)
867 gcc_assert (line_num > 0);
869 if (line_num <= m_line_num)
871 /* We've been asked to read lines that are before m_line_num.
872 So lets use our line record (if it's not empty) to try to
873 avoid re-reading the file from the beginning again. */
875 if (m_line_record.is_empty ())
877 m_line_start_idx = 0;
878 m_line_num = 0;
880 else
882 file_cache_slot::line_info *i = NULL;
883 if (m_total_lines <= line_record_size)
885 /* In languages where the input file is not totally
886 preprocessed up front, the m_total_lines hint
887 can be smaller than the number of lines of the
888 file. In that case, only the first
889 m_total_lines have been recorded.
891 Otherwise, the first m_total_lines we've read have
892 their start/end recorded here. */
893 i = (line_num <= m_total_lines)
894 ? &m_line_record[line_num - 1]
895 : &m_line_record[m_total_lines - 1];
896 gcc_assert (i->line_num <= line_num);
898 else
900 /* So the file had more lines than our line record
901 size. Thus the number of lines we've recorded has
902 been scaled down to line_record_size. Let's
903 pick the start/end of the recorded line that is
904 closest to line_num. */
905 size_t n = (line_num <= m_total_lines)
906 ? line_num * line_record_size / m_total_lines
907 : m_line_record.length () - 1;
908 if (n < m_line_record.length ())
910 i = &m_line_record[n];
911 gcc_assert (i->line_num <= line_num);
915 if (i && i->line_num == line_num)
917 /* We have the start/end of the line. */
918 *line = m_data + i->start_pos;
919 *line_len = i->end_pos - i->start_pos;
920 return true;
923 if (i)
925 m_line_start_idx = i->start_pos;
926 m_line_num = i->line_num - 1;
928 else
930 m_line_start_idx = 0;
931 m_line_num = 0;
936 /* Let's walk from line m_line_num up to line_num - 1, without
937 copying any line. */
938 while (m_line_num < line_num - 1)
939 if (!goto_next_line ())
940 return false;
942 /* The line we want is the next one. Let's read and copy it back to
943 the caller. */
944 return get_next_line (line, line_len);
947 /* Return the physical source line that corresponds to FILE_PATH/LINE.
948 The line is not nul-terminated. The returned pointer is only
949 valid until the next call of location_get_source_line.
950 Note that the line can contain several null characters,
951 so the returned value's length has the actual length of the line.
952 If the function fails, a NULL char_span is returned. */
954 char_span
955 file_cache::get_source_line (const char *file_path, int line)
957 char *buffer = NULL;
958 ssize_t len;
960 if (line == 0)
961 return char_span (NULL, 0);
963 if (file_path == NULL)
964 return char_span (NULL, 0);
966 file_cache_slot *c = lookup_or_add_file (file_path);
967 if (c == NULL)
968 return char_span (NULL, 0);
970 bool read = c->read_line_num (line, &buffer, &len);
971 if (!read)
972 return char_span (NULL, 0);
974 return char_span (buffer, len);
977 char_span
978 location_get_source_line (const char *file_path, int line)
980 diagnostic_file_cache_init ();
981 return global_dc->m_file_cache->get_source_line (file_path, line);
984 /* Return a NUL-terminated copy of the source text between two locations, or
985 NULL if the arguments are invalid. The caller is responsible for freeing
986 the return value. */
988 char *
989 get_source_text_between (location_t start, location_t end)
991 expanded_location expstart =
992 expand_location_to_spelling_point (start, LOCATION_ASPECT_START);
993 expanded_location expend =
994 expand_location_to_spelling_point (end, LOCATION_ASPECT_FINISH);
996 /* If the locations are in different files or the end comes before the
997 start, give up and return nothing. */
998 if (!expstart.file || !expend.file)
999 return NULL;
1000 if (strcmp (expstart.file, expend.file) != 0)
1001 return NULL;
1002 if (expstart.line > expend.line)
1003 return NULL;
1004 if (expstart.line == expend.line
1005 && expstart.column > expend.column)
1006 return NULL;
1007 /* These aren't real column numbers, give up. */
1008 if (expstart.column == 0 || expend.column == 0)
1009 return NULL;
1011 /* For a single line we need to trim both edges. */
1012 if (expstart.line == expend.line)
1014 char_span line = location_get_source_line (expstart.file, expstart.line);
1015 if (line.length () < 1)
1016 return NULL;
1017 int s = expstart.column - 1;
1018 int len = expend.column - s;
1019 if (line.length () < (size_t)expend.column)
1020 return NULL;
1021 return line.subspan (s, len).xstrdup ();
1024 struct obstack buf_obstack;
1025 obstack_init (&buf_obstack);
1027 /* Loop through all lines in the range and append each to buf; may trim
1028 parts of the start and end lines off depending on column values. */
1029 for (int lnum = expstart.line; lnum <= expend.line; ++lnum)
1031 char_span line = location_get_source_line (expstart.file, lnum);
1032 if (line.length () < 1 && (lnum != expstart.line && lnum != expend.line))
1033 continue;
1035 /* For the first line in the range, only start at expstart.column */
1036 if (lnum == expstart.line)
1038 unsigned off = expstart.column - 1;
1039 if (line.length () < off)
1040 return NULL;
1041 line = line.subspan (off, line.length() - off);
1043 /* For the last line, don't go past expend.column */
1044 else if (lnum == expend.line)
1046 if (line.length () < (size_t)expend.column)
1047 return NULL;
1048 line = line.subspan (0, expend.column);
1051 /* Combine spaces at the beginning of later lines. */
1052 if (lnum > expstart.line)
1054 unsigned off;
1055 for (off = 0; off < line.length(); ++off)
1056 if (line[off] != ' ' && line[off] != '\t')
1057 break;
1058 if (off > 0)
1060 obstack_1grow (&buf_obstack, ' ');
1061 line = line.subspan (off, line.length() - off);
1065 /* This does not include any trailing newlines. */
1066 obstack_grow (&buf_obstack, line.get_buffer (), line.length ());
1069 /* NUL-terminate and finish the buf obstack. */
1070 obstack_1grow (&buf_obstack, 0);
1071 const char *buf = (const char *) obstack_finish (&buf_obstack);
1073 return xstrdup (buf);
1077 char_span
1078 file_cache::get_source_file_content (const char *file_path)
1080 file_cache_slot *c = lookup_or_add_file (file_path);
1081 if (c == nullptr)
1082 return char_span (nullptr, 0);
1083 return c->get_full_file_content ();
1087 /* Get a borrowed char_span to the full content of FILE_PATH
1088 as decoded according to the input charset, encoded as UTF-8. */
1090 char_span
1091 get_source_file_content (const char *file_path)
1093 diagnostic_file_cache_init ();
1094 return global_dc->m_file_cache->get_source_file_content (file_path);
1097 /* Determine if FILE_PATH missing a trailing newline on its final line.
1098 Only valid to call once all of the file has been loaded, by
1099 requesting a line number beyond the end of the file. */
1101 bool
1102 location_missing_trailing_newline (const char *file_path)
1104 diagnostic_file_cache_init ();
1106 file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
1107 if (c == NULL)
1108 return false;
1110 return c->missing_trailing_newline_p ();
1113 /* Test if the location originates from the spelling location of a
1114 builtin-tokens. That is, return TRUE if LOC is a (possibly
1115 virtual) location of a built-in token that appears in the expansion
1116 list of a macro. Please note that this function also works on
1117 tokens that result from built-in tokens. For instance, the
1118 function would return true if passed a token "4" that is the result
1119 of the expansion of the built-in __LINE__ macro. */
1120 bool
1121 is_location_from_builtin_token (location_t loc)
1123 const line_map_ordinary *map = NULL;
1124 loc = linemap_resolve_location (line_table, loc,
1125 LRK_SPELLING_LOCATION, &map);
1126 return loc == BUILTINS_LOCATION;
1129 /* Expand the source location LOC into a human readable location. If
1130 LOC is virtual, it resolves to the expansion point of the involved
1131 macro. If LOC resolves to a builtin location, the file name of the
1132 readable location is set to the string "<built-in>". */
1134 expanded_location
1135 expand_location (location_t loc)
1137 return expand_location_1 (loc, /*expansion_point_p=*/true,
1138 LOCATION_ASPECT_CARET);
1141 /* Expand the source location LOC into a human readable location. If
1142 LOC is virtual, it resolves to the expansion location of the
1143 relevant macro. If LOC resolves to a builtin location, the file
1144 name of the readable location is set to the string
1145 "<built-in>". */
1147 expanded_location
1148 expand_location_to_spelling_point (location_t loc,
1149 enum location_aspect aspect)
1151 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1154 /* The rich_location class within libcpp requires a way to expand
1155 location_t instances, and relies on the client code
1156 providing a symbol named
1157 linemap_client_expand_location_to_spelling_point
1158 to do this.
1160 This is the implementation for libcommon.a (all host binaries),
1161 which simply calls into expand_location_1. */
1163 expanded_location
1164 linemap_client_expand_location_to_spelling_point (location_t loc,
1165 enum location_aspect aspect)
1167 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1171 /* If LOCATION is in a system header and if it is a virtual location
1172 for a token coming from the expansion of a macro, unwind it to
1173 the location of the expansion point of the macro. If the expansion
1174 point is also in a system header return the original LOCATION.
1175 Otherwise, return the location of the expansion point.
1177 This is used for instance when we want to emit diagnostics about a
1178 token that may be located in a macro that is itself defined in a
1179 system header, for example, for the NULL macro. In such a case, if
1180 LOCATION were passed directly to diagnostic functions such as
1181 warning_at, the diagnostic would be suppressed (unless
1182 -Wsystem-headers). */
1184 location_t
1185 expansion_point_location_if_in_system_header (location_t location)
1187 if (!in_system_header_at (location))
1188 return location;
1190 location_t xloc = linemap_resolve_location (line_table, location,
1191 LRK_MACRO_EXPANSION_POINT,
1192 NULL);
1193 return in_system_header_at (xloc) ? location : xloc;
1196 /* If LOCATION is a virtual location for a token coming from the expansion
1197 of a macro, unwind to the location of the expansion point of the macro. */
1199 location_t
1200 expansion_point_location (location_t location)
1202 return linemap_resolve_location (line_table, location,
1203 LRK_MACRO_EXPANSION_POINT, NULL);
1206 /* Construct a location with caret at CARET, ranging from START to
1207 FINISH.
1209 For example, consider:
1211 11111111112
1212 12345678901234567890
1214 523 return foo + bar;
1215 ~~~~^~~~~
1218 The location's caret is at the "+", line 523 column 15, but starts
1219 earlier, at the "f" of "foo" at column 11. The finish is at the "r"
1220 of "bar" at column 19. */
1222 location_t
1223 make_location (location_t caret, location_t start, location_t finish)
1225 return line_table->make_location (caret, start, finish);
1228 /* Same as above, but taking a source range rather than two locations. */
1230 location_t
1231 make_location (location_t caret, source_range src_range)
1233 location_t pure_loc = get_pure_location (caret);
1234 return line_table->get_or_create_combined_loc (pure_loc, src_range,
1235 nullptr, 0);
1238 /* An expanded_location stores the column in byte units. This function
1239 converts that column to display units. That requires reading the associated
1240 source line in order to calculate the display width. If that cannot be done
1241 for any reason, then returns the byte column as a fallback. */
1243 location_compute_display_column (expanded_location exploc,
1244 const cpp_char_column_policy &policy)
1246 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1247 return exploc.column;
1248 char_span line = location_get_source_line (exploc.file, exploc.line);
1249 /* If line is NULL, this function returns exploc.column which is the
1250 desired fallback. */
1251 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1252 exploc.column, policy);
1255 /* Dump statistics to stderr about the memory usage of the line_table
1256 set of line maps. This also displays some statistics about macro
1257 expansion. */
1259 void
1260 dump_line_table_statistics (void)
1262 struct linemap_stats s;
1263 long total_used_map_size,
1264 macro_maps_size,
1265 total_allocated_map_size;
1267 memset (&s, 0, sizeof (s));
1269 linemap_get_statistics (line_table, &s);
1271 macro_maps_size = s.macro_maps_used_size
1272 + s.macro_maps_locations_size;
1274 total_allocated_map_size = s.ordinary_maps_allocated_size
1275 + s.macro_maps_allocated_size
1276 + s.macro_maps_locations_size;
1278 total_used_map_size = s.ordinary_maps_used_size
1279 + s.macro_maps_used_size
1280 + s.macro_maps_locations_size;
1282 fprintf (stderr, "Number of expanded macros: %5ld\n",
1283 s.num_expanded_macros);
1284 if (s.num_expanded_macros != 0)
1285 fprintf (stderr, "Average number of tokens per macro expansion: %5ld\n",
1286 s.num_macro_tokens / s.num_expanded_macros);
1287 fprintf (stderr,
1288 "\nLine Table allocations during the "
1289 "compilation process\n");
1290 fprintf (stderr, "Number of ordinary maps used: " PRsa (5) "\n",
1291 SIZE_AMOUNT (s.num_ordinary_maps_used));
1292 fprintf (stderr, "Ordinary map used size: " PRsa (5) "\n",
1293 SIZE_AMOUNT (s.ordinary_maps_used_size));
1294 fprintf (stderr, "Number of ordinary maps allocated: " PRsa (5) "\n",
1295 SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1296 fprintf (stderr, "Ordinary maps allocated size: " PRsa (5) "\n",
1297 SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1298 fprintf (stderr, "Number of macro maps used: " PRsa (5) "\n",
1299 SIZE_AMOUNT (s.num_macro_maps_used));
1300 fprintf (stderr, "Macro maps used size: " PRsa (5) "\n",
1301 SIZE_AMOUNT (s.macro_maps_used_size));
1302 fprintf (stderr, "Macro maps locations size: " PRsa (5) "\n",
1303 SIZE_AMOUNT (s.macro_maps_locations_size));
1304 fprintf (stderr, "Macro maps size: " PRsa (5) "\n",
1305 SIZE_AMOUNT (macro_maps_size));
1306 fprintf (stderr, "Duplicated maps locations size: " PRsa (5) "\n",
1307 SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1308 fprintf (stderr, "Total allocated maps size: " PRsa (5) "\n",
1309 SIZE_AMOUNT (total_allocated_map_size));
1310 fprintf (stderr, "Total used maps size: " PRsa (5) "\n",
1311 SIZE_AMOUNT (total_used_map_size));
1312 fprintf (stderr, "Ad-hoc table size: " PRsa (5) "\n",
1313 SIZE_AMOUNT (s.adhoc_table_size));
1314 fprintf (stderr, "Ad-hoc table entries used: " PRsa (5) "\n",
1315 SIZE_AMOUNT (s.adhoc_table_entries_used));
1316 fprintf (stderr, "optimized_ranges: " PRsa (5) "\n",
1317 SIZE_AMOUNT (line_table->m_num_optimized_ranges));
1318 fprintf (stderr, "unoptimized_ranges: " PRsa (5) "\n",
1319 SIZE_AMOUNT (line_table->m_num_unoptimized_ranges));
1321 fprintf (stderr, "\n");
1324 /* Get location one beyond the final location in ordinary map IDX. */
1326 static location_t
1327 get_end_location (class line_maps *set, unsigned int idx)
1329 if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1330 return set->highest_location;
1332 struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1333 return MAP_START_LOCATION (next_map);
1336 /* Helper function for write_digit_row. */
1338 static void
1339 write_digit (FILE *stream, int digit)
1341 fputc ('0' + (digit % 10), stream);
1344 /* Helper function for dump_location_info.
1345 Write a row of numbers to STREAM, numbering a source line,
1346 giving the units, tens, hundreds etc of the column number. */
1348 static void
1349 write_digit_row (FILE *stream, int indent,
1350 const line_map_ordinary *map,
1351 location_t loc, int max_col, int divisor)
1353 fprintf (stream, "%*c", indent, ' ');
1354 fprintf (stream, "|");
1355 for (int column = 1; column < max_col; column++)
1357 location_t column_loc = loc + (column << map->m_range_bits);
1358 write_digit (stream, column_loc / divisor);
1360 fprintf (stream, "\n");
1363 /* Write a half-closed (START) / half-open (END) interval of
1364 location_t to STREAM. */
1366 static void
1367 dump_location_range (FILE *stream,
1368 location_t start, location_t end)
1370 fprintf (stream,
1371 " location_t interval: %u <= loc < %u\n",
1372 start, end);
1375 /* Write a labelled description of a half-closed (START) / half-open (END)
1376 interval of location_t to STREAM. */
1378 static void
1379 dump_labelled_location_range (FILE *stream,
1380 const char *name,
1381 location_t start, location_t end)
1383 fprintf (stream, "%s\n", name);
1384 dump_location_range (stream, start, end);
1385 fprintf (stream, "\n");
1388 /* Write a visualization of the locations in the line_table to STREAM. */
1390 void
1391 dump_location_info (FILE *stream)
1393 /* Visualize the reserved locations. */
1394 dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1395 0, RESERVED_LOCATION_COUNT);
1397 /* Visualize the ordinary line_map instances, rendering the sources. */
1398 for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1400 location_t end_location = get_end_location (line_table, idx);
1401 /* half-closed: doesn't include this one. */
1403 const line_map_ordinary *map
1404 = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1405 fprintf (stream, "ORDINARY MAP: %i\n", idx);
1406 dump_location_range (stream,
1407 MAP_START_LOCATION (map), end_location);
1408 fprintf (stream, " file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1409 fprintf (stream, " starting at line: %i\n",
1410 ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1411 fprintf (stream, " column and range bits: %i\n",
1412 map->m_column_and_range_bits);
1413 fprintf (stream, " column bits: %i\n",
1414 map->m_column_and_range_bits - map->m_range_bits);
1415 fprintf (stream, " range bits: %i\n",
1416 map->m_range_bits);
1417 const char * reason;
1418 switch (map->reason) {
1419 case LC_ENTER:
1420 reason = "LC_ENTER";
1421 break;
1422 case LC_LEAVE:
1423 reason = "LC_LEAVE";
1424 break;
1425 case LC_RENAME:
1426 reason = "LC_RENAME";
1427 break;
1428 case LC_RENAME_VERBATIM:
1429 reason = "LC_RENAME_VERBATIM";
1430 break;
1431 case LC_ENTER_MACRO:
1432 reason = "LC_RENAME_MACRO";
1433 break;
1434 default:
1435 reason = "Unknown";
1437 fprintf (stream, " reason: %d (%s)\n", map->reason, reason);
1439 const line_map_ordinary *includer_map
1440 = linemap_included_from_linemap (line_table, map);
1441 fprintf (stream, " included from location: %d",
1442 linemap_included_from (map));
1443 if (includer_map) {
1444 fprintf (stream, " (in ordinary map %d)",
1445 int (includer_map - line_table->info_ordinary.maps));
1447 fprintf (stream, "\n");
1449 /* Render the span of source lines that this "map" covers. */
1450 for (location_t loc = MAP_START_LOCATION (map);
1451 loc < end_location;
1452 loc += (1 << map->m_range_bits) )
1454 gcc_assert (pure_location_p (line_table, loc) );
1456 expanded_location exploc
1457 = linemap_expand_location (line_table, map, loc);
1459 if (exploc.column == 0)
1461 /* Beginning of a new source line: draw the line. */
1463 char_span line_text = location_get_source_line (exploc.file,
1464 exploc.line);
1465 if (!line_text)
1466 break;
1467 fprintf (stream,
1468 "%s:%3i|loc:%5i|%.*s\n",
1469 exploc.file, exploc.line,
1470 loc,
1471 (int)line_text.length (), line_text.get_buffer ());
1473 /* "loc" is at column 0, which means "the whole line".
1474 Render the locations *within* the line, by underlining
1475 it, showing the location_t numeric values
1476 at each column. */
1477 size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1478 if (max_col > line_text.length ())
1479 max_col = line_text.length () + 1;
1481 int len_lnum = num_digits (exploc.line);
1482 if (len_lnum < 3)
1483 len_lnum = 3;
1484 int len_loc = num_digits (loc);
1485 if (len_loc < 5)
1486 len_loc = 5;
1488 int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1490 /* Thousands. */
1491 if (end_location > 999)
1492 write_digit_row (stream, indent, map, loc, max_col, 1000);
1494 /* Hundreds. */
1495 if (end_location > 99)
1496 write_digit_row (stream, indent, map, loc, max_col, 100);
1498 /* Tens. */
1499 write_digit_row (stream, indent, map, loc, max_col, 10);
1501 /* Units. */
1502 write_digit_row (stream, indent, map, loc, max_col, 1);
1505 fprintf (stream, "\n");
1508 /* Visualize unallocated values. */
1509 dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1510 line_table->highest_location,
1511 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1513 /* Visualize the macro line_map instances, rendering the sources. */
1514 for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1516 /* Each macro map that is allocated owns location_t values
1517 that are *lower* that the one before them.
1518 Hence it's meaningful to view them either in order of ascending
1519 source locations, or in order of ascending macro map index. */
1520 const bool ascending_location_ts = true;
1521 unsigned int idx = (ascending_location_ts
1522 ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1523 : i);
1524 const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1525 fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1526 idx,
1527 linemap_map_get_macro_name (map),
1528 MACRO_MAP_NUM_MACRO_TOKENS (map));
1529 dump_location_range (stream,
1530 map->start_location,
1531 (map->start_location
1532 + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1533 inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1534 "expansion point is location %i",
1535 MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1536 fprintf (stream, " map->start_location: %u\n",
1537 map->start_location);
1539 fprintf (stream, " macro_locations:\n");
1540 for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1542 location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1543 location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1545 /* linemap_add_macro_token encodes token numbers in an expansion
1546 by putting them after MAP_START_LOCATION. */
1548 /* I'm typically seeing 4 uninitialized entries at the end of
1549 0xafafafaf.
1550 This appears to be due to macro.cc:replace_args
1551 adding 2 extra args for padding tokens; presumably there may
1552 be a leading and/or trailing padding token injected,
1553 each for 2 more location slots.
1554 This would explain there being up to 4 location_ts slots
1555 that may be uninitialized. */
1557 fprintf (stream, " %u: %u, %u\n",
1561 if (x == y)
1563 if (x < MAP_START_LOCATION (map))
1564 inform (x, "token %u has %<x-location == y-location == %u%>",
1565 i, x);
1566 else
1567 fprintf (stream,
1568 "x-location == y-location == %u encodes token # %u\n",
1569 x, x - MAP_START_LOCATION (map));
1571 else
1573 inform (x, "token %u has %<x-location == %u%>", i, x);
1574 inform (x, "token %u has %<y-location == %u%>", i, y);
1577 fprintf (stream, "\n");
1580 /* It appears that MAX_LOCATION_T itself is never assigned to a
1581 macro map, presumably due to an off-by-one error somewhere
1582 between the logic in linemap_enter_macro and
1583 LINEMAPS_MACRO_LOWEST_LOCATION. */
1584 dump_labelled_location_range (stream, "MAX_LOCATION_T",
1585 MAX_LOCATION_T,
1586 MAX_LOCATION_T + 1);
1588 /* Visualize ad-hoc values. */
1589 dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1590 MAX_LOCATION_T + 1, UINT_MAX);
1593 /* string_concat's constructor. */
1595 string_concat::string_concat (int num, location_t *locs)
1596 : m_num (num)
1598 m_locs = ggc_vec_alloc <location_t> (num);
1599 for (int i = 0; i < num; i++)
1600 m_locs[i] = locs[i];
1603 /* string_concat_db's constructor. */
1605 string_concat_db::string_concat_db ()
1607 m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1610 /* Record that a string concatenation occurred, covering NUM
1611 string literal tokens. LOCS is an array of size NUM, containing the
1612 locations of the tokens. A copy of LOCS is taken. */
1614 void
1615 string_concat_db::record_string_concatenation (int num, location_t *locs)
1617 gcc_assert (num > 1);
1618 gcc_assert (locs);
1620 location_t key_loc = get_key_loc (locs[0]);
1621 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1622 any data now recorded under key 'key_loc' would be overwritten by a
1623 subsequent call with the same key 'key_loc'. */
1624 if (RESERVED_LOCATION_P (key_loc))
1625 return;
1627 string_concat *concat
1628 = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1629 m_table->put (key_loc, concat);
1632 /* Determine if LOC was the location of the initial token of a
1633 concatenation of string literal tokens.
1634 If so, *OUT_NUM is written to with the number of tokens, and
1635 *OUT_LOCS with the location of an array of locations of the
1636 tokens, and return true. *OUT_LOCS is a borrowed pointer to
1637 storage owned by the string_concat_db.
1638 Otherwise, return false. */
1640 bool
1641 string_concat_db::get_string_concatenation (location_t loc,
1642 int *out_num,
1643 location_t **out_locs)
1645 gcc_assert (out_num);
1646 gcc_assert (out_locs);
1648 location_t key_loc = get_key_loc (loc);
1649 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1650 discussion in 'string_concat_db::record_string_concatenation'. */
1651 if (RESERVED_LOCATION_P (key_loc))
1652 return false;
1654 string_concat **concat = m_table->get (key_loc);
1655 if (!concat)
1656 return false;
1658 *out_num = (*concat)->m_num;
1659 *out_locs =(*concat)->m_locs;
1660 return true;
1663 /* Internal function. Canonicalize LOC into a form suitable for
1664 use as a key within the database, stripping away macro expansion,
1665 ad-hoc information, and range information, using the location of
1666 the start of LOC within an ordinary linemap. */
1668 location_t
1669 string_concat_db::get_key_loc (location_t loc)
1671 loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1672 NULL);
1674 loc = get_range_from_loc (line_table, loc).m_start;
1676 return loc;
1679 /* Helper class for use within get_substring_ranges_for_loc.
1680 An vec of cpp_string with responsibility for releasing all of the
1681 str->text for each str in the vector. */
1683 class auto_cpp_string_vec : public auto_vec <cpp_string>
1685 public:
1686 auto_cpp_string_vec (int alloc)
1687 : auto_vec <cpp_string> (alloc) {}
1689 ~auto_cpp_string_vec ()
1691 /* Clean up the copies within this vec. */
1692 int i;
1693 cpp_string *str;
1694 FOR_EACH_VEC_ELT (*this, i, str)
1695 free (const_cast <unsigned char *> (str->text));
1699 /* Attempt to populate RANGES with source location information on the
1700 individual characters within the string literal found at STRLOC.
1701 If CONCATS is non-NULL, then any string literals that the token at
1702 STRLOC was concatenated with are also added to RANGES.
1704 Return NULL if successful, or an error message if any errors occurred (in
1705 which case RANGES may be only partially populated and should not
1706 be used).
1708 This is implemented by re-parsing the relevant source line(s). */
1710 static const char *
1711 get_substring_ranges_for_loc (cpp_reader *pfile,
1712 string_concat_db *concats,
1713 location_t strloc,
1714 enum cpp_ttype type,
1715 cpp_substring_ranges &ranges)
1717 gcc_assert (pfile);
1719 if (strloc == UNKNOWN_LOCATION)
1720 return "unknown location";
1722 /* Reparsing the strings requires accurate location information.
1723 If -ftrack-macro-expansion has been overridden from its default
1724 of 2, then we might have a location of a macro expansion point,
1725 rather than the location of the literal itself.
1726 Avoid this by requiring that we have full macro expansion tracking
1727 for substring locations to be available. */
1728 if (cpp_get_options (pfile)->track_macro_expansion != 2)
1729 return "track_macro_expansion != 2";
1731 /* If #line or # 44 "file"-style directives are present, then there's
1732 no guarantee that the line numbers we have can be used to locate
1733 the strings. For example, we might have a .i file with # directives
1734 pointing back to lines within a .c file, but the .c file might
1735 have been edited since the .i file was created.
1736 In such a case, the safest course is to disable on-demand substring
1737 locations. */
1738 if (line_table->seen_line_directive)
1739 return "seen line directive";
1741 /* If string concatenation has occurred at STRLOC, get the locations
1742 of all of the literal tokens making up the compound string.
1743 Otherwise, just use STRLOC. */
1744 int num_locs = 1;
1745 location_t *strlocs = &strloc;
1746 if (concats)
1747 concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1749 auto_cpp_string_vec strs (num_locs);
1750 auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1751 for (int i = 0; i < num_locs; i++)
1753 /* Get range of strloc. We will use it to locate the start and finish
1754 of the literal token within the line. */
1755 source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1757 if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1759 /* If the string token was within a macro expansion, then we can
1760 cope with it for the simple case where we have a single token.
1761 Otherwise, bail out. */
1762 if (src_range.m_start != src_range.m_finish)
1763 return "macro expansion";
1765 else
1767 if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1768 /* If so, we can't reliably determine where the token started within
1769 its line. */
1770 return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1772 if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1773 /* If so, we can't reliably determine where the token finished
1774 within its line. */
1775 return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1778 expanded_location start
1779 = expand_location_to_spelling_point (src_range.m_start,
1780 LOCATION_ASPECT_START);
1781 expanded_location finish
1782 = expand_location_to_spelling_point (src_range.m_finish,
1783 LOCATION_ASPECT_FINISH);
1784 if (start.file != finish.file)
1785 return "range endpoints are in different files";
1786 if (start.line != finish.line)
1787 return "range endpoints are on different lines";
1788 if (start.column > finish.column)
1789 return "range endpoints are reversed";
1791 char_span line = location_get_source_line (start.file, start.line);
1792 if (!line)
1793 return "unable to read source line";
1795 /* Determine the location of the literal (including quotes
1796 and leading prefix chars, such as the 'u' in a u""
1797 token). */
1798 size_t literal_length = finish.column - start.column + 1;
1800 /* Ensure that we don't crash if we got the wrong location. */
1801 if (start.column < 1)
1802 return "zero start column";
1803 if (line.length () < (start.column - 1 + literal_length))
1804 return "line is not wide enough";
1806 char_span literal = line.subspan (start.column - 1, literal_length);
1808 cpp_string from;
1809 from.len = literal_length;
1810 /* Make a copy of the literal, to avoid having to rely on
1811 the lifetime of the copy of the line within the cache.
1812 This will be released by the auto_cpp_string_vec dtor. */
1813 from.text = (unsigned char *)literal.xstrdup ();
1814 strs.safe_push (from);
1816 /* For very long lines, a new linemap could have started
1817 halfway through the token.
1818 Ensure that the loc_reader uses the linemap of the
1819 *end* of the token for its start location. */
1820 const line_map_ordinary *start_ord_map;
1821 linemap_resolve_location (line_table, src_range.m_start,
1822 LRK_SPELLING_LOCATION, &start_ord_map);
1823 const line_map_ordinary *final_ord_map;
1824 linemap_resolve_location (line_table, src_range.m_finish,
1825 LRK_SPELLING_LOCATION, &final_ord_map);
1826 if (start_ord_map == NULL || final_ord_map == NULL)
1827 return "failed to get ordinary maps";
1828 /* Bulletproofing. We ought to only have different ordinary maps
1829 for start vs finish due to line-length jumps. */
1830 if (start_ord_map != final_ord_map
1831 && start_ord_map->to_file != final_ord_map->to_file)
1832 return "start and finish are spelled in different ordinary maps";
1833 /* The file from linemap_resolve_location ought to match that from
1834 expand_location_to_spelling_point. */
1835 if (start_ord_map->to_file != start.file)
1836 return "mismatching file after resolving linemap";
1838 location_t start_loc
1839 = linemap_position_for_line_and_column (line_table, final_ord_map,
1840 start.line, start.column);
1842 cpp_string_location_reader loc_reader (start_loc, line_table);
1843 loc_readers.safe_push (loc_reader);
1846 /* Rerun cpp_interpret_string, or rather, a modified version of it. */
1847 const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1848 loc_readers.address (),
1849 num_locs, &ranges, type);
1850 if (err)
1851 return err;
1853 /* Success: "ranges" should now contain information on the string. */
1854 return NULL;
1857 /* Attempt to populate *OUT_LOC with source location information on the
1858 given characters within the string literal found at STRLOC.
1859 CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1860 character set.
1862 For example, given CARET_IDX = 4, START_IDX = 3, END_IDX = 7
1863 and string literal "012345\n789"
1864 *OUT_LOC is written to with:
1865 "012345\n789"
1866 ~^~~~~
1868 If CONCATS is non-NULL, then any string literals that the token at
1869 STRLOC was concatenated with are also considered.
1871 This is implemented by re-parsing the relevant source line(s).
1873 Return NULL if successful, or an error message if any errors occurred.
1874 Error messages are intended for GCC developers (to help debugging) rather
1875 than for end-users. */
1877 const char *
1878 get_location_within_string (cpp_reader *pfile,
1879 string_concat_db *concats,
1880 location_t strloc,
1881 enum cpp_ttype type,
1882 int caret_idx, int start_idx, int end_idx,
1883 location_t *out_loc)
1885 gcc_checking_assert (caret_idx >= 0);
1886 gcc_checking_assert (start_idx >= 0);
1887 gcc_checking_assert (end_idx >= 0);
1888 gcc_assert (out_loc);
1890 cpp_substring_ranges ranges;
1891 const char *err
1892 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1893 if (err)
1894 return err;
1896 if (caret_idx >= ranges.get_num_ranges ())
1897 return "caret_idx out of range";
1898 if (start_idx >= ranges.get_num_ranges ())
1899 return "start_idx out of range";
1900 if (end_idx >= ranges.get_num_ranges ())
1901 return "end_idx out of range";
1903 *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1904 ranges.get_range (start_idx).m_start,
1905 ranges.get_range (end_idx).m_finish);
1906 return NULL;
1909 /* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1911 location_t
1912 location_with_discriminator (location_t locus, int discriminator)
1914 tree block = LOCATION_BLOCK (locus);
1915 source_range src_range = get_range_from_loc (line_table, locus);
1916 locus = get_pure_location (locus);
1918 if (locus == UNKNOWN_LOCATION)
1919 return locus;
1921 return line_table->get_or_create_combined_loc (locus, src_range, block,
1922 discriminator);
1925 /* Return TRUE if LOCUS represents a location with a discriminator. */
1927 bool
1928 has_discriminator (location_t locus)
1930 return get_discriminator_from_loc (locus) != 0;
1933 /* Return the discriminator for LOCUS. */
1936 get_discriminator_from_loc (location_t locus)
1938 return get_discriminator_from_loc (line_table, locus);
1941 #if CHECKING_P
1943 namespace selftest {
1945 /* Selftests of location handling. */
1947 /* Attempt to populate *OUT_RANGE with source location information on the
1948 given character within the string literal found at STRLOC.
1949 CHAR_IDX refers to an offset within the execution character set.
1950 If CONCATS is non-NULL, then any string literals that the token at
1951 STRLOC was concatenated with are also considered.
1953 This is implemented by re-parsing the relevant source line(s).
1955 Return NULL if successful, or an error message if any errors occurred.
1956 Error messages are intended for GCC developers (to help debugging) rather
1957 than for end-users. */
1959 static const char *
1960 get_source_range_for_char (cpp_reader *pfile,
1961 string_concat_db *concats,
1962 location_t strloc,
1963 enum cpp_ttype type,
1964 int char_idx,
1965 source_range *out_range)
1967 gcc_checking_assert (char_idx >= 0);
1968 gcc_assert (out_range);
1970 cpp_substring_ranges ranges;
1971 const char *err
1972 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1973 if (err)
1974 return err;
1976 if (char_idx >= ranges.get_num_ranges ())
1977 return "char_idx out of range";
1979 *out_range = ranges.get_range (char_idx);
1980 return NULL;
1983 /* As get_source_range_for_char, but write to *OUT the number
1984 of ranges that are available. */
1986 static const char *
1987 get_num_source_ranges_for_substring (cpp_reader *pfile,
1988 string_concat_db *concats,
1989 location_t strloc,
1990 enum cpp_ttype type,
1991 int *out)
1993 gcc_assert (out);
1995 cpp_substring_ranges ranges;
1996 const char *err
1997 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1999 if (err)
2000 return err;
2002 *out = ranges.get_num_ranges ();
2003 return NULL;
2006 /* Selftests of location handling. */
2008 /* Verify that compare() on linenum_type handles comparisons over the full
2009 range of the type. */
2011 static void
2012 test_linenum_comparisons ()
2014 linenum_type min_line (0);
2015 linenum_type max_line (0xffffffff);
2016 ASSERT_EQ (0, compare (min_line, min_line));
2017 ASSERT_EQ (0, compare (max_line, max_line));
2019 ASSERT_GT (compare (max_line, min_line), 0);
2020 ASSERT_LT (compare (min_line, max_line), 0);
2023 /* Helper function for verifying location data: when location_t
2024 values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
2025 as having column 0. */
2027 static bool
2028 should_have_column_data_p (location_t loc)
2030 if (IS_ADHOC_LOC (loc))
2031 loc = get_location_from_adhoc_loc (line_table, loc);
2032 if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
2033 return false;
2034 return true;
2037 /* Selftest for should_have_column_data_p. */
2039 static void
2040 test_should_have_column_data_p ()
2042 ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
2043 ASSERT_TRUE
2044 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
2045 ASSERT_FALSE
2046 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
2049 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
2050 on LOC. */
2052 static void
2053 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
2054 location_t loc)
2056 ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
2057 ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
2058 /* If location_t values are sufficiently high, then column numbers
2059 will be unavailable and LOCATION_COLUMN (loc) will be 0.
2060 When close to the threshold, column numbers *may* be present: if
2061 the final linemap before the threshold contains a line that straddles
2062 the threshold, locations in that line have column information. */
2063 if (should_have_column_data_p (loc))
2064 ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
2067 /* Various selftests involve constructing a line table and one or more
2068 line maps within it.
2070 For maximum test coverage we want to run these tests with a variety
2071 of situations:
2072 - line_table->default_range_bits: some frontends use a non-zero value
2073 and others use zero
2074 - the fallback modes within line-map.cc: there are various threshold
2075 values for location_t beyond line-map.cc changes
2076 behavior (disabling of the range-packing optimization, disabling
2077 of column-tracking). We can exercise these by starting the line_table
2078 at interesting values at or near these thresholds.
2080 The following struct describes a particular case within our test
2081 matrix. */
2083 class line_table_case
2085 public:
2086 line_table_case (int default_range_bits, int base_location)
2087 : m_default_range_bits (default_range_bits),
2088 m_base_location (base_location)
2091 int m_default_range_bits;
2092 int m_base_location;
2095 /* Constructor. Store the old value of line_table, and create a new
2096 one, using sane defaults. */
2098 line_table_test::line_table_test ()
2100 gcc_assert (saved_line_table == NULL);
2101 saved_line_table = line_table;
2102 line_table = ggc_alloc<line_maps> ();
2103 linemap_init (line_table, BUILTINS_LOCATION);
2104 gcc_assert (saved_line_table->m_reallocator);
2105 line_table->m_reallocator = saved_line_table->m_reallocator;
2106 gcc_assert (saved_line_table->m_round_alloc_size);
2107 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2108 line_table->default_range_bits = 0;
2111 /* Constructor. Store the old value of line_table, and create a new
2112 one, using the sitation described in CASE_. */
2114 line_table_test::line_table_test (const line_table_case &case_)
2116 gcc_assert (saved_line_table == NULL);
2117 saved_line_table = line_table;
2118 line_table = ggc_alloc<line_maps> ();
2119 linemap_init (line_table, BUILTINS_LOCATION);
2120 gcc_assert (saved_line_table->m_reallocator);
2121 line_table->m_reallocator = saved_line_table->m_reallocator;
2122 gcc_assert (saved_line_table->m_round_alloc_size);
2123 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2124 line_table->default_range_bits = case_.m_default_range_bits;
2125 if (case_.m_base_location)
2127 line_table->highest_location = case_.m_base_location;
2128 line_table->highest_line = case_.m_base_location;
2132 /* Destructor. Restore the old value of line_table. */
2134 line_table_test::~line_table_test ()
2136 gcc_assert (saved_line_table != NULL);
2137 line_table = saved_line_table;
2138 saved_line_table = NULL;
2141 /* Verify basic operation of ordinary linemaps. */
2143 static void
2144 test_accessing_ordinary_linemaps (const line_table_case &case_)
2146 line_table_test ltt (case_);
2148 /* Build a simple linemap describing some locations. */
2149 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
2151 linemap_line_start (line_table, 1, 100);
2152 location_t loc_a = linemap_position_for_column (line_table, 1);
2153 location_t loc_b = linemap_position_for_column (line_table, 23);
2155 linemap_line_start (line_table, 2, 100);
2156 location_t loc_c = linemap_position_for_column (line_table, 1);
2157 location_t loc_d = linemap_position_for_column (line_table, 17);
2159 /* Example of a very long line. */
2160 linemap_line_start (line_table, 3, 2000);
2161 location_t loc_e = linemap_position_for_column (line_table, 700);
2163 /* Transitioning back to a short line. */
2164 linemap_line_start (line_table, 4, 0);
2165 location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2167 if (should_have_column_data_p (loc_back_to_short))
2169 /* Verify that we switched to short lines in the linemap. */
2170 line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
2171 ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2174 /* Example of a line that will eventually be seen to be longer
2175 than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2176 below that. */
2177 linemap_line_start (line_table, 5, 2000);
2179 location_t loc_start_of_very_long_line
2180 = linemap_position_for_column (line_table, 2000);
2181 location_t loc_too_wide
2182 = linemap_position_for_column (line_table, 4097);
2183 location_t loc_too_wide_2
2184 = linemap_position_for_column (line_table, 4098);
2186 /* ...and back to a sane line length. */
2187 linemap_line_start (line_table, 6, 100);
2188 location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2190 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2192 /* Multiple files. */
2193 linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2194 linemap_line_start (line_table, 1, 200);
2195 location_t loc_f = linemap_position_for_column (line_table, 150);
2196 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2198 /* Verify that we can recover the location info. */
2199 assert_loceq ("foo.c", 1, 1, loc_a);
2200 assert_loceq ("foo.c", 1, 23, loc_b);
2201 assert_loceq ("foo.c", 2, 1, loc_c);
2202 assert_loceq ("foo.c", 2, 17, loc_d);
2203 assert_loceq ("foo.c", 3, 700, loc_e);
2204 assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2206 /* In the very wide line, the initial location should be fully tracked. */
2207 assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2208 /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2209 be disabled. */
2210 assert_loceq ("foo.c", 5, 0, loc_too_wide);
2211 assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2212 /*...and column-tracking should be re-enabled for subsequent lines. */
2213 assert_loceq ("foo.c", 6, 10, loc_sane_again);
2215 assert_loceq ("bar.c", 1, 150, loc_f);
2217 ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2218 ASSERT_TRUE (pure_location_p (line_table, loc_a));
2220 /* Verify using make_location to build a range, and extracting data
2221 back from it. */
2222 location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2223 ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2224 ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2225 source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2226 ASSERT_EQ (loc_b, src_range.m_start);
2227 ASSERT_EQ (loc_d, src_range.m_finish);
2230 /* Verify various properties of UNKNOWN_LOCATION. */
2232 static void
2233 test_unknown_location ()
2235 ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2236 ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2237 ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2240 /* Verify various properties of BUILTINS_LOCATION. */
2242 static void
2243 test_builtins ()
2245 assert_loceq (special_fname_builtin (), 0, 0, BUILTINS_LOCATION);
2246 ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2249 /* Regression test for make_location.
2250 Ensure that we use pure locations for the start/finish of the range,
2251 rather than storing a packed or ad-hoc range as the start/finish. */
2253 static void
2254 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2256 /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2257 with C++ frontend.
2258 ....................0000000001111111111222.
2259 ....................1234567890123456789012. */
2260 const char *content = " r += !aaa == bbb;\n";
2261 temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2262 line_table_test ltt (case_);
2263 linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2265 const location_t c11 = linemap_position_for_column (line_table, 11);
2266 const location_t c12 = linemap_position_for_column (line_table, 12);
2267 const location_t c13 = linemap_position_for_column (line_table, 13);
2268 const location_t c14 = linemap_position_for_column (line_table, 14);
2269 const location_t c21 = linemap_position_for_column (line_table, 21);
2271 if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2272 return;
2274 /* Use column 13 for the caret location, arbitrarily, to verify that we
2275 handle start != caret. */
2276 const location_t aaa = make_location (c13, c12, c14);
2277 ASSERT_EQ (c13, get_pure_location (aaa));
2278 ASSERT_EQ (c12, get_start (aaa));
2279 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2280 ASSERT_EQ (c14, get_finish (aaa));
2281 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2283 /* Make a location using a location with a range as the start-point. */
2284 const location_t not_aaa = make_location (c11, aaa, c14);
2285 ASSERT_EQ (c11, get_pure_location (not_aaa));
2286 /* It should use the start location of the range, not store the range
2287 itself. */
2288 ASSERT_EQ (c12, get_start (not_aaa));
2289 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2290 ASSERT_EQ (c14, get_finish (not_aaa));
2291 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2293 /* Similarly, make a location with a range as the end-point. */
2294 const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2295 ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2296 ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2297 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2298 ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2299 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2300 const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2301 /* It should use the finish location of the range, not store the range
2302 itself. */
2303 ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2304 ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2305 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2306 ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2307 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2310 /* Verify reading of input files (e.g. for caret-based diagnostics). */
2312 static void
2313 test_reading_source_line ()
2315 /* Create a tempfile and write some text to it. */
2316 temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2317 "01234567890123456789\n"
2318 "This is the test text\n"
2319 "This is the 3rd line");
2321 /* Read back a specific line from the tempfile. */
2322 char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2323 ASSERT_TRUE (source_line);
2324 ASSERT_TRUE (source_line.get_buffer () != NULL);
2325 ASSERT_EQ (20, source_line.length ());
2326 ASSERT_TRUE (!strncmp ("This is the 3rd line",
2327 source_line.get_buffer (), source_line.length ()));
2329 source_line = location_get_source_line (tmp.get_filename (), 2);
2330 ASSERT_TRUE (source_line);
2331 ASSERT_TRUE (source_line.get_buffer () != NULL);
2332 ASSERT_EQ (21, source_line.length ());
2333 ASSERT_TRUE (!strncmp ("This is the test text",
2334 source_line.get_buffer (), source_line.length ()));
2336 source_line = location_get_source_line (tmp.get_filename (), 4);
2337 ASSERT_FALSE (source_line);
2338 ASSERT_TRUE (source_line.get_buffer () == NULL);
2341 /* Tests of lexing. */
2343 /* Verify that token TOK from PARSER has cpp_token_as_text
2344 equal to EXPECTED_TEXT. */
2346 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT) \
2347 SELFTEST_BEGIN_STMT \
2348 unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK)); \
2349 ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt); \
2350 SELFTEST_END_STMT
2352 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2353 and ranges from EXP_START_COL to EXP_FINISH_COL.
2354 Use LOC as the effective location of the selftest. */
2356 static void
2357 assert_token_loc_eq (const location &loc,
2358 const cpp_token *tok,
2359 const char *exp_filename, int exp_linenum,
2360 int exp_start_col, int exp_finish_col)
2362 location_t tok_loc = tok->src_loc;
2363 ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2364 ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2366 /* If location_t values are sufficiently high, then column numbers
2367 will be unavailable. */
2368 if (!should_have_column_data_p (tok_loc))
2369 return;
2371 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2372 source_range tok_range = get_range_from_loc (line_table, tok_loc);
2373 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2374 ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2377 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2378 SELFTEST_LOCATION as the effective location of the selftest. */
2380 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2381 EXP_START_COL, EXP_FINISH_COL) \
2382 assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2383 (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2385 /* Test of lexing a file using libcpp, verifying tokens and their
2386 location information. */
2388 static void
2389 test_lexer (const line_table_case &case_)
2391 /* Create a tempfile and write some text to it. */
2392 const char *content =
2393 /*00000000011111111112222222222333333.3333444444444.455555555556
2394 12345678901234567890123456789012345.6789012345678.901234567890. */
2395 ("test_name /* c-style comment */\n"
2396 " \"test literal\"\n"
2397 " // test c++-style comment\n"
2398 " 42\n");
2399 temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2401 line_table_test ltt (case_);
2403 cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2405 const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2406 ASSERT_NE (fname, NULL);
2408 /* Verify that we get the expected tokens back, with the correct
2409 location information. */
2411 location_t loc;
2412 const cpp_token *tok;
2413 tok = cpp_get_token_with_location (parser, &loc);
2414 ASSERT_NE (tok, NULL);
2415 ASSERT_EQ (tok->type, CPP_NAME);
2416 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2417 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2419 tok = cpp_get_token_with_location (parser, &loc);
2420 ASSERT_NE (tok, NULL);
2421 ASSERT_EQ (tok->type, CPP_STRING);
2422 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2423 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2425 tok = cpp_get_token_with_location (parser, &loc);
2426 ASSERT_NE (tok, NULL);
2427 ASSERT_EQ (tok->type, CPP_NUMBER);
2428 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2429 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2431 tok = cpp_get_token_with_location (parser, &loc);
2432 ASSERT_NE (tok, NULL);
2433 ASSERT_EQ (tok->type, CPP_EOF);
2435 cpp_finish (parser, NULL);
2436 cpp_destroy (parser);
2439 /* Forward decls. */
2441 class lexer_test;
2442 class lexer_test_options;
2444 /* A class for specifying options of a lexer_test.
2445 The "apply" vfunc is called during the lexer_test constructor. */
2447 class lexer_test_options
2449 public:
2450 virtual void apply (lexer_test &) = 0;
2453 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2454 in its dtor.
2456 This is needed by struct lexer_test to ensure that the cleanup of the
2457 cpp_reader happens *after* the cleanup of the temp_source_file. */
2459 class cpp_reader_ptr
2461 public:
2462 cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2464 ~cpp_reader_ptr ()
2466 cpp_finish (m_ptr, NULL);
2467 cpp_destroy (m_ptr);
2470 operator cpp_reader * () const { return m_ptr; }
2472 private:
2473 cpp_reader *m_ptr;
2476 /* A struct for writing lexer tests. */
2478 class lexer_test
2480 public:
2481 lexer_test (const line_table_case &case_, const char *content,
2482 lexer_test_options *options);
2483 ~lexer_test ();
2485 const cpp_token *get_token ();
2487 /* The ordering of these fields matters.
2488 The line_table_test must be first, since the cpp_reader_ptr
2489 uses it.
2490 The cpp_reader must be cleaned up *after* the temp_source_file
2491 since the filenames in input.cc's input cache are owned by the
2492 cpp_reader; in particular, when ~temp_source_file evicts the
2493 filename the filenames must still be alive. */
2494 line_table_test m_ltt;
2495 cpp_reader_ptr m_parser;
2496 temp_source_file m_tempfile;
2497 string_concat_db m_concats;
2498 bool m_implicitly_expect_EOF;
2501 /* Use an EBCDIC encoding for the execution charset, specifically
2502 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2504 This exercises iconv integration within libcpp.
2505 Not every build of iconv supports the given charset,
2506 so we need to flag this error and handle it gracefully. */
2508 class ebcdic_execution_charset : public lexer_test_options
2510 public:
2511 ebcdic_execution_charset () : m_num_iconv_errors (0)
2513 gcc_assert (s_singleton == NULL);
2514 s_singleton = this;
2516 ~ebcdic_execution_charset ()
2518 gcc_assert (s_singleton == this);
2519 s_singleton = NULL;
2522 void apply (lexer_test &test) final override
2524 cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2525 cpp_opts->narrow_charset = "IBM1047";
2527 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2528 callbacks->diagnostic = on_diagnostic;
2531 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2532 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2533 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2534 rich_location *richloc ATTRIBUTE_UNUSED,
2535 const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2536 ATTRIBUTE_FPTR_PRINTF(5,0)
2538 gcc_assert (s_singleton);
2539 /* Avoid exgettext from picking this up, it is translated in libcpp. */
2540 const char *msg = "conversion from %s to %s not supported by iconv";
2541 #ifdef ENABLE_NLS
2542 msg = dgettext ("cpplib", msg);
2543 #endif
2544 /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2545 when the local iconv build doesn't support the conversion. */
2546 if (strcmp (msgid, msg) == 0)
2548 s_singleton->m_num_iconv_errors++;
2549 return true;
2552 /* Otherwise, we have an unexpected error. */
2553 abort ();
2556 bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2558 private:
2559 static ebcdic_execution_charset *s_singleton;
2560 int m_num_iconv_errors;
2563 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2565 /* A lexer_test_options subclass that records a list of diagnostic
2566 messages emitted by the lexer. */
2568 class lexer_diagnostic_sink : public lexer_test_options
2570 public:
2571 lexer_diagnostic_sink ()
2573 gcc_assert (s_singleton == NULL);
2574 s_singleton = this;
2576 ~lexer_diagnostic_sink ()
2578 gcc_assert (s_singleton == this);
2579 s_singleton = NULL;
2581 int i;
2582 char *str;
2583 FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2584 free (str);
2587 void apply (lexer_test &test) final override
2589 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2590 callbacks->diagnostic = on_diagnostic;
2593 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2594 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2595 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2596 rich_location *richloc ATTRIBUTE_UNUSED,
2597 const char *msgid, va_list *ap)
2598 ATTRIBUTE_FPTR_PRINTF(5,0)
2600 char *msg = xvasprintf (msgid, *ap);
2601 s_singleton->m_diagnostics.safe_push (msg);
2602 return true;
2605 auto_vec<char *> m_diagnostics;
2607 private:
2608 static lexer_diagnostic_sink *s_singleton;
2611 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2613 /* Constructor. Override line_table with a new instance based on CASE_,
2614 and write CONTENT to a tempfile. Create a cpp_reader, and use it to
2615 start parsing the tempfile. */
2617 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2618 lexer_test_options *options)
2619 : m_ltt (case_),
2620 m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2621 /* Create a tempfile and write the text to it. */
2622 m_tempfile (SELFTEST_LOCATION, ".c", content),
2623 m_concats (),
2624 m_implicitly_expect_EOF (true)
2626 if (options)
2627 options->apply (*this);
2629 cpp_init_iconv (m_parser);
2631 /* Parse the file. */
2632 const char *fname = cpp_read_main_file (m_parser,
2633 m_tempfile.get_filename ());
2634 ASSERT_NE (fname, NULL);
2637 /* Destructor. By default, verify that the next token in m_parser is EOF. */
2639 lexer_test::~lexer_test ()
2641 location_t loc;
2642 const cpp_token *tok;
2644 if (m_implicitly_expect_EOF)
2646 tok = cpp_get_token_with_location (m_parser, &loc);
2647 ASSERT_NE (tok, NULL);
2648 ASSERT_EQ (tok->type, CPP_EOF);
2652 /* Get the next token from m_parser. */
2654 const cpp_token *
2655 lexer_test::get_token ()
2657 location_t loc;
2658 const cpp_token *tok;
2660 tok = cpp_get_token_with_location (m_parser, &loc);
2661 ASSERT_NE (tok, NULL);
2662 return tok;
2665 /* Verify that locations within string literals are correctly handled. */
2667 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2668 using the string concatenation database for TEST.
2670 Assert that the character at index IDX is on EXPECTED_LINE,
2671 and that it begins at column EXPECTED_START_COL and ends at
2672 EXPECTED_FINISH_COL (unless the locations are beyond
2673 LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2674 columns). */
2676 static void
2677 assert_char_at_range (const location &loc,
2678 lexer_test& test,
2679 location_t strloc, enum cpp_ttype type, int idx,
2680 int expected_line, int expected_start_col,
2681 int expected_finish_col)
2683 cpp_reader *pfile = test.m_parser;
2684 string_concat_db *concats = &test.m_concats;
2686 source_range actual_range = source_range();
2687 const char *err
2688 = get_source_range_for_char (pfile, concats, strloc, type, idx,
2689 &actual_range);
2690 if (should_have_column_data_p (strloc))
2691 ASSERT_EQ_AT (loc, NULL, err);
2692 else
2694 ASSERT_STREQ_AT (loc,
2695 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2696 err);
2697 return;
2700 int actual_start_line = LOCATION_LINE (actual_range.m_start);
2701 ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2702 int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2703 ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2705 if (should_have_column_data_p (actual_range.m_start))
2707 int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2708 ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2710 if (should_have_column_data_p (actual_range.m_finish))
2712 int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2713 ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2717 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2718 the effective location of any errors. */
2720 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2721 EXPECTED_START_COL, EXPECTED_FINISH_COL) \
2722 assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2723 (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2724 (EXPECTED_FINISH_COL))
2726 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2727 using the string concatenation database for TEST.
2729 Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */
2731 static void
2732 assert_num_substring_ranges (const location &loc,
2733 lexer_test& test,
2734 location_t strloc,
2735 enum cpp_ttype type,
2736 int expected_num_ranges)
2738 cpp_reader *pfile = test.m_parser;
2739 string_concat_db *concats = &test.m_concats;
2741 int actual_num_ranges = -1;
2742 const char *err
2743 = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2744 &actual_num_ranges);
2745 if (should_have_column_data_p (strloc))
2746 ASSERT_EQ_AT (loc, NULL, err);
2747 else
2749 ASSERT_STREQ_AT (loc,
2750 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2751 err);
2752 return;
2754 ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2757 /* Macro for calling assert_num_substring_ranges, supplying
2758 SELFTEST_LOCATION for the effective location of any errors. */
2760 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2761 EXPECTED_NUM_RANGES) \
2762 assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2763 (TYPE), (EXPECTED_NUM_RANGES))
2766 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2767 returns an error (using the string concatenation database for TEST). */
2769 static void
2770 assert_has_no_substring_ranges (const location &loc,
2771 lexer_test& test,
2772 location_t strloc,
2773 enum cpp_ttype type,
2774 const char *expected_err)
2776 cpp_reader *pfile = test.m_parser;
2777 string_concat_db *concats = &test.m_concats;
2778 cpp_substring_ranges ranges;
2779 const char *actual_err
2780 = get_substring_ranges_for_loc (pfile, concats, strloc,
2781 type, ranges);
2782 if (should_have_column_data_p (strloc))
2783 ASSERT_STREQ_AT (loc, expected_err, actual_err);
2784 else
2785 ASSERT_STREQ_AT (loc,
2786 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2787 actual_err);
2790 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \
2791 assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2792 (STRLOC), (TYPE), (ERR))
2794 /* Lex a simple string literal. Verify the substring location data, before
2795 and after running cpp_interpret_string on it. */
2797 static void
2798 test_lexer_string_locations_simple (const line_table_case &case_)
2800 /* Digits 0-9 (with 0 at column 10), the simple way.
2801 ....................000000000.11111111112.2222222223333333333
2802 ....................123456789.01234567890.1234567890123456789
2803 We add a trailing comment to ensure that we correctly locate
2804 the end of the string literal token. */
2805 const char *content = " \"0123456789\" /* not a string */\n";
2806 lexer_test test (case_, content, NULL);
2808 /* Verify that we get the expected token back, with the correct
2809 location information. */
2810 const cpp_token *tok = test.get_token ();
2811 ASSERT_EQ (tok->type, CPP_STRING);
2812 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2813 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2815 /* At this point in lexing, the quote characters are treated as part of
2816 the string (they are stripped off by cpp_interpret_string). */
2818 ASSERT_EQ (tok->val.str.len, 12);
2820 /* Verify that cpp_interpret_string works. */
2821 cpp_string dst_string;
2822 const enum cpp_ttype type = CPP_STRING;
2823 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2824 &dst_string, type);
2825 ASSERT_TRUE (result);
2826 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2827 free (const_cast <unsigned char *> (dst_string.text));
2829 /* Verify ranges of individual characters. This no longer includes the
2830 opening quote, but does include the closing quote. */
2831 for (int i = 0; i <= 10; i++)
2832 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2833 10 + i, 10 + i);
2835 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2838 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2839 encoding. */
2841 static void
2842 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2844 /* EBCDIC support requires iconv. */
2845 if (!HAVE_ICONV)
2846 return;
2848 /* Digits 0-9 (with 0 at column 10), the simple way.
2849 ....................000000000.11111111112.2222222223333333333
2850 ....................123456789.01234567890.1234567890123456789
2851 We add a trailing comment to ensure that we correctly locate
2852 the end of the string literal token. */
2853 const char *content = " \"0123456789\" /* not a string */\n";
2854 ebcdic_execution_charset use_ebcdic;
2855 lexer_test test (case_, content, &use_ebcdic);
2857 /* Verify that we get the expected token back, with the correct
2858 location information. */
2859 const cpp_token *tok = test.get_token ();
2860 ASSERT_EQ (tok->type, CPP_STRING);
2861 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2862 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2864 /* At this point in lexing, the quote characters are treated as part of
2865 the string (they are stripped off by cpp_interpret_string). */
2867 ASSERT_EQ (tok->val.str.len, 12);
2869 /* The remainder of the test requires an iconv implementation that
2870 can convert from UTF-8 to the EBCDIC encoding requested above. */
2871 if (use_ebcdic.iconv_errors_occurred_p ())
2872 return;
2874 /* Verify that cpp_interpret_string works. */
2875 cpp_string dst_string;
2876 const enum cpp_ttype type = CPP_STRING;
2877 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2878 &dst_string, type);
2879 ASSERT_TRUE (result);
2880 /* We should now have EBCDIC-encoded text, specifically
2881 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2882 The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */
2883 ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2884 (const char *)dst_string.text);
2885 free (const_cast <unsigned char *> (dst_string.text));
2887 /* Verify that we don't attempt to record substring location information
2888 for such cases. */
2889 ASSERT_HAS_NO_SUBSTRING_RANGES
2890 (test, tok->src_loc, type,
2891 "execution character set != source character set");
2894 /* Lex a string literal containing a hex-escaped character.
2895 Verify the substring location data, before and after running
2896 cpp_interpret_string on it. */
2898 static void
2899 test_lexer_string_locations_hex (const line_table_case &case_)
2901 /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2902 and with a space in place of digit 6, to terminate the escaped
2903 hex code.
2904 ....................000000000.111111.11112222.
2905 ....................123456789.012345.67890123. */
2906 const char *content = " \"01234\\x35 789\"\n";
2907 lexer_test test (case_, content, NULL);
2909 /* Verify that we get the expected token back, with the correct
2910 location information. */
2911 const cpp_token *tok = test.get_token ();
2912 ASSERT_EQ (tok->type, CPP_STRING);
2913 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2914 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2916 /* At this point in lexing, the quote characters are treated as part of
2917 the string (they are stripped off by cpp_interpret_string). */
2918 ASSERT_EQ (tok->val.str.len, 15);
2920 /* Verify that cpp_interpret_string works. */
2921 cpp_string dst_string;
2922 const enum cpp_ttype type = CPP_STRING;
2923 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2924 &dst_string, type);
2925 ASSERT_TRUE (result);
2926 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2927 free (const_cast <unsigned char *> (dst_string.text));
2929 /* Verify ranges of individual characters. This no longer includes the
2930 opening quote, but does include the closing quote. */
2931 for (int i = 0; i <= 4; i++)
2932 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2933 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2934 for (int i = 6; i <= 10; i++)
2935 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2937 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2940 /* Lex a string literal containing an octal-escaped character.
2941 Verify the substring location data after running cpp_interpret_string
2942 on it. */
2944 static void
2945 test_lexer_string_locations_oct (const line_table_case &case_)
2947 /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2948 and with a space in place of digit 6, to terminate the escaped
2949 octal code.
2950 ....................000000000.111111.11112222.2222223333333333444
2951 ....................123456789.012345.67890123.4567890123456789012 */
2952 const char *content = " \"01234\\065 789\" /* not a string */\n";
2953 lexer_test test (case_, content, NULL);
2955 /* Verify that we get the expected token back, with the correct
2956 location information. */
2957 const cpp_token *tok = test.get_token ();
2958 ASSERT_EQ (tok->type, CPP_STRING);
2959 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2961 /* Verify that cpp_interpret_string works. */
2962 cpp_string dst_string;
2963 const enum cpp_ttype type = CPP_STRING;
2964 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2965 &dst_string, type);
2966 ASSERT_TRUE (result);
2967 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2968 free (const_cast <unsigned char *> (dst_string.text));
2970 /* Verify ranges of individual characters. This no longer includes the
2971 opening quote, but does include the closing quote. */
2972 for (int i = 0; i < 5; i++)
2973 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2974 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2975 for (int i = 6; i <= 10; i++)
2976 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2978 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2981 /* Test of string literal containing letter escapes. */
2983 static void
2984 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2986 /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2987 .....................000000000.1.11111.1.1.11222.22222223333333
2988 .....................123456789.0.12345.6.7.89012.34567890123456. */
2989 const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2990 lexer_test test (case_, content, NULL);
2992 /* Verify that we get the expected tokens back. */
2993 const cpp_token *tok = test.get_token ();
2994 ASSERT_EQ (tok->type, CPP_STRING);
2995 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2997 /* Verify ranges of individual characters. */
2998 /* "\t". */
2999 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3000 0, 1, 10, 11);
3001 /* "foo". */
3002 for (int i = 1; i <= 3; i++)
3003 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3004 i, 1, 11 + i, 11 + i);
3005 /* "\\" and "\n". */
3006 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3007 4, 1, 15, 16);
3008 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3009 5, 1, 17, 18);
3011 /* "bar" and closing quote for nul-terminator. */
3012 for (int i = 6; i <= 9; i++)
3013 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3014 i, 1, 13 + i, 13 + i);
3016 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
3019 /* Another test of a string literal containing a letter escape.
3020 Based on string seen in
3021 printf ("%-%\n");
3022 in gcc.dg/format/c90-printf-1.c. */
3024 static void
3025 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
3027 /* .....................000000000.1111.11.1111.22222222223.
3028 .....................123456789.0123.45.6789.01234567890. */
3029 const char *content = (" \"%-%\\n\" /* non-str */\n");
3030 lexer_test test (case_, content, NULL);
3032 /* Verify that we get the expected tokens back. */
3033 const cpp_token *tok = test.get_token ();
3034 ASSERT_EQ (tok->type, CPP_STRING);
3035 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
3037 /* Verify ranges of individual characters. */
3038 /* "%-%". */
3039 for (int i = 0; i < 3; i++)
3040 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3041 i, 1, 10 + i, 10 + i);
3042 /* "\n". */
3043 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3044 3, 1, 13, 14);
3046 /* Closing quote for nul-terminator. */
3047 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3048 4, 1, 15, 15);
3050 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
3053 /* Lex a string literal containing UCN 4 characters.
3054 Verify the substring location data after running cpp_interpret_string
3055 on it. */
3057 static void
3058 test_lexer_string_locations_ucn4 (const line_table_case &case_)
3060 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
3061 as UCN 4.
3062 ....................000000000.111111.111122.222222223.33333333344444
3063 ....................123456789.012345.678901.234567890.12345678901234 */
3064 const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n";
3065 lexer_test test (case_, content, NULL);
3067 /* Verify that we get the expected token back, with the correct
3068 location information. */
3069 const cpp_token *tok = test.get_token ();
3070 ASSERT_EQ (tok->type, CPP_STRING);
3071 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
3073 /* Verify that cpp_interpret_string works.
3074 The string should be encoded in the execution character
3075 set. Assuming that is UTF-8, we should have the following:
3076 ----------- ---- ----- ------- ----------------
3077 Byte offset Byte Octal Unicode Source Column(s)
3078 ----------- ---- ----- ------- ----------------
3079 0 0x30 '0' 10
3080 1 0x31 '1' 11
3081 2 0x32 '2' 12
3082 3 0x33 '3' 13
3083 4 0x34 '4' 14
3084 5 0xE2 \342 U+2174 15-20
3085 6 0x85 \205 (cont) 15-20
3086 7 0xB4 \264 (cont) 15-20
3087 8 0xE2 \342 U+2175 21-26
3088 9 0x85 \205 (cont) 21-26
3089 10 0xB5 \265 (cont) 21-26
3090 11 0x37 '7' 27
3091 12 0x38 '8' 28
3092 13 0x39 '9' 29
3093 14 0x00 30 (closing quote)
3094 ----------- ---- ----- ------- ---------------. */
3096 cpp_string dst_string;
3097 const enum cpp_ttype type = CPP_STRING;
3098 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3099 &dst_string, type);
3100 ASSERT_TRUE (result);
3101 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3102 (const char *)dst_string.text);
3103 free (const_cast <unsigned char *> (dst_string.text));
3105 /* Verify ranges of individual characters. This no longer includes the
3106 opening quote, but does include the closing quote.
3107 '01234'. */
3108 for (int i = 0; i <= 4; i++)
3109 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3110 /* U+2174. */
3111 for (int i = 5; i <= 7; i++)
3112 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
3113 /* U+2175. */
3114 for (int i = 8; i <= 10; i++)
3115 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
3116 /* '789' and nul terminator */
3117 for (int i = 11; i <= 14; i++)
3118 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
3120 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3123 /* Lex a string literal containing UCN 8 characters.
3124 Verify the substring location data after running cpp_interpret_string
3125 on it. */
3127 static void
3128 test_lexer_string_locations_ucn8 (const line_table_case &case_)
3130 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
3131 ....................000000000.111111.1111222222.2222333333333.344444
3132 ....................123456789.012345.6789012345.6789012345678.901234 */
3133 const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n";
3134 lexer_test test (case_, content, NULL);
3136 /* Verify that we get the expected token back, with the correct
3137 location information. */
3138 const cpp_token *tok = test.get_token ();
3139 ASSERT_EQ (tok->type, CPP_STRING);
3140 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3141 "\"01234\\U00002174\\U00002175789\"");
3143 /* Verify that cpp_interpret_string works.
3144 The UTF-8 encoding of the string is identical to that from
3145 the ucn4 testcase above; the only difference is the column
3146 locations. */
3147 cpp_string dst_string;
3148 const enum cpp_ttype type = CPP_STRING;
3149 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3150 &dst_string, type);
3151 ASSERT_TRUE (result);
3152 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3153 (const char *)dst_string.text);
3154 free (const_cast <unsigned char *> (dst_string.text));
3156 /* Verify ranges of individual characters. This no longer includes the
3157 opening quote, but does include the closing quote.
3158 '01234'. */
3159 for (int i = 0; i <= 4; i++)
3160 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3161 /* U+2174. */
3162 for (int i = 5; i <= 7; i++)
3163 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3164 /* U+2175. */
3165 for (int i = 8; i <= 10; i++)
3166 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3167 /* '789' at columns 35-37 */
3168 for (int i = 11; i <= 13; i++)
3169 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3170 /* Closing quote/nul-terminator at column 38. */
3171 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3173 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3176 /* Fetch a big-endian 32-bit value and convert to host endianness. */
3178 static uint32_t
3179 uint32_from_big_endian (const uint32_t *ptr_be_value)
3181 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3182 return (((uint32_t) buf[0] << 24)
3183 | ((uint32_t) buf[1] << 16)
3184 | ((uint32_t) buf[2] << 8)
3185 | (uint32_t) buf[3]);
3188 /* Lex a wide string literal and verify that attempts to read substring
3189 location data from it fail gracefully. */
3191 static void
3192 test_lexer_string_locations_wide_string (const line_table_case &case_)
3194 /* Digits 0-9.
3195 ....................000000000.11111111112.22222222233333
3196 ....................123456789.01234567890.12345678901234 */
3197 const char *content = " L\"0123456789\" /* non-str */\n";
3198 lexer_test test (case_, content, NULL);
3200 /* Verify that we get the expected token back, with the correct
3201 location information. */
3202 const cpp_token *tok = test.get_token ();
3203 ASSERT_EQ (tok->type, CPP_WSTRING);
3204 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3206 /* Verify that cpp_interpret_string works, using CPP_WSTRING. */
3207 cpp_string dst_string;
3208 const enum cpp_ttype type = CPP_WSTRING;
3209 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3210 &dst_string, type);
3211 ASSERT_TRUE (result);
3212 /* The cpp_reader defaults to big-endian with
3213 CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3214 now be encoded as UTF-32BE. */
3215 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3216 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3217 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3218 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3219 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3220 free (const_cast <unsigned char *> (dst_string.text));
3222 /* We don't yet support generating substring location information
3223 for L"" strings. */
3224 ASSERT_HAS_NO_SUBSTRING_RANGES
3225 (test, tok->src_loc, type,
3226 "execution character set != source character set");
3229 /* Fetch a big-endian 16-bit value and convert to host endianness. */
3231 static uint16_t
3232 uint16_from_big_endian (const uint16_t *ptr_be_value)
3234 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3235 return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3238 /* Lex a u"" string literal and verify that attempts to read substring
3239 location data from it fail gracefully. */
3241 static void
3242 test_lexer_string_locations_string16 (const line_table_case &case_)
3244 /* Digits 0-9.
3245 ....................000000000.11111111112.22222222233333
3246 ....................123456789.01234567890.12345678901234 */
3247 const char *content = " u\"0123456789\" /* non-str */\n";
3248 lexer_test test (case_, content, NULL);
3250 /* Verify that we get the expected token back, with the correct
3251 location information. */
3252 const cpp_token *tok = test.get_token ();
3253 ASSERT_EQ (tok->type, CPP_STRING16);
3254 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3256 /* Verify that cpp_interpret_string works, using CPP_STRING16. */
3257 cpp_string dst_string;
3258 const enum cpp_ttype type = CPP_STRING16;
3259 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3260 &dst_string, type);
3261 ASSERT_TRUE (result);
3263 /* The cpp_reader defaults to big-endian, so dst_string should
3264 now be encoded as UTF-16BE. */
3265 const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3266 ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3267 ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3268 ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3269 ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3270 free (const_cast <unsigned char *> (dst_string.text));
3272 /* We don't yet support generating substring location information
3273 for L"" strings. */
3274 ASSERT_HAS_NO_SUBSTRING_RANGES
3275 (test, tok->src_loc, type,
3276 "execution character set != source character set");
3279 /* Lex a U"" string literal and verify that attempts to read substring
3280 location data from it fail gracefully. */
3282 static void
3283 test_lexer_string_locations_string32 (const line_table_case &case_)
3285 /* Digits 0-9.
3286 ....................000000000.11111111112.22222222233333
3287 ....................123456789.01234567890.12345678901234 */
3288 const char *content = " U\"0123456789\" /* non-str */\n";
3289 lexer_test test (case_, content, NULL);
3291 /* Verify that we get the expected token back, with the correct
3292 location information. */
3293 const cpp_token *tok = test.get_token ();
3294 ASSERT_EQ (tok->type, CPP_STRING32);
3295 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3297 /* Verify that cpp_interpret_string works, using CPP_STRING32. */
3298 cpp_string dst_string;
3299 const enum cpp_ttype type = CPP_STRING32;
3300 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3301 &dst_string, type);
3302 ASSERT_TRUE (result);
3304 /* The cpp_reader defaults to big-endian, so dst_string should
3305 now be encoded as UTF-32BE. */
3306 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3307 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3308 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3309 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3310 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3311 free (const_cast <unsigned char *> (dst_string.text));
3313 /* We don't yet support generating substring location information
3314 for L"" strings. */
3315 ASSERT_HAS_NO_SUBSTRING_RANGES
3316 (test, tok->src_loc, type,
3317 "execution character set != source character set");
3320 /* Lex a u8-string literal.
3321 Verify the substring location data after running cpp_interpret_string
3322 on it. */
3324 static void
3325 test_lexer_string_locations_u8 (const line_table_case &case_)
3327 /* Digits 0-9.
3328 ....................000000000.11111111112.22222222233333
3329 ....................123456789.01234567890.12345678901234 */
3330 const char *content = " u8\"0123456789\" /* non-str */\n";
3331 lexer_test test (case_, content, NULL);
3333 /* Verify that we get the expected token back, with the correct
3334 location information. */
3335 const cpp_token *tok = test.get_token ();
3336 ASSERT_EQ (tok->type, CPP_UTF8STRING);
3337 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3339 /* Verify that cpp_interpret_string works. */
3340 cpp_string dst_string;
3341 const enum cpp_ttype type = CPP_STRING;
3342 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3343 &dst_string, type);
3344 ASSERT_TRUE (result);
3345 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3346 free (const_cast <unsigned char *> (dst_string.text));
3348 /* Verify ranges of individual characters. This no longer includes the
3349 opening quote, but does include the closing quote. */
3350 for (int i = 0; i <= 10; i++)
3351 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3354 /* Lex a string literal containing UTF-8 source characters.
3355 Verify the substring location data after running cpp_interpret_string
3356 on it. */
3358 static void
3359 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3361 /* This string literal is written out to the source file as UTF-8,
3362 and is of the form "before mojibake after", where "mojibake"
3363 is written as the following four unicode code points:
3364 U+6587 CJK UNIFIED IDEOGRAPH-6587
3365 U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3366 U+5316 CJK UNIFIED IDEOGRAPH-5316
3367 U+3051 HIRAGANA LETTER KE.
3368 Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3369 "before" and "after" are 1 byte per unicode character.
3371 The numbering shown are "columns", which are *byte* numbers within
3372 the line, rather than unicode character numbers.
3374 .................... 000000000.1111111.
3375 .................... 123456789.0123456. */
3376 const char *content = (" \"before "
3377 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3378 UTF-8: 0xE6 0x96 0x87
3379 C octal escaped UTF-8: \346\226\207
3380 "column" numbers: 17-19. */
3381 "\346\226\207"
3383 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3384 UTF-8: 0xE5 0xAD 0x97
3385 C octal escaped UTF-8: \345\255\227
3386 "column" numbers: 20-22. */
3387 "\345\255\227"
3389 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3390 UTF-8: 0xE5 0x8C 0x96
3391 C octal escaped UTF-8: \345\214\226
3392 "column" numbers: 23-25. */
3393 "\345\214\226"
3395 /* U+3051 HIRAGANA LETTER KE
3396 UTF-8: 0xE3 0x81 0x91
3397 C octal escaped UTF-8: \343\201\221
3398 "column" numbers: 26-28. */
3399 "\343\201\221"
3401 /* column numbers 29 onwards
3402 2333333.33334444444444
3403 9012345.67890123456789. */
3404 " after\" /* non-str */\n");
3405 lexer_test test (case_, content, NULL);
3407 /* Verify that we get the expected token back, with the correct
3408 location information. */
3409 const cpp_token *tok = test.get_token ();
3410 ASSERT_EQ (tok->type, CPP_STRING);
3411 ASSERT_TOKEN_AS_TEXT_EQ
3412 (test.m_parser, tok,
3413 "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3415 /* Verify that cpp_interpret_string works. */
3416 cpp_string dst_string;
3417 const enum cpp_ttype type = CPP_STRING;
3418 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3419 &dst_string, type);
3420 ASSERT_TRUE (result);
3421 ASSERT_STREQ
3422 ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3423 (const char *)dst_string.text);
3424 free (const_cast <unsigned char *> (dst_string.text));
3426 /* Verify ranges of individual characters. This no longer includes the
3427 opening quote, but does include the closing quote.
3428 Assuming that both source and execution encodings are UTF-8, we have
3429 a run of 25 octets in each, plus the NUL terminator. */
3430 for (int i = 0; i < 25; i++)
3431 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3432 /* NUL-terminator should use the closing quote at column 35. */
3433 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3435 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3438 /* Test of string literal concatenation. */
3440 static void
3441 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3443 /* Digits 0-9.
3444 .....................000000000.111111.11112222222222
3445 .....................123456789.012345.67890123456789. */
3446 const char *content = (" \"01234\" /* non-str */\n"
3447 " \"56789\" /* non-str */\n");
3448 lexer_test test (case_, content, NULL);
3450 location_t input_locs[2];
3452 /* Verify that we get the expected tokens back. */
3453 auto_vec <cpp_string> input_strings;
3454 const cpp_token *tok_a = test.get_token ();
3455 ASSERT_EQ (tok_a->type, CPP_STRING);
3456 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3457 input_strings.safe_push (tok_a->val.str);
3458 input_locs[0] = tok_a->src_loc;
3460 const cpp_token *tok_b = test.get_token ();
3461 ASSERT_EQ (tok_b->type, CPP_STRING);
3462 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3463 input_strings.safe_push (tok_b->val.str);
3464 input_locs[1] = tok_b->src_loc;
3466 /* Verify that cpp_interpret_string works. */
3467 cpp_string dst_string;
3468 const enum cpp_ttype type = CPP_STRING;
3469 bool result = cpp_interpret_string (test.m_parser,
3470 input_strings.address (), 2,
3471 &dst_string, type);
3472 ASSERT_TRUE (result);
3473 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3474 free (const_cast <unsigned char *> (dst_string.text));
3476 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3477 test.m_concats.record_string_concatenation (2, input_locs);
3479 location_t initial_loc = input_locs[0];
3481 /* "01234" on line 1. */
3482 for (int i = 0; i <= 4; i++)
3483 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3484 /* "56789" in line 2, plus its closing quote for the nul terminator. */
3485 for (int i = 5; i <= 10; i++)
3486 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3488 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3491 /* Another test of string literal concatenation. */
3493 static void
3494 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3496 /* Digits 0-9.
3497 .....................000000000.111.11111112222222
3498 .....................123456789.012.34567890123456. */
3499 const char *content = (" \"01\" /* non-str */\n"
3500 " \"23\" /* non-str */\n"
3501 " \"45\" /* non-str */\n"
3502 " \"67\" /* non-str */\n"
3503 " \"89\" /* non-str */\n");
3504 lexer_test test (case_, content, NULL);
3506 auto_vec <cpp_string> input_strings;
3507 location_t input_locs[5];
3509 /* Verify that we get the expected tokens back. */
3510 for (int i = 0; i < 5; i++)
3512 const cpp_token *tok = test.get_token ();
3513 ASSERT_EQ (tok->type, CPP_STRING);
3514 input_strings.safe_push (tok->val.str);
3515 input_locs[i] = tok->src_loc;
3518 /* Verify that cpp_interpret_string works. */
3519 cpp_string dst_string;
3520 const enum cpp_ttype type = CPP_STRING;
3521 bool result = cpp_interpret_string (test.m_parser,
3522 input_strings.address (), 5,
3523 &dst_string, type);
3524 ASSERT_TRUE (result);
3525 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3526 free (const_cast <unsigned char *> (dst_string.text));
3528 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3529 test.m_concats.record_string_concatenation (5, input_locs);
3531 location_t initial_loc = input_locs[0];
3533 /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3534 detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3535 and expect get_source_range_for_substring to fail.
3536 However, for a string concatenation test, we can have a case
3537 where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3538 but subsequent strings can be after it.
3539 Attempting to detect this within assert_char_at_range
3540 would overcomplicate the logic for the common test cases, so
3541 we detect it here. */
3542 if (should_have_column_data_p (input_locs[0])
3543 && !should_have_column_data_p (input_locs[4]))
3545 /* Verify that get_source_range_for_substring gracefully rejects
3546 this case. */
3547 source_range actual_range;
3548 const char *err
3549 = get_source_range_for_char (test.m_parser, &test.m_concats,
3550 initial_loc, type, 0, &actual_range);
3551 ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3552 return;
3555 for (int i = 0; i < 5; i++)
3556 for (int j = 0; j < 2; j++)
3557 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3558 i + 1, 10 + j, 10 + j);
3560 /* NUL-terminator should use the final closing quote at line 5 column 12. */
3561 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3563 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3566 /* Another test of string literal concatenation, this time combined with
3567 various kinds of escaped characters. */
3569 static void
3570 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3572 /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3573 digit 6 in ASCII as octal "\066", concatenating multiple strings. */
3574 const char *content
3575 /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3576 .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3577 = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n");
3578 lexer_test test (case_, content, NULL);
3580 auto_vec <cpp_string> input_strings;
3581 location_t input_locs[4];
3583 /* Verify that we get the expected tokens back. */
3584 for (int i = 0; i < 4; i++)
3586 const cpp_token *tok = test.get_token ();
3587 ASSERT_EQ (tok->type, CPP_STRING);
3588 input_strings.safe_push (tok->val.str);
3589 input_locs[i] = tok->src_loc;
3592 /* Verify that cpp_interpret_string works. */
3593 cpp_string dst_string;
3594 const enum cpp_ttype type = CPP_STRING;
3595 bool result = cpp_interpret_string (test.m_parser,
3596 input_strings.address (), 4,
3597 &dst_string, type);
3598 ASSERT_TRUE (result);
3599 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3600 free (const_cast <unsigned char *> (dst_string.text));
3602 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3603 test.m_concats.record_string_concatenation (4, input_locs);
3605 location_t initial_loc = input_locs[0];
3607 for (int i = 0; i <= 4; i++)
3608 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3609 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3610 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3611 for (int i = 7; i <= 9; i++)
3612 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3614 /* NUL-terminator should use the location of the final closing quote. */
3615 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3617 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3620 /* Test of string literal in a macro. */
3622 static void
3623 test_lexer_string_locations_macro (const line_table_case &case_)
3625 /* Digits 0-9.
3626 .....................0000000001111111111.22222222223.
3627 .....................1234567890123456789.01234567890. */
3628 const char *content = ("#define MACRO \"0123456789\" /* non-str */\n"
3629 " MACRO");
3630 lexer_test test (case_, content, NULL);
3632 /* Verify that we get the expected tokens back. */
3633 const cpp_token *tok = test.get_token ();
3634 ASSERT_EQ (tok->type, CPP_PADDING);
3636 tok = test.get_token ();
3637 ASSERT_EQ (tok->type, CPP_STRING);
3638 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3640 /* Verify ranges of individual characters. We ought to
3641 see columns within the macro definition. */
3642 for (int i = 0; i <= 10; i++)
3643 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3644 i, 1, 20 + i, 20 + i);
3646 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3648 tok = test.get_token ();
3649 ASSERT_EQ (tok->type, CPP_PADDING);
3652 /* Test of stringification of a macro argument. */
3654 static void
3655 test_lexer_string_locations_stringified_macro_argument
3656 (const line_table_case &case_)
3658 /* .....................000000000111111111122222222223.
3659 .....................123456789012345678901234567890. */
3660 const char *content = ("#define MACRO(X) #X /* non-str */\n"
3661 "MACRO(foo)\n");
3662 lexer_test test (case_, content, NULL);
3664 /* Verify that we get the expected token back. */
3665 const cpp_token *tok = test.get_token ();
3666 ASSERT_EQ (tok->type, CPP_PADDING);
3668 tok = test.get_token ();
3669 ASSERT_EQ (tok->type, CPP_STRING);
3670 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3672 /* We don't support getting the location of a stringified macro
3673 argument. Verify that it fails gracefully. */
3674 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3675 "cpp_interpret_string_1 failed");
3677 tok = test.get_token ();
3678 ASSERT_EQ (tok->type, CPP_PADDING);
3680 tok = test.get_token ();
3681 ASSERT_EQ (tok->type, CPP_PADDING);
3684 /* Ensure that we are fail gracefully if something attempts to pass
3685 in a location that isn't a string literal token. Seen on this code:
3687 const char a[] = " %d ";
3688 __builtin_printf (a, 0.5);
3691 when c-format.cc erroneously used the indicated one-character
3692 location as the format string location, leading to a read past the
3693 end of a string buffer in cpp_interpret_string_1. */
3695 static void
3696 test_lexer_string_locations_non_string (const line_table_case &case_)
3698 /* .....................000000000111111111122222222223.
3699 .....................123456789012345678901234567890. */
3700 const char *content = (" a\n");
3701 lexer_test test (case_, content, NULL);
3703 /* Verify that we get the expected token back. */
3704 const cpp_token *tok = test.get_token ();
3705 ASSERT_EQ (tok->type, CPP_NAME);
3706 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3708 /* At this point, libcpp is attempting to interpret the name as a
3709 string literal, despite it not starting with a quote. We don't detect
3710 that, but we should at least fail gracefully. */
3711 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3712 "cpp_interpret_string_1 failed");
3715 /* Ensure that we can read substring information for a token which
3716 starts in one linemap and ends in another . Adapted from
3717 gcc.dg/cpp/pr69985.c. */
3719 static void
3720 test_lexer_string_locations_long_line (const line_table_case &case_)
3722 /* .....................000000.000111111111
3723 .....................123456.789012346789. */
3724 const char *content = ("/* A very long line, so that we start a new line map. */\n"
3725 " \"0123456789012345678901234567890123456789"
3726 "0123456789012345678901234567890123456789"
3727 "0123456789012345678901234567890123456789"
3728 "0123456789\"\n");
3730 lexer_test test (case_, content, NULL);
3732 /* Verify that we get the expected token back. */
3733 const cpp_token *tok = test.get_token ();
3734 ASSERT_EQ (tok->type, CPP_STRING);
3736 if (!should_have_column_data_p (line_table->highest_location))
3737 return;
3739 /* Verify ranges of individual characters. */
3740 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3741 for (int i = 0; i < 131; i++)
3742 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3743 i, 2, 7 + i, 7 + i);
3746 /* Test of locations within a raw string that doesn't contain a newline. */
3748 static void
3749 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3751 /* .....................00.0000000111111111122.
3752 .....................12.3456789012345678901. */
3753 const char *content = ("R\"foo(0123456789)foo\"\n");
3754 lexer_test test (case_, content, NULL);
3756 /* Verify that we get the expected token back. */
3757 const cpp_token *tok = test.get_token ();
3758 ASSERT_EQ (tok->type, CPP_STRING);
3760 /* Verify that cpp_interpret_string works. */
3761 cpp_string dst_string;
3762 const enum cpp_ttype type = CPP_STRING;
3763 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3764 &dst_string, type);
3765 ASSERT_TRUE (result);
3766 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3767 free (const_cast <unsigned char *> (dst_string.text));
3769 if (!should_have_column_data_p (line_table->highest_location))
3770 return;
3772 /* 0-9, plus the nil terminator. */
3773 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3774 for (int i = 0; i < 11; i++)
3775 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3776 i, 1, 7 + i, 7 + i);
3779 /* Test of locations within a raw string that contains a newline. */
3781 static void
3782 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3784 /* .....................00.0000.
3785 .....................12.3456. */
3786 const char *content = ("R\"foo(\n"
3787 /* .....................00000.
3788 .....................12345. */
3789 "hello\n"
3790 "world\n"
3791 /* .....................00000.
3792 .....................12345. */
3793 ")foo\"\n");
3794 lexer_test test (case_, content, NULL);
3796 /* Verify that we get the expected token back. */
3797 const cpp_token *tok = test.get_token ();
3798 ASSERT_EQ (tok->type, CPP_STRING);
3800 /* Verify that cpp_interpret_string works. */
3801 cpp_string dst_string;
3802 const enum cpp_ttype type = CPP_STRING;
3803 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3804 &dst_string, type);
3805 ASSERT_TRUE (result);
3806 ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3807 free (const_cast <unsigned char *> (dst_string.text));
3809 if (!should_have_column_data_p (line_table->highest_location))
3810 return;
3812 /* Currently we don't support locations within raw strings that
3813 contain newlines. */
3814 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3815 "range endpoints are on different lines");
3818 /* Test of parsing an unterminated raw string. */
3820 static void
3821 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3823 const char *content = "R\"ouch()ouCh\" /* etc */";
3825 lexer_diagnostic_sink diagnostics;
3826 lexer_test test (case_, content, &diagnostics);
3827 test.m_implicitly_expect_EOF = false;
3829 /* Attempt to parse the raw string. */
3830 const cpp_token *tok = test.get_token ();
3831 ASSERT_EQ (tok->type, CPP_EOF);
3833 ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3834 /* We expect the message "unterminated raw string"
3835 in the "cpplib" translation domain.
3836 It's not clear that dgettext is available on all supported hosts,
3837 so this assertion is commented-out for now.
3838 ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3839 diagnostics.m_diagnostics[0]);
3843 /* Test of lexing char constants. */
3845 static void
3846 test_lexer_char_constants (const line_table_case &case_)
3848 /* Various char constants.
3849 .....................0000000001111111111.22222222223.
3850 .....................1234567890123456789.01234567890. */
3851 const char *content = (" 'a'\n"
3852 " u'a'\n"
3853 " U'a'\n"
3854 " L'a'\n"
3855 " 'abc'\n");
3856 lexer_test test (case_, content, NULL);
3858 /* Verify that we get the expected tokens back. */
3859 /* 'a'. */
3860 const cpp_token *tok = test.get_token ();
3861 ASSERT_EQ (tok->type, CPP_CHAR);
3862 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3864 unsigned int chars_seen;
3865 int unsignedp;
3866 cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3867 &chars_seen, &unsignedp);
3868 ASSERT_EQ (cc, 'a');
3869 ASSERT_EQ (chars_seen, 1);
3871 /* u'a'. */
3872 tok = test.get_token ();
3873 ASSERT_EQ (tok->type, CPP_CHAR16);
3874 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3876 /* U'a'. */
3877 tok = test.get_token ();
3878 ASSERT_EQ (tok->type, CPP_CHAR32);
3879 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3881 /* L'a'. */
3882 tok = test.get_token ();
3883 ASSERT_EQ (tok->type, CPP_WCHAR);
3884 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3886 /* 'abc' (c-char-sequence). */
3887 tok = test.get_token ();
3888 ASSERT_EQ (tok->type, CPP_CHAR);
3889 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3891 /* A table of interesting location_t values, giving one axis of our test
3892 matrix. */
3894 static const location_t boundary_locations[] = {
3895 /* Zero means "don't override the default values for a new line_table". */
3898 /* An arbitrary non-zero value that isn't close to one of
3899 the boundary values below. */
3900 0x10000,
3902 /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES. */
3903 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3904 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3905 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3906 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3907 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3909 /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS. */
3910 LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3911 LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3912 LINE_MAP_MAX_LOCATION_WITH_COLS,
3913 LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3914 LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3917 /* Run TESTCASE multiple times, once for each case in our test matrix. */
3919 void
3920 for_each_line_table_case (void (*testcase) (const line_table_case &))
3922 /* As noted above in the description of struct line_table_case,
3923 we want to explore a test matrix of interesting line_table
3924 situations, running various selftests for each case within the
3925 matrix. */
3927 /* Run all tests with:
3928 (a) line_table->default_range_bits == 0, and
3929 (b) line_table->default_range_bits == 5. */
3930 int num_cases_tested = 0;
3931 for (int default_range_bits = 0; default_range_bits <= 5;
3932 default_range_bits += 5)
3934 /* ...and use each of the "interesting" location values as
3935 the starting location within line_table. */
3936 const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3937 for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3939 line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3941 testcase (c);
3943 num_cases_tested++;
3947 /* Verify that we fully covered the test matrix. */
3948 ASSERT_EQ (num_cases_tested, 2 * 12);
3951 /* Verify that when presented with a consecutive pair of locations with
3952 a very large line offset, we don't attempt to consolidate them into
3953 a single ordinary linemap where the line offsets within the line map
3954 would lead to overflow (PR lto/88147). */
3956 static void
3957 test_line_offset_overflow ()
3959 line_table_test ltt (line_table_case (5, 0));
3961 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3962 linemap_line_start (line_table, 1, 100);
3963 location_t loc_a = linemap_line_start (line_table, 2578, 255);
3964 assert_loceq ("foo.c", 2578, 0, loc_a);
3966 const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3967 ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3968 ASSERT_EQ (ordmap_a->m_range_bits, 5);
3970 location_t loc_b = linemap_line_start (line_table, 404198, 512);
3971 assert_loceq ("foo.c", 404198, 0, loc_b);
3973 /* We should have started a new linemap, rather than attempting to store
3974 a very large line offset. */
3975 const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3976 ASSERT_NE (ordmap_a, ordmap_b);
3979 void test_cpp_utf8 ()
3981 const int def_tabstop = 8;
3982 cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3984 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
3986 int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3987 ASSERT_EQ (8, w_bad);
3988 int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3989 ASSERT_EQ (5, w_ctrl);
3992 /* Verify that wcwidth of valid UTF-8 is as expected. */
3994 const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3995 ASSERT_EQ (1, w_pi);
3996 const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3997 ASSERT_EQ (2, w_emoji);
3998 const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3999 policy);
4000 ASSERT_EQ (1, w_umlaut_precomposed);
4001 const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
4002 policy);
4003 ASSERT_EQ (1, w_umlaut_combining);
4004 const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
4005 ASSERT_EQ (2, w_han);
4006 const int w_ascii = cpp_display_width ("GCC", 3, policy);
4007 ASSERT_EQ (3, w_ascii);
4008 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
4009 "\x9f! \xe4\xb8\xba y\xcc\x88",
4010 24, policy);
4011 ASSERT_EQ (18, w_mixed);
4014 /* Verify that display width properly expands tabs. */
4016 const char *tstr = "\tabc\td";
4017 ASSERT_EQ (6, cpp_display_width (tstr, 6,
4018 cpp_char_column_policy (1, cpp_wcwidth)));
4019 ASSERT_EQ (10, cpp_display_width (tstr, 6,
4020 cpp_char_column_policy (3, cpp_wcwidth)));
4021 ASSERT_EQ (17, cpp_display_width (tstr, 6,
4022 cpp_char_column_policy (8, cpp_wcwidth)));
4023 ASSERT_EQ (1,
4024 cpp_display_column_to_byte_column
4025 (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
4028 /* Verify that cpp_byte_column_to_display_column can go past the end,
4029 and similar edge cases. */
4031 const char *str
4032 /* Display columns.
4033 111111112345 */
4034 = "\xcf\x80 abc";
4035 /* 111122223456
4036 Byte columns. */
4038 ASSERT_EQ (5, cpp_display_width (str, 6, policy));
4039 ASSERT_EQ (105,
4040 cpp_byte_column_to_display_column (str, 6, 106, policy));
4041 ASSERT_EQ (10000,
4042 cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
4043 ASSERT_EQ (0,
4044 cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
4047 /* Verify that cpp_display_column_to_byte_column can go past the end,
4048 and similar edge cases, and check invertibility. */
4050 const char *str
4051 /* Display columns.
4052 000000000000000000000000000000000000011
4053 111111112222222234444444455555555678901 */
4054 = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
4055 /* 000000000000000000000000000000000111111
4056 111122223333444456666777788889999012345
4057 Byte columns. */
4058 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
4059 ASSERT_EQ (15,
4060 cpp_display_column_to_byte_column (str, 15, 11, policy));
4061 ASSERT_EQ (115,
4062 cpp_display_column_to_byte_column (str, 15, 111, policy));
4063 ASSERT_EQ (10000,
4064 cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
4065 ASSERT_EQ (0,
4066 cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
4068 /* Verify that we do not interrupt a UTF-8 sequence. */
4069 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
4071 for (int byte_col = 1; byte_col <= 15; ++byte_col)
4073 const int disp_col
4074 = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
4075 const int byte_col2
4076 = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
4078 /* If we ask for the display column in the middle of a UTF-8
4079 sequence, it will return the length of the partial sequence,
4080 matching the behavior of GCC before display column support.
4081 Otherwise check the round trip was successful. */
4082 if (byte_col < 4)
4083 ASSERT_EQ (byte_col, disp_col);
4084 else if (byte_col >= 6 && byte_col < 9)
4085 ASSERT_EQ (3 + (byte_col - 5), disp_col);
4086 else
4087 ASSERT_EQ (byte_col2, byte_col);
4092 static bool
4093 check_cpp_valid_utf8_p (const char *str)
4095 return cpp_valid_utf8_p (str, strlen (str));
4098 /* Check that cpp_valid_utf8_p works as expected. */
4100 static void
4101 test_cpp_valid_utf8_p ()
4103 ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
4105 /* 2-byte char (pi). */
4106 ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
4108 /* 3-byte chars (the Japanese word "mojibake"). */
4109 ASSERT_TRUE (check_cpp_valid_utf8_p
4111 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
4112 UTF-8: 0xE6 0x96 0x87
4113 C octal escaped UTF-8: \346\226\207. */
4114 "\346\226\207"
4115 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
4116 UTF-8: 0xE5 0xAD 0x97
4117 C octal escaped UTF-8: \345\255\227. */
4118 "\345\255\227"
4119 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
4120 UTF-8: 0xE5 0x8C 0x96
4121 C octal escaped UTF-8: \345\214\226. */
4122 "\345\214\226"
4123 /* U+3051 HIRAGANA LETTER KE
4124 UTF-8: 0xE3 0x81 0x91
4125 C octal escaped UTF-8: \343\201\221. */
4126 "\343\201\221"));
4128 /* 4-byte char: an emoji. */
4129 ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
4131 /* Control codes, including the NUL byte. */
4132 ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
4134 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
4136 /* Unexpected continuation bytes. */
4137 for (unsigned char continuation_byte = 0x80;
4138 continuation_byte <= 0xbf;
4139 continuation_byte++)
4140 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
4142 /* "Lonely start characters" for 2-byte sequences. */
4144 unsigned char buf[2];
4145 buf[1] = ' ';
4146 for (buf[0] = 0xc0;
4147 buf[0] <= 0xdf;
4148 buf[0]++)
4149 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4152 /* "Lonely start characters" for 3-byte sequences. */
4154 unsigned char buf[2];
4155 buf[1] = ' ';
4156 for (buf[0] = 0xe0;
4157 buf[0] <= 0xef;
4158 buf[0]++)
4159 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4162 /* "Lonely start characters" for 4-byte sequences. */
4164 unsigned char buf[2];
4165 buf[1] = ' ';
4166 for (buf[0] = 0xf0;
4167 buf[0] <= 0xf4;
4168 buf[0]++)
4169 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4172 /* Invalid start characters (formerly valid for 5-byte and 6-byte
4173 sequences). */
4175 unsigned char buf[2];
4176 buf[1] = ' ';
4177 for (buf[0] = 0xf5;
4178 buf[0] <= 0xfd;
4179 buf[0]++)
4180 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4183 /* Impossible bytes. */
4184 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
4185 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
4186 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
4187 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
4190 /* Run all of the selftests within this file. */
4192 void
4193 input_cc_tests ()
4195 test_linenum_comparisons ();
4196 test_should_have_column_data_p ();
4197 test_unknown_location ();
4198 test_builtins ();
4199 for_each_line_table_case (test_make_location_nonpure_range_endpoints);
4201 for_each_line_table_case (test_accessing_ordinary_linemaps);
4202 for_each_line_table_case (test_lexer);
4203 for_each_line_table_case (test_lexer_string_locations_simple);
4204 for_each_line_table_case (test_lexer_string_locations_ebcdic);
4205 for_each_line_table_case (test_lexer_string_locations_hex);
4206 for_each_line_table_case (test_lexer_string_locations_oct);
4207 for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
4208 for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
4209 for_each_line_table_case (test_lexer_string_locations_ucn4);
4210 for_each_line_table_case (test_lexer_string_locations_ucn8);
4211 for_each_line_table_case (test_lexer_string_locations_wide_string);
4212 for_each_line_table_case (test_lexer_string_locations_string16);
4213 for_each_line_table_case (test_lexer_string_locations_string32);
4214 for_each_line_table_case (test_lexer_string_locations_u8);
4215 for_each_line_table_case (test_lexer_string_locations_utf8_source);
4216 for_each_line_table_case (test_lexer_string_locations_concatenation_1);
4217 for_each_line_table_case (test_lexer_string_locations_concatenation_2);
4218 for_each_line_table_case (test_lexer_string_locations_concatenation_3);
4219 for_each_line_table_case (test_lexer_string_locations_macro);
4220 for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
4221 for_each_line_table_case (test_lexer_string_locations_non_string);
4222 for_each_line_table_case (test_lexer_string_locations_long_line);
4223 for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
4224 for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
4225 for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
4226 for_each_line_table_case (test_lexer_char_constants);
4228 test_reading_source_line ();
4230 test_line_offset_overflow ();
4232 test_cpp_utf8 ();
4233 test_cpp_valid_utf8_p ();
4236 } // namespace selftest
4238 #endif /* CHECKING_P */