ada: Fix spurious -Wstringop-overflow with link time optimization
[official-gcc.git] / gcc / input.cc
blobf7c0163729136a58a7ce7511046f083f6355d7f8
1 /* Data and functions related to line maps and input files.
2 Copyright (C) 2004-2023 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "selftest.h"
26 #include "cpplib.h"
28 #ifndef HAVE_ICONV
29 #define HAVE_ICONV 0
30 #endif
32 const char *
33 special_fname_builtin ()
35 return _("<built-in>");
38 /* Input charset configuration. */
39 static const char *default_charset_callback (const char *)
41 return nullptr;
44 void
45 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
46 bool should_skip_bom)
48 in_context.ccb = (ccb ? ccb : default_charset_callback);
49 in_context.should_skip_bom = should_skip_bom;
52 /* This is a cache used by get_next_line to store the content of a
53 file to be searched for file lines. */
54 class file_cache_slot
56 public:
57 file_cache_slot ();
58 ~file_cache_slot ();
60 bool read_line_num (size_t line_num,
61 char ** line, ssize_t *line_len);
63 /* Accessors. */
64 const char *get_file_path () const { return m_file_path; }
65 unsigned get_use_count () const { return m_use_count; }
66 bool missing_trailing_newline_p () const
68 return m_missing_trailing_newline;
70 char_span get_full_file_content ();
72 void inc_use_count () { m_use_count++; }
74 bool create (const file_cache::input_context &in_context,
75 const char *file_path, FILE *fp, unsigned highest_use_count);
76 void evict ();
78 private:
79 /* These are information used to store a line boundary. */
80 class line_info
82 public:
83 /* The line number. It starts from 1. */
84 size_t line_num;
86 /* The position (byte count) of the beginning of the line,
87 relative to the file data pointer. This starts at zero. */
88 size_t start_pos;
90 /* The position (byte count) of the last byte of the line. This
91 normally points to the '\n' character, or to one byte after the
92 last byte of the file, if the file doesn't contain a '\n'
93 character. */
94 size_t end_pos;
96 line_info (size_t l, size_t s, size_t e)
97 : line_num (l), start_pos (s), end_pos (e)
100 line_info ()
101 :line_num (0), start_pos (0), end_pos (0)
105 bool needs_read_p () const;
106 bool needs_grow_p () const;
107 void maybe_grow ();
108 bool read_data ();
109 bool maybe_read_data ();
110 bool get_next_line (char **line, ssize_t *line_len);
111 bool read_next_line (char ** line, ssize_t *line_len);
112 bool goto_next_line ();
114 static const size_t buffer_size = 4 * 1024;
115 static const size_t line_record_size = 100;
117 /* The number of time this file has been accessed. This is used
118 to designate which file cache to evict from the cache
119 array. */
120 unsigned m_use_count;
122 /* The file_path is the key for identifying a particular file in
123 the cache.
124 For libcpp-using code, the underlying buffer for this field is
125 owned by the corresponding _cpp_file within the cpp_reader. */
126 const char *m_file_path;
128 FILE *m_fp;
130 /* This points to the content of the file that we've read so
131 far. */
132 char *m_data;
134 /* The allocated buffer to be freed may start a little earlier than DATA,
135 e.g. if a UTF8 BOM was skipped at the beginning. */
136 int m_alloc_offset;
138 /* The size of the DATA array above.*/
139 size_t m_size;
141 /* The number of bytes read from the underlying file so far. This
142 must be less (or equal) than SIZE above. */
143 size_t m_nb_read;
145 /* The index of the beginning of the current line. */
146 size_t m_line_start_idx;
148 /* The number of the previous line read. This starts at 1. Zero
149 means we've read no line so far. */
150 size_t m_line_num;
152 /* This is the total number of lines of the current file. At the
153 moment, we try to get this information from the line map
154 subsystem. Note that this is just a hint. When using the C++
155 front-end, this hint is correct because the input file is then
156 completely tokenized before parsing starts; so the line map knows
157 the number of lines before compilation really starts. For e.g,
158 the C front-end, it can happen that we start emitting diagnostics
159 before the line map has seen the end of the file. */
160 size_t m_total_lines;
162 /* Could this file be missing a trailing newline on its final line?
163 Initially true (to cope with empty files), set to true/false
164 as each line is read. */
165 bool m_missing_trailing_newline;
167 /* This is a record of the beginning and end of the lines we've seen
168 while reading the file. This is useful to avoid walking the data
169 from the beginning when we are asked to read a line that is
170 before LINE_START_IDX above. Note that the maximum size of this
171 record is line_record_size, so that the memory consumption
172 doesn't explode. We thus scale total_lines down to
173 line_record_size. */
174 vec<line_info, va_heap> m_line_record;
176 void offset_buffer (int offset)
178 gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
179 : (size_t) offset <= m_size);
180 gcc_assert (m_data);
181 m_alloc_offset += offset;
182 m_data += offset;
183 m_size -= offset;
188 /* Current position in real source file. */
190 location_t input_location = UNKNOWN_LOCATION;
192 class line_maps *line_table;
194 /* A stashed copy of "line_table" for use by selftest::line_table_test.
195 This needs to be a global so that it can be a GC root, and thus
196 prevent the stashed copy from being garbage-collected if the GC runs
197 during a line_table_test. */
199 class line_maps *saved_line_table;
201 /* Expand the source location LOC into a human readable location. If
202 LOC resolves to a builtin location, the file name of the readable
203 location is set to the string "<built-in>". If EXPANSION_POINT_P is
204 TRUE and LOC is virtual, then it is resolved to the expansion
205 point of the involved macro. Otherwise, it is resolved to the
206 spelling location of the token.
208 When resolving to the spelling location of the token, if the
209 resulting location is for a built-in location (that is, it has no
210 associated line/column) in the context of a macro expansion, the
211 returned location is the first one (while unwinding the macro
212 location towards its expansion point) that is in real source
213 code.
215 ASPECT controls which part of the location to use. */
217 static expanded_location
218 expand_location_1 (location_t loc,
219 bool expansion_point_p,
220 enum location_aspect aspect)
222 expanded_location xloc;
223 const line_map_ordinary *map;
224 enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
225 tree block = NULL;
227 if (IS_ADHOC_LOC (loc))
229 block = LOCATION_BLOCK (loc);
230 loc = LOCATION_LOCUS (loc);
233 memset (&xloc, 0, sizeof (xloc));
235 if (loc >= RESERVED_LOCATION_COUNT)
237 if (!expansion_point_p)
239 /* We want to resolve LOC to its spelling location.
241 But if that spelling location is a reserved location that
242 appears in the context of a macro expansion (like for a
243 location for a built-in token), let's consider the first
244 location (toward the expansion point) that is not reserved;
245 that is, the first location that is in real source code. */
246 loc = linemap_unwind_to_first_non_reserved_loc (line_table,
247 loc, NULL);
248 lrk = LRK_SPELLING_LOCATION;
250 loc = linemap_resolve_location (line_table, loc, lrk, &map);
252 /* loc is now either in an ordinary map, or is a reserved location.
253 If it is a compound location, the caret is in a spelling location,
254 but the start/finish might still be a virtual location.
255 Depending of what the caller asked for, we may need to recurse
256 one level in order to resolve any virtual locations in the
257 end-points. */
258 switch (aspect)
260 default:
261 gcc_unreachable ();
262 /* Fall through. */
263 case LOCATION_ASPECT_CARET:
264 break;
265 case LOCATION_ASPECT_START:
267 location_t start = get_start (loc);
268 if (start != loc)
269 return expand_location_1 (start, expansion_point_p, aspect);
271 break;
272 case LOCATION_ASPECT_FINISH:
274 location_t finish = get_finish (loc);
275 if (finish != loc)
276 return expand_location_1 (finish, expansion_point_p, aspect);
278 break;
280 xloc = linemap_expand_location (line_table, map, loc);
283 xloc.data = block;
284 if (loc <= BUILTINS_LOCATION)
285 xloc.file = loc == UNKNOWN_LOCATION ? NULL : special_fname_builtin ();
287 return xloc;
290 /* Initialize the set of cache used for files accessed by caret
291 diagnostic. */
293 static void
294 diagnostic_file_cache_init (void)
296 gcc_assert (global_dc);
297 global_dc->file_cache_init ();
300 void
301 diagnostic_context::file_cache_init ()
303 if (m_file_cache == nullptr)
304 m_file_cache = new file_cache ();
307 /* Return the total lines number that have been read so far by the
308 line map (in the preprocessor) so far. For languages like C++ that
309 entirely preprocess the input file before starting to parse, this
310 equals the actual number of lines of the file. */
312 static size_t
313 total_lines_num (const char *file_path)
315 size_t r = 0;
316 location_t l = 0;
317 if (linemap_get_file_highest_location (line_table, file_path, &l))
319 gcc_assert (l >= RESERVED_LOCATION_COUNT);
320 expanded_location xloc = expand_location (l);
321 r = xloc.line;
323 return r;
326 /* Lookup the cache used for the content of a given file accessed by
327 caret diagnostic. Return the found cached file, or NULL if no
328 cached file was found. */
330 file_cache_slot *
331 file_cache::lookup_file (const char *file_path)
333 gcc_assert (file_path);
335 /* This will contain the found cached file. */
336 file_cache_slot *r = NULL;
337 for (unsigned i = 0; i < num_file_slots; ++i)
339 file_cache_slot *c = &m_file_slots[i];
340 if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
342 c->inc_use_count ();
343 r = c;
347 if (r)
348 r->inc_use_count ();
350 return r;
353 /* Purge any mention of FILENAME from the cache of files used for
354 printing source code. For use in selftests when working
355 with tempfiles. */
357 void
358 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
360 gcc_assert (file_path);
362 auto file_cache = global_dc->get_file_cache ();
363 if (!file_cache)
364 return;
365 file_cache->forcibly_evict_file (file_path);
368 void
369 file_cache::forcibly_evict_file (const char *file_path)
371 gcc_assert (file_path);
373 file_cache_slot *r = lookup_file (file_path);
374 if (!r)
375 /* Not found. */
376 return;
378 r->evict ();
381 void
382 file_cache_slot::evict ()
384 m_file_path = NULL;
385 if (m_fp)
386 fclose (m_fp);
387 m_fp = NULL;
388 m_nb_read = 0;
389 m_line_start_idx = 0;
390 m_line_num = 0;
391 m_line_record.truncate (0);
392 m_use_count = 0;
393 m_total_lines = 0;
394 m_missing_trailing_newline = true;
397 /* Return the file cache that has been less used, recently, or the
398 first empty one. If HIGHEST_USE_COUNT is non-null,
399 *HIGHEST_USE_COUNT is set to the highest use count of the entries
400 in the cache table. */
402 file_cache_slot*
403 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
405 diagnostic_file_cache_init ();
407 file_cache_slot *to_evict = &m_file_slots[0];
408 unsigned huc = to_evict->get_use_count ();
409 for (unsigned i = 1; i < num_file_slots; ++i)
411 file_cache_slot *c = &m_file_slots[i];
412 bool c_is_empty = (c->get_file_path () == NULL);
414 if (c->get_use_count () < to_evict->get_use_count ()
415 || (to_evict->get_file_path () && c_is_empty))
416 /* We evict C because it's either an entry with a lower use
417 count or one that is empty. */
418 to_evict = c;
420 if (huc < c->get_use_count ())
421 huc = c->get_use_count ();
423 if (c_is_empty)
424 /* We've reached the end of the cache; subsequent elements are
425 all empty. */
426 break;
429 if (highest_use_count)
430 *highest_use_count = huc;
432 return to_evict;
435 /* Create the cache used for the content of a given file to be
436 accessed by caret diagnostic. This cache is added to an array of
437 cache and can be retrieved by lookup_file_in_cache_tab. This
438 function returns the created cache. Note that only the last
439 num_file_slots files are cached.
441 This can return nullptr if the FILE_PATH can't be opened for
442 reading, or if the content can't be converted to the input_charset. */
444 file_cache_slot*
445 file_cache::add_file (const char *file_path)
448 FILE *fp = fopen (file_path, "r");
449 if (fp == NULL)
450 return NULL;
452 unsigned highest_use_count = 0;
453 file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
454 if (!r->create (in_context, file_path, fp, highest_use_count))
455 return NULL;
456 return r;
459 /* Get a borrowed char_span to the full content of this file
460 as decoded according to the input charset, encoded as UTF-8. */
462 char_span
463 file_cache_slot::get_full_file_content ()
465 char *line;
466 ssize_t line_len;
467 while (get_next_line (&line, &line_len))
470 return char_span (m_data, m_nb_read);
473 /* Populate this slot for use on FILE_PATH and FP, dropping any
474 existing cached content within it. */
476 bool
477 file_cache_slot::create (const file_cache::input_context &in_context,
478 const char *file_path, FILE *fp,
479 unsigned highest_use_count)
481 m_file_path = file_path;
482 if (m_fp)
483 fclose (m_fp);
484 m_fp = fp;
485 if (m_alloc_offset)
486 offset_buffer (-m_alloc_offset);
487 m_nb_read = 0;
488 m_line_start_idx = 0;
489 m_line_num = 0;
490 m_line_record.truncate (0);
491 /* Ensure that this cache entry doesn't get evicted next time
492 add_file_to_cache_tab is called. */
493 m_use_count = ++highest_use_count;
494 m_total_lines = total_lines_num (file_path);
495 m_missing_trailing_newline = true;
498 /* Check the input configuration to determine if we need to do any
499 transformations, such as charset conversion or BOM skipping. */
500 if (const char *input_charset = in_context.ccb (file_path))
502 /* Need a full-blown conversion of the input charset. */
503 fclose (m_fp);
504 m_fp = NULL;
505 const cpp_converted_source cs
506 = cpp_get_converted_source (file_path, input_charset);
507 if (!cs.data)
508 return false;
509 if (m_data)
510 XDELETEVEC (m_data);
511 m_data = cs.data;
512 m_nb_read = m_size = cs.len;
513 m_alloc_offset = cs.data - cs.to_free;
515 else if (in_context.should_skip_bom)
517 if (read_data ())
519 const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
520 offset_buffer (offset);
521 m_nb_read -= offset;
525 return true;
528 /* file_cache's ctor. */
530 file_cache::file_cache ()
531 : m_file_slots (new file_cache_slot[num_file_slots])
533 initialize_input_context (nullptr, false);
536 /* file_cache's dtor. */
538 file_cache::~file_cache ()
540 delete[] m_file_slots;
543 /* Lookup the cache used for the content of a given file accessed by
544 caret diagnostic. If no cached file was found, create a new cache
545 for this file, add it to the array of cached file and return
548 This can return nullptr on a cache miss if FILE_PATH can't be opened for
549 reading, or if the content can't be converted to the input_charset. */
551 file_cache_slot*
552 file_cache::lookup_or_add_file (const char *file_path)
554 file_cache_slot *r = lookup_file (file_path);
555 if (r == NULL)
556 r = add_file (file_path);
557 return r;
560 /* Default constructor for a cache of file used by caret
561 diagnostic. */
563 file_cache_slot::file_cache_slot ()
564 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
565 m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
566 m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
568 m_line_record.create (0);
571 /* Destructor for a cache of file used by caret diagnostic. */
573 file_cache_slot::~file_cache_slot ()
575 if (m_fp)
577 fclose (m_fp);
578 m_fp = NULL;
580 if (m_data)
582 offset_buffer (-m_alloc_offset);
583 XDELETEVEC (m_data);
584 m_data = 0;
586 m_line_record.release ();
589 /* Returns TRUE iff the cache would need to be filled with data coming
590 from the file. That is, either the cache is empty or full or the
591 current line is empty. Note that if the cache is full, it would
592 need to be extended and filled again. */
594 bool
595 file_cache_slot::needs_read_p () const
597 return m_fp && (m_nb_read == 0
598 || m_nb_read == m_size
599 || (m_line_start_idx >= m_nb_read - 1));
602 /* Return TRUE iff the cache is full and thus needs to be
603 extended. */
605 bool
606 file_cache_slot::needs_grow_p () const
608 return m_nb_read == m_size;
611 /* Grow the cache if it needs to be extended. */
613 void
614 file_cache_slot::maybe_grow ()
616 if (!needs_grow_p ())
617 return;
619 if (!m_data)
621 gcc_assert (m_size == 0 && m_alloc_offset == 0);
622 m_size = buffer_size;
623 m_data = XNEWVEC (char, m_size);
625 else
627 const int offset = m_alloc_offset;
628 offset_buffer (-offset);
629 m_size *= 2;
630 m_data = XRESIZEVEC (char, m_data, m_size);
631 offset_buffer (offset);
635 /* Read more data into the cache. Extends the cache if need be.
636 Returns TRUE iff new data could be read. */
638 bool
639 file_cache_slot::read_data ()
641 if (feof (m_fp) || ferror (m_fp))
642 return false;
644 maybe_grow ();
646 char * from = m_data + m_nb_read;
647 size_t to_read = m_size - m_nb_read;
648 size_t nb_read = fread (from, 1, to_read, m_fp);
650 if (ferror (m_fp))
651 return false;
653 m_nb_read += nb_read;
654 return !!nb_read;
657 /* Read new data iff the cache needs to be filled with more data
658 coming from the file FP. Return TRUE iff the cache was filled with
659 mode data. */
661 bool
662 file_cache_slot::maybe_read_data ()
664 if (!needs_read_p ())
665 return false;
666 return read_data ();
669 /* Helper function for file_cache_slot::get_next_line (), to find the end of
670 the next line. Returns with the memchr convention, i.e. nullptr if a line
671 terminator was not found. We need to determine line endings in the same
672 manner that libcpp does: any of \n, \r\n, or \r is a line ending. */
674 static char *
675 find_end_of_line (char *s, size_t len)
677 for (const auto end = s + len; s != end; ++s)
679 if (*s == '\n')
680 return s;
681 if (*s == '\r')
683 const auto next = s + 1;
684 if (next == end)
686 /* Don't find the line ending if \r is the very last character
687 in the buffer; we do not know if it's the end of the file or
688 just the end of what has been read so far, and we wouldn't
689 want to break in the middle of what's actually a \r\n
690 sequence. Instead, we will handle the case of a file ending
691 in a \r later. */
692 break;
694 return (*next == '\n' ? next : s);
697 return nullptr;
700 /* Read a new line from file FP, using C as a cache for the data
701 coming from the file. Upon successful completion, *LINE is set to
702 the beginning of the line found. *LINE points directly in the
703 line cache and is only valid until the next call of get_next_line.
704 *LINE_LEN is set to the length of the line. Note that the line
705 does not contain any terminal delimiter. This function returns
706 true if some data was read or process from the cache, false
707 otherwise. Note that subsequent calls to get_next_line might
708 make the content of *LINE invalid. */
710 bool
711 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
713 /* Fill the cache with data to process. */
714 maybe_read_data ();
716 size_t remaining_size = m_nb_read - m_line_start_idx;
717 if (remaining_size == 0)
718 /* There is no more data to process. */
719 return false;
721 char *line_start = m_data + m_line_start_idx;
723 char *next_line_start = NULL;
724 size_t len = 0;
725 char *line_end = find_end_of_line (line_start, remaining_size);
726 if (line_end == NULL)
728 /* We haven't found an end-of-line delimiter in the cache.
729 Fill the cache with more data from the file and look again. */
730 while (maybe_read_data ())
732 line_start = m_data + m_line_start_idx;
733 remaining_size = m_nb_read - m_line_start_idx;
734 line_end = find_end_of_line (line_start, remaining_size);
735 if (line_end != NULL)
737 next_line_start = line_end + 1;
738 break;
741 if (line_end == NULL)
743 /* We've loaded all the file into the cache and still no
744 terminator. Let's say the line ends up at one byte past the
745 end of the file. This is to stay consistent with the case
746 of when the line ends up with a terminator and line_end points to
747 that. That consistency is useful below in the len calculation.
749 If the file ends in a \r, we didn't identify it as a line
750 terminator above, so do that now instead. */
751 line_end = m_data + m_nb_read;
752 if (m_nb_read && line_end[-1] == '\r')
754 --line_end;
755 m_missing_trailing_newline = false;
757 else
758 m_missing_trailing_newline = true;
760 else
761 m_missing_trailing_newline = false;
763 else
765 next_line_start = line_end + 1;
766 m_missing_trailing_newline = false;
769 if (m_fp && ferror (m_fp))
770 return false;
772 /* At this point, we've found the end of the of line. It either points to
773 the line terminator or to one byte after the last byte of the file. */
774 gcc_assert (line_end != NULL);
776 len = line_end - line_start;
778 if (m_line_start_idx < m_nb_read)
779 *line = line_start;
781 ++m_line_num;
783 /* Before we update our line record, make sure the hint about the
784 total number of lines of the file is correct. If it's not, then
785 we give up recording line boundaries from now on. */
786 bool update_line_record = true;
787 if (m_line_num > m_total_lines)
788 update_line_record = false;
790 /* Now update our line record so that re-reading lines from the
791 before m_line_start_idx is faster. */
792 if (update_line_record
793 && m_line_record.length () < line_record_size)
795 /* If the file lines fits in the line record, we just record all
796 its lines ...*/
797 if (m_total_lines <= line_record_size
798 && m_line_num > m_line_record.length ())
799 m_line_record.safe_push
800 (file_cache_slot::line_info (m_line_num,
801 m_line_start_idx,
802 line_end - m_data));
803 else if (m_total_lines > line_record_size)
805 /* ... otherwise, we just scale total_lines down to
806 (line_record_size lines. */
807 size_t n = (m_line_num * line_record_size) / m_total_lines;
808 if (m_line_record.length () == 0
809 || n >= m_line_record.length ())
810 m_line_record.safe_push
811 (file_cache_slot::line_info (m_line_num,
812 m_line_start_idx,
813 line_end - m_data));
817 /* Update m_line_start_idx so that it points to the next line to be
818 read. */
819 if (next_line_start)
820 m_line_start_idx = next_line_start - m_data;
821 else
822 /* We didn't find any terminal '\n'. Let's consider that the end
823 of line is the end of the data in the cache. The next
824 invocation of get_next_line will either read more data from the
825 underlying file or return false early because we've reached the
826 end of the file. */
827 m_line_start_idx = m_nb_read;
829 *line_len = len;
831 return true;
834 /* Consume the next bytes coming from the cache (or from its
835 underlying file if there are remaining unread bytes in the file)
836 until we reach the next end-of-line (or end-of-file). There is no
837 copying from the cache involved. Return TRUE upon successful
838 completion. */
840 bool
841 file_cache_slot::goto_next_line ()
843 char *l;
844 ssize_t len;
846 return get_next_line (&l, &len);
849 /* Read an arbitrary line number LINE_NUM from the file cached in C.
850 If the line was read successfully, *LINE points to the beginning
851 of the line in the file cache and *LINE_LEN is the length of the
852 line. *LINE is not nul-terminated, but may contain zero bytes.
853 *LINE is only valid until the next call of read_line_num.
854 This function returns bool if a line was read. */
856 bool
857 file_cache_slot::read_line_num (size_t line_num,
858 char ** line, ssize_t *line_len)
860 gcc_assert (line_num > 0);
862 if (line_num <= m_line_num)
864 /* We've been asked to read lines that are before m_line_num.
865 So lets use our line record (if it's not empty) to try to
866 avoid re-reading the file from the beginning again. */
868 if (m_line_record.is_empty ())
870 m_line_start_idx = 0;
871 m_line_num = 0;
873 else
875 file_cache_slot::line_info *i = NULL;
876 if (m_total_lines <= line_record_size)
878 /* In languages where the input file is not totally
879 preprocessed up front, the m_total_lines hint
880 can be smaller than the number of lines of the
881 file. In that case, only the first
882 m_total_lines have been recorded.
884 Otherwise, the first m_total_lines we've read have
885 their start/end recorded here. */
886 i = (line_num <= m_total_lines)
887 ? &m_line_record[line_num - 1]
888 : &m_line_record[m_total_lines - 1];
889 gcc_assert (i->line_num <= line_num);
891 else
893 /* So the file had more lines than our line record
894 size. Thus the number of lines we've recorded has
895 been scaled down to line_record_size. Let's
896 pick the start/end of the recorded line that is
897 closest to line_num. */
898 size_t n = (line_num <= m_total_lines)
899 ? line_num * line_record_size / m_total_lines
900 : m_line_record.length () - 1;
901 if (n < m_line_record.length ())
903 i = &m_line_record[n];
904 gcc_assert (i->line_num <= line_num);
908 if (i && i->line_num == line_num)
910 /* We have the start/end of the line. */
911 *line = m_data + i->start_pos;
912 *line_len = i->end_pos - i->start_pos;
913 return true;
916 if (i)
918 m_line_start_idx = i->start_pos;
919 m_line_num = i->line_num - 1;
921 else
923 m_line_start_idx = 0;
924 m_line_num = 0;
929 /* Let's walk from line m_line_num up to line_num - 1, without
930 copying any line. */
931 while (m_line_num < line_num - 1)
932 if (!goto_next_line ())
933 return false;
935 /* The line we want is the next one. Let's read and copy it back to
936 the caller. */
937 return get_next_line (line, line_len);
940 /* Return the physical source line that corresponds to FILE_PATH/LINE.
941 The line is not nul-terminated. The returned pointer is only
942 valid until the next call of location_get_source_line.
943 Note that the line can contain several null characters,
944 so the returned value's length has the actual length of the line.
945 If the function fails, a NULL char_span is returned. */
947 char_span
948 file_cache::get_source_line (const char *file_path, int line)
950 char *buffer = NULL;
951 ssize_t len;
953 if (line == 0)
954 return char_span (NULL, 0);
956 if (file_path == NULL)
957 return char_span (NULL, 0);
959 file_cache_slot *c = lookup_or_add_file (file_path);
960 if (c == NULL)
961 return char_span (NULL, 0);
963 bool read = c->read_line_num (line, &buffer, &len);
964 if (!read)
965 return char_span (NULL, 0);
967 return char_span (buffer, len);
970 char_span
971 location_get_source_line (const char *file_path, int line)
973 diagnostic_file_cache_init ();
974 return global_dc->get_file_cache ()->get_source_line (file_path, line);
977 /* Return a NUL-terminated copy of the source text between two locations, or
978 NULL if the arguments are invalid. The caller is responsible for freeing
979 the return value. */
981 char *
982 get_source_text_between (location_t start, location_t end)
984 expanded_location expstart =
985 expand_location_to_spelling_point (start, LOCATION_ASPECT_START);
986 expanded_location expend =
987 expand_location_to_spelling_point (end, LOCATION_ASPECT_FINISH);
989 /* If the locations are in different files or the end comes before the
990 start, give up and return nothing. */
991 if (!expstart.file || !expend.file)
992 return NULL;
993 if (strcmp (expstart.file, expend.file) != 0)
994 return NULL;
995 if (expstart.line > expend.line)
996 return NULL;
997 if (expstart.line == expend.line
998 && expstart.column > expend.column)
999 return NULL;
1000 /* These aren't real column numbers, give up. */
1001 if (expstart.column == 0 || expend.column == 0)
1002 return NULL;
1004 /* For a single line we need to trim both edges. */
1005 if (expstart.line == expend.line)
1007 char_span line = location_get_source_line (expstart.file, expstart.line);
1008 if (line.length () < 1)
1009 return NULL;
1010 int s = expstart.column - 1;
1011 int len = expend.column - s;
1012 if (line.length () < (size_t)expend.column)
1013 return NULL;
1014 return line.subspan (s, len).xstrdup ();
1017 struct obstack buf_obstack;
1018 obstack_init (&buf_obstack);
1020 /* Loop through all lines in the range and append each to buf; may trim
1021 parts of the start and end lines off depending on column values. */
1022 for (int lnum = expstart.line; lnum <= expend.line; ++lnum)
1024 char_span line = location_get_source_line (expstart.file, lnum);
1025 if (line.length () < 1 && (lnum != expstart.line && lnum != expend.line))
1026 continue;
1028 /* For the first line in the range, only start at expstart.column */
1029 if (lnum == expstart.line)
1031 unsigned off = expstart.column - 1;
1032 if (line.length () < off)
1033 return NULL;
1034 line = line.subspan (off, line.length() - off);
1036 /* For the last line, don't go past expend.column */
1037 else if (lnum == expend.line)
1039 if (line.length () < (size_t)expend.column)
1040 return NULL;
1041 line = line.subspan (0, expend.column);
1044 /* Combine spaces at the beginning of later lines. */
1045 if (lnum > expstart.line)
1047 unsigned off;
1048 for (off = 0; off < line.length(); ++off)
1049 if (line[off] != ' ' && line[off] != '\t')
1050 break;
1051 if (off > 0)
1053 obstack_1grow (&buf_obstack, ' ');
1054 line = line.subspan (off, line.length() - off);
1058 /* This does not include any trailing newlines. */
1059 obstack_grow (&buf_obstack, line.get_buffer (), line.length ());
1062 /* NUL-terminate and finish the buf obstack. */
1063 obstack_1grow (&buf_obstack, 0);
1064 const char *buf = (const char *) obstack_finish (&buf_obstack);
1066 return xstrdup (buf);
1070 char_span
1071 file_cache::get_source_file_content (const char *file_path)
1073 file_cache_slot *c = lookup_or_add_file (file_path);
1074 if (c == nullptr)
1075 return char_span (nullptr, 0);
1076 return c->get_full_file_content ();
1080 /* Get a borrowed char_span to the full content of FILE_PATH
1081 as decoded according to the input charset, encoded as UTF-8. */
1083 char_span
1084 get_source_file_content (const char *file_path)
1086 diagnostic_file_cache_init ();
1087 return global_dc->get_file_cache ()->get_source_file_content (file_path);
1090 /* Determine if FILE_PATH missing a trailing newline on its final line.
1091 Only valid to call once all of the file has been loaded, by
1092 requesting a line number beyond the end of the file. */
1094 bool
1095 location_missing_trailing_newline (const char *file_path)
1097 diagnostic_file_cache_init ();
1099 file_cache_slot *c = global_dc->get_file_cache ()->lookup_or_add_file (file_path);
1100 if (c == NULL)
1101 return false;
1103 return c->missing_trailing_newline_p ();
1106 /* Test if the location originates from the spelling location of a
1107 builtin-tokens. That is, return TRUE if LOC is a (possibly
1108 virtual) location of a built-in token that appears in the expansion
1109 list of a macro. Please note that this function also works on
1110 tokens that result from built-in tokens. For instance, the
1111 function would return true if passed a token "4" that is the result
1112 of the expansion of the built-in __LINE__ macro. */
1113 bool
1114 is_location_from_builtin_token (location_t loc)
1116 const line_map_ordinary *map = NULL;
1117 loc = linemap_resolve_location (line_table, loc,
1118 LRK_SPELLING_LOCATION, &map);
1119 return loc == BUILTINS_LOCATION;
1122 /* Expand the source location LOC into a human readable location. If
1123 LOC is virtual, it resolves to the expansion point of the involved
1124 macro. If LOC resolves to a builtin location, the file name of the
1125 readable location is set to the string "<built-in>". */
1127 expanded_location
1128 expand_location (location_t loc)
1130 return expand_location_1 (loc, /*expansion_point_p=*/true,
1131 LOCATION_ASPECT_CARET);
1134 /* Expand the source location LOC into a human readable location. If
1135 LOC is virtual, it resolves to the expansion location of the
1136 relevant macro. If LOC resolves to a builtin location, the file
1137 name of the readable location is set to the string
1138 "<built-in>". */
1140 expanded_location
1141 expand_location_to_spelling_point (location_t loc,
1142 enum location_aspect aspect)
1144 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1147 /* The rich_location class within libcpp requires a way to expand
1148 location_t instances, and relies on the client code
1149 providing a symbol named
1150 linemap_client_expand_location_to_spelling_point
1151 to do this.
1153 This is the implementation for libcommon.a (all host binaries),
1154 which simply calls into expand_location_1. */
1156 expanded_location
1157 linemap_client_expand_location_to_spelling_point (location_t loc,
1158 enum location_aspect aspect)
1160 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1164 /* If LOCATION is in a system header and if it is a virtual location
1165 for a token coming from the expansion of a macro, unwind it to
1166 the location of the expansion point of the macro. If the expansion
1167 point is also in a system header return the original LOCATION.
1168 Otherwise, return the location of the expansion point.
1170 This is used for instance when we want to emit diagnostics about a
1171 token that may be located in a macro that is itself defined in a
1172 system header, for example, for the NULL macro. In such a case, if
1173 LOCATION were passed directly to diagnostic functions such as
1174 warning_at, the diagnostic would be suppressed (unless
1175 -Wsystem-headers). */
1177 location_t
1178 expansion_point_location_if_in_system_header (location_t location)
1180 if (!in_system_header_at (location))
1181 return location;
1183 location_t xloc = linemap_resolve_location (line_table, location,
1184 LRK_MACRO_EXPANSION_POINT,
1185 NULL);
1186 return in_system_header_at (xloc) ? location : xloc;
1189 /* If LOCATION is a virtual location for a token coming from the expansion
1190 of a macro, unwind to the location of the expansion point of the macro. */
1192 location_t
1193 expansion_point_location (location_t location)
1195 return linemap_resolve_location (line_table, location,
1196 LRK_MACRO_EXPANSION_POINT, NULL);
1199 /* Construct a location with caret at CARET, ranging from START to
1200 FINISH.
1202 For example, consider:
1204 11111111112
1205 12345678901234567890
1207 523 return foo + bar;
1208 ~~~~^~~~~
1211 The location's caret is at the "+", line 523 column 15, but starts
1212 earlier, at the "f" of "foo" at column 11. The finish is at the "r"
1213 of "bar" at column 19. */
1215 location_t
1216 make_location (location_t caret, location_t start, location_t finish)
1218 return line_table->make_location (caret, start, finish);
1221 /* Same as above, but taking a source range rather than two locations. */
1223 location_t
1224 make_location (location_t caret, source_range src_range)
1226 location_t pure_loc = get_pure_location (caret);
1227 return line_table->get_or_create_combined_loc (pure_loc, src_range,
1228 nullptr, 0);
1231 /* An expanded_location stores the column in byte units. This function
1232 converts that column to display units. That requires reading the associated
1233 source line in order to calculate the display width. If that cannot be done
1234 for any reason, then returns the byte column as a fallback. */
1236 location_compute_display_column (expanded_location exploc,
1237 const cpp_char_column_policy &policy)
1239 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1240 return exploc.column;
1241 char_span line = location_get_source_line (exploc.file, exploc.line);
1242 /* If line is NULL, this function returns exploc.column which is the
1243 desired fallback. */
1244 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1245 exploc.column, policy);
1248 /* Dump statistics to stderr about the memory usage of the line_table
1249 set of line maps. This also displays some statistics about macro
1250 expansion. */
1252 void
1253 dump_line_table_statistics (void)
1255 struct linemap_stats s;
1256 long total_used_map_size,
1257 macro_maps_size,
1258 total_allocated_map_size;
1260 memset (&s, 0, sizeof (s));
1262 linemap_get_statistics (line_table, &s);
1264 macro_maps_size = s.macro_maps_used_size
1265 + s.macro_maps_locations_size;
1267 total_allocated_map_size = s.ordinary_maps_allocated_size
1268 + s.macro_maps_allocated_size
1269 + s.macro_maps_locations_size;
1271 total_used_map_size = s.ordinary_maps_used_size
1272 + s.macro_maps_used_size
1273 + s.macro_maps_locations_size;
1275 fprintf (stderr, "Number of expanded macros: %5ld\n",
1276 s.num_expanded_macros);
1277 if (s.num_expanded_macros != 0)
1278 fprintf (stderr, "Average number of tokens per macro expansion: %5ld\n",
1279 s.num_macro_tokens / s.num_expanded_macros);
1280 fprintf (stderr,
1281 "\nLine Table allocations during the "
1282 "compilation process\n");
1283 fprintf (stderr, "Number of ordinary maps used: " PRsa (5) "\n",
1284 SIZE_AMOUNT (s.num_ordinary_maps_used));
1285 fprintf (stderr, "Ordinary map used size: " PRsa (5) "\n",
1286 SIZE_AMOUNT (s.ordinary_maps_used_size));
1287 fprintf (stderr, "Number of ordinary maps allocated: " PRsa (5) "\n",
1288 SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1289 fprintf (stderr, "Ordinary maps allocated size: " PRsa (5) "\n",
1290 SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1291 fprintf (stderr, "Number of macro maps used: " PRsa (5) "\n",
1292 SIZE_AMOUNT (s.num_macro_maps_used));
1293 fprintf (stderr, "Macro maps used size: " PRsa (5) "\n",
1294 SIZE_AMOUNT (s.macro_maps_used_size));
1295 fprintf (stderr, "Macro maps locations size: " PRsa (5) "\n",
1296 SIZE_AMOUNT (s.macro_maps_locations_size));
1297 fprintf (stderr, "Macro maps size: " PRsa (5) "\n",
1298 SIZE_AMOUNT (macro_maps_size));
1299 fprintf (stderr, "Duplicated maps locations size: " PRsa (5) "\n",
1300 SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1301 fprintf (stderr, "Total allocated maps size: " PRsa (5) "\n",
1302 SIZE_AMOUNT (total_allocated_map_size));
1303 fprintf (stderr, "Total used maps size: " PRsa (5) "\n",
1304 SIZE_AMOUNT (total_used_map_size));
1305 fprintf (stderr, "Ad-hoc table size: " PRsa (5) "\n",
1306 SIZE_AMOUNT (s.adhoc_table_size));
1307 fprintf (stderr, "Ad-hoc table entries used: " PRsa (5) "\n",
1308 SIZE_AMOUNT (s.adhoc_table_entries_used));
1309 fprintf (stderr, "optimized_ranges: " PRsa (5) "\n",
1310 SIZE_AMOUNT (line_table->m_num_optimized_ranges));
1311 fprintf (stderr, "unoptimized_ranges: " PRsa (5) "\n",
1312 SIZE_AMOUNT (line_table->m_num_unoptimized_ranges));
1314 fprintf (stderr, "\n");
1317 /* Get location one beyond the final location in ordinary map IDX. */
1319 static location_t
1320 get_end_location (class line_maps *set, unsigned int idx)
1322 if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1323 return set->highest_location;
1325 struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1326 return MAP_START_LOCATION (next_map);
1329 /* Helper function for write_digit_row. */
1331 static void
1332 write_digit (FILE *stream, int digit)
1334 fputc ('0' + (digit % 10), stream);
1337 /* Helper function for dump_location_info.
1338 Write a row of numbers to STREAM, numbering a source line,
1339 giving the units, tens, hundreds etc of the column number. */
1341 static void
1342 write_digit_row (FILE *stream, int indent,
1343 const line_map_ordinary *map,
1344 location_t loc, int max_col, int divisor)
1346 fprintf (stream, "%*c", indent, ' ');
1347 fprintf (stream, "|");
1348 for (int column = 1; column < max_col; column++)
1350 location_t column_loc = loc + (column << map->m_range_bits);
1351 write_digit (stream, column_loc / divisor);
1353 fprintf (stream, "\n");
1356 /* Write a half-closed (START) / half-open (END) interval of
1357 location_t to STREAM. */
1359 static void
1360 dump_location_range (FILE *stream,
1361 location_t start, location_t end)
1363 fprintf (stream,
1364 " location_t interval: %u <= loc < %u\n",
1365 start, end);
1368 /* Write a labelled description of a half-closed (START) / half-open (END)
1369 interval of location_t to STREAM. */
1371 static void
1372 dump_labelled_location_range (FILE *stream,
1373 const char *name,
1374 location_t start, location_t end)
1376 fprintf (stream, "%s\n", name);
1377 dump_location_range (stream, start, end);
1378 fprintf (stream, "\n");
1381 /* Write a visualization of the locations in the line_table to STREAM. */
1383 void
1384 dump_location_info (FILE *stream)
1386 /* Visualize the reserved locations. */
1387 dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1388 0, RESERVED_LOCATION_COUNT);
1390 /* Visualize the ordinary line_map instances, rendering the sources. */
1391 for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1393 location_t end_location = get_end_location (line_table, idx);
1394 /* half-closed: doesn't include this one. */
1396 const line_map_ordinary *map
1397 = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1398 fprintf (stream, "ORDINARY MAP: %i\n", idx);
1399 dump_location_range (stream,
1400 MAP_START_LOCATION (map), end_location);
1401 fprintf (stream, " file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1402 fprintf (stream, " starting at line: %i\n",
1403 ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1404 fprintf (stream, " column and range bits: %i\n",
1405 map->m_column_and_range_bits);
1406 fprintf (stream, " column bits: %i\n",
1407 map->m_column_and_range_bits - map->m_range_bits);
1408 fprintf (stream, " range bits: %i\n",
1409 map->m_range_bits);
1410 const char * reason;
1411 switch (map->reason) {
1412 case LC_ENTER:
1413 reason = "LC_ENTER";
1414 break;
1415 case LC_LEAVE:
1416 reason = "LC_LEAVE";
1417 break;
1418 case LC_RENAME:
1419 reason = "LC_RENAME";
1420 break;
1421 case LC_RENAME_VERBATIM:
1422 reason = "LC_RENAME_VERBATIM";
1423 break;
1424 case LC_ENTER_MACRO:
1425 reason = "LC_RENAME_MACRO";
1426 break;
1427 default:
1428 reason = "Unknown";
1430 fprintf (stream, " reason: %d (%s)\n", map->reason, reason);
1432 const line_map_ordinary *includer_map
1433 = linemap_included_from_linemap (line_table, map);
1434 fprintf (stream, " included from location: %d",
1435 linemap_included_from (map));
1436 if (includer_map) {
1437 fprintf (stream, " (in ordinary map %d)",
1438 int (includer_map - line_table->info_ordinary.maps));
1440 fprintf (stream, "\n");
1442 /* Render the span of source lines that this "map" covers. */
1443 for (location_t loc = MAP_START_LOCATION (map);
1444 loc < end_location;
1445 loc += (1 << map->m_range_bits) )
1447 gcc_assert (pure_location_p (line_table, loc) );
1449 expanded_location exploc
1450 = linemap_expand_location (line_table, map, loc);
1452 if (exploc.column == 0)
1454 /* Beginning of a new source line: draw the line. */
1456 char_span line_text = location_get_source_line (exploc.file,
1457 exploc.line);
1458 if (!line_text)
1459 break;
1460 fprintf (stream,
1461 "%s:%3i|loc:%5i|%.*s\n",
1462 exploc.file, exploc.line,
1463 loc,
1464 (int)line_text.length (), line_text.get_buffer ());
1466 /* "loc" is at column 0, which means "the whole line".
1467 Render the locations *within* the line, by underlining
1468 it, showing the location_t numeric values
1469 at each column. */
1470 size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1471 if (max_col > line_text.length ())
1472 max_col = line_text.length () + 1;
1474 int len_lnum = num_digits (exploc.line);
1475 if (len_lnum < 3)
1476 len_lnum = 3;
1477 int len_loc = num_digits (loc);
1478 if (len_loc < 5)
1479 len_loc = 5;
1481 int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1483 /* Thousands. */
1484 if (end_location > 999)
1485 write_digit_row (stream, indent, map, loc, max_col, 1000);
1487 /* Hundreds. */
1488 if (end_location > 99)
1489 write_digit_row (stream, indent, map, loc, max_col, 100);
1491 /* Tens. */
1492 write_digit_row (stream, indent, map, loc, max_col, 10);
1494 /* Units. */
1495 write_digit_row (stream, indent, map, loc, max_col, 1);
1498 fprintf (stream, "\n");
1501 /* Visualize unallocated values. */
1502 dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1503 line_table->highest_location,
1504 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1506 /* Visualize the macro line_map instances, rendering the sources. */
1507 for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1509 /* Each macro map that is allocated owns location_t values
1510 that are *lower* that the one before them.
1511 Hence it's meaningful to view them either in order of ascending
1512 source locations, or in order of ascending macro map index. */
1513 const bool ascending_location_ts = true;
1514 unsigned int idx = (ascending_location_ts
1515 ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1516 : i);
1517 const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1518 fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1519 idx,
1520 linemap_map_get_macro_name (map),
1521 MACRO_MAP_NUM_MACRO_TOKENS (map));
1522 dump_location_range (stream,
1523 map->start_location,
1524 (map->start_location
1525 + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1526 inform (map->get_expansion_point_location (),
1527 "expansion point is location %i",
1528 map->get_expansion_point_location ());
1529 fprintf (stream, " map->start_location: %u\n",
1530 map->start_location);
1532 fprintf (stream, " macro_locations:\n");
1533 for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1535 location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1536 location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1538 /* linemap_add_macro_token encodes token numbers in an expansion
1539 by putting them after MAP_START_LOCATION. */
1541 /* I'm typically seeing 4 uninitialized entries at the end of
1542 0xafafafaf.
1543 This appears to be due to macro.cc:replace_args
1544 adding 2 extra args for padding tokens; presumably there may
1545 be a leading and/or trailing padding token injected,
1546 each for 2 more location slots.
1547 This would explain there being up to 4 location_ts slots
1548 that may be uninitialized. */
1550 fprintf (stream, " %u: %u, %u\n",
1554 if (x == y)
1556 if (x < MAP_START_LOCATION (map))
1557 inform (x, "token %u has %<x-location == y-location == %u%>",
1558 i, x);
1559 else
1560 fprintf (stream,
1561 "x-location == y-location == %u encodes token # %u\n",
1562 x, x - MAP_START_LOCATION (map));
1564 else
1566 inform (x, "token %u has %<x-location == %u%>", i, x);
1567 inform (x, "token %u has %<y-location == %u%>", i, y);
1570 fprintf (stream, "\n");
1573 /* It appears that MAX_LOCATION_T itself is never assigned to a
1574 macro map, presumably due to an off-by-one error somewhere
1575 between the logic in linemap_enter_macro and
1576 LINEMAPS_MACRO_LOWEST_LOCATION. */
1577 dump_labelled_location_range (stream, "MAX_LOCATION_T",
1578 MAX_LOCATION_T,
1579 MAX_LOCATION_T + 1);
1581 /* Visualize ad-hoc values. */
1582 dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1583 MAX_LOCATION_T + 1, UINT_MAX);
1586 /* string_concat's constructor. */
1588 string_concat::string_concat (int num, location_t *locs)
1589 : m_num (num)
1591 m_locs = ggc_vec_alloc <location_t> (num);
1592 for (int i = 0; i < num; i++)
1593 m_locs[i] = locs[i];
1596 /* string_concat_db's constructor. */
1598 string_concat_db::string_concat_db ()
1600 m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1603 /* Record that a string concatenation occurred, covering NUM
1604 string literal tokens. LOCS is an array of size NUM, containing the
1605 locations of the tokens. A copy of LOCS is taken. */
1607 void
1608 string_concat_db::record_string_concatenation (int num, location_t *locs)
1610 gcc_assert (num > 1);
1611 gcc_assert (locs);
1613 location_t key_loc = get_key_loc (locs[0]);
1614 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1615 any data now recorded under key 'key_loc' would be overwritten by a
1616 subsequent call with the same key 'key_loc'. */
1617 if (RESERVED_LOCATION_P (key_loc))
1618 return;
1620 string_concat *concat
1621 = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1622 m_table->put (key_loc, concat);
1625 /* Determine if LOC was the location of the initial token of a
1626 concatenation of string literal tokens.
1627 If so, *OUT_NUM is written to with the number of tokens, and
1628 *OUT_LOCS with the location of an array of locations of the
1629 tokens, and return true. *OUT_LOCS is a borrowed pointer to
1630 storage owned by the string_concat_db.
1631 Otherwise, return false. */
1633 bool
1634 string_concat_db::get_string_concatenation (location_t loc,
1635 int *out_num,
1636 location_t **out_locs)
1638 gcc_assert (out_num);
1639 gcc_assert (out_locs);
1641 location_t key_loc = get_key_loc (loc);
1642 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1643 discussion in 'string_concat_db::record_string_concatenation'. */
1644 if (RESERVED_LOCATION_P (key_loc))
1645 return false;
1647 string_concat **concat = m_table->get (key_loc);
1648 if (!concat)
1649 return false;
1651 *out_num = (*concat)->m_num;
1652 *out_locs =(*concat)->m_locs;
1653 return true;
1656 /* Internal function. Canonicalize LOC into a form suitable for
1657 use as a key within the database, stripping away macro expansion,
1658 ad-hoc information, and range information, using the location of
1659 the start of LOC within an ordinary linemap. */
1661 location_t
1662 string_concat_db::get_key_loc (location_t loc)
1664 loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1665 NULL);
1667 loc = get_range_from_loc (line_table, loc).m_start;
1669 return loc;
1672 /* Helper class for use within get_substring_ranges_for_loc.
1673 An vec of cpp_string with responsibility for releasing all of the
1674 str->text for each str in the vector. */
1676 class auto_cpp_string_vec : public auto_vec <cpp_string>
1678 public:
1679 auto_cpp_string_vec (int alloc)
1680 : auto_vec <cpp_string> (alloc) {}
1682 ~auto_cpp_string_vec ()
1684 /* Clean up the copies within this vec. */
1685 int i;
1686 cpp_string *str;
1687 FOR_EACH_VEC_ELT (*this, i, str)
1688 free (const_cast <unsigned char *> (str->text));
1692 /* Attempt to populate RANGES with source location information on the
1693 individual characters within the string literal found at STRLOC.
1694 If CONCATS is non-NULL, then any string literals that the token at
1695 STRLOC was concatenated with are also added to RANGES.
1697 Return NULL if successful, or an error message if any errors occurred (in
1698 which case RANGES may be only partially populated and should not
1699 be used).
1701 This is implemented by re-parsing the relevant source line(s). */
1703 static const char *
1704 get_substring_ranges_for_loc (cpp_reader *pfile,
1705 string_concat_db *concats,
1706 location_t strloc,
1707 enum cpp_ttype type,
1708 cpp_substring_ranges &ranges)
1710 gcc_assert (pfile);
1712 if (strloc == UNKNOWN_LOCATION)
1713 return "unknown location";
1715 /* Reparsing the strings requires accurate location information.
1716 If -ftrack-macro-expansion has been overridden from its default
1717 of 2, then we might have a location of a macro expansion point,
1718 rather than the location of the literal itself.
1719 Avoid this by requiring that we have full macro expansion tracking
1720 for substring locations to be available. */
1721 if (cpp_get_options (pfile)->track_macro_expansion != 2)
1722 return "track_macro_expansion != 2";
1724 /* If #line or # 44 "file"-style directives are present, then there's
1725 no guarantee that the line numbers we have can be used to locate
1726 the strings. For example, we might have a .i file with # directives
1727 pointing back to lines within a .c file, but the .c file might
1728 have been edited since the .i file was created.
1729 In such a case, the safest course is to disable on-demand substring
1730 locations. */
1731 if (line_table->seen_line_directive)
1732 return "seen line directive";
1734 /* If string concatenation has occurred at STRLOC, get the locations
1735 of all of the literal tokens making up the compound string.
1736 Otherwise, just use STRLOC. */
1737 int num_locs = 1;
1738 location_t *strlocs = &strloc;
1739 if (concats)
1740 concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1742 auto_cpp_string_vec strs (num_locs);
1743 auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1744 for (int i = 0; i < num_locs; i++)
1746 /* Get range of strloc. We will use it to locate the start and finish
1747 of the literal token within the line. */
1748 source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1750 if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1752 /* If the string token was within a macro expansion, then we can
1753 cope with it for the simple case where we have a single token.
1754 Otherwise, bail out. */
1755 if (src_range.m_start != src_range.m_finish)
1756 return "macro expansion";
1758 else
1760 if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1761 /* If so, we can't reliably determine where the token started within
1762 its line. */
1763 return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1765 if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1766 /* If so, we can't reliably determine where the token finished
1767 within its line. */
1768 return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1771 expanded_location start
1772 = expand_location_to_spelling_point (src_range.m_start,
1773 LOCATION_ASPECT_START);
1774 expanded_location finish
1775 = expand_location_to_spelling_point (src_range.m_finish,
1776 LOCATION_ASPECT_FINISH);
1777 if (start.file != finish.file)
1778 return "range endpoints are in different files";
1779 if (start.line != finish.line)
1780 return "range endpoints are on different lines";
1781 if (start.column > finish.column)
1782 return "range endpoints are reversed";
1784 char_span line = location_get_source_line (start.file, start.line);
1785 if (!line)
1786 return "unable to read source line";
1788 /* Determine the location of the literal (including quotes
1789 and leading prefix chars, such as the 'u' in a u""
1790 token). */
1791 size_t literal_length = finish.column - start.column + 1;
1793 /* Ensure that we don't crash if we got the wrong location. */
1794 if (start.column < 1)
1795 return "zero start column";
1796 if (line.length () < (start.column - 1 + literal_length))
1797 return "line is not wide enough";
1799 char_span literal = line.subspan (start.column - 1, literal_length);
1801 cpp_string from;
1802 from.len = literal_length;
1803 /* Make a copy of the literal, to avoid having to rely on
1804 the lifetime of the copy of the line within the cache.
1805 This will be released by the auto_cpp_string_vec dtor. */
1806 from.text = (unsigned char *)literal.xstrdup ();
1807 strs.safe_push (from);
1809 /* For very long lines, a new linemap could have started
1810 halfway through the token.
1811 Ensure that the loc_reader uses the linemap of the
1812 *end* of the token for its start location. */
1813 const line_map_ordinary *start_ord_map;
1814 linemap_resolve_location (line_table, src_range.m_start,
1815 LRK_SPELLING_LOCATION, &start_ord_map);
1816 const line_map_ordinary *final_ord_map;
1817 linemap_resolve_location (line_table, src_range.m_finish,
1818 LRK_SPELLING_LOCATION, &final_ord_map);
1819 if (start_ord_map == NULL || final_ord_map == NULL)
1820 return "failed to get ordinary maps";
1821 /* Bulletproofing. We ought to only have different ordinary maps
1822 for start vs finish due to line-length jumps. */
1823 if (start_ord_map != final_ord_map
1824 && start_ord_map->to_file != final_ord_map->to_file)
1825 return "start and finish are spelled in different ordinary maps";
1826 /* The file from linemap_resolve_location ought to match that from
1827 expand_location_to_spelling_point. */
1828 if (start_ord_map->to_file != start.file)
1829 return "mismatching file after resolving linemap";
1831 location_t start_loc
1832 = linemap_position_for_line_and_column (line_table, final_ord_map,
1833 start.line, start.column);
1835 cpp_string_location_reader loc_reader (start_loc, line_table);
1836 loc_readers.safe_push (loc_reader);
1839 /* Rerun cpp_interpret_string, or rather, a modified version of it. */
1840 const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1841 loc_readers.address (),
1842 num_locs, &ranges, type);
1843 if (err)
1844 return err;
1846 /* Success: "ranges" should now contain information on the string. */
1847 return NULL;
1850 /* Attempt to populate *OUT_LOC with source location information on the
1851 given characters within the string literal found at STRLOC.
1852 CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1853 character set.
1855 For example, given CARET_IDX = 4, START_IDX = 3, END_IDX = 7
1856 and string literal "012345\n789"
1857 *OUT_LOC is written to with:
1858 "012345\n789"
1859 ~^~~~~
1861 If CONCATS is non-NULL, then any string literals that the token at
1862 STRLOC was concatenated with are also considered.
1864 This is implemented by re-parsing the relevant source line(s).
1866 Return NULL if successful, or an error message if any errors occurred.
1867 Error messages are intended for GCC developers (to help debugging) rather
1868 than for end-users. */
1870 const char *
1871 get_location_within_string (cpp_reader *pfile,
1872 string_concat_db *concats,
1873 location_t strloc,
1874 enum cpp_ttype type,
1875 int caret_idx, int start_idx, int end_idx,
1876 location_t *out_loc)
1878 gcc_checking_assert (caret_idx >= 0);
1879 gcc_checking_assert (start_idx >= 0);
1880 gcc_checking_assert (end_idx >= 0);
1881 gcc_assert (out_loc);
1883 cpp_substring_ranges ranges;
1884 const char *err
1885 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1886 if (err)
1887 return err;
1889 if (caret_idx >= ranges.get_num_ranges ())
1890 return "caret_idx out of range";
1891 if (start_idx >= ranges.get_num_ranges ())
1892 return "start_idx out of range";
1893 if (end_idx >= ranges.get_num_ranges ())
1894 return "end_idx out of range";
1896 *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1897 ranges.get_range (start_idx).m_start,
1898 ranges.get_range (end_idx).m_finish);
1899 return NULL;
1902 /* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1904 location_t
1905 location_with_discriminator (location_t locus, int discriminator)
1907 tree block = LOCATION_BLOCK (locus);
1908 source_range src_range = get_range_from_loc (line_table, locus);
1909 locus = get_pure_location (locus);
1911 if (locus == UNKNOWN_LOCATION)
1912 return locus;
1914 return line_table->get_or_create_combined_loc (locus, src_range, block,
1915 discriminator);
1918 /* Return TRUE if LOCUS represents a location with a discriminator. */
1920 bool
1921 has_discriminator (location_t locus)
1923 return get_discriminator_from_loc (locus) != 0;
1926 /* Return the discriminator for LOCUS. */
1929 get_discriminator_from_loc (location_t locus)
1931 return get_discriminator_from_loc (line_table, locus);
1934 #if CHECKING_P
1936 namespace selftest {
1938 /* Selftests of location handling. */
1940 /* Attempt to populate *OUT_RANGE with source location information on the
1941 given character within the string literal found at STRLOC.
1942 CHAR_IDX refers to an offset within the execution character set.
1943 If CONCATS is non-NULL, then any string literals that the token at
1944 STRLOC was concatenated with are also considered.
1946 This is implemented by re-parsing the relevant source line(s).
1948 Return NULL if successful, or an error message if any errors occurred.
1949 Error messages are intended for GCC developers (to help debugging) rather
1950 than for end-users. */
1952 static const char *
1953 get_source_range_for_char (cpp_reader *pfile,
1954 string_concat_db *concats,
1955 location_t strloc,
1956 enum cpp_ttype type,
1957 int char_idx,
1958 source_range *out_range)
1960 gcc_checking_assert (char_idx >= 0);
1961 gcc_assert (out_range);
1963 cpp_substring_ranges ranges;
1964 const char *err
1965 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1966 if (err)
1967 return err;
1969 if (char_idx >= ranges.get_num_ranges ())
1970 return "char_idx out of range";
1972 *out_range = ranges.get_range (char_idx);
1973 return NULL;
1976 /* As get_source_range_for_char, but write to *OUT the number
1977 of ranges that are available. */
1979 static const char *
1980 get_num_source_ranges_for_substring (cpp_reader *pfile,
1981 string_concat_db *concats,
1982 location_t strloc,
1983 enum cpp_ttype type,
1984 int *out)
1986 gcc_assert (out);
1988 cpp_substring_ranges ranges;
1989 const char *err
1990 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1992 if (err)
1993 return err;
1995 *out = ranges.get_num_ranges ();
1996 return NULL;
1999 /* Selftests of location handling. */
2001 /* Verify that compare() on linenum_type handles comparisons over the full
2002 range of the type. */
2004 static void
2005 test_linenum_comparisons ()
2007 linenum_type min_line (0);
2008 linenum_type max_line (0xffffffff);
2009 ASSERT_EQ (0, compare (min_line, min_line));
2010 ASSERT_EQ (0, compare (max_line, max_line));
2012 ASSERT_GT (compare (max_line, min_line), 0);
2013 ASSERT_LT (compare (min_line, max_line), 0);
2016 /* Helper function for verifying location data: when location_t
2017 values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
2018 as having column 0. */
2020 static bool
2021 should_have_column_data_p (location_t loc)
2023 if (IS_ADHOC_LOC (loc))
2024 loc = get_location_from_adhoc_loc (line_table, loc);
2025 if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
2026 return false;
2027 return true;
2030 /* Selftest for should_have_column_data_p. */
2032 static void
2033 test_should_have_column_data_p ()
2035 ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
2036 ASSERT_TRUE
2037 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
2038 ASSERT_FALSE
2039 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
2042 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
2043 on LOC. */
2045 static void
2046 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
2047 location_t loc)
2049 ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
2050 ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
2051 /* If location_t values are sufficiently high, then column numbers
2052 will be unavailable and LOCATION_COLUMN (loc) will be 0.
2053 When close to the threshold, column numbers *may* be present: if
2054 the final linemap before the threshold contains a line that straddles
2055 the threshold, locations in that line have column information. */
2056 if (should_have_column_data_p (loc))
2057 ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
2060 /* Various selftests involve constructing a line table and one or more
2061 line maps within it.
2063 For maximum test coverage we want to run these tests with a variety
2064 of situations:
2065 - line_table->default_range_bits: some frontends use a non-zero value
2066 and others use zero
2067 - the fallback modes within line-map.cc: there are various threshold
2068 values for location_t beyond line-map.cc changes
2069 behavior (disabling of the range-packing optimization, disabling
2070 of column-tracking). We can exercise these by starting the line_table
2071 at interesting values at or near these thresholds.
2073 The following struct describes a particular case within our test
2074 matrix. */
2076 class line_table_case
2078 public:
2079 line_table_case (int default_range_bits, int base_location)
2080 : m_default_range_bits (default_range_bits),
2081 m_base_location (base_location)
2084 int m_default_range_bits;
2085 int m_base_location;
2088 /* Constructor. Store the old value of line_table, and create a new
2089 one, using sane defaults. */
2091 line_table_test::line_table_test ()
2093 gcc_assert (saved_line_table == NULL);
2094 saved_line_table = line_table;
2095 line_table = ggc_alloc<line_maps> ();
2096 linemap_init (line_table, BUILTINS_LOCATION);
2097 gcc_assert (saved_line_table->m_reallocator);
2098 line_table->m_reallocator = saved_line_table->m_reallocator;
2099 gcc_assert (saved_line_table->m_round_alloc_size);
2100 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2101 line_table->default_range_bits = 0;
2104 /* Constructor. Store the old value of line_table, and create a new
2105 one, using the sitation described in CASE_. */
2107 line_table_test::line_table_test (const line_table_case &case_)
2109 gcc_assert (saved_line_table == NULL);
2110 saved_line_table = line_table;
2111 line_table = ggc_alloc<line_maps> ();
2112 linemap_init (line_table, BUILTINS_LOCATION);
2113 gcc_assert (saved_line_table->m_reallocator);
2114 line_table->m_reallocator = saved_line_table->m_reallocator;
2115 gcc_assert (saved_line_table->m_round_alloc_size);
2116 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2117 line_table->default_range_bits = case_.m_default_range_bits;
2118 if (case_.m_base_location)
2120 line_table->highest_location = case_.m_base_location;
2121 line_table->highest_line = case_.m_base_location;
2125 /* Destructor. Restore the old value of line_table. */
2127 line_table_test::~line_table_test ()
2129 gcc_assert (saved_line_table != NULL);
2130 line_table = saved_line_table;
2131 saved_line_table = NULL;
2134 /* Verify basic operation of ordinary linemaps. */
2136 static void
2137 test_accessing_ordinary_linemaps (const line_table_case &case_)
2139 line_table_test ltt (case_);
2141 /* Build a simple linemap describing some locations. */
2142 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
2144 linemap_line_start (line_table, 1, 100);
2145 location_t loc_a = linemap_position_for_column (line_table, 1);
2146 location_t loc_b = linemap_position_for_column (line_table, 23);
2148 linemap_line_start (line_table, 2, 100);
2149 location_t loc_c = linemap_position_for_column (line_table, 1);
2150 location_t loc_d = linemap_position_for_column (line_table, 17);
2152 /* Example of a very long line. */
2153 linemap_line_start (line_table, 3, 2000);
2154 location_t loc_e = linemap_position_for_column (line_table, 700);
2156 /* Transitioning back to a short line. */
2157 linemap_line_start (line_table, 4, 0);
2158 location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2160 if (should_have_column_data_p (loc_back_to_short))
2162 /* Verify that we switched to short lines in the linemap. */
2163 line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
2164 ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2167 /* Example of a line that will eventually be seen to be longer
2168 than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2169 below that. */
2170 linemap_line_start (line_table, 5, 2000);
2172 location_t loc_start_of_very_long_line
2173 = linemap_position_for_column (line_table, 2000);
2174 location_t loc_too_wide
2175 = linemap_position_for_column (line_table, 4097);
2176 location_t loc_too_wide_2
2177 = linemap_position_for_column (line_table, 4098);
2179 /* ...and back to a sane line length. */
2180 linemap_line_start (line_table, 6, 100);
2181 location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2183 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2185 /* Multiple files. */
2186 linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2187 linemap_line_start (line_table, 1, 200);
2188 location_t loc_f = linemap_position_for_column (line_table, 150);
2189 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2191 /* Verify that we can recover the location info. */
2192 assert_loceq ("foo.c", 1, 1, loc_a);
2193 assert_loceq ("foo.c", 1, 23, loc_b);
2194 assert_loceq ("foo.c", 2, 1, loc_c);
2195 assert_loceq ("foo.c", 2, 17, loc_d);
2196 assert_loceq ("foo.c", 3, 700, loc_e);
2197 assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2199 /* In the very wide line, the initial location should be fully tracked. */
2200 assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2201 /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2202 be disabled. */
2203 assert_loceq ("foo.c", 5, 0, loc_too_wide);
2204 assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2205 /*...and column-tracking should be re-enabled for subsequent lines. */
2206 assert_loceq ("foo.c", 6, 10, loc_sane_again);
2208 assert_loceq ("bar.c", 1, 150, loc_f);
2210 ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2211 ASSERT_TRUE (pure_location_p (line_table, loc_a));
2213 /* Verify using make_location to build a range, and extracting data
2214 back from it. */
2215 location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2216 ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2217 ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2218 source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2219 ASSERT_EQ (loc_b, src_range.m_start);
2220 ASSERT_EQ (loc_d, src_range.m_finish);
2223 /* Verify various properties of UNKNOWN_LOCATION. */
2225 static void
2226 test_unknown_location ()
2228 ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2229 ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2230 ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2233 /* Verify various properties of BUILTINS_LOCATION. */
2235 static void
2236 test_builtins ()
2238 assert_loceq (special_fname_builtin (), 0, 0, BUILTINS_LOCATION);
2239 ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2242 /* Regression test for make_location.
2243 Ensure that we use pure locations for the start/finish of the range,
2244 rather than storing a packed or ad-hoc range as the start/finish. */
2246 static void
2247 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2249 /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2250 with C++ frontend.
2251 ....................0000000001111111111222.
2252 ....................1234567890123456789012. */
2253 const char *content = " r += !aaa == bbb;\n";
2254 temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2255 line_table_test ltt (case_);
2256 linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2258 const location_t c11 = linemap_position_for_column (line_table, 11);
2259 const location_t c12 = linemap_position_for_column (line_table, 12);
2260 const location_t c13 = linemap_position_for_column (line_table, 13);
2261 const location_t c14 = linemap_position_for_column (line_table, 14);
2262 const location_t c21 = linemap_position_for_column (line_table, 21);
2264 if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2265 return;
2267 /* Use column 13 for the caret location, arbitrarily, to verify that we
2268 handle start != caret. */
2269 const location_t aaa = make_location (c13, c12, c14);
2270 ASSERT_EQ (c13, get_pure_location (aaa));
2271 ASSERT_EQ (c12, get_start (aaa));
2272 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2273 ASSERT_EQ (c14, get_finish (aaa));
2274 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2276 /* Make a location using a location with a range as the start-point. */
2277 const location_t not_aaa = make_location (c11, aaa, c14);
2278 ASSERT_EQ (c11, get_pure_location (not_aaa));
2279 /* It should use the start location of the range, not store the range
2280 itself. */
2281 ASSERT_EQ (c12, get_start (not_aaa));
2282 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2283 ASSERT_EQ (c14, get_finish (not_aaa));
2284 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2286 /* Similarly, make a location with a range as the end-point. */
2287 const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2288 ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2289 ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2290 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2291 ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2292 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2293 const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2294 /* It should use the finish location of the range, not store the range
2295 itself. */
2296 ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2297 ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2298 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2299 ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2300 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2303 /* Verify reading of input files (e.g. for caret-based diagnostics). */
2305 static void
2306 test_reading_source_line ()
2308 /* Create a tempfile and write some text to it. */
2309 temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2310 "01234567890123456789\n"
2311 "This is the test text\n"
2312 "This is the 3rd line");
2314 /* Read back a specific line from the tempfile. */
2315 char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2316 ASSERT_TRUE (source_line);
2317 ASSERT_TRUE (source_line.get_buffer () != NULL);
2318 ASSERT_EQ (20, source_line.length ());
2319 ASSERT_TRUE (!strncmp ("This is the 3rd line",
2320 source_line.get_buffer (), source_line.length ()));
2322 source_line = location_get_source_line (tmp.get_filename (), 2);
2323 ASSERT_TRUE (source_line);
2324 ASSERT_TRUE (source_line.get_buffer () != NULL);
2325 ASSERT_EQ (21, source_line.length ());
2326 ASSERT_TRUE (!strncmp ("This is the test text",
2327 source_line.get_buffer (), source_line.length ()));
2329 source_line = location_get_source_line (tmp.get_filename (), 4);
2330 ASSERT_FALSE (source_line);
2331 ASSERT_TRUE (source_line.get_buffer () == NULL);
2334 /* Tests of lexing. */
2336 /* Verify that token TOK from PARSER has cpp_token_as_text
2337 equal to EXPECTED_TEXT. */
2339 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT) \
2340 SELFTEST_BEGIN_STMT \
2341 unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK)); \
2342 ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt); \
2343 SELFTEST_END_STMT
2345 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2346 and ranges from EXP_START_COL to EXP_FINISH_COL.
2347 Use LOC as the effective location of the selftest. */
2349 static void
2350 assert_token_loc_eq (const location &loc,
2351 const cpp_token *tok,
2352 const char *exp_filename, int exp_linenum,
2353 int exp_start_col, int exp_finish_col)
2355 location_t tok_loc = tok->src_loc;
2356 ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2357 ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2359 /* If location_t values are sufficiently high, then column numbers
2360 will be unavailable. */
2361 if (!should_have_column_data_p (tok_loc))
2362 return;
2364 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2365 source_range tok_range = get_range_from_loc (line_table, tok_loc);
2366 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2367 ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2370 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2371 SELFTEST_LOCATION as the effective location of the selftest. */
2373 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2374 EXP_START_COL, EXP_FINISH_COL) \
2375 assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2376 (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2378 /* Test of lexing a file using libcpp, verifying tokens and their
2379 location information. */
2381 static void
2382 test_lexer (const line_table_case &case_)
2384 /* Create a tempfile and write some text to it. */
2385 const char *content =
2386 /*00000000011111111112222222222333333.3333444444444.455555555556
2387 12345678901234567890123456789012345.6789012345678.901234567890. */
2388 ("test_name /* c-style comment */\n"
2389 " \"test literal\"\n"
2390 " // test c++-style comment\n"
2391 " 42\n");
2392 temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2394 line_table_test ltt (case_);
2396 cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2398 const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2399 ASSERT_NE (fname, NULL);
2401 /* Verify that we get the expected tokens back, with the correct
2402 location information. */
2404 location_t loc;
2405 const cpp_token *tok;
2406 tok = cpp_get_token_with_location (parser, &loc);
2407 ASSERT_NE (tok, NULL);
2408 ASSERT_EQ (tok->type, CPP_NAME);
2409 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2410 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2412 tok = cpp_get_token_with_location (parser, &loc);
2413 ASSERT_NE (tok, NULL);
2414 ASSERT_EQ (tok->type, CPP_STRING);
2415 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2416 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2418 tok = cpp_get_token_with_location (parser, &loc);
2419 ASSERT_NE (tok, NULL);
2420 ASSERT_EQ (tok->type, CPP_NUMBER);
2421 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2422 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2424 tok = cpp_get_token_with_location (parser, &loc);
2425 ASSERT_NE (tok, NULL);
2426 ASSERT_EQ (tok->type, CPP_EOF);
2428 cpp_finish (parser, NULL);
2429 cpp_destroy (parser);
2432 /* Forward decls. */
2434 class lexer_test;
2435 class lexer_test_options;
2437 /* A class for specifying options of a lexer_test.
2438 The "apply" vfunc is called during the lexer_test constructor. */
2440 class lexer_test_options
2442 public:
2443 virtual void apply (lexer_test &) = 0;
2446 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2447 in its dtor.
2449 This is needed by struct lexer_test to ensure that the cleanup of the
2450 cpp_reader happens *after* the cleanup of the temp_source_file. */
2452 class cpp_reader_ptr
2454 public:
2455 cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2457 ~cpp_reader_ptr ()
2459 cpp_finish (m_ptr, NULL);
2460 cpp_destroy (m_ptr);
2463 operator cpp_reader * () const { return m_ptr; }
2465 private:
2466 cpp_reader *m_ptr;
2469 /* A struct for writing lexer tests. */
2471 class lexer_test
2473 public:
2474 lexer_test (const line_table_case &case_, const char *content,
2475 lexer_test_options *options);
2476 ~lexer_test ();
2478 const cpp_token *get_token ();
2480 /* The ordering of these fields matters.
2481 The line_table_test must be first, since the cpp_reader_ptr
2482 uses it.
2483 The cpp_reader must be cleaned up *after* the temp_source_file
2484 since the filenames in input.cc's input cache are owned by the
2485 cpp_reader; in particular, when ~temp_source_file evicts the
2486 filename the filenames must still be alive. */
2487 line_table_test m_ltt;
2488 cpp_reader_ptr m_parser;
2489 temp_source_file m_tempfile;
2490 string_concat_db m_concats;
2491 bool m_implicitly_expect_EOF;
2494 /* Use an EBCDIC encoding for the execution charset, specifically
2495 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2497 This exercises iconv integration within libcpp.
2498 Not every build of iconv supports the given charset,
2499 so we need to flag this error and handle it gracefully. */
2501 class ebcdic_execution_charset : public lexer_test_options
2503 public:
2504 ebcdic_execution_charset () : m_num_iconv_errors (0)
2506 gcc_assert (s_singleton == NULL);
2507 s_singleton = this;
2509 ~ebcdic_execution_charset ()
2511 gcc_assert (s_singleton == this);
2512 s_singleton = NULL;
2515 void apply (lexer_test &test) final override
2517 cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2518 cpp_opts->narrow_charset = "IBM1047";
2520 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2521 callbacks->diagnostic = on_diagnostic;
2524 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2525 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2526 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2527 rich_location *richloc ATTRIBUTE_UNUSED,
2528 const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2529 ATTRIBUTE_FPTR_PRINTF(5,0)
2531 gcc_assert (s_singleton);
2532 /* Avoid exgettext from picking this up, it is translated in libcpp. */
2533 const char *msg = "conversion from %s to %s not supported by iconv";
2534 #ifdef ENABLE_NLS
2535 msg = dgettext ("cpplib", msg);
2536 #endif
2537 /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2538 when the local iconv build doesn't support the conversion. */
2539 if (strcmp (msgid, msg) == 0)
2541 s_singleton->m_num_iconv_errors++;
2542 return true;
2545 /* Otherwise, we have an unexpected error. */
2546 abort ();
2549 bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2551 private:
2552 static ebcdic_execution_charset *s_singleton;
2553 int m_num_iconv_errors;
2556 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2558 /* A lexer_test_options subclass that records a list of diagnostic
2559 messages emitted by the lexer. */
2561 class lexer_diagnostic_sink : public lexer_test_options
2563 public:
2564 lexer_diagnostic_sink ()
2566 gcc_assert (s_singleton == NULL);
2567 s_singleton = this;
2569 ~lexer_diagnostic_sink ()
2571 gcc_assert (s_singleton == this);
2572 s_singleton = NULL;
2574 int i;
2575 char *str;
2576 FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2577 free (str);
2580 void apply (lexer_test &test) final override
2582 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2583 callbacks->diagnostic = on_diagnostic;
2586 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2587 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2588 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2589 rich_location *richloc ATTRIBUTE_UNUSED,
2590 const char *msgid, va_list *ap)
2591 ATTRIBUTE_FPTR_PRINTF(5,0)
2593 char *msg = xvasprintf (msgid, *ap);
2594 s_singleton->m_diagnostics.safe_push (msg);
2595 return true;
2598 auto_vec<char *> m_diagnostics;
2600 private:
2601 static lexer_diagnostic_sink *s_singleton;
2604 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2606 /* Constructor. Override line_table with a new instance based on CASE_,
2607 and write CONTENT to a tempfile. Create a cpp_reader, and use it to
2608 start parsing the tempfile. */
2610 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2611 lexer_test_options *options)
2612 : m_ltt (case_),
2613 m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2614 /* Create a tempfile and write the text to it. */
2615 m_tempfile (SELFTEST_LOCATION, ".c", content),
2616 m_concats (),
2617 m_implicitly_expect_EOF (true)
2619 if (options)
2620 options->apply (*this);
2622 cpp_init_iconv (m_parser);
2624 /* Parse the file. */
2625 const char *fname = cpp_read_main_file (m_parser,
2626 m_tempfile.get_filename ());
2627 ASSERT_NE (fname, NULL);
2630 /* Destructor. By default, verify that the next token in m_parser is EOF. */
2632 lexer_test::~lexer_test ()
2634 location_t loc;
2635 const cpp_token *tok;
2637 if (m_implicitly_expect_EOF)
2639 tok = cpp_get_token_with_location (m_parser, &loc);
2640 ASSERT_NE (tok, NULL);
2641 ASSERT_EQ (tok->type, CPP_EOF);
2645 /* Get the next token from m_parser. */
2647 const cpp_token *
2648 lexer_test::get_token ()
2650 location_t loc;
2651 const cpp_token *tok;
2653 tok = cpp_get_token_with_location (m_parser, &loc);
2654 ASSERT_NE (tok, NULL);
2655 return tok;
2658 /* Verify that locations within string literals are correctly handled. */
2660 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2661 using the string concatenation database for TEST.
2663 Assert that the character at index IDX is on EXPECTED_LINE,
2664 and that it begins at column EXPECTED_START_COL and ends at
2665 EXPECTED_FINISH_COL (unless the locations are beyond
2666 LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2667 columns). */
2669 static void
2670 assert_char_at_range (const location &loc,
2671 lexer_test& test,
2672 location_t strloc, enum cpp_ttype type, int idx,
2673 int expected_line, int expected_start_col,
2674 int expected_finish_col)
2676 cpp_reader *pfile = test.m_parser;
2677 string_concat_db *concats = &test.m_concats;
2679 source_range actual_range = source_range();
2680 const char *err
2681 = get_source_range_for_char (pfile, concats, strloc, type, idx,
2682 &actual_range);
2683 if (should_have_column_data_p (strloc))
2684 ASSERT_EQ_AT (loc, NULL, err);
2685 else
2687 ASSERT_STREQ_AT (loc,
2688 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2689 err);
2690 return;
2693 int actual_start_line = LOCATION_LINE (actual_range.m_start);
2694 ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2695 int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2696 ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2698 if (should_have_column_data_p (actual_range.m_start))
2700 int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2701 ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2703 if (should_have_column_data_p (actual_range.m_finish))
2705 int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2706 ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2710 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2711 the effective location of any errors. */
2713 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2714 EXPECTED_START_COL, EXPECTED_FINISH_COL) \
2715 assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2716 (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2717 (EXPECTED_FINISH_COL))
2719 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2720 using the string concatenation database for TEST.
2722 Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */
2724 static void
2725 assert_num_substring_ranges (const location &loc,
2726 lexer_test& test,
2727 location_t strloc,
2728 enum cpp_ttype type,
2729 int expected_num_ranges)
2731 cpp_reader *pfile = test.m_parser;
2732 string_concat_db *concats = &test.m_concats;
2734 int actual_num_ranges = -1;
2735 const char *err
2736 = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2737 &actual_num_ranges);
2738 if (should_have_column_data_p (strloc))
2739 ASSERT_EQ_AT (loc, NULL, err);
2740 else
2742 ASSERT_STREQ_AT (loc,
2743 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2744 err);
2745 return;
2747 ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2750 /* Macro for calling assert_num_substring_ranges, supplying
2751 SELFTEST_LOCATION for the effective location of any errors. */
2753 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2754 EXPECTED_NUM_RANGES) \
2755 assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2756 (TYPE), (EXPECTED_NUM_RANGES))
2759 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2760 returns an error (using the string concatenation database for TEST). */
2762 static void
2763 assert_has_no_substring_ranges (const location &loc,
2764 lexer_test& test,
2765 location_t strloc,
2766 enum cpp_ttype type,
2767 const char *expected_err)
2769 cpp_reader *pfile = test.m_parser;
2770 string_concat_db *concats = &test.m_concats;
2771 cpp_substring_ranges ranges;
2772 const char *actual_err
2773 = get_substring_ranges_for_loc (pfile, concats, strloc,
2774 type, ranges);
2775 if (should_have_column_data_p (strloc))
2776 ASSERT_STREQ_AT (loc, expected_err, actual_err);
2777 else
2778 ASSERT_STREQ_AT (loc,
2779 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2780 actual_err);
2783 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \
2784 assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2785 (STRLOC), (TYPE), (ERR))
2787 /* Lex a simple string literal. Verify the substring location data, before
2788 and after running cpp_interpret_string on it. */
2790 static void
2791 test_lexer_string_locations_simple (const line_table_case &case_)
2793 /* Digits 0-9 (with 0 at column 10), the simple way.
2794 ....................000000000.11111111112.2222222223333333333
2795 ....................123456789.01234567890.1234567890123456789
2796 We add a trailing comment to ensure that we correctly locate
2797 the end of the string literal token. */
2798 const char *content = " \"0123456789\" /* not a string */\n";
2799 lexer_test test (case_, content, NULL);
2801 /* Verify that we get the expected token back, with the correct
2802 location information. */
2803 const cpp_token *tok = test.get_token ();
2804 ASSERT_EQ (tok->type, CPP_STRING);
2805 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2806 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2808 /* At this point in lexing, the quote characters are treated as part of
2809 the string (they are stripped off by cpp_interpret_string). */
2811 ASSERT_EQ (tok->val.str.len, 12);
2813 /* Verify that cpp_interpret_string works. */
2814 cpp_string dst_string;
2815 const enum cpp_ttype type = CPP_STRING;
2816 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2817 &dst_string, type);
2818 ASSERT_TRUE (result);
2819 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2820 free (const_cast <unsigned char *> (dst_string.text));
2822 /* Verify ranges of individual characters. This no longer includes the
2823 opening quote, but does include the closing quote. */
2824 for (int i = 0; i <= 10; i++)
2825 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2826 10 + i, 10 + i);
2828 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2831 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2832 encoding. */
2834 static void
2835 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2837 /* EBCDIC support requires iconv. */
2838 if (!HAVE_ICONV)
2839 return;
2841 /* Digits 0-9 (with 0 at column 10), the simple way.
2842 ....................000000000.11111111112.2222222223333333333
2843 ....................123456789.01234567890.1234567890123456789
2844 We add a trailing comment to ensure that we correctly locate
2845 the end of the string literal token. */
2846 const char *content = " \"0123456789\" /* not a string */\n";
2847 ebcdic_execution_charset use_ebcdic;
2848 lexer_test test (case_, content, &use_ebcdic);
2850 /* Verify that we get the expected token back, with the correct
2851 location information. */
2852 const cpp_token *tok = test.get_token ();
2853 ASSERT_EQ (tok->type, CPP_STRING);
2854 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2855 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2857 /* At this point in lexing, the quote characters are treated as part of
2858 the string (they are stripped off by cpp_interpret_string). */
2860 ASSERT_EQ (tok->val.str.len, 12);
2862 /* The remainder of the test requires an iconv implementation that
2863 can convert from UTF-8 to the EBCDIC encoding requested above. */
2864 if (use_ebcdic.iconv_errors_occurred_p ())
2865 return;
2867 /* Verify that cpp_interpret_string works. */
2868 cpp_string dst_string;
2869 const enum cpp_ttype type = CPP_STRING;
2870 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2871 &dst_string, type);
2872 ASSERT_TRUE (result);
2873 /* We should now have EBCDIC-encoded text, specifically
2874 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2875 The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */
2876 ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2877 (const char *)dst_string.text);
2878 free (const_cast <unsigned char *> (dst_string.text));
2880 /* Verify that we don't attempt to record substring location information
2881 for such cases. */
2882 ASSERT_HAS_NO_SUBSTRING_RANGES
2883 (test, tok->src_loc, type,
2884 "execution character set != source character set");
2887 /* Lex a string literal containing a hex-escaped character.
2888 Verify the substring location data, before and after running
2889 cpp_interpret_string on it. */
2891 static void
2892 test_lexer_string_locations_hex (const line_table_case &case_)
2894 /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2895 and with a space in place of digit 6, to terminate the escaped
2896 hex code.
2897 ....................000000000.111111.11112222.
2898 ....................123456789.012345.67890123. */
2899 const char *content = " \"01234\\x35 789\"\n";
2900 lexer_test test (case_, content, NULL);
2902 /* Verify that we get the expected token back, with the correct
2903 location information. */
2904 const cpp_token *tok = test.get_token ();
2905 ASSERT_EQ (tok->type, CPP_STRING);
2906 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2907 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2909 /* At this point in lexing, the quote characters are treated as part of
2910 the string (they are stripped off by cpp_interpret_string). */
2911 ASSERT_EQ (tok->val.str.len, 15);
2913 /* Verify that cpp_interpret_string works. */
2914 cpp_string dst_string;
2915 const enum cpp_ttype type = CPP_STRING;
2916 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2917 &dst_string, type);
2918 ASSERT_TRUE (result);
2919 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2920 free (const_cast <unsigned char *> (dst_string.text));
2922 /* Verify ranges of individual characters. This no longer includes the
2923 opening quote, but does include the closing quote. */
2924 for (int i = 0; i <= 4; i++)
2925 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2926 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2927 for (int i = 6; i <= 10; i++)
2928 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2930 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2933 /* Lex a string literal containing an octal-escaped character.
2934 Verify the substring location data after running cpp_interpret_string
2935 on it. */
2937 static void
2938 test_lexer_string_locations_oct (const line_table_case &case_)
2940 /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2941 and with a space in place of digit 6, to terminate the escaped
2942 octal code.
2943 ....................000000000.111111.11112222.2222223333333333444
2944 ....................123456789.012345.67890123.4567890123456789012 */
2945 const char *content = " \"01234\\065 789\" /* not a string */\n";
2946 lexer_test test (case_, content, NULL);
2948 /* Verify that we get the expected token back, with the correct
2949 location information. */
2950 const cpp_token *tok = test.get_token ();
2951 ASSERT_EQ (tok->type, CPP_STRING);
2952 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2954 /* Verify that cpp_interpret_string works. */
2955 cpp_string dst_string;
2956 const enum cpp_ttype type = CPP_STRING;
2957 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2958 &dst_string, type);
2959 ASSERT_TRUE (result);
2960 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2961 free (const_cast <unsigned char *> (dst_string.text));
2963 /* Verify ranges of individual characters. This no longer includes the
2964 opening quote, but does include the closing quote. */
2965 for (int i = 0; i < 5; i++)
2966 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2967 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2968 for (int i = 6; i <= 10; i++)
2969 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2971 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2974 /* Test of string literal containing letter escapes. */
2976 static void
2977 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2979 /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2980 .....................000000000.1.11111.1.1.11222.22222223333333
2981 .....................123456789.0.12345.6.7.89012.34567890123456. */
2982 const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2983 lexer_test test (case_, content, NULL);
2985 /* Verify that we get the expected tokens back. */
2986 const cpp_token *tok = test.get_token ();
2987 ASSERT_EQ (tok->type, CPP_STRING);
2988 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2990 /* Verify ranges of individual characters. */
2991 /* "\t". */
2992 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2993 0, 1, 10, 11);
2994 /* "foo". */
2995 for (int i = 1; i <= 3; i++)
2996 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2997 i, 1, 11 + i, 11 + i);
2998 /* "\\" and "\n". */
2999 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3000 4, 1, 15, 16);
3001 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3002 5, 1, 17, 18);
3004 /* "bar" and closing quote for nul-terminator. */
3005 for (int i = 6; i <= 9; i++)
3006 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3007 i, 1, 13 + i, 13 + i);
3009 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
3012 /* Another test of a string literal containing a letter escape.
3013 Based on string seen in
3014 printf ("%-%\n");
3015 in gcc.dg/format/c90-printf-1.c. */
3017 static void
3018 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
3020 /* .....................000000000.1111.11.1111.22222222223.
3021 .....................123456789.0123.45.6789.01234567890. */
3022 const char *content = (" \"%-%\\n\" /* non-str */\n");
3023 lexer_test test (case_, content, NULL);
3025 /* Verify that we get the expected tokens back. */
3026 const cpp_token *tok = test.get_token ();
3027 ASSERT_EQ (tok->type, CPP_STRING);
3028 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
3030 /* Verify ranges of individual characters. */
3031 /* "%-%". */
3032 for (int i = 0; i < 3; i++)
3033 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3034 i, 1, 10 + i, 10 + i);
3035 /* "\n". */
3036 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3037 3, 1, 13, 14);
3039 /* Closing quote for nul-terminator. */
3040 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3041 4, 1, 15, 15);
3043 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
3046 /* Lex a string literal containing UCN 4 characters.
3047 Verify the substring location data after running cpp_interpret_string
3048 on it. */
3050 static void
3051 test_lexer_string_locations_ucn4 (const line_table_case &case_)
3053 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
3054 as UCN 4.
3055 ....................000000000.111111.111122.222222223.33333333344444
3056 ....................123456789.012345.678901.234567890.12345678901234 */
3057 const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n";
3058 lexer_test test (case_, content, NULL);
3060 /* Verify that we get the expected token back, with the correct
3061 location information. */
3062 const cpp_token *tok = test.get_token ();
3063 ASSERT_EQ (tok->type, CPP_STRING);
3064 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
3066 /* Verify that cpp_interpret_string works.
3067 The string should be encoded in the execution character
3068 set. Assuming that is UTF-8, we should have the following:
3069 ----------- ---- ----- ------- ----------------
3070 Byte offset Byte Octal Unicode Source Column(s)
3071 ----------- ---- ----- ------- ----------------
3072 0 0x30 '0' 10
3073 1 0x31 '1' 11
3074 2 0x32 '2' 12
3075 3 0x33 '3' 13
3076 4 0x34 '4' 14
3077 5 0xE2 \342 U+2174 15-20
3078 6 0x85 \205 (cont) 15-20
3079 7 0xB4 \264 (cont) 15-20
3080 8 0xE2 \342 U+2175 21-26
3081 9 0x85 \205 (cont) 21-26
3082 10 0xB5 \265 (cont) 21-26
3083 11 0x37 '7' 27
3084 12 0x38 '8' 28
3085 13 0x39 '9' 29
3086 14 0x00 30 (closing quote)
3087 ----------- ---- ----- ------- ---------------. */
3089 cpp_string dst_string;
3090 const enum cpp_ttype type = CPP_STRING;
3091 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3092 &dst_string, type);
3093 ASSERT_TRUE (result);
3094 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3095 (const char *)dst_string.text);
3096 free (const_cast <unsigned char *> (dst_string.text));
3098 /* Verify ranges of individual characters. This no longer includes the
3099 opening quote, but does include the closing quote.
3100 '01234'. */
3101 for (int i = 0; i <= 4; i++)
3102 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3103 /* U+2174. */
3104 for (int i = 5; i <= 7; i++)
3105 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
3106 /* U+2175. */
3107 for (int i = 8; i <= 10; i++)
3108 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
3109 /* '789' and nul terminator */
3110 for (int i = 11; i <= 14; i++)
3111 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
3113 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3116 /* Lex a string literal containing UCN 8 characters.
3117 Verify the substring location data after running cpp_interpret_string
3118 on it. */
3120 static void
3121 test_lexer_string_locations_ucn8 (const line_table_case &case_)
3123 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
3124 ....................000000000.111111.1111222222.2222333333333.344444
3125 ....................123456789.012345.6789012345.6789012345678.901234 */
3126 const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n";
3127 lexer_test test (case_, content, NULL);
3129 /* Verify that we get the expected token back, with the correct
3130 location information. */
3131 const cpp_token *tok = test.get_token ();
3132 ASSERT_EQ (tok->type, CPP_STRING);
3133 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3134 "\"01234\\U00002174\\U00002175789\"");
3136 /* Verify that cpp_interpret_string works.
3137 The UTF-8 encoding of the string is identical to that from
3138 the ucn4 testcase above; the only difference is the column
3139 locations. */
3140 cpp_string dst_string;
3141 const enum cpp_ttype type = CPP_STRING;
3142 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3143 &dst_string, type);
3144 ASSERT_TRUE (result);
3145 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3146 (const char *)dst_string.text);
3147 free (const_cast <unsigned char *> (dst_string.text));
3149 /* Verify ranges of individual characters. This no longer includes the
3150 opening quote, but does include the closing quote.
3151 '01234'. */
3152 for (int i = 0; i <= 4; i++)
3153 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3154 /* U+2174. */
3155 for (int i = 5; i <= 7; i++)
3156 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3157 /* U+2175. */
3158 for (int i = 8; i <= 10; i++)
3159 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3160 /* '789' at columns 35-37 */
3161 for (int i = 11; i <= 13; i++)
3162 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3163 /* Closing quote/nul-terminator at column 38. */
3164 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3166 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3169 /* Fetch a big-endian 32-bit value and convert to host endianness. */
3171 static uint32_t
3172 uint32_from_big_endian (const uint32_t *ptr_be_value)
3174 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3175 return (((uint32_t) buf[0] << 24)
3176 | ((uint32_t) buf[1] << 16)
3177 | ((uint32_t) buf[2] << 8)
3178 | (uint32_t) buf[3]);
3181 /* Lex a wide string literal and verify that attempts to read substring
3182 location data from it fail gracefully. */
3184 static void
3185 test_lexer_string_locations_wide_string (const line_table_case &case_)
3187 /* Digits 0-9.
3188 ....................000000000.11111111112.22222222233333
3189 ....................123456789.01234567890.12345678901234 */
3190 const char *content = " L\"0123456789\" /* non-str */\n";
3191 lexer_test test (case_, content, NULL);
3193 /* Verify that we get the expected token back, with the correct
3194 location information. */
3195 const cpp_token *tok = test.get_token ();
3196 ASSERT_EQ (tok->type, CPP_WSTRING);
3197 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3199 /* Verify that cpp_interpret_string works, using CPP_WSTRING. */
3200 cpp_string dst_string;
3201 const enum cpp_ttype type = CPP_WSTRING;
3202 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3203 &dst_string, type);
3204 ASSERT_TRUE (result);
3205 /* The cpp_reader defaults to big-endian with
3206 CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3207 now be encoded as UTF-32BE. */
3208 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3209 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3210 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3211 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3212 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3213 free (const_cast <unsigned char *> (dst_string.text));
3215 /* We don't yet support generating substring location information
3216 for L"" strings. */
3217 ASSERT_HAS_NO_SUBSTRING_RANGES
3218 (test, tok->src_loc, type,
3219 "execution character set != source character set");
3222 /* Fetch a big-endian 16-bit value and convert to host endianness. */
3224 static uint16_t
3225 uint16_from_big_endian (const uint16_t *ptr_be_value)
3227 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3228 return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3231 /* Lex a u"" string literal and verify that attempts to read substring
3232 location data from it fail gracefully. */
3234 static void
3235 test_lexer_string_locations_string16 (const line_table_case &case_)
3237 /* Digits 0-9.
3238 ....................000000000.11111111112.22222222233333
3239 ....................123456789.01234567890.12345678901234 */
3240 const char *content = " u\"0123456789\" /* non-str */\n";
3241 lexer_test test (case_, content, NULL);
3243 /* Verify that we get the expected token back, with the correct
3244 location information. */
3245 const cpp_token *tok = test.get_token ();
3246 ASSERT_EQ (tok->type, CPP_STRING16);
3247 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3249 /* Verify that cpp_interpret_string works, using CPP_STRING16. */
3250 cpp_string dst_string;
3251 const enum cpp_ttype type = CPP_STRING16;
3252 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3253 &dst_string, type);
3254 ASSERT_TRUE (result);
3256 /* The cpp_reader defaults to big-endian, so dst_string should
3257 now be encoded as UTF-16BE. */
3258 const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3259 ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3260 ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3261 ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3262 ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3263 free (const_cast <unsigned char *> (dst_string.text));
3265 /* We don't yet support generating substring location information
3266 for L"" strings. */
3267 ASSERT_HAS_NO_SUBSTRING_RANGES
3268 (test, tok->src_loc, type,
3269 "execution character set != source character set");
3272 /* Lex a U"" string literal and verify that attempts to read substring
3273 location data from it fail gracefully. */
3275 static void
3276 test_lexer_string_locations_string32 (const line_table_case &case_)
3278 /* Digits 0-9.
3279 ....................000000000.11111111112.22222222233333
3280 ....................123456789.01234567890.12345678901234 */
3281 const char *content = " U\"0123456789\" /* non-str */\n";
3282 lexer_test test (case_, content, NULL);
3284 /* Verify that we get the expected token back, with the correct
3285 location information. */
3286 const cpp_token *tok = test.get_token ();
3287 ASSERT_EQ (tok->type, CPP_STRING32);
3288 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3290 /* Verify that cpp_interpret_string works, using CPP_STRING32. */
3291 cpp_string dst_string;
3292 const enum cpp_ttype type = CPP_STRING32;
3293 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3294 &dst_string, type);
3295 ASSERT_TRUE (result);
3297 /* The cpp_reader defaults to big-endian, so dst_string should
3298 now be encoded as UTF-32BE. */
3299 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3300 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3301 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3302 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3303 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3304 free (const_cast <unsigned char *> (dst_string.text));
3306 /* We don't yet support generating substring location information
3307 for L"" strings. */
3308 ASSERT_HAS_NO_SUBSTRING_RANGES
3309 (test, tok->src_loc, type,
3310 "execution character set != source character set");
3313 /* Lex a u8-string literal.
3314 Verify the substring location data after running cpp_interpret_string
3315 on it. */
3317 static void
3318 test_lexer_string_locations_u8 (const line_table_case &case_)
3320 /* Digits 0-9.
3321 ....................000000000.11111111112.22222222233333
3322 ....................123456789.01234567890.12345678901234 */
3323 const char *content = " u8\"0123456789\" /* non-str */\n";
3324 lexer_test test (case_, content, NULL);
3326 /* Verify that we get the expected token back, with the correct
3327 location information. */
3328 const cpp_token *tok = test.get_token ();
3329 ASSERT_EQ (tok->type, CPP_UTF8STRING);
3330 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3332 /* Verify that cpp_interpret_string works. */
3333 cpp_string dst_string;
3334 const enum cpp_ttype type = CPP_STRING;
3335 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3336 &dst_string, type);
3337 ASSERT_TRUE (result);
3338 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3339 free (const_cast <unsigned char *> (dst_string.text));
3341 /* Verify ranges of individual characters. This no longer includes the
3342 opening quote, but does include the closing quote. */
3343 for (int i = 0; i <= 10; i++)
3344 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3347 /* Lex a string literal containing UTF-8 source characters.
3348 Verify the substring location data after running cpp_interpret_string
3349 on it. */
3351 static void
3352 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3354 /* This string literal is written out to the source file as UTF-8,
3355 and is of the form "before mojibake after", where "mojibake"
3356 is written as the following four unicode code points:
3357 U+6587 CJK UNIFIED IDEOGRAPH-6587
3358 U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3359 U+5316 CJK UNIFIED IDEOGRAPH-5316
3360 U+3051 HIRAGANA LETTER KE.
3361 Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3362 "before" and "after" are 1 byte per unicode character.
3364 The numbering shown are "columns", which are *byte* numbers within
3365 the line, rather than unicode character numbers.
3367 .................... 000000000.1111111.
3368 .................... 123456789.0123456. */
3369 const char *content = (" \"before "
3370 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3371 UTF-8: 0xE6 0x96 0x87
3372 C octal escaped UTF-8: \346\226\207
3373 "column" numbers: 17-19. */
3374 "\346\226\207"
3376 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3377 UTF-8: 0xE5 0xAD 0x97
3378 C octal escaped UTF-8: \345\255\227
3379 "column" numbers: 20-22. */
3380 "\345\255\227"
3382 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3383 UTF-8: 0xE5 0x8C 0x96
3384 C octal escaped UTF-8: \345\214\226
3385 "column" numbers: 23-25. */
3386 "\345\214\226"
3388 /* U+3051 HIRAGANA LETTER KE
3389 UTF-8: 0xE3 0x81 0x91
3390 C octal escaped UTF-8: \343\201\221
3391 "column" numbers: 26-28. */
3392 "\343\201\221"
3394 /* column numbers 29 onwards
3395 2333333.33334444444444
3396 9012345.67890123456789. */
3397 " after\" /* non-str */\n");
3398 lexer_test test (case_, content, NULL);
3400 /* Verify that we get the expected token back, with the correct
3401 location information. */
3402 const cpp_token *tok = test.get_token ();
3403 ASSERT_EQ (tok->type, CPP_STRING);
3404 ASSERT_TOKEN_AS_TEXT_EQ
3405 (test.m_parser, tok,
3406 "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3408 /* Verify that cpp_interpret_string works. */
3409 cpp_string dst_string;
3410 const enum cpp_ttype type = CPP_STRING;
3411 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3412 &dst_string, type);
3413 ASSERT_TRUE (result);
3414 ASSERT_STREQ
3415 ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3416 (const char *)dst_string.text);
3417 free (const_cast <unsigned char *> (dst_string.text));
3419 /* Verify ranges of individual characters. This no longer includes the
3420 opening quote, but does include the closing quote.
3421 Assuming that both source and execution encodings are UTF-8, we have
3422 a run of 25 octets in each, plus the NUL terminator. */
3423 for (int i = 0; i < 25; i++)
3424 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3425 /* NUL-terminator should use the closing quote at column 35. */
3426 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3428 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3431 /* Test of string literal concatenation. */
3433 static void
3434 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3436 /* Digits 0-9.
3437 .....................000000000.111111.11112222222222
3438 .....................123456789.012345.67890123456789. */
3439 const char *content = (" \"01234\" /* non-str */\n"
3440 " \"56789\" /* non-str */\n");
3441 lexer_test test (case_, content, NULL);
3443 location_t input_locs[2];
3445 /* Verify that we get the expected tokens back. */
3446 auto_vec <cpp_string> input_strings;
3447 const cpp_token *tok_a = test.get_token ();
3448 ASSERT_EQ (tok_a->type, CPP_STRING);
3449 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3450 input_strings.safe_push (tok_a->val.str);
3451 input_locs[0] = tok_a->src_loc;
3453 const cpp_token *tok_b = test.get_token ();
3454 ASSERT_EQ (tok_b->type, CPP_STRING);
3455 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3456 input_strings.safe_push (tok_b->val.str);
3457 input_locs[1] = tok_b->src_loc;
3459 /* Verify that cpp_interpret_string works. */
3460 cpp_string dst_string;
3461 const enum cpp_ttype type = CPP_STRING;
3462 bool result = cpp_interpret_string (test.m_parser,
3463 input_strings.address (), 2,
3464 &dst_string, type);
3465 ASSERT_TRUE (result);
3466 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3467 free (const_cast <unsigned char *> (dst_string.text));
3469 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3470 test.m_concats.record_string_concatenation (2, input_locs);
3472 location_t initial_loc = input_locs[0];
3474 /* "01234" on line 1. */
3475 for (int i = 0; i <= 4; i++)
3476 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3477 /* "56789" in line 2, plus its closing quote for the nul terminator. */
3478 for (int i = 5; i <= 10; i++)
3479 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3481 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3484 /* Another test of string literal concatenation. */
3486 static void
3487 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3489 /* Digits 0-9.
3490 .....................000000000.111.11111112222222
3491 .....................123456789.012.34567890123456. */
3492 const char *content = (" \"01\" /* non-str */\n"
3493 " \"23\" /* non-str */\n"
3494 " \"45\" /* non-str */\n"
3495 " \"67\" /* non-str */\n"
3496 " \"89\" /* non-str */\n");
3497 lexer_test test (case_, content, NULL);
3499 auto_vec <cpp_string> input_strings;
3500 location_t input_locs[5];
3502 /* Verify that we get the expected tokens back. */
3503 for (int i = 0; i < 5; i++)
3505 const cpp_token *tok = test.get_token ();
3506 ASSERT_EQ (tok->type, CPP_STRING);
3507 input_strings.safe_push (tok->val.str);
3508 input_locs[i] = tok->src_loc;
3511 /* Verify that cpp_interpret_string works. */
3512 cpp_string dst_string;
3513 const enum cpp_ttype type = CPP_STRING;
3514 bool result = cpp_interpret_string (test.m_parser,
3515 input_strings.address (), 5,
3516 &dst_string, type);
3517 ASSERT_TRUE (result);
3518 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3519 free (const_cast <unsigned char *> (dst_string.text));
3521 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3522 test.m_concats.record_string_concatenation (5, input_locs);
3524 location_t initial_loc = input_locs[0];
3526 /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3527 detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3528 and expect get_source_range_for_substring to fail.
3529 However, for a string concatenation test, we can have a case
3530 where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3531 but subsequent strings can be after it.
3532 Attempting to detect this within assert_char_at_range
3533 would overcomplicate the logic for the common test cases, so
3534 we detect it here. */
3535 if (should_have_column_data_p (input_locs[0])
3536 && !should_have_column_data_p (input_locs[4]))
3538 /* Verify that get_source_range_for_substring gracefully rejects
3539 this case. */
3540 source_range actual_range;
3541 const char *err
3542 = get_source_range_for_char (test.m_parser, &test.m_concats,
3543 initial_loc, type, 0, &actual_range);
3544 ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3545 return;
3548 for (int i = 0; i < 5; i++)
3549 for (int j = 0; j < 2; j++)
3550 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3551 i + 1, 10 + j, 10 + j);
3553 /* NUL-terminator should use the final closing quote at line 5 column 12. */
3554 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3556 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3559 /* Another test of string literal concatenation, this time combined with
3560 various kinds of escaped characters. */
3562 static void
3563 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3565 /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3566 digit 6 in ASCII as octal "\066", concatenating multiple strings. */
3567 const char *content
3568 /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3569 .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3570 = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n");
3571 lexer_test test (case_, content, NULL);
3573 auto_vec <cpp_string> input_strings;
3574 location_t input_locs[4];
3576 /* Verify that we get the expected tokens back. */
3577 for (int i = 0; i < 4; i++)
3579 const cpp_token *tok = test.get_token ();
3580 ASSERT_EQ (tok->type, CPP_STRING);
3581 input_strings.safe_push (tok->val.str);
3582 input_locs[i] = tok->src_loc;
3585 /* Verify that cpp_interpret_string works. */
3586 cpp_string dst_string;
3587 const enum cpp_ttype type = CPP_STRING;
3588 bool result = cpp_interpret_string (test.m_parser,
3589 input_strings.address (), 4,
3590 &dst_string, type);
3591 ASSERT_TRUE (result);
3592 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3593 free (const_cast <unsigned char *> (dst_string.text));
3595 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3596 test.m_concats.record_string_concatenation (4, input_locs);
3598 location_t initial_loc = input_locs[0];
3600 for (int i = 0; i <= 4; i++)
3601 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3602 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3603 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3604 for (int i = 7; i <= 9; i++)
3605 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3607 /* NUL-terminator should use the location of the final closing quote. */
3608 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3610 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3613 /* Test of string literal in a macro. */
3615 static void
3616 test_lexer_string_locations_macro (const line_table_case &case_)
3618 /* Digits 0-9.
3619 .....................0000000001111111111.22222222223.
3620 .....................1234567890123456789.01234567890. */
3621 const char *content = ("#define MACRO \"0123456789\" /* non-str */\n"
3622 " MACRO");
3623 lexer_test test (case_, content, NULL);
3625 /* Verify that we get the expected tokens back. */
3626 const cpp_token *tok = test.get_token ();
3627 ASSERT_EQ (tok->type, CPP_PADDING);
3629 tok = test.get_token ();
3630 ASSERT_EQ (tok->type, CPP_STRING);
3631 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3633 /* Verify ranges of individual characters. We ought to
3634 see columns within the macro definition. */
3635 for (int i = 0; i <= 10; i++)
3636 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3637 i, 1, 20 + i, 20 + i);
3639 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3641 tok = test.get_token ();
3642 ASSERT_EQ (tok->type, CPP_PADDING);
3645 /* Test of stringification of a macro argument. */
3647 static void
3648 test_lexer_string_locations_stringified_macro_argument
3649 (const line_table_case &case_)
3651 /* .....................000000000111111111122222222223.
3652 .....................123456789012345678901234567890. */
3653 const char *content = ("#define MACRO(X) #X /* non-str */\n"
3654 "MACRO(foo)\n");
3655 lexer_test test (case_, content, NULL);
3657 /* Verify that we get the expected token back. */
3658 const cpp_token *tok = test.get_token ();
3659 ASSERT_EQ (tok->type, CPP_PADDING);
3661 tok = test.get_token ();
3662 ASSERT_EQ (tok->type, CPP_STRING);
3663 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3665 /* We don't support getting the location of a stringified macro
3666 argument. Verify that it fails gracefully. */
3667 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3668 "cpp_interpret_string_1 failed");
3670 tok = test.get_token ();
3671 ASSERT_EQ (tok->type, CPP_PADDING);
3673 tok = test.get_token ();
3674 ASSERT_EQ (tok->type, CPP_PADDING);
3677 /* Ensure that we are fail gracefully if something attempts to pass
3678 in a location that isn't a string literal token. Seen on this code:
3680 const char a[] = " %d ";
3681 __builtin_printf (a, 0.5);
3684 when c-format.cc erroneously used the indicated one-character
3685 location as the format string location, leading to a read past the
3686 end of a string buffer in cpp_interpret_string_1. */
3688 static void
3689 test_lexer_string_locations_non_string (const line_table_case &case_)
3691 /* .....................000000000111111111122222222223.
3692 .....................123456789012345678901234567890. */
3693 const char *content = (" a\n");
3694 lexer_test test (case_, content, NULL);
3696 /* Verify that we get the expected token back. */
3697 const cpp_token *tok = test.get_token ();
3698 ASSERT_EQ (tok->type, CPP_NAME);
3699 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3701 /* At this point, libcpp is attempting to interpret the name as a
3702 string literal, despite it not starting with a quote. We don't detect
3703 that, but we should at least fail gracefully. */
3704 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3705 "cpp_interpret_string_1 failed");
3708 /* Ensure that we can read substring information for a token which
3709 starts in one linemap and ends in another . Adapted from
3710 gcc.dg/cpp/pr69985.c. */
3712 static void
3713 test_lexer_string_locations_long_line (const line_table_case &case_)
3715 /* .....................000000.000111111111
3716 .....................123456.789012346789. */
3717 const char *content = ("/* A very long line, so that we start a new line map. */\n"
3718 " \"0123456789012345678901234567890123456789"
3719 "0123456789012345678901234567890123456789"
3720 "0123456789012345678901234567890123456789"
3721 "0123456789\"\n");
3723 lexer_test test (case_, content, NULL);
3725 /* Verify that we get the expected token back. */
3726 const cpp_token *tok = test.get_token ();
3727 ASSERT_EQ (tok->type, CPP_STRING);
3729 if (!should_have_column_data_p (line_table->highest_location))
3730 return;
3732 /* Verify ranges of individual characters. */
3733 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3734 for (int i = 0; i < 131; i++)
3735 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3736 i, 2, 7 + i, 7 + i);
3739 /* Test of locations within a raw string that doesn't contain a newline. */
3741 static void
3742 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3744 /* .....................00.0000000111111111122.
3745 .....................12.3456789012345678901. */
3746 const char *content = ("R\"foo(0123456789)foo\"\n");
3747 lexer_test test (case_, content, NULL);
3749 /* Verify that we get the expected token back. */
3750 const cpp_token *tok = test.get_token ();
3751 ASSERT_EQ (tok->type, CPP_STRING);
3753 /* Verify that cpp_interpret_string works. */
3754 cpp_string dst_string;
3755 const enum cpp_ttype type = CPP_STRING;
3756 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3757 &dst_string, type);
3758 ASSERT_TRUE (result);
3759 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3760 free (const_cast <unsigned char *> (dst_string.text));
3762 if (!should_have_column_data_p (line_table->highest_location))
3763 return;
3765 /* 0-9, plus the nil terminator. */
3766 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3767 for (int i = 0; i < 11; i++)
3768 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3769 i, 1, 7 + i, 7 + i);
3772 /* Test of locations within a raw string that contains a newline. */
3774 static void
3775 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3777 /* .....................00.0000.
3778 .....................12.3456. */
3779 const char *content = ("R\"foo(\n"
3780 /* .....................00000.
3781 .....................12345. */
3782 "hello\n"
3783 "world\n"
3784 /* .....................00000.
3785 .....................12345. */
3786 ")foo\"\n");
3787 lexer_test test (case_, content, NULL);
3789 /* Verify that we get the expected token back. */
3790 const cpp_token *tok = test.get_token ();
3791 ASSERT_EQ (tok->type, CPP_STRING);
3793 /* Verify that cpp_interpret_string works. */
3794 cpp_string dst_string;
3795 const enum cpp_ttype type = CPP_STRING;
3796 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3797 &dst_string, type);
3798 ASSERT_TRUE (result);
3799 ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3800 free (const_cast <unsigned char *> (dst_string.text));
3802 if (!should_have_column_data_p (line_table->highest_location))
3803 return;
3805 /* Currently we don't support locations within raw strings that
3806 contain newlines. */
3807 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3808 "range endpoints are on different lines");
3811 /* Test of parsing an unterminated raw string. */
3813 static void
3814 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3816 const char *content = "R\"ouch()ouCh\" /* etc */";
3818 lexer_diagnostic_sink diagnostics;
3819 lexer_test test (case_, content, &diagnostics);
3820 test.m_implicitly_expect_EOF = false;
3822 /* Attempt to parse the raw string. */
3823 const cpp_token *tok = test.get_token ();
3824 ASSERT_EQ (tok->type, CPP_EOF);
3826 ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3827 /* We expect the message "unterminated raw string"
3828 in the "cpplib" translation domain.
3829 It's not clear that dgettext is available on all supported hosts,
3830 so this assertion is commented-out for now.
3831 ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3832 diagnostics.m_diagnostics[0]);
3836 /* Test of lexing char constants. */
3838 static void
3839 test_lexer_char_constants (const line_table_case &case_)
3841 /* Various char constants.
3842 .....................0000000001111111111.22222222223.
3843 .....................1234567890123456789.01234567890. */
3844 const char *content = (" 'a'\n"
3845 " u'a'\n"
3846 " U'a'\n"
3847 " L'a'\n"
3848 " 'abc'\n");
3849 lexer_test test (case_, content, NULL);
3851 /* Verify that we get the expected tokens back. */
3852 /* 'a'. */
3853 const cpp_token *tok = test.get_token ();
3854 ASSERT_EQ (tok->type, CPP_CHAR);
3855 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3857 unsigned int chars_seen;
3858 int unsignedp;
3859 cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3860 &chars_seen, &unsignedp);
3861 ASSERT_EQ (cc, 'a');
3862 ASSERT_EQ (chars_seen, 1);
3864 /* u'a'. */
3865 tok = test.get_token ();
3866 ASSERT_EQ (tok->type, CPP_CHAR16);
3867 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3869 /* U'a'. */
3870 tok = test.get_token ();
3871 ASSERT_EQ (tok->type, CPP_CHAR32);
3872 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3874 /* L'a'. */
3875 tok = test.get_token ();
3876 ASSERT_EQ (tok->type, CPP_WCHAR);
3877 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3879 /* 'abc' (c-char-sequence). */
3880 tok = test.get_token ();
3881 ASSERT_EQ (tok->type, CPP_CHAR);
3882 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3884 /* A table of interesting location_t values, giving one axis of our test
3885 matrix. */
3887 static const location_t boundary_locations[] = {
3888 /* Zero means "don't override the default values for a new line_table". */
3891 /* An arbitrary non-zero value that isn't close to one of
3892 the boundary values below. */
3893 0x10000,
3895 /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES. */
3896 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3897 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3898 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3899 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3900 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3902 /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS. */
3903 LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3904 LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3905 LINE_MAP_MAX_LOCATION_WITH_COLS,
3906 LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3907 LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3910 /* Run TESTCASE multiple times, once for each case in our test matrix. */
3912 void
3913 for_each_line_table_case (void (*testcase) (const line_table_case &))
3915 /* As noted above in the description of struct line_table_case,
3916 we want to explore a test matrix of interesting line_table
3917 situations, running various selftests for each case within the
3918 matrix. */
3920 /* Run all tests with:
3921 (a) line_table->default_range_bits == 0, and
3922 (b) line_table->default_range_bits == 5. */
3923 int num_cases_tested = 0;
3924 for (int default_range_bits = 0; default_range_bits <= 5;
3925 default_range_bits += 5)
3927 /* ...and use each of the "interesting" location values as
3928 the starting location within line_table. */
3929 const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3930 for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3932 line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3934 testcase (c);
3936 num_cases_tested++;
3940 /* Verify that we fully covered the test matrix. */
3941 ASSERT_EQ (num_cases_tested, 2 * 12);
3944 /* Verify that when presented with a consecutive pair of locations with
3945 a very large line offset, we don't attempt to consolidate them into
3946 a single ordinary linemap where the line offsets within the line map
3947 would lead to overflow (PR lto/88147). */
3949 static void
3950 test_line_offset_overflow ()
3952 line_table_test ltt (line_table_case (5, 0));
3954 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3955 linemap_line_start (line_table, 1, 100);
3956 location_t loc_a = linemap_line_start (line_table, 2578, 255);
3957 assert_loceq ("foo.c", 2578, 0, loc_a);
3959 const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3960 ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3961 ASSERT_EQ (ordmap_a->m_range_bits, 5);
3963 location_t loc_b = linemap_line_start (line_table, 404198, 512);
3964 assert_loceq ("foo.c", 404198, 0, loc_b);
3966 /* We should have started a new linemap, rather than attempting to store
3967 a very large line offset. */
3968 const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3969 ASSERT_NE (ordmap_a, ordmap_b);
3972 void test_cpp_utf8 ()
3974 const int def_tabstop = 8;
3975 cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3977 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
3979 int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3980 ASSERT_EQ (8, w_bad);
3981 int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3982 ASSERT_EQ (5, w_ctrl);
3985 /* Verify that wcwidth of valid UTF-8 is as expected. */
3987 const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3988 ASSERT_EQ (1, w_pi);
3989 const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3990 ASSERT_EQ (2, w_emoji);
3991 const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3992 policy);
3993 ASSERT_EQ (1, w_umlaut_precomposed);
3994 const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3995 policy);
3996 ASSERT_EQ (1, w_umlaut_combining);
3997 const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3998 ASSERT_EQ (2, w_han);
3999 const int w_ascii = cpp_display_width ("GCC", 3, policy);
4000 ASSERT_EQ (3, w_ascii);
4001 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
4002 "\x9f! \xe4\xb8\xba y\xcc\x88",
4003 24, policy);
4004 ASSERT_EQ (18, w_mixed);
4007 /* Verify that display width properly expands tabs. */
4009 const char *tstr = "\tabc\td";
4010 ASSERT_EQ (6, cpp_display_width (tstr, 6,
4011 cpp_char_column_policy (1, cpp_wcwidth)));
4012 ASSERT_EQ (10, cpp_display_width (tstr, 6,
4013 cpp_char_column_policy (3, cpp_wcwidth)));
4014 ASSERT_EQ (17, cpp_display_width (tstr, 6,
4015 cpp_char_column_policy (8, cpp_wcwidth)));
4016 ASSERT_EQ (1,
4017 cpp_display_column_to_byte_column
4018 (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
4021 /* Verify that cpp_byte_column_to_display_column can go past the end,
4022 and similar edge cases. */
4024 const char *str
4025 /* Display columns.
4026 111111112345 */
4027 = "\xcf\x80 abc";
4028 /* 111122223456
4029 Byte columns. */
4031 ASSERT_EQ (5, cpp_display_width (str, 6, policy));
4032 ASSERT_EQ (105,
4033 cpp_byte_column_to_display_column (str, 6, 106, policy));
4034 ASSERT_EQ (10000,
4035 cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
4036 ASSERT_EQ (0,
4037 cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
4040 /* Verify that cpp_display_column_to_byte_column can go past the end,
4041 and similar edge cases, and check invertibility. */
4043 const char *str
4044 /* Display columns.
4045 000000000000000000000000000000000000011
4046 111111112222222234444444455555555678901 */
4047 = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
4048 /* 000000000000000000000000000000000111111
4049 111122223333444456666777788889999012345
4050 Byte columns. */
4051 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
4052 ASSERT_EQ (15,
4053 cpp_display_column_to_byte_column (str, 15, 11, policy));
4054 ASSERT_EQ (115,
4055 cpp_display_column_to_byte_column (str, 15, 111, policy));
4056 ASSERT_EQ (10000,
4057 cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
4058 ASSERT_EQ (0,
4059 cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
4061 /* Verify that we do not interrupt a UTF-8 sequence. */
4062 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
4064 for (int byte_col = 1; byte_col <= 15; ++byte_col)
4066 const int disp_col
4067 = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
4068 const int byte_col2
4069 = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
4071 /* If we ask for the display column in the middle of a UTF-8
4072 sequence, it will return the length of the partial sequence,
4073 matching the behavior of GCC before display column support.
4074 Otherwise check the round trip was successful. */
4075 if (byte_col < 4)
4076 ASSERT_EQ (byte_col, disp_col);
4077 else if (byte_col >= 6 && byte_col < 9)
4078 ASSERT_EQ (3 + (byte_col - 5), disp_col);
4079 else
4080 ASSERT_EQ (byte_col2, byte_col);
4085 static bool
4086 check_cpp_valid_utf8_p (const char *str)
4088 return cpp_valid_utf8_p (str, strlen (str));
4091 /* Check that cpp_valid_utf8_p works as expected. */
4093 static void
4094 test_cpp_valid_utf8_p ()
4096 ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
4098 /* 2-byte char (pi). */
4099 ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
4101 /* 3-byte chars (the Japanese word "mojibake"). */
4102 ASSERT_TRUE (check_cpp_valid_utf8_p
4104 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
4105 UTF-8: 0xE6 0x96 0x87
4106 C octal escaped UTF-8: \346\226\207. */
4107 "\346\226\207"
4108 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
4109 UTF-8: 0xE5 0xAD 0x97
4110 C octal escaped UTF-8: \345\255\227. */
4111 "\345\255\227"
4112 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
4113 UTF-8: 0xE5 0x8C 0x96
4114 C octal escaped UTF-8: \345\214\226. */
4115 "\345\214\226"
4116 /* U+3051 HIRAGANA LETTER KE
4117 UTF-8: 0xE3 0x81 0x91
4118 C octal escaped UTF-8: \343\201\221. */
4119 "\343\201\221"));
4121 /* 4-byte char: an emoji. */
4122 ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
4124 /* Control codes, including the NUL byte. */
4125 ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
4127 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
4129 /* Unexpected continuation bytes. */
4130 for (unsigned char continuation_byte = 0x80;
4131 continuation_byte <= 0xbf;
4132 continuation_byte++)
4133 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
4135 /* "Lonely start characters" for 2-byte sequences. */
4137 unsigned char buf[2];
4138 buf[1] = ' ';
4139 for (buf[0] = 0xc0;
4140 buf[0] <= 0xdf;
4141 buf[0]++)
4142 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4145 /* "Lonely start characters" for 3-byte sequences. */
4147 unsigned char buf[2];
4148 buf[1] = ' ';
4149 for (buf[0] = 0xe0;
4150 buf[0] <= 0xef;
4151 buf[0]++)
4152 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4155 /* "Lonely start characters" for 4-byte sequences. */
4157 unsigned char buf[2];
4158 buf[1] = ' ';
4159 for (buf[0] = 0xf0;
4160 buf[0] <= 0xf4;
4161 buf[0]++)
4162 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4165 /* Invalid start characters (formerly valid for 5-byte and 6-byte
4166 sequences). */
4168 unsigned char buf[2];
4169 buf[1] = ' ';
4170 for (buf[0] = 0xf5;
4171 buf[0] <= 0xfd;
4172 buf[0]++)
4173 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4176 /* Impossible bytes. */
4177 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
4178 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
4179 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
4180 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
4183 /* Run all of the selftests within this file. */
4185 void
4186 input_cc_tests ()
4188 test_linenum_comparisons ();
4189 test_should_have_column_data_p ();
4190 test_unknown_location ();
4191 test_builtins ();
4192 for_each_line_table_case (test_make_location_nonpure_range_endpoints);
4194 for_each_line_table_case (test_accessing_ordinary_linemaps);
4195 for_each_line_table_case (test_lexer);
4196 for_each_line_table_case (test_lexer_string_locations_simple);
4197 for_each_line_table_case (test_lexer_string_locations_ebcdic);
4198 for_each_line_table_case (test_lexer_string_locations_hex);
4199 for_each_line_table_case (test_lexer_string_locations_oct);
4200 for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
4201 for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
4202 for_each_line_table_case (test_lexer_string_locations_ucn4);
4203 for_each_line_table_case (test_lexer_string_locations_ucn8);
4204 for_each_line_table_case (test_lexer_string_locations_wide_string);
4205 for_each_line_table_case (test_lexer_string_locations_string16);
4206 for_each_line_table_case (test_lexer_string_locations_string32);
4207 for_each_line_table_case (test_lexer_string_locations_u8);
4208 for_each_line_table_case (test_lexer_string_locations_utf8_source);
4209 for_each_line_table_case (test_lexer_string_locations_concatenation_1);
4210 for_each_line_table_case (test_lexer_string_locations_concatenation_2);
4211 for_each_line_table_case (test_lexer_string_locations_concatenation_3);
4212 for_each_line_table_case (test_lexer_string_locations_macro);
4213 for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
4214 for_each_line_table_case (test_lexer_string_locations_non_string);
4215 for_each_line_table_case (test_lexer_string_locations_long_line);
4216 for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
4217 for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
4218 for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
4219 for_each_line_table_case (test_lexer_char_constants);
4221 test_reading_source_line ();
4223 test_line_offset_overflow ();
4225 test_cpp_utf8 ();
4226 test_cpp_valid_utf8_p ();
4229 } // namespace selftest
4231 #endif /* CHECKING_P */