ada: Reorder components in Ada.Containers.Bounded_Doubly_Linked_Lists
[official-gcc.git] / gcc / input.cc
blobeaf301ec7c15075d53f801e85180a91bcacc05d2
1 /* Data and functions related to line maps and input files.
2 Copyright (C) 2004-2023 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "selftest.h"
26 #include "cpplib.h"
28 #ifndef HAVE_ICONV
29 #define HAVE_ICONV 0
30 #endif
32 const char *
33 special_fname_builtin ()
35 return _("<built-in>");
38 /* Input charset configuration. */
39 static const char *default_charset_callback (const char *)
41 return nullptr;
44 void
45 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
46 bool should_skip_bom)
48 in_context.ccb = (ccb ? ccb : default_charset_callback);
49 in_context.should_skip_bom = should_skip_bom;
52 /* This is a cache used by get_next_line to store the content of a
53 file to be searched for file lines. */
54 class file_cache_slot
56 public:
57 file_cache_slot ();
58 ~file_cache_slot ();
60 bool read_line_num (size_t line_num,
61 char ** line, ssize_t *line_len);
63 /* Accessors. */
64 const char *get_file_path () const { return m_file_path; }
65 unsigned get_use_count () const { return m_use_count; }
66 bool missing_trailing_newline_p () const
68 return m_missing_trailing_newline;
70 char_span get_full_file_content ();
72 void inc_use_count () { m_use_count++; }
74 bool create (const file_cache::input_context &in_context,
75 const char *file_path, FILE *fp, unsigned highest_use_count);
76 void evict ();
78 private:
79 /* These are information used to store a line boundary. */
80 class line_info
82 public:
83 /* The line number. It starts from 1. */
84 size_t line_num;
86 /* The position (byte count) of the beginning of the line,
87 relative to the file data pointer. This starts at zero. */
88 size_t start_pos;
90 /* The position (byte count) of the last byte of the line. This
91 normally points to the '\n' character, or to one byte after the
92 last byte of the file, if the file doesn't contain a '\n'
93 character. */
94 size_t end_pos;
96 line_info (size_t l, size_t s, size_t e)
97 : line_num (l), start_pos (s), end_pos (e)
100 line_info ()
101 :line_num (0), start_pos (0), end_pos (0)
105 bool needs_read_p () const;
106 bool needs_grow_p () const;
107 void maybe_grow ();
108 bool read_data ();
109 bool maybe_read_data ();
110 bool get_next_line (char **line, ssize_t *line_len);
111 bool read_next_line (char ** line, ssize_t *line_len);
112 bool goto_next_line ();
114 static const size_t buffer_size = 4 * 1024;
115 static const size_t line_record_size = 100;
117 /* The number of time this file has been accessed. This is used
118 to designate which file cache to evict from the cache
119 array. */
120 unsigned m_use_count;
122 /* The file_path is the key for identifying a particular file in
123 the cache.
124 For libcpp-using code, the underlying buffer for this field is
125 owned by the corresponding _cpp_file within the cpp_reader. */
126 const char *m_file_path;
128 FILE *m_fp;
130 /* This points to the content of the file that we've read so
131 far. */
132 char *m_data;
134 /* The allocated buffer to be freed may start a little earlier than DATA,
135 e.g. if a UTF8 BOM was skipped at the beginning. */
136 int m_alloc_offset;
138 /* The size of the DATA array above.*/
139 size_t m_size;
141 /* The number of bytes read from the underlying file so far. This
142 must be less (or equal) than SIZE above. */
143 size_t m_nb_read;
145 /* The index of the beginning of the current line. */
146 size_t m_line_start_idx;
148 /* The number of the previous line read. This starts at 1. Zero
149 means we've read no line so far. */
150 size_t m_line_num;
152 /* This is the total number of lines of the current file. At the
153 moment, we try to get this information from the line map
154 subsystem. Note that this is just a hint. When using the C++
155 front-end, this hint is correct because the input file is then
156 completely tokenized before parsing starts; so the line map knows
157 the number of lines before compilation really starts. For e.g,
158 the C front-end, it can happen that we start emitting diagnostics
159 before the line map has seen the end of the file. */
160 size_t m_total_lines;
162 /* Could this file be missing a trailing newline on its final line?
163 Initially true (to cope with empty files), set to true/false
164 as each line is read. */
165 bool m_missing_trailing_newline;
167 /* This is a record of the beginning and end of the lines we've seen
168 while reading the file. This is useful to avoid walking the data
169 from the beginning when we are asked to read a line that is
170 before LINE_START_IDX above. Note that the maximum size of this
171 record is line_record_size, so that the memory consumption
172 doesn't explode. We thus scale total_lines down to
173 line_record_size. */
174 vec<line_info, va_heap> m_line_record;
176 void offset_buffer (int offset)
178 gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
179 : (size_t) offset <= m_size);
180 gcc_assert (m_data);
181 m_alloc_offset += offset;
182 m_data += offset;
183 m_size -= offset;
188 /* Current position in real source file. */
190 location_t input_location = UNKNOWN_LOCATION;
192 class line_maps *line_table;
194 /* A stashed copy of "line_table" for use by selftest::line_table_test.
195 This needs to be a global so that it can be a GC root, and thus
196 prevent the stashed copy from being garbage-collected if the GC runs
197 during a line_table_test. */
199 class line_maps *saved_line_table;
201 /* Expand the source location LOC into a human readable location. If
202 LOC resolves to a builtin location, the file name of the readable
203 location is set to the string "<built-in>". If EXPANSION_POINT_P is
204 TRUE and LOC is virtual, then it is resolved to the expansion
205 point of the involved macro. Otherwise, it is resolved to the
206 spelling location of the token.
208 When resolving to the spelling location of the token, if the
209 resulting location is for a built-in location (that is, it has no
210 associated line/column) in the context of a macro expansion, the
211 returned location is the first one (while unwinding the macro
212 location towards its expansion point) that is in real source
213 code.
215 ASPECT controls which part of the location to use. */
217 static expanded_location
218 expand_location_1 (location_t loc,
219 bool expansion_point_p,
220 enum location_aspect aspect)
222 expanded_location xloc;
223 const line_map_ordinary *map;
224 enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
225 tree block = NULL;
227 if (IS_ADHOC_LOC (loc))
229 block = LOCATION_BLOCK (loc);
230 loc = LOCATION_LOCUS (loc);
233 memset (&xloc, 0, sizeof (xloc));
235 if (loc >= RESERVED_LOCATION_COUNT)
237 if (!expansion_point_p)
239 /* We want to resolve LOC to its spelling location.
241 But if that spelling location is a reserved location that
242 appears in the context of a macro expansion (like for a
243 location for a built-in token), let's consider the first
244 location (toward the expansion point) that is not reserved;
245 that is, the first location that is in real source code. */
246 loc = linemap_unwind_to_first_non_reserved_loc (line_table,
247 loc, NULL);
248 lrk = LRK_SPELLING_LOCATION;
250 loc = linemap_resolve_location (line_table, loc, lrk, &map);
252 /* loc is now either in an ordinary map, or is a reserved location.
253 If it is a compound location, the caret is in a spelling location,
254 but the start/finish might still be a virtual location.
255 Depending of what the caller asked for, we may need to recurse
256 one level in order to resolve any virtual locations in the
257 end-points. */
258 switch (aspect)
260 default:
261 gcc_unreachable ();
262 /* Fall through. */
263 case LOCATION_ASPECT_CARET:
264 break;
265 case LOCATION_ASPECT_START:
267 location_t start = get_start (loc);
268 if (start != loc)
269 return expand_location_1 (start, expansion_point_p, aspect);
271 break;
272 case LOCATION_ASPECT_FINISH:
274 location_t finish = get_finish (loc);
275 if (finish != loc)
276 return expand_location_1 (finish, expansion_point_p, aspect);
278 break;
280 xloc = linemap_expand_location (line_table, map, loc);
283 xloc.data = block;
284 if (loc <= BUILTINS_LOCATION)
285 xloc.file = loc == UNKNOWN_LOCATION ? NULL : special_fname_builtin ();
287 return xloc;
290 /* Initialize the set of cache used for files accessed by caret
291 diagnostic. */
293 static void
294 diagnostic_file_cache_init (void)
296 gcc_assert (global_dc);
297 if (global_dc->m_file_cache == NULL)
298 global_dc->m_file_cache = new file_cache ();
301 /* Free the resources used by the set of cache used for files accessed
302 by caret diagnostic. */
304 void
305 diagnostic_file_cache_fini (void)
307 if (global_dc->m_file_cache)
309 delete global_dc->m_file_cache;
310 global_dc->m_file_cache = NULL;
314 /* Return the total lines number that have been read so far by the
315 line map (in the preprocessor) so far. For languages like C++ that
316 entirely preprocess the input file before starting to parse, this
317 equals the actual number of lines of the file. */
319 static size_t
320 total_lines_num (const char *file_path)
322 size_t r = 0;
323 location_t l = 0;
324 if (linemap_get_file_highest_location (line_table, file_path, &l))
326 gcc_assert (l >= RESERVED_LOCATION_COUNT);
327 expanded_location xloc = expand_location (l);
328 r = xloc.line;
330 return r;
333 /* Lookup the cache used for the content of a given file accessed by
334 caret diagnostic. Return the found cached file, or NULL if no
335 cached file was found. */
337 file_cache_slot *
338 file_cache::lookup_file (const char *file_path)
340 gcc_assert (file_path);
342 /* This will contain the found cached file. */
343 file_cache_slot *r = NULL;
344 for (unsigned i = 0; i < num_file_slots; ++i)
346 file_cache_slot *c = &m_file_slots[i];
347 if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
349 c->inc_use_count ();
350 r = c;
354 if (r)
355 r->inc_use_count ();
357 return r;
360 /* Purge any mention of FILENAME from the cache of files used for
361 printing source code. For use in selftests when working
362 with tempfiles. */
364 void
365 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
367 gcc_assert (file_path);
369 if (!global_dc->m_file_cache)
370 return;
372 global_dc->m_file_cache->forcibly_evict_file (file_path);
375 void
376 file_cache::forcibly_evict_file (const char *file_path)
378 gcc_assert (file_path);
380 file_cache_slot *r = lookup_file (file_path);
381 if (!r)
382 /* Not found. */
383 return;
385 r->evict ();
388 void
389 file_cache_slot::evict ()
391 m_file_path = NULL;
392 if (m_fp)
393 fclose (m_fp);
394 m_fp = NULL;
395 m_nb_read = 0;
396 m_line_start_idx = 0;
397 m_line_num = 0;
398 m_line_record.truncate (0);
399 m_use_count = 0;
400 m_total_lines = 0;
401 m_missing_trailing_newline = true;
404 /* Return the file cache that has been less used, recently, or the
405 first empty one. If HIGHEST_USE_COUNT is non-null,
406 *HIGHEST_USE_COUNT is set to the highest use count of the entries
407 in the cache table. */
409 file_cache_slot*
410 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
412 diagnostic_file_cache_init ();
414 file_cache_slot *to_evict = &m_file_slots[0];
415 unsigned huc = to_evict->get_use_count ();
416 for (unsigned i = 1; i < num_file_slots; ++i)
418 file_cache_slot *c = &m_file_slots[i];
419 bool c_is_empty = (c->get_file_path () == NULL);
421 if (c->get_use_count () < to_evict->get_use_count ()
422 || (to_evict->get_file_path () && c_is_empty))
423 /* We evict C because it's either an entry with a lower use
424 count or one that is empty. */
425 to_evict = c;
427 if (huc < c->get_use_count ())
428 huc = c->get_use_count ();
430 if (c_is_empty)
431 /* We've reached the end of the cache; subsequent elements are
432 all empty. */
433 break;
436 if (highest_use_count)
437 *highest_use_count = huc;
439 return to_evict;
442 /* Create the cache used for the content of a given file to be
443 accessed by caret diagnostic. This cache is added to an array of
444 cache and can be retrieved by lookup_file_in_cache_tab. This
445 function returns the created cache. Note that only the last
446 num_file_slots files are cached. */
448 file_cache_slot*
449 file_cache::add_file (const char *file_path)
452 FILE *fp = fopen (file_path, "r");
453 if (fp == NULL)
454 return NULL;
456 unsigned highest_use_count = 0;
457 file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
458 if (!r->create (in_context, file_path, fp, highest_use_count))
459 return NULL;
460 return r;
463 /* Get a borrowed char_span to the full content of this file
464 as decoded according to the input charset, encoded as UTF-8. */
466 char_span
467 file_cache_slot::get_full_file_content ()
469 char *line;
470 ssize_t line_len;
471 while (get_next_line (&line, &line_len))
474 return char_span (m_data, m_nb_read);
477 /* Populate this slot for use on FILE_PATH and FP, dropping any
478 existing cached content within it. */
480 bool
481 file_cache_slot::create (const file_cache::input_context &in_context,
482 const char *file_path, FILE *fp,
483 unsigned highest_use_count)
485 m_file_path = file_path;
486 if (m_fp)
487 fclose (m_fp);
488 m_fp = fp;
489 if (m_alloc_offset)
490 offset_buffer (-m_alloc_offset);
491 m_nb_read = 0;
492 m_line_start_idx = 0;
493 m_line_num = 0;
494 m_line_record.truncate (0);
495 /* Ensure that this cache entry doesn't get evicted next time
496 add_file_to_cache_tab is called. */
497 m_use_count = ++highest_use_count;
498 m_total_lines = total_lines_num (file_path);
499 m_missing_trailing_newline = true;
502 /* Check the input configuration to determine if we need to do any
503 transformations, such as charset conversion or BOM skipping. */
504 if (const char *input_charset = in_context.ccb (file_path))
506 /* Need a full-blown conversion of the input charset. */
507 fclose (m_fp);
508 m_fp = NULL;
509 const cpp_converted_source cs
510 = cpp_get_converted_source (file_path, input_charset);
511 if (!cs.data)
512 return false;
513 if (m_data)
514 XDELETEVEC (m_data);
515 m_data = cs.data;
516 m_nb_read = m_size = cs.len;
517 m_alloc_offset = cs.data - cs.to_free;
519 else if (in_context.should_skip_bom)
521 if (read_data ())
523 const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
524 offset_buffer (offset);
525 m_nb_read -= offset;
529 return true;
532 /* file_cache's ctor. */
534 file_cache::file_cache ()
535 : m_file_slots (new file_cache_slot[num_file_slots])
537 initialize_input_context (nullptr, false);
540 /* file_cache's dtor. */
542 file_cache::~file_cache ()
544 delete[] m_file_slots;
547 /* Lookup the cache used for the content of a given file accessed by
548 caret diagnostic. If no cached file was found, create a new cache
549 for this file, add it to the array of cached file and return
550 it. */
552 file_cache_slot*
553 file_cache::lookup_or_add_file (const char *file_path)
555 file_cache_slot *r = lookup_file (file_path);
556 if (r == NULL)
557 r = add_file (file_path);
558 return r;
561 /* Default constructor for a cache of file used by caret
562 diagnostic. */
564 file_cache_slot::file_cache_slot ()
565 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
566 m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
567 m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
569 m_line_record.create (0);
572 /* Destructor for a cache of file used by caret diagnostic. */
574 file_cache_slot::~file_cache_slot ()
576 if (m_fp)
578 fclose (m_fp);
579 m_fp = NULL;
581 if (m_data)
583 offset_buffer (-m_alloc_offset);
584 XDELETEVEC (m_data);
585 m_data = 0;
587 m_line_record.release ();
590 /* Returns TRUE iff the cache would need to be filled with data coming
591 from the file. That is, either the cache is empty or full or the
592 current line is empty. Note that if the cache is full, it would
593 need to be extended and filled again. */
595 bool
596 file_cache_slot::needs_read_p () const
598 return m_fp && (m_nb_read == 0
599 || m_nb_read == m_size
600 || (m_line_start_idx >= m_nb_read - 1));
603 /* Return TRUE iff the cache is full and thus needs to be
604 extended. */
606 bool
607 file_cache_slot::needs_grow_p () const
609 return m_nb_read == m_size;
612 /* Grow the cache if it needs to be extended. */
614 void
615 file_cache_slot::maybe_grow ()
617 if (!needs_grow_p ())
618 return;
620 if (!m_data)
622 gcc_assert (m_size == 0 && m_alloc_offset == 0);
623 m_size = buffer_size;
624 m_data = XNEWVEC (char, m_size);
626 else
628 const int offset = m_alloc_offset;
629 offset_buffer (-offset);
630 m_size *= 2;
631 m_data = XRESIZEVEC (char, m_data, m_size);
632 offset_buffer (offset);
636 /* Read more data into the cache. Extends the cache if need be.
637 Returns TRUE iff new data could be read. */
639 bool
640 file_cache_slot::read_data ()
642 if (feof (m_fp) || ferror (m_fp))
643 return false;
645 maybe_grow ();
647 char * from = m_data + m_nb_read;
648 size_t to_read = m_size - m_nb_read;
649 size_t nb_read = fread (from, 1, to_read, m_fp);
651 if (ferror (m_fp))
652 return false;
654 m_nb_read += nb_read;
655 return !!nb_read;
658 /* Read new data iff the cache needs to be filled with more data
659 coming from the file FP. Return TRUE iff the cache was filled with
660 mode data. */
662 bool
663 file_cache_slot::maybe_read_data ()
665 if (!needs_read_p ())
666 return false;
667 return read_data ();
670 /* Helper function for file_cache_slot::get_next_line (), to find the end of
671 the next line. Returns with the memchr convention, i.e. nullptr if a line
672 terminator was not found. We need to determine line endings in the same
673 manner that libcpp does: any of \n, \r\n, or \r is a line ending. */
675 static char *
676 find_end_of_line (char *s, size_t len)
678 for (const auto end = s + len; s != end; ++s)
680 if (*s == '\n')
681 return s;
682 if (*s == '\r')
684 const auto next = s + 1;
685 if (next == end)
687 /* Don't find the line ending if \r is the very last character
688 in the buffer; we do not know if it's the end of the file or
689 just the end of what has been read so far, and we wouldn't
690 want to break in the middle of what's actually a \r\n
691 sequence. Instead, we will handle the case of a file ending
692 in a \r later. */
693 break;
695 return (*next == '\n' ? next : s);
698 return nullptr;
701 /* Read a new line from file FP, using C as a cache for the data
702 coming from the file. Upon successful completion, *LINE is set to
703 the beginning of the line found. *LINE points directly in the
704 line cache and is only valid until the next call of get_next_line.
705 *LINE_LEN is set to the length of the line. Note that the line
706 does not contain any terminal delimiter. This function returns
707 true if some data was read or process from the cache, false
708 otherwise. Note that subsequent calls to get_next_line might
709 make the content of *LINE invalid. */
711 bool
712 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
714 /* Fill the cache with data to process. */
715 maybe_read_data ();
717 size_t remaining_size = m_nb_read - m_line_start_idx;
718 if (remaining_size == 0)
719 /* There is no more data to process. */
720 return false;
722 char *line_start = m_data + m_line_start_idx;
724 char *next_line_start = NULL;
725 size_t len = 0;
726 char *line_end = find_end_of_line (line_start, remaining_size);
727 if (line_end == NULL)
729 /* We haven't found an end-of-line delimiter in the cache.
730 Fill the cache with more data from the file and look again. */
731 while (maybe_read_data ())
733 line_start = m_data + m_line_start_idx;
734 remaining_size = m_nb_read - m_line_start_idx;
735 line_end = find_end_of_line (line_start, remaining_size);
736 if (line_end != NULL)
738 next_line_start = line_end + 1;
739 break;
742 if (line_end == NULL)
744 /* We've loaded all the file into the cache and still no
745 terminator. Let's say the line ends up at one byte past the
746 end of the file. This is to stay consistent with the case
747 of when the line ends up with a terminator and line_end points to
748 that. That consistency is useful below in the len calculation.
750 If the file ends in a \r, we didn't identify it as a line
751 terminator above, so do that now instead. */
752 line_end = m_data + m_nb_read;
753 if (m_nb_read && line_end[-1] == '\r')
755 --line_end;
756 m_missing_trailing_newline = false;
758 else
759 m_missing_trailing_newline = true;
761 else
762 m_missing_trailing_newline = false;
764 else
766 next_line_start = line_end + 1;
767 m_missing_trailing_newline = false;
770 if (m_fp && ferror (m_fp))
771 return false;
773 /* At this point, we've found the end of the of line. It either points to
774 the line terminator or to one byte after the last byte of the file. */
775 gcc_assert (line_end != NULL);
777 len = line_end - line_start;
779 if (m_line_start_idx < m_nb_read)
780 *line = line_start;
782 ++m_line_num;
784 /* Before we update our line record, make sure the hint about the
785 total number of lines of the file is correct. If it's not, then
786 we give up recording line boundaries from now on. */
787 bool update_line_record = true;
788 if (m_line_num > m_total_lines)
789 update_line_record = false;
791 /* Now update our line record so that re-reading lines from the
792 before m_line_start_idx is faster. */
793 if (update_line_record
794 && m_line_record.length () < line_record_size)
796 /* If the file lines fits in the line record, we just record all
797 its lines ...*/
798 if (m_total_lines <= line_record_size
799 && m_line_num > m_line_record.length ())
800 m_line_record.safe_push
801 (file_cache_slot::line_info (m_line_num,
802 m_line_start_idx,
803 line_end - m_data));
804 else if (m_total_lines > line_record_size)
806 /* ... otherwise, we just scale total_lines down to
807 (line_record_size lines. */
808 size_t n = (m_line_num * line_record_size) / m_total_lines;
809 if (m_line_record.length () == 0
810 || n >= m_line_record.length ())
811 m_line_record.safe_push
812 (file_cache_slot::line_info (m_line_num,
813 m_line_start_idx,
814 line_end - m_data));
818 /* Update m_line_start_idx so that it points to the next line to be
819 read. */
820 if (next_line_start)
821 m_line_start_idx = next_line_start - m_data;
822 else
823 /* We didn't find any terminal '\n'. Let's consider that the end
824 of line is the end of the data in the cache. The next
825 invocation of get_next_line will either read more data from the
826 underlying file or return false early because we've reached the
827 end of the file. */
828 m_line_start_idx = m_nb_read;
830 *line_len = len;
832 return true;
835 /* Consume the next bytes coming from the cache (or from its
836 underlying file if there are remaining unread bytes in the file)
837 until we reach the next end-of-line (or end-of-file). There is no
838 copying from the cache involved. Return TRUE upon successful
839 completion. */
841 bool
842 file_cache_slot::goto_next_line ()
844 char *l;
845 ssize_t len;
847 return get_next_line (&l, &len);
850 /* Read an arbitrary line number LINE_NUM from the file cached in C.
851 If the line was read successfully, *LINE points to the beginning
852 of the line in the file cache and *LINE_LEN is the length of the
853 line. *LINE is not nul-terminated, but may contain zero bytes.
854 *LINE is only valid until the next call of read_line_num.
855 This function returns bool if a line was read. */
857 bool
858 file_cache_slot::read_line_num (size_t line_num,
859 char ** line, ssize_t *line_len)
861 gcc_assert (line_num > 0);
863 if (line_num <= m_line_num)
865 /* We've been asked to read lines that are before m_line_num.
866 So lets use our line record (if it's not empty) to try to
867 avoid re-reading the file from the beginning again. */
869 if (m_line_record.is_empty ())
871 m_line_start_idx = 0;
872 m_line_num = 0;
874 else
876 file_cache_slot::line_info *i = NULL;
877 if (m_total_lines <= line_record_size)
879 /* In languages where the input file is not totally
880 preprocessed up front, the m_total_lines hint
881 can be smaller than the number of lines of the
882 file. In that case, only the first
883 m_total_lines have been recorded.
885 Otherwise, the first m_total_lines we've read have
886 their start/end recorded here. */
887 i = (line_num <= m_total_lines)
888 ? &m_line_record[line_num - 1]
889 : &m_line_record[m_total_lines - 1];
890 gcc_assert (i->line_num <= line_num);
892 else
894 /* So the file had more lines than our line record
895 size. Thus the number of lines we've recorded has
896 been scaled down to line_record_size. Let's
897 pick the start/end of the recorded line that is
898 closest to line_num. */
899 size_t n = (line_num <= m_total_lines)
900 ? line_num * line_record_size / m_total_lines
901 : m_line_record.length () - 1;
902 if (n < m_line_record.length ())
904 i = &m_line_record[n];
905 gcc_assert (i->line_num <= line_num);
909 if (i && i->line_num == line_num)
911 /* We have the start/end of the line. */
912 *line = m_data + i->start_pos;
913 *line_len = i->end_pos - i->start_pos;
914 return true;
917 if (i)
919 m_line_start_idx = i->start_pos;
920 m_line_num = i->line_num - 1;
922 else
924 m_line_start_idx = 0;
925 m_line_num = 0;
930 /* Let's walk from line m_line_num up to line_num - 1, without
931 copying any line. */
932 while (m_line_num < line_num - 1)
933 if (!goto_next_line ())
934 return false;
936 /* The line we want is the next one. Let's read and copy it back to
937 the caller. */
938 return get_next_line (line, line_len);
941 /* Return the physical source line that corresponds to FILE_PATH/LINE.
942 The line is not nul-terminated. The returned pointer is only
943 valid until the next call of location_get_source_line.
944 Note that the line can contain several null characters,
945 so the returned value's length has the actual length of the line.
946 If the function fails, a NULL char_span is returned. */
948 char_span
949 location_get_source_line (const char *file_path, int line)
951 char *buffer = NULL;
952 ssize_t len;
954 if (line == 0)
955 return char_span (NULL, 0);
957 if (file_path == NULL)
958 return char_span (NULL, 0);
960 diagnostic_file_cache_init ();
962 file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
963 if (c == NULL)
964 return char_span (NULL, 0);
966 bool read = c->read_line_num (line, &buffer, &len);
967 if (!read)
968 return char_span (NULL, 0);
970 return char_span (buffer, len);
973 /* Return a NUL-terminated copy of the source text between two locations, or
974 NULL if the arguments are invalid. The caller is responsible for freeing
975 the return value. */
977 char *
978 get_source_text_between (location_t start, location_t end)
980 expanded_location expstart =
981 expand_location_to_spelling_point (start, LOCATION_ASPECT_START);
982 expanded_location expend =
983 expand_location_to_spelling_point (end, LOCATION_ASPECT_FINISH);
985 /* If the locations are in different files or the end comes before the
986 start, give up and return nothing. */
987 if (!expstart.file || !expend.file)
988 return NULL;
989 if (strcmp (expstart.file, expend.file) != 0)
990 return NULL;
991 if (expstart.line > expend.line)
992 return NULL;
993 if (expstart.line == expend.line
994 && expstart.column > expend.column)
995 return NULL;
996 /* These aren't real column numbers, give up. */
997 if (expstart.column == 0 || expend.column == 0)
998 return NULL;
1000 /* For a single line we need to trim both edges. */
1001 if (expstart.line == expend.line)
1003 char_span line = location_get_source_line (expstart.file, expstart.line);
1004 if (line.length () < 1)
1005 return NULL;
1006 int s = expstart.column - 1;
1007 int len = expend.column - s;
1008 if (line.length () < (size_t)expend.column)
1009 return NULL;
1010 return line.subspan (s, len).xstrdup ();
1013 struct obstack buf_obstack;
1014 obstack_init (&buf_obstack);
1016 /* Loop through all lines in the range and append each to buf; may trim
1017 parts of the start and end lines off depending on column values. */
1018 for (int lnum = expstart.line; lnum <= expend.line; ++lnum)
1020 char_span line = location_get_source_line (expstart.file, lnum);
1021 if (line.length () < 1 && (lnum != expstart.line && lnum != expend.line))
1022 continue;
1024 /* For the first line in the range, only start at expstart.column */
1025 if (lnum == expstart.line)
1027 unsigned off = expstart.column - 1;
1028 if (line.length () < off)
1029 return NULL;
1030 line = line.subspan (off, line.length() - off);
1032 /* For the last line, don't go past expend.column */
1033 else if (lnum == expend.line)
1035 if (line.length () < (size_t)expend.column)
1036 return NULL;
1037 line = line.subspan (0, expend.column);
1040 /* Combine spaces at the beginning of later lines. */
1041 if (lnum > expstart.line)
1043 unsigned off;
1044 for (off = 0; off < line.length(); ++off)
1045 if (line[off] != ' ' && line[off] != '\t')
1046 break;
1047 if (off > 0)
1049 obstack_1grow (&buf_obstack, ' ');
1050 line = line.subspan (off, line.length() - off);
1054 /* This does not include any trailing newlines. */
1055 obstack_grow (&buf_obstack, line.get_buffer (), line.length ());
1058 /* NUL-terminate and finish the buf obstack. */
1059 obstack_1grow (&buf_obstack, 0);
1060 const char *buf = (const char *) obstack_finish (&buf_obstack);
1062 return xstrdup (buf);
1065 /* Get a borrowed char_span to the full content of FILE_PATH
1066 as decoded according to the input charset, encoded as UTF-8. */
1068 char_span
1069 get_source_file_content (const char *file_path)
1071 diagnostic_file_cache_init ();
1073 file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
1074 return c->get_full_file_content ();
1077 /* Determine if FILE_PATH missing a trailing newline on its final line.
1078 Only valid to call once all of the file has been loaded, by
1079 requesting a line number beyond the end of the file. */
1081 bool
1082 location_missing_trailing_newline (const char *file_path)
1084 diagnostic_file_cache_init ();
1086 file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
1087 if (c == NULL)
1088 return false;
1090 return c->missing_trailing_newline_p ();
1093 /* Test if the location originates from the spelling location of a
1094 builtin-tokens. That is, return TRUE if LOC is a (possibly
1095 virtual) location of a built-in token that appears in the expansion
1096 list of a macro. Please note that this function also works on
1097 tokens that result from built-in tokens. For instance, the
1098 function would return true if passed a token "4" that is the result
1099 of the expansion of the built-in __LINE__ macro. */
1100 bool
1101 is_location_from_builtin_token (location_t loc)
1103 const line_map_ordinary *map = NULL;
1104 loc = linemap_resolve_location (line_table, loc,
1105 LRK_SPELLING_LOCATION, &map);
1106 return loc == BUILTINS_LOCATION;
1109 /* Expand the source location LOC into a human readable location. If
1110 LOC is virtual, it resolves to the expansion point of the involved
1111 macro. If LOC resolves to a builtin location, the file name of the
1112 readable location is set to the string "<built-in>". */
1114 expanded_location
1115 expand_location (location_t loc)
1117 return expand_location_1 (loc, /*expansion_point_p=*/true,
1118 LOCATION_ASPECT_CARET);
1121 /* Expand the source location LOC into a human readable location. If
1122 LOC is virtual, it resolves to the expansion location of the
1123 relevant macro. If LOC resolves to a builtin location, the file
1124 name of the readable location is set to the string
1125 "<built-in>". */
1127 expanded_location
1128 expand_location_to_spelling_point (location_t loc,
1129 enum location_aspect aspect)
1131 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1134 /* The rich_location class within libcpp requires a way to expand
1135 location_t instances, and relies on the client code
1136 providing a symbol named
1137 linemap_client_expand_location_to_spelling_point
1138 to do this.
1140 This is the implementation for libcommon.a (all host binaries),
1141 which simply calls into expand_location_1. */
1143 expanded_location
1144 linemap_client_expand_location_to_spelling_point (location_t loc,
1145 enum location_aspect aspect)
1147 return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1151 /* If LOCATION is in a system header and if it is a virtual location
1152 for a token coming from the expansion of a macro, unwind it to
1153 the location of the expansion point of the macro. If the expansion
1154 point is also in a system header return the original LOCATION.
1155 Otherwise, return the location of the expansion point.
1157 This is used for instance when we want to emit diagnostics about a
1158 token that may be located in a macro that is itself defined in a
1159 system header, for example, for the NULL macro. In such a case, if
1160 LOCATION were passed directly to diagnostic functions such as
1161 warning_at, the diagnostic would be suppressed (unless
1162 -Wsystem-headers). */
1164 location_t
1165 expansion_point_location_if_in_system_header (location_t location)
1167 if (!in_system_header_at (location))
1168 return location;
1170 location_t xloc = linemap_resolve_location (line_table, location,
1171 LRK_MACRO_EXPANSION_POINT,
1172 NULL);
1173 return in_system_header_at (xloc) ? location : xloc;
1176 /* If LOCATION is a virtual location for a token coming from the expansion
1177 of a macro, unwind to the location of the expansion point of the macro. */
1179 location_t
1180 expansion_point_location (location_t location)
1182 return linemap_resolve_location (line_table, location,
1183 LRK_MACRO_EXPANSION_POINT, NULL);
1186 /* Construct a location with caret at CARET, ranging from START to
1187 finish e.g.
1189 11111111112
1190 12345678901234567890
1192 523 return foo + bar;
1193 ~~~~^~~~~
1196 The location's caret is at the "+", line 523 column 15, but starts
1197 earlier, at the "f" of "foo" at column 11. The finish is at the "r"
1198 of "bar" at column 19. */
1200 location_t
1201 make_location (location_t caret, location_t start, location_t finish)
1203 location_t pure_loc = get_pure_location (caret);
1204 source_range src_range;
1205 src_range.m_start = get_start (start);
1206 src_range.m_finish = get_finish (finish);
1207 location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1208 pure_loc,
1209 src_range,
1210 NULL,
1212 return combined_loc;
1215 /* Same as above, but taking a source range rather than two locations. */
1217 location_t
1218 make_location (location_t caret, source_range src_range)
1220 location_t pure_loc = get_pure_location (caret);
1221 return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL, 0);
1224 /* An expanded_location stores the column in byte units. This function
1225 converts that column to display units. That requires reading the associated
1226 source line in order to calculate the display width. If that cannot be done
1227 for any reason, then returns the byte column as a fallback. */
1229 location_compute_display_column (expanded_location exploc,
1230 const cpp_char_column_policy &policy)
1232 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1233 return exploc.column;
1234 char_span line = location_get_source_line (exploc.file, exploc.line);
1235 /* If line is NULL, this function returns exploc.column which is the
1236 desired fallback. */
1237 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1238 exploc.column, policy);
1241 /* Dump statistics to stderr about the memory usage of the line_table
1242 set of line maps. This also displays some statistics about macro
1243 expansion. */
1245 void
1246 dump_line_table_statistics (void)
1248 struct linemap_stats s;
1249 long total_used_map_size,
1250 macro_maps_size,
1251 total_allocated_map_size;
1253 memset (&s, 0, sizeof (s));
1255 linemap_get_statistics (line_table, &s);
1257 macro_maps_size = s.macro_maps_used_size
1258 + s.macro_maps_locations_size;
1260 total_allocated_map_size = s.ordinary_maps_allocated_size
1261 + s.macro_maps_allocated_size
1262 + s.macro_maps_locations_size;
1264 total_used_map_size = s.ordinary_maps_used_size
1265 + s.macro_maps_used_size
1266 + s.macro_maps_locations_size;
1268 fprintf (stderr, "Number of expanded macros: %5ld\n",
1269 s.num_expanded_macros);
1270 if (s.num_expanded_macros != 0)
1271 fprintf (stderr, "Average number of tokens per macro expansion: %5ld\n",
1272 s.num_macro_tokens / s.num_expanded_macros);
1273 fprintf (stderr,
1274 "\nLine Table allocations during the "
1275 "compilation process\n");
1276 fprintf (stderr, "Number of ordinary maps used: " PRsa (5) "\n",
1277 SIZE_AMOUNT (s.num_ordinary_maps_used));
1278 fprintf (stderr, "Ordinary map used size: " PRsa (5) "\n",
1279 SIZE_AMOUNT (s.ordinary_maps_used_size));
1280 fprintf (stderr, "Number of ordinary maps allocated: " PRsa (5) "\n",
1281 SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1282 fprintf (stderr, "Ordinary maps allocated size: " PRsa (5) "\n",
1283 SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1284 fprintf (stderr, "Number of macro maps used: " PRsa (5) "\n",
1285 SIZE_AMOUNT (s.num_macro_maps_used));
1286 fprintf (stderr, "Macro maps used size: " PRsa (5) "\n",
1287 SIZE_AMOUNT (s.macro_maps_used_size));
1288 fprintf (stderr, "Macro maps locations size: " PRsa (5) "\n",
1289 SIZE_AMOUNT (s.macro_maps_locations_size));
1290 fprintf (stderr, "Macro maps size: " PRsa (5) "\n",
1291 SIZE_AMOUNT (macro_maps_size));
1292 fprintf (stderr, "Duplicated maps locations size: " PRsa (5) "\n",
1293 SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1294 fprintf (stderr, "Total allocated maps size: " PRsa (5) "\n",
1295 SIZE_AMOUNT (total_allocated_map_size));
1296 fprintf (stderr, "Total used maps size: " PRsa (5) "\n",
1297 SIZE_AMOUNT (total_used_map_size));
1298 fprintf (stderr, "Ad-hoc table size: " PRsa (5) "\n",
1299 SIZE_AMOUNT (s.adhoc_table_size));
1300 fprintf (stderr, "Ad-hoc table entries used: " PRsa (5) "\n",
1301 SIZE_AMOUNT (s.adhoc_table_entries_used));
1302 fprintf (stderr, "optimized_ranges: " PRsa (5) "\n",
1303 SIZE_AMOUNT (line_table->num_optimized_ranges));
1304 fprintf (stderr, "unoptimized_ranges: " PRsa (5) "\n",
1305 SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1307 fprintf (stderr, "\n");
1310 /* Get location one beyond the final location in ordinary map IDX. */
1312 static location_t
1313 get_end_location (class line_maps *set, unsigned int idx)
1315 if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1316 return set->highest_location;
1318 struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1319 return MAP_START_LOCATION (next_map);
1322 /* Helper function for write_digit_row. */
1324 static void
1325 write_digit (FILE *stream, int digit)
1327 fputc ('0' + (digit % 10), stream);
1330 /* Helper function for dump_location_info.
1331 Write a row of numbers to STREAM, numbering a source line,
1332 giving the units, tens, hundreds etc of the column number. */
1334 static void
1335 write_digit_row (FILE *stream, int indent,
1336 const line_map_ordinary *map,
1337 location_t loc, int max_col, int divisor)
1339 fprintf (stream, "%*c", indent, ' ');
1340 fprintf (stream, "|");
1341 for (int column = 1; column < max_col; column++)
1343 location_t column_loc = loc + (column << map->m_range_bits);
1344 write_digit (stream, column_loc / divisor);
1346 fprintf (stream, "\n");
1349 /* Write a half-closed (START) / half-open (END) interval of
1350 location_t to STREAM. */
1352 static void
1353 dump_location_range (FILE *stream,
1354 location_t start, location_t end)
1356 fprintf (stream,
1357 " location_t interval: %u <= loc < %u\n",
1358 start, end);
1361 /* Write a labelled description of a half-closed (START) / half-open (END)
1362 interval of location_t to STREAM. */
1364 static void
1365 dump_labelled_location_range (FILE *stream,
1366 const char *name,
1367 location_t start, location_t end)
1369 fprintf (stream, "%s\n", name);
1370 dump_location_range (stream, start, end);
1371 fprintf (stream, "\n");
1374 /* Write a visualization of the locations in the line_table to STREAM. */
1376 void
1377 dump_location_info (FILE *stream)
1379 /* Visualize the reserved locations. */
1380 dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1381 0, RESERVED_LOCATION_COUNT);
1383 /* Visualize the ordinary line_map instances, rendering the sources. */
1384 for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1386 location_t end_location = get_end_location (line_table, idx);
1387 /* half-closed: doesn't include this one. */
1389 const line_map_ordinary *map
1390 = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1391 fprintf (stream, "ORDINARY MAP: %i\n", idx);
1392 dump_location_range (stream,
1393 MAP_START_LOCATION (map), end_location);
1394 fprintf (stream, " file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1395 fprintf (stream, " starting at line: %i\n",
1396 ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1397 fprintf (stream, " column and range bits: %i\n",
1398 map->m_column_and_range_bits);
1399 fprintf (stream, " column bits: %i\n",
1400 map->m_column_and_range_bits - map->m_range_bits);
1401 fprintf (stream, " range bits: %i\n",
1402 map->m_range_bits);
1403 const char * reason;
1404 switch (map->reason) {
1405 case LC_ENTER:
1406 reason = "LC_ENTER";
1407 break;
1408 case LC_LEAVE:
1409 reason = "LC_LEAVE";
1410 break;
1411 case LC_RENAME:
1412 reason = "LC_RENAME";
1413 break;
1414 case LC_RENAME_VERBATIM:
1415 reason = "LC_RENAME_VERBATIM";
1416 break;
1417 case LC_ENTER_MACRO:
1418 reason = "LC_RENAME_MACRO";
1419 break;
1420 default:
1421 reason = "Unknown";
1423 fprintf (stream, " reason: %d (%s)\n", map->reason, reason);
1425 const line_map_ordinary *includer_map
1426 = linemap_included_from_linemap (line_table, map);
1427 fprintf (stream, " included from location: %d",
1428 linemap_included_from (map));
1429 if (includer_map) {
1430 fprintf (stream, " (in ordinary map %d)",
1431 int (includer_map - line_table->info_ordinary.maps));
1433 fprintf (stream, "\n");
1435 /* Render the span of source lines that this "map" covers. */
1436 for (location_t loc = MAP_START_LOCATION (map);
1437 loc < end_location;
1438 loc += (1 << map->m_range_bits) )
1440 gcc_assert (pure_location_p (line_table, loc) );
1442 expanded_location exploc
1443 = linemap_expand_location (line_table, map, loc);
1445 if (exploc.column == 0)
1447 /* Beginning of a new source line: draw the line. */
1449 char_span line_text = location_get_source_line (exploc.file,
1450 exploc.line);
1451 if (!line_text)
1452 break;
1453 fprintf (stream,
1454 "%s:%3i|loc:%5i|%.*s\n",
1455 exploc.file, exploc.line,
1456 loc,
1457 (int)line_text.length (), line_text.get_buffer ());
1459 /* "loc" is at column 0, which means "the whole line".
1460 Render the locations *within* the line, by underlining
1461 it, showing the location_t numeric values
1462 at each column. */
1463 size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1464 if (max_col > line_text.length ())
1465 max_col = line_text.length () + 1;
1467 int len_lnum = num_digits (exploc.line);
1468 if (len_lnum < 3)
1469 len_lnum = 3;
1470 int len_loc = num_digits (loc);
1471 if (len_loc < 5)
1472 len_loc = 5;
1474 int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1476 /* Thousands. */
1477 if (end_location > 999)
1478 write_digit_row (stream, indent, map, loc, max_col, 1000);
1480 /* Hundreds. */
1481 if (end_location > 99)
1482 write_digit_row (stream, indent, map, loc, max_col, 100);
1484 /* Tens. */
1485 write_digit_row (stream, indent, map, loc, max_col, 10);
1487 /* Units. */
1488 write_digit_row (stream, indent, map, loc, max_col, 1);
1491 fprintf (stream, "\n");
1494 /* Visualize unallocated values. */
1495 dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1496 line_table->highest_location,
1497 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1499 /* Visualize the macro line_map instances, rendering the sources. */
1500 for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1502 /* Each macro map that is allocated owns location_t values
1503 that are *lower* that the one before them.
1504 Hence it's meaningful to view them either in order of ascending
1505 source locations, or in order of ascending macro map index. */
1506 const bool ascending_location_ts = true;
1507 unsigned int idx = (ascending_location_ts
1508 ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1509 : i);
1510 const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1511 fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1512 idx,
1513 linemap_map_get_macro_name (map),
1514 MACRO_MAP_NUM_MACRO_TOKENS (map));
1515 dump_location_range (stream,
1516 map->start_location,
1517 (map->start_location
1518 + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1519 inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1520 "expansion point is location %i",
1521 MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1522 fprintf (stream, " map->start_location: %u\n",
1523 map->start_location);
1525 fprintf (stream, " macro_locations:\n");
1526 for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1528 location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1529 location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1531 /* linemap_add_macro_token encodes token numbers in an expansion
1532 by putting them after MAP_START_LOCATION. */
1534 /* I'm typically seeing 4 uninitialized entries at the end of
1535 0xafafafaf.
1536 This appears to be due to macro.cc:replace_args
1537 adding 2 extra args for padding tokens; presumably there may
1538 be a leading and/or trailing padding token injected,
1539 each for 2 more location slots.
1540 This would explain there being up to 4 location_ts slots
1541 that may be uninitialized. */
1543 fprintf (stream, " %u: %u, %u\n",
1547 if (x == y)
1549 if (x < MAP_START_LOCATION (map))
1550 inform (x, "token %u has %<x-location == y-location == %u%>",
1551 i, x);
1552 else
1553 fprintf (stream,
1554 "x-location == y-location == %u encodes token # %u\n",
1555 x, x - MAP_START_LOCATION (map));
1557 else
1559 inform (x, "token %u has %<x-location == %u%>", i, x);
1560 inform (x, "token %u has %<y-location == %u%>", i, y);
1563 fprintf (stream, "\n");
1566 /* It appears that MAX_LOCATION_T itself is never assigned to a
1567 macro map, presumably due to an off-by-one error somewhere
1568 between the logic in linemap_enter_macro and
1569 LINEMAPS_MACRO_LOWEST_LOCATION. */
1570 dump_labelled_location_range (stream, "MAX_LOCATION_T",
1571 MAX_LOCATION_T,
1572 MAX_LOCATION_T + 1);
1574 /* Visualize ad-hoc values. */
1575 dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1576 MAX_LOCATION_T + 1, UINT_MAX);
1579 /* string_concat's constructor. */
1581 string_concat::string_concat (int num, location_t *locs)
1582 : m_num (num)
1584 m_locs = ggc_vec_alloc <location_t> (num);
1585 for (int i = 0; i < num; i++)
1586 m_locs[i] = locs[i];
1589 /* string_concat_db's constructor. */
1591 string_concat_db::string_concat_db ()
1593 m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1596 /* Record that a string concatenation occurred, covering NUM
1597 string literal tokens. LOCS is an array of size NUM, containing the
1598 locations of the tokens. A copy of LOCS is taken. */
1600 void
1601 string_concat_db::record_string_concatenation (int num, location_t *locs)
1603 gcc_assert (num > 1);
1604 gcc_assert (locs);
1606 location_t key_loc = get_key_loc (locs[0]);
1607 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1608 any data now recorded under key 'key_loc' would be overwritten by a
1609 subsequent call with the same key 'key_loc'. */
1610 if (RESERVED_LOCATION_P (key_loc))
1611 return;
1613 string_concat *concat
1614 = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1615 m_table->put (key_loc, concat);
1618 /* Determine if LOC was the location of the initial token of a
1619 concatenation of string literal tokens.
1620 If so, *OUT_NUM is written to with the number of tokens, and
1621 *OUT_LOCS with the location of an array of locations of the
1622 tokens, and return true. *OUT_LOCS is a borrowed pointer to
1623 storage owned by the string_concat_db.
1624 Otherwise, return false. */
1626 bool
1627 string_concat_db::get_string_concatenation (location_t loc,
1628 int *out_num,
1629 location_t **out_locs)
1631 gcc_assert (out_num);
1632 gcc_assert (out_locs);
1634 location_t key_loc = get_key_loc (loc);
1635 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1636 discussion in 'string_concat_db::record_string_concatenation'. */
1637 if (RESERVED_LOCATION_P (key_loc))
1638 return false;
1640 string_concat **concat = m_table->get (key_loc);
1641 if (!concat)
1642 return false;
1644 *out_num = (*concat)->m_num;
1645 *out_locs =(*concat)->m_locs;
1646 return true;
1649 /* Internal function. Canonicalize LOC into a form suitable for
1650 use as a key within the database, stripping away macro expansion,
1651 ad-hoc information, and range information, using the location of
1652 the start of LOC within an ordinary linemap. */
1654 location_t
1655 string_concat_db::get_key_loc (location_t loc)
1657 loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1658 NULL);
1660 loc = get_range_from_loc (line_table, loc).m_start;
1662 return loc;
1665 /* Helper class for use within get_substring_ranges_for_loc.
1666 An vec of cpp_string with responsibility for releasing all of the
1667 str->text for each str in the vector. */
1669 class auto_cpp_string_vec : public auto_vec <cpp_string>
1671 public:
1672 auto_cpp_string_vec (int alloc)
1673 : auto_vec <cpp_string> (alloc) {}
1675 ~auto_cpp_string_vec ()
1677 /* Clean up the copies within this vec. */
1678 int i;
1679 cpp_string *str;
1680 FOR_EACH_VEC_ELT (*this, i, str)
1681 free (const_cast <unsigned char *> (str->text));
1685 /* Attempt to populate RANGES with source location information on the
1686 individual characters within the string literal found at STRLOC.
1687 If CONCATS is non-NULL, then any string literals that the token at
1688 STRLOC was concatenated with are also added to RANGES.
1690 Return NULL if successful, or an error message if any errors occurred (in
1691 which case RANGES may be only partially populated and should not
1692 be used).
1694 This is implemented by re-parsing the relevant source line(s). */
1696 static const char *
1697 get_substring_ranges_for_loc (cpp_reader *pfile,
1698 string_concat_db *concats,
1699 location_t strloc,
1700 enum cpp_ttype type,
1701 cpp_substring_ranges &ranges)
1703 gcc_assert (pfile);
1705 if (strloc == UNKNOWN_LOCATION)
1706 return "unknown location";
1708 /* Reparsing the strings requires accurate location information.
1709 If -ftrack-macro-expansion has been overridden from its default
1710 of 2, then we might have a location of a macro expansion point,
1711 rather than the location of the literal itself.
1712 Avoid this by requiring that we have full macro expansion tracking
1713 for substring locations to be available. */
1714 if (cpp_get_options (pfile)->track_macro_expansion != 2)
1715 return "track_macro_expansion != 2";
1717 /* If #line or # 44 "file"-style directives are present, then there's
1718 no guarantee that the line numbers we have can be used to locate
1719 the strings. For example, we might have a .i file with # directives
1720 pointing back to lines within a .c file, but the .c file might
1721 have been edited since the .i file was created.
1722 In such a case, the safest course is to disable on-demand substring
1723 locations. */
1724 if (line_table->seen_line_directive)
1725 return "seen line directive";
1727 /* If string concatenation has occurred at STRLOC, get the locations
1728 of all of the literal tokens making up the compound string.
1729 Otherwise, just use STRLOC. */
1730 int num_locs = 1;
1731 location_t *strlocs = &strloc;
1732 if (concats)
1733 concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1735 auto_cpp_string_vec strs (num_locs);
1736 auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1737 for (int i = 0; i < num_locs; i++)
1739 /* Get range of strloc. We will use it to locate the start and finish
1740 of the literal token within the line. */
1741 source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1743 if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1745 /* If the string token was within a macro expansion, then we can
1746 cope with it for the simple case where we have a single token.
1747 Otherwise, bail out. */
1748 if (src_range.m_start != src_range.m_finish)
1749 return "macro expansion";
1751 else
1753 if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1754 /* If so, we can't reliably determine where the token started within
1755 its line. */
1756 return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1758 if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1759 /* If so, we can't reliably determine where the token finished
1760 within its line. */
1761 return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1764 expanded_location start
1765 = expand_location_to_spelling_point (src_range.m_start,
1766 LOCATION_ASPECT_START);
1767 expanded_location finish
1768 = expand_location_to_spelling_point (src_range.m_finish,
1769 LOCATION_ASPECT_FINISH);
1770 if (start.file != finish.file)
1771 return "range endpoints are in different files";
1772 if (start.line != finish.line)
1773 return "range endpoints are on different lines";
1774 if (start.column > finish.column)
1775 return "range endpoints are reversed";
1777 char_span line = location_get_source_line (start.file, start.line);
1778 if (!line)
1779 return "unable to read source line";
1781 /* Determine the location of the literal (including quotes
1782 and leading prefix chars, such as the 'u' in a u""
1783 token). */
1784 size_t literal_length = finish.column - start.column + 1;
1786 /* Ensure that we don't crash if we got the wrong location. */
1787 if (start.column < 1)
1788 return "zero start column";
1789 if (line.length () < (start.column - 1 + literal_length))
1790 return "line is not wide enough";
1792 char_span literal = line.subspan (start.column - 1, literal_length);
1794 cpp_string from;
1795 from.len = literal_length;
1796 /* Make a copy of the literal, to avoid having to rely on
1797 the lifetime of the copy of the line within the cache.
1798 This will be released by the auto_cpp_string_vec dtor. */
1799 from.text = (unsigned char *)literal.xstrdup ();
1800 strs.safe_push (from);
1802 /* For very long lines, a new linemap could have started
1803 halfway through the token.
1804 Ensure that the loc_reader uses the linemap of the
1805 *end* of the token for its start location. */
1806 const line_map_ordinary *start_ord_map;
1807 linemap_resolve_location (line_table, src_range.m_start,
1808 LRK_SPELLING_LOCATION, &start_ord_map);
1809 const line_map_ordinary *final_ord_map;
1810 linemap_resolve_location (line_table, src_range.m_finish,
1811 LRK_SPELLING_LOCATION, &final_ord_map);
1812 if (start_ord_map == NULL || final_ord_map == NULL)
1813 return "failed to get ordinary maps";
1814 /* Bulletproofing. We ought to only have different ordinary maps
1815 for start vs finish due to line-length jumps. */
1816 if (start_ord_map != final_ord_map
1817 && start_ord_map->to_file != final_ord_map->to_file)
1818 return "start and finish are spelled in different ordinary maps";
1819 /* The file from linemap_resolve_location ought to match that from
1820 expand_location_to_spelling_point. */
1821 if (start_ord_map->to_file != start.file)
1822 return "mismatching file after resolving linemap";
1824 location_t start_loc
1825 = linemap_position_for_line_and_column (line_table, final_ord_map,
1826 start.line, start.column);
1828 cpp_string_location_reader loc_reader (start_loc, line_table);
1829 loc_readers.safe_push (loc_reader);
1832 /* Rerun cpp_interpret_string, or rather, a modified version of it. */
1833 const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1834 loc_readers.address (),
1835 num_locs, &ranges, type);
1836 if (err)
1837 return err;
1839 /* Success: "ranges" should now contain information on the string. */
1840 return NULL;
1843 /* Attempt to populate *OUT_LOC with source location information on the
1844 given characters within the string literal found at STRLOC.
1845 CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1846 character set.
1848 For example, given CARET_IDX = 4, START_IDX = 3, END_IDX = 7
1849 and string literal "012345\n789"
1850 *OUT_LOC is written to with:
1851 "012345\n789"
1852 ~^~~~~
1854 If CONCATS is non-NULL, then any string literals that the token at
1855 STRLOC was concatenated with are also considered.
1857 This is implemented by re-parsing the relevant source line(s).
1859 Return NULL if successful, or an error message if any errors occurred.
1860 Error messages are intended for GCC developers (to help debugging) rather
1861 than for end-users. */
1863 const char *
1864 get_location_within_string (cpp_reader *pfile,
1865 string_concat_db *concats,
1866 location_t strloc,
1867 enum cpp_ttype type,
1868 int caret_idx, int start_idx, int end_idx,
1869 location_t *out_loc)
1871 gcc_checking_assert (caret_idx >= 0);
1872 gcc_checking_assert (start_idx >= 0);
1873 gcc_checking_assert (end_idx >= 0);
1874 gcc_assert (out_loc);
1876 cpp_substring_ranges ranges;
1877 const char *err
1878 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1879 if (err)
1880 return err;
1882 if (caret_idx >= ranges.get_num_ranges ())
1883 return "caret_idx out of range";
1884 if (start_idx >= ranges.get_num_ranges ())
1885 return "start_idx out of range";
1886 if (end_idx >= ranges.get_num_ranges ())
1887 return "end_idx out of range";
1889 *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1890 ranges.get_range (start_idx).m_start,
1891 ranges.get_range (end_idx).m_finish);
1892 return NULL;
1895 /* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1897 location_t
1898 location_with_discriminator (location_t locus, int discriminator)
1900 tree block = LOCATION_BLOCK (locus);
1901 source_range src_range = get_range_from_loc (line_table, locus);
1902 locus = get_pure_location (locus);
1904 if (locus == UNKNOWN_LOCATION)
1905 return locus;
1907 return COMBINE_LOCATION_DATA (line_table, locus, src_range, block, discriminator);
1910 /* Return TRUE if LOCUS represents a location with a discriminator. */
1912 bool
1913 has_discriminator (location_t locus)
1915 return get_discriminator_from_loc (locus) != 0;
1918 /* Return the discriminator for LOCUS. */
1921 get_discriminator_from_loc (location_t locus)
1923 return get_discriminator_from_loc (line_table, locus);
1926 #if CHECKING_P
1928 namespace selftest {
1930 /* Selftests of location handling. */
1932 /* Attempt to populate *OUT_RANGE with source location information on the
1933 given character within the string literal found at STRLOC.
1934 CHAR_IDX refers to an offset within the execution character set.
1935 If CONCATS is non-NULL, then any string literals that the token at
1936 STRLOC was concatenated with are also considered.
1938 This is implemented by re-parsing the relevant source line(s).
1940 Return NULL if successful, or an error message if any errors occurred.
1941 Error messages are intended for GCC developers (to help debugging) rather
1942 than for end-users. */
1944 static const char *
1945 get_source_range_for_char (cpp_reader *pfile,
1946 string_concat_db *concats,
1947 location_t strloc,
1948 enum cpp_ttype type,
1949 int char_idx,
1950 source_range *out_range)
1952 gcc_checking_assert (char_idx >= 0);
1953 gcc_assert (out_range);
1955 cpp_substring_ranges ranges;
1956 const char *err
1957 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1958 if (err)
1959 return err;
1961 if (char_idx >= ranges.get_num_ranges ())
1962 return "char_idx out of range";
1964 *out_range = ranges.get_range (char_idx);
1965 return NULL;
1968 /* As get_source_range_for_char, but write to *OUT the number
1969 of ranges that are available. */
1971 static const char *
1972 get_num_source_ranges_for_substring (cpp_reader *pfile,
1973 string_concat_db *concats,
1974 location_t strloc,
1975 enum cpp_ttype type,
1976 int *out)
1978 gcc_assert (out);
1980 cpp_substring_ranges ranges;
1981 const char *err
1982 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1984 if (err)
1985 return err;
1987 *out = ranges.get_num_ranges ();
1988 return NULL;
1991 /* Selftests of location handling. */
1993 /* Verify that compare() on linenum_type handles comparisons over the full
1994 range of the type. */
1996 static void
1997 test_linenum_comparisons ()
1999 linenum_type min_line (0);
2000 linenum_type max_line (0xffffffff);
2001 ASSERT_EQ (0, compare (min_line, min_line));
2002 ASSERT_EQ (0, compare (max_line, max_line));
2004 ASSERT_GT (compare (max_line, min_line), 0);
2005 ASSERT_LT (compare (min_line, max_line), 0);
2008 /* Helper function for verifying location data: when location_t
2009 values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
2010 as having column 0. */
2012 static bool
2013 should_have_column_data_p (location_t loc)
2015 if (IS_ADHOC_LOC (loc))
2016 loc = get_location_from_adhoc_loc (line_table, loc);
2017 if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
2018 return false;
2019 return true;
2022 /* Selftest for should_have_column_data_p. */
2024 static void
2025 test_should_have_column_data_p ()
2027 ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
2028 ASSERT_TRUE
2029 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
2030 ASSERT_FALSE
2031 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
2034 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
2035 on LOC. */
2037 static void
2038 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
2039 location_t loc)
2041 ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
2042 ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
2043 /* If location_t values are sufficiently high, then column numbers
2044 will be unavailable and LOCATION_COLUMN (loc) will be 0.
2045 When close to the threshold, column numbers *may* be present: if
2046 the final linemap before the threshold contains a line that straddles
2047 the threshold, locations in that line have column information. */
2048 if (should_have_column_data_p (loc))
2049 ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
2052 /* Various selftests involve constructing a line table and one or more
2053 line maps within it.
2055 For maximum test coverage we want to run these tests with a variety
2056 of situations:
2057 - line_table->default_range_bits: some frontends use a non-zero value
2058 and others use zero
2059 - the fallback modes within line-map.cc: there are various threshold
2060 values for location_t beyond line-map.cc changes
2061 behavior (disabling of the range-packing optimization, disabling
2062 of column-tracking). We can exercise these by starting the line_table
2063 at interesting values at or near these thresholds.
2065 The following struct describes a particular case within our test
2066 matrix. */
2068 class line_table_case
2070 public:
2071 line_table_case (int default_range_bits, int base_location)
2072 : m_default_range_bits (default_range_bits),
2073 m_base_location (base_location)
2076 int m_default_range_bits;
2077 int m_base_location;
2080 /* Constructor. Store the old value of line_table, and create a new
2081 one, using sane defaults. */
2083 line_table_test::line_table_test ()
2085 gcc_assert (saved_line_table == NULL);
2086 saved_line_table = line_table;
2087 line_table = ggc_alloc<line_maps> ();
2088 linemap_init (line_table, BUILTINS_LOCATION);
2089 gcc_assert (saved_line_table->reallocator);
2090 line_table->reallocator = saved_line_table->reallocator;
2091 gcc_assert (saved_line_table->round_alloc_size);
2092 line_table->round_alloc_size = saved_line_table->round_alloc_size;
2093 line_table->default_range_bits = 0;
2096 /* Constructor. Store the old value of line_table, and create a new
2097 one, using the sitation described in CASE_. */
2099 line_table_test::line_table_test (const line_table_case &case_)
2101 gcc_assert (saved_line_table == NULL);
2102 saved_line_table = line_table;
2103 line_table = ggc_alloc<line_maps> ();
2104 linemap_init (line_table, BUILTINS_LOCATION);
2105 gcc_assert (saved_line_table->reallocator);
2106 line_table->reallocator = saved_line_table->reallocator;
2107 gcc_assert (saved_line_table->round_alloc_size);
2108 line_table->round_alloc_size = saved_line_table->round_alloc_size;
2109 line_table->default_range_bits = case_.m_default_range_bits;
2110 if (case_.m_base_location)
2112 line_table->highest_location = case_.m_base_location;
2113 line_table->highest_line = case_.m_base_location;
2117 /* Destructor. Restore the old value of line_table. */
2119 line_table_test::~line_table_test ()
2121 gcc_assert (saved_line_table != NULL);
2122 line_table = saved_line_table;
2123 saved_line_table = NULL;
2126 /* Verify basic operation of ordinary linemaps. */
2128 static void
2129 test_accessing_ordinary_linemaps (const line_table_case &case_)
2131 line_table_test ltt (case_);
2133 /* Build a simple linemap describing some locations. */
2134 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
2136 linemap_line_start (line_table, 1, 100);
2137 location_t loc_a = linemap_position_for_column (line_table, 1);
2138 location_t loc_b = linemap_position_for_column (line_table, 23);
2140 linemap_line_start (line_table, 2, 100);
2141 location_t loc_c = linemap_position_for_column (line_table, 1);
2142 location_t loc_d = linemap_position_for_column (line_table, 17);
2144 /* Example of a very long line. */
2145 linemap_line_start (line_table, 3, 2000);
2146 location_t loc_e = linemap_position_for_column (line_table, 700);
2148 /* Transitioning back to a short line. */
2149 linemap_line_start (line_table, 4, 0);
2150 location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2152 if (should_have_column_data_p (loc_back_to_short))
2154 /* Verify that we switched to short lines in the linemap. */
2155 line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
2156 ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2159 /* Example of a line that will eventually be seen to be longer
2160 than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2161 below that. */
2162 linemap_line_start (line_table, 5, 2000);
2164 location_t loc_start_of_very_long_line
2165 = linemap_position_for_column (line_table, 2000);
2166 location_t loc_too_wide
2167 = linemap_position_for_column (line_table, 4097);
2168 location_t loc_too_wide_2
2169 = linemap_position_for_column (line_table, 4098);
2171 /* ...and back to a sane line length. */
2172 linemap_line_start (line_table, 6, 100);
2173 location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2175 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2177 /* Multiple files. */
2178 linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2179 linemap_line_start (line_table, 1, 200);
2180 location_t loc_f = linemap_position_for_column (line_table, 150);
2181 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2183 /* Verify that we can recover the location info. */
2184 assert_loceq ("foo.c", 1, 1, loc_a);
2185 assert_loceq ("foo.c", 1, 23, loc_b);
2186 assert_loceq ("foo.c", 2, 1, loc_c);
2187 assert_loceq ("foo.c", 2, 17, loc_d);
2188 assert_loceq ("foo.c", 3, 700, loc_e);
2189 assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2191 /* In the very wide line, the initial location should be fully tracked. */
2192 assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2193 /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2194 be disabled. */
2195 assert_loceq ("foo.c", 5, 0, loc_too_wide);
2196 assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2197 /*...and column-tracking should be re-enabled for subsequent lines. */
2198 assert_loceq ("foo.c", 6, 10, loc_sane_again);
2200 assert_loceq ("bar.c", 1, 150, loc_f);
2202 ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2203 ASSERT_TRUE (pure_location_p (line_table, loc_a));
2205 /* Verify using make_location to build a range, and extracting data
2206 back from it. */
2207 location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2208 ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2209 ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2210 source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2211 ASSERT_EQ (loc_b, src_range.m_start);
2212 ASSERT_EQ (loc_d, src_range.m_finish);
2215 /* Verify various properties of UNKNOWN_LOCATION. */
2217 static void
2218 test_unknown_location ()
2220 ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2221 ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2222 ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2225 /* Verify various properties of BUILTINS_LOCATION. */
2227 static void
2228 test_builtins ()
2230 assert_loceq (special_fname_builtin (), 0, 0, BUILTINS_LOCATION);
2231 ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2234 /* Regression test for make_location.
2235 Ensure that we use pure locations for the start/finish of the range,
2236 rather than storing a packed or ad-hoc range as the start/finish. */
2238 static void
2239 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2241 /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2242 with C++ frontend.
2243 ....................0000000001111111111222.
2244 ....................1234567890123456789012. */
2245 const char *content = " r += !aaa == bbb;\n";
2246 temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2247 line_table_test ltt (case_);
2248 linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2250 const location_t c11 = linemap_position_for_column (line_table, 11);
2251 const location_t c12 = linemap_position_for_column (line_table, 12);
2252 const location_t c13 = linemap_position_for_column (line_table, 13);
2253 const location_t c14 = linemap_position_for_column (line_table, 14);
2254 const location_t c21 = linemap_position_for_column (line_table, 21);
2256 if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2257 return;
2259 /* Use column 13 for the caret location, arbitrarily, to verify that we
2260 handle start != caret. */
2261 const location_t aaa = make_location (c13, c12, c14);
2262 ASSERT_EQ (c13, get_pure_location (aaa));
2263 ASSERT_EQ (c12, get_start (aaa));
2264 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2265 ASSERT_EQ (c14, get_finish (aaa));
2266 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2268 /* Make a location using a location with a range as the start-point. */
2269 const location_t not_aaa = make_location (c11, aaa, c14);
2270 ASSERT_EQ (c11, get_pure_location (not_aaa));
2271 /* It should use the start location of the range, not store the range
2272 itself. */
2273 ASSERT_EQ (c12, get_start (not_aaa));
2274 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2275 ASSERT_EQ (c14, get_finish (not_aaa));
2276 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2278 /* Similarly, make a location with a range as the end-point. */
2279 const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2280 ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2281 ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2282 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2283 ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2284 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2285 const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2286 /* It should use the finish location of the range, not store the range
2287 itself. */
2288 ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2289 ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2290 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2291 ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2292 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2295 /* Verify reading of input files (e.g. for caret-based diagnostics). */
2297 static void
2298 test_reading_source_line ()
2300 /* Create a tempfile and write some text to it. */
2301 temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2302 "01234567890123456789\n"
2303 "This is the test text\n"
2304 "This is the 3rd line");
2306 /* Read back a specific line from the tempfile. */
2307 char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2308 ASSERT_TRUE (source_line);
2309 ASSERT_TRUE (source_line.get_buffer () != NULL);
2310 ASSERT_EQ (20, source_line.length ());
2311 ASSERT_TRUE (!strncmp ("This is the 3rd line",
2312 source_line.get_buffer (), source_line.length ()));
2314 source_line = location_get_source_line (tmp.get_filename (), 2);
2315 ASSERT_TRUE (source_line);
2316 ASSERT_TRUE (source_line.get_buffer () != NULL);
2317 ASSERT_EQ (21, source_line.length ());
2318 ASSERT_TRUE (!strncmp ("This is the test text",
2319 source_line.get_buffer (), source_line.length ()));
2321 source_line = location_get_source_line (tmp.get_filename (), 4);
2322 ASSERT_FALSE (source_line);
2323 ASSERT_TRUE (source_line.get_buffer () == NULL);
2326 /* Tests of lexing. */
2328 /* Verify that token TOK from PARSER has cpp_token_as_text
2329 equal to EXPECTED_TEXT. */
2331 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT) \
2332 SELFTEST_BEGIN_STMT \
2333 unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK)); \
2334 ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt); \
2335 SELFTEST_END_STMT
2337 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2338 and ranges from EXP_START_COL to EXP_FINISH_COL.
2339 Use LOC as the effective location of the selftest. */
2341 static void
2342 assert_token_loc_eq (const location &loc,
2343 const cpp_token *tok,
2344 const char *exp_filename, int exp_linenum,
2345 int exp_start_col, int exp_finish_col)
2347 location_t tok_loc = tok->src_loc;
2348 ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2349 ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2351 /* If location_t values are sufficiently high, then column numbers
2352 will be unavailable. */
2353 if (!should_have_column_data_p (tok_loc))
2354 return;
2356 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2357 source_range tok_range = get_range_from_loc (line_table, tok_loc);
2358 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2359 ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2362 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2363 SELFTEST_LOCATION as the effective location of the selftest. */
2365 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2366 EXP_START_COL, EXP_FINISH_COL) \
2367 assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2368 (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2370 /* Test of lexing a file using libcpp, verifying tokens and their
2371 location information. */
2373 static void
2374 test_lexer (const line_table_case &case_)
2376 /* Create a tempfile and write some text to it. */
2377 const char *content =
2378 /*00000000011111111112222222222333333.3333444444444.455555555556
2379 12345678901234567890123456789012345.6789012345678.901234567890. */
2380 ("test_name /* c-style comment */\n"
2381 " \"test literal\"\n"
2382 " // test c++-style comment\n"
2383 " 42\n");
2384 temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2386 line_table_test ltt (case_);
2388 cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2390 const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2391 ASSERT_NE (fname, NULL);
2393 /* Verify that we get the expected tokens back, with the correct
2394 location information. */
2396 location_t loc;
2397 const cpp_token *tok;
2398 tok = cpp_get_token_with_location (parser, &loc);
2399 ASSERT_NE (tok, NULL);
2400 ASSERT_EQ (tok->type, CPP_NAME);
2401 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2402 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2404 tok = cpp_get_token_with_location (parser, &loc);
2405 ASSERT_NE (tok, NULL);
2406 ASSERT_EQ (tok->type, CPP_STRING);
2407 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2408 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2410 tok = cpp_get_token_with_location (parser, &loc);
2411 ASSERT_NE (tok, NULL);
2412 ASSERT_EQ (tok->type, CPP_NUMBER);
2413 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2414 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2416 tok = cpp_get_token_with_location (parser, &loc);
2417 ASSERT_NE (tok, NULL);
2418 ASSERT_EQ (tok->type, CPP_EOF);
2420 cpp_finish (parser, NULL);
2421 cpp_destroy (parser);
2424 /* Forward decls. */
2426 class lexer_test;
2427 class lexer_test_options;
2429 /* A class for specifying options of a lexer_test.
2430 The "apply" vfunc is called during the lexer_test constructor. */
2432 class lexer_test_options
2434 public:
2435 virtual void apply (lexer_test &) = 0;
2438 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2439 in its dtor.
2441 This is needed by struct lexer_test to ensure that the cleanup of the
2442 cpp_reader happens *after* the cleanup of the temp_source_file. */
2444 class cpp_reader_ptr
2446 public:
2447 cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2449 ~cpp_reader_ptr ()
2451 cpp_finish (m_ptr, NULL);
2452 cpp_destroy (m_ptr);
2455 operator cpp_reader * () const { return m_ptr; }
2457 private:
2458 cpp_reader *m_ptr;
2461 /* A struct for writing lexer tests. */
2463 class lexer_test
2465 public:
2466 lexer_test (const line_table_case &case_, const char *content,
2467 lexer_test_options *options);
2468 ~lexer_test ();
2470 const cpp_token *get_token ();
2472 /* The ordering of these fields matters.
2473 The line_table_test must be first, since the cpp_reader_ptr
2474 uses it.
2475 The cpp_reader must be cleaned up *after* the temp_source_file
2476 since the filenames in input.cc's input cache are owned by the
2477 cpp_reader; in particular, when ~temp_source_file evicts the
2478 filename the filenames must still be alive. */
2479 line_table_test m_ltt;
2480 cpp_reader_ptr m_parser;
2481 temp_source_file m_tempfile;
2482 string_concat_db m_concats;
2483 bool m_implicitly_expect_EOF;
2486 /* Use an EBCDIC encoding for the execution charset, specifically
2487 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2489 This exercises iconv integration within libcpp.
2490 Not every build of iconv supports the given charset,
2491 so we need to flag this error and handle it gracefully. */
2493 class ebcdic_execution_charset : public lexer_test_options
2495 public:
2496 ebcdic_execution_charset () : m_num_iconv_errors (0)
2498 gcc_assert (s_singleton == NULL);
2499 s_singleton = this;
2501 ~ebcdic_execution_charset ()
2503 gcc_assert (s_singleton == this);
2504 s_singleton = NULL;
2507 void apply (lexer_test &test) final override
2509 cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2510 cpp_opts->narrow_charset = "IBM1047";
2512 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2513 callbacks->diagnostic = on_diagnostic;
2516 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2517 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2518 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2519 rich_location *richloc ATTRIBUTE_UNUSED,
2520 const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2521 ATTRIBUTE_FPTR_PRINTF(5,0)
2523 gcc_assert (s_singleton);
2524 /* Avoid exgettext from picking this up, it is translated in libcpp. */
2525 const char *msg = "conversion from %s to %s not supported by iconv";
2526 #ifdef ENABLE_NLS
2527 msg = dgettext ("cpplib", msg);
2528 #endif
2529 /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2530 when the local iconv build doesn't support the conversion. */
2531 if (strcmp (msgid, msg) == 0)
2533 s_singleton->m_num_iconv_errors++;
2534 return true;
2537 /* Otherwise, we have an unexpected error. */
2538 abort ();
2541 bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2543 private:
2544 static ebcdic_execution_charset *s_singleton;
2545 int m_num_iconv_errors;
2548 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2550 /* A lexer_test_options subclass that records a list of diagnostic
2551 messages emitted by the lexer. */
2553 class lexer_diagnostic_sink : public lexer_test_options
2555 public:
2556 lexer_diagnostic_sink ()
2558 gcc_assert (s_singleton == NULL);
2559 s_singleton = this;
2561 ~lexer_diagnostic_sink ()
2563 gcc_assert (s_singleton == this);
2564 s_singleton = NULL;
2566 int i;
2567 char *str;
2568 FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2569 free (str);
2572 void apply (lexer_test &test) final override
2574 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2575 callbacks->diagnostic = on_diagnostic;
2578 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2579 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2580 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2581 rich_location *richloc ATTRIBUTE_UNUSED,
2582 const char *msgid, va_list *ap)
2583 ATTRIBUTE_FPTR_PRINTF(5,0)
2585 char *msg = xvasprintf (msgid, *ap);
2586 s_singleton->m_diagnostics.safe_push (msg);
2587 return true;
2590 auto_vec<char *> m_diagnostics;
2592 private:
2593 static lexer_diagnostic_sink *s_singleton;
2596 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2598 /* Constructor. Override line_table with a new instance based on CASE_,
2599 and write CONTENT to a tempfile. Create a cpp_reader, and use it to
2600 start parsing the tempfile. */
2602 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2603 lexer_test_options *options)
2604 : m_ltt (case_),
2605 m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2606 /* Create a tempfile and write the text to it. */
2607 m_tempfile (SELFTEST_LOCATION, ".c", content),
2608 m_concats (),
2609 m_implicitly_expect_EOF (true)
2611 if (options)
2612 options->apply (*this);
2614 cpp_init_iconv (m_parser);
2616 /* Parse the file. */
2617 const char *fname = cpp_read_main_file (m_parser,
2618 m_tempfile.get_filename ());
2619 ASSERT_NE (fname, NULL);
2622 /* Destructor. By default, verify that the next token in m_parser is EOF. */
2624 lexer_test::~lexer_test ()
2626 location_t loc;
2627 const cpp_token *tok;
2629 if (m_implicitly_expect_EOF)
2631 tok = cpp_get_token_with_location (m_parser, &loc);
2632 ASSERT_NE (tok, NULL);
2633 ASSERT_EQ (tok->type, CPP_EOF);
2637 /* Get the next token from m_parser. */
2639 const cpp_token *
2640 lexer_test::get_token ()
2642 location_t loc;
2643 const cpp_token *tok;
2645 tok = cpp_get_token_with_location (m_parser, &loc);
2646 ASSERT_NE (tok, NULL);
2647 return tok;
2650 /* Verify that locations within string literals are correctly handled. */
2652 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2653 using the string concatenation database for TEST.
2655 Assert that the character at index IDX is on EXPECTED_LINE,
2656 and that it begins at column EXPECTED_START_COL and ends at
2657 EXPECTED_FINISH_COL (unless the locations are beyond
2658 LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2659 columns). */
2661 static void
2662 assert_char_at_range (const location &loc,
2663 lexer_test& test,
2664 location_t strloc, enum cpp_ttype type, int idx,
2665 int expected_line, int expected_start_col,
2666 int expected_finish_col)
2668 cpp_reader *pfile = test.m_parser;
2669 string_concat_db *concats = &test.m_concats;
2671 source_range actual_range = source_range();
2672 const char *err
2673 = get_source_range_for_char (pfile, concats, strloc, type, idx,
2674 &actual_range);
2675 if (should_have_column_data_p (strloc))
2676 ASSERT_EQ_AT (loc, NULL, err);
2677 else
2679 ASSERT_STREQ_AT (loc,
2680 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2681 err);
2682 return;
2685 int actual_start_line = LOCATION_LINE (actual_range.m_start);
2686 ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2687 int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2688 ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2690 if (should_have_column_data_p (actual_range.m_start))
2692 int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2693 ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2695 if (should_have_column_data_p (actual_range.m_finish))
2697 int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2698 ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2702 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2703 the effective location of any errors. */
2705 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2706 EXPECTED_START_COL, EXPECTED_FINISH_COL) \
2707 assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2708 (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2709 (EXPECTED_FINISH_COL))
2711 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2712 using the string concatenation database for TEST.
2714 Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */
2716 static void
2717 assert_num_substring_ranges (const location &loc,
2718 lexer_test& test,
2719 location_t strloc,
2720 enum cpp_ttype type,
2721 int expected_num_ranges)
2723 cpp_reader *pfile = test.m_parser;
2724 string_concat_db *concats = &test.m_concats;
2726 int actual_num_ranges = -1;
2727 const char *err
2728 = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2729 &actual_num_ranges);
2730 if (should_have_column_data_p (strloc))
2731 ASSERT_EQ_AT (loc, NULL, err);
2732 else
2734 ASSERT_STREQ_AT (loc,
2735 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2736 err);
2737 return;
2739 ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2742 /* Macro for calling assert_num_substring_ranges, supplying
2743 SELFTEST_LOCATION for the effective location of any errors. */
2745 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2746 EXPECTED_NUM_RANGES) \
2747 assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2748 (TYPE), (EXPECTED_NUM_RANGES))
2751 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2752 returns an error (using the string concatenation database for TEST). */
2754 static void
2755 assert_has_no_substring_ranges (const location &loc,
2756 lexer_test& test,
2757 location_t strloc,
2758 enum cpp_ttype type,
2759 const char *expected_err)
2761 cpp_reader *pfile = test.m_parser;
2762 string_concat_db *concats = &test.m_concats;
2763 cpp_substring_ranges ranges;
2764 const char *actual_err
2765 = get_substring_ranges_for_loc (pfile, concats, strloc,
2766 type, ranges);
2767 if (should_have_column_data_p (strloc))
2768 ASSERT_STREQ_AT (loc, expected_err, actual_err);
2769 else
2770 ASSERT_STREQ_AT (loc,
2771 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2772 actual_err);
2775 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \
2776 assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2777 (STRLOC), (TYPE), (ERR))
2779 /* Lex a simple string literal. Verify the substring location data, before
2780 and after running cpp_interpret_string on it. */
2782 static void
2783 test_lexer_string_locations_simple (const line_table_case &case_)
2785 /* Digits 0-9 (with 0 at column 10), the simple way.
2786 ....................000000000.11111111112.2222222223333333333
2787 ....................123456789.01234567890.1234567890123456789
2788 We add a trailing comment to ensure that we correctly locate
2789 the end of the string literal token. */
2790 const char *content = " \"0123456789\" /* not a string */\n";
2791 lexer_test test (case_, content, NULL);
2793 /* Verify that we get the expected token back, with the correct
2794 location information. */
2795 const cpp_token *tok = test.get_token ();
2796 ASSERT_EQ (tok->type, CPP_STRING);
2797 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2798 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2800 /* At this point in lexing, the quote characters are treated as part of
2801 the string (they are stripped off by cpp_interpret_string). */
2803 ASSERT_EQ (tok->val.str.len, 12);
2805 /* Verify that cpp_interpret_string works. */
2806 cpp_string dst_string;
2807 const enum cpp_ttype type = CPP_STRING;
2808 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2809 &dst_string, type);
2810 ASSERT_TRUE (result);
2811 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2812 free (const_cast <unsigned char *> (dst_string.text));
2814 /* Verify ranges of individual characters. This no longer includes the
2815 opening quote, but does include the closing quote. */
2816 for (int i = 0; i <= 10; i++)
2817 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2818 10 + i, 10 + i);
2820 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2823 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2824 encoding. */
2826 static void
2827 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2829 /* EBCDIC support requires iconv. */
2830 if (!HAVE_ICONV)
2831 return;
2833 /* Digits 0-9 (with 0 at column 10), the simple way.
2834 ....................000000000.11111111112.2222222223333333333
2835 ....................123456789.01234567890.1234567890123456789
2836 We add a trailing comment to ensure that we correctly locate
2837 the end of the string literal token. */
2838 const char *content = " \"0123456789\" /* not a string */\n";
2839 ebcdic_execution_charset use_ebcdic;
2840 lexer_test test (case_, content, &use_ebcdic);
2842 /* Verify that we get the expected token back, with the correct
2843 location information. */
2844 const cpp_token *tok = test.get_token ();
2845 ASSERT_EQ (tok->type, CPP_STRING);
2846 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2847 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2849 /* At this point in lexing, the quote characters are treated as part of
2850 the string (they are stripped off by cpp_interpret_string). */
2852 ASSERT_EQ (tok->val.str.len, 12);
2854 /* The remainder of the test requires an iconv implementation that
2855 can convert from UTF-8 to the EBCDIC encoding requested above. */
2856 if (use_ebcdic.iconv_errors_occurred_p ())
2857 return;
2859 /* Verify that cpp_interpret_string works. */
2860 cpp_string dst_string;
2861 const enum cpp_ttype type = CPP_STRING;
2862 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2863 &dst_string, type);
2864 ASSERT_TRUE (result);
2865 /* We should now have EBCDIC-encoded text, specifically
2866 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2867 The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */
2868 ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2869 (const char *)dst_string.text);
2870 free (const_cast <unsigned char *> (dst_string.text));
2872 /* Verify that we don't attempt to record substring location information
2873 for such cases. */
2874 ASSERT_HAS_NO_SUBSTRING_RANGES
2875 (test, tok->src_loc, type,
2876 "execution character set != source character set");
2879 /* Lex a string literal containing a hex-escaped character.
2880 Verify the substring location data, before and after running
2881 cpp_interpret_string on it. */
2883 static void
2884 test_lexer_string_locations_hex (const line_table_case &case_)
2886 /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2887 and with a space in place of digit 6, to terminate the escaped
2888 hex code.
2889 ....................000000000.111111.11112222.
2890 ....................123456789.012345.67890123. */
2891 const char *content = " \"01234\\x35 789\"\n";
2892 lexer_test test (case_, content, NULL);
2894 /* Verify that we get the expected token back, with the correct
2895 location information. */
2896 const cpp_token *tok = test.get_token ();
2897 ASSERT_EQ (tok->type, CPP_STRING);
2898 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2899 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2901 /* At this point in lexing, the quote characters are treated as part of
2902 the string (they are stripped off by cpp_interpret_string). */
2903 ASSERT_EQ (tok->val.str.len, 15);
2905 /* Verify that cpp_interpret_string works. */
2906 cpp_string dst_string;
2907 const enum cpp_ttype type = CPP_STRING;
2908 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2909 &dst_string, type);
2910 ASSERT_TRUE (result);
2911 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2912 free (const_cast <unsigned char *> (dst_string.text));
2914 /* Verify ranges of individual characters. This no longer includes the
2915 opening quote, but does include the closing quote. */
2916 for (int i = 0; i <= 4; i++)
2917 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2918 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2919 for (int i = 6; i <= 10; i++)
2920 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2922 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2925 /* Lex a string literal containing an octal-escaped character.
2926 Verify the substring location data after running cpp_interpret_string
2927 on it. */
2929 static void
2930 test_lexer_string_locations_oct (const line_table_case &case_)
2932 /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2933 and with a space in place of digit 6, to terminate the escaped
2934 octal code.
2935 ....................000000000.111111.11112222.2222223333333333444
2936 ....................123456789.012345.67890123.4567890123456789012 */
2937 const char *content = " \"01234\\065 789\" /* not a string */\n";
2938 lexer_test test (case_, content, NULL);
2940 /* Verify that we get the expected token back, with the correct
2941 location information. */
2942 const cpp_token *tok = test.get_token ();
2943 ASSERT_EQ (tok->type, CPP_STRING);
2944 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2946 /* Verify that cpp_interpret_string works. */
2947 cpp_string dst_string;
2948 const enum cpp_ttype type = CPP_STRING;
2949 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2950 &dst_string, type);
2951 ASSERT_TRUE (result);
2952 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2953 free (const_cast <unsigned char *> (dst_string.text));
2955 /* Verify ranges of individual characters. This no longer includes the
2956 opening quote, but does include the closing quote. */
2957 for (int i = 0; i < 5; i++)
2958 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2959 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2960 for (int i = 6; i <= 10; i++)
2961 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2963 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2966 /* Test of string literal containing letter escapes. */
2968 static void
2969 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2971 /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2972 .....................000000000.1.11111.1.1.11222.22222223333333
2973 .....................123456789.0.12345.6.7.89012.34567890123456. */
2974 const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2975 lexer_test test (case_, content, NULL);
2977 /* Verify that we get the expected tokens back. */
2978 const cpp_token *tok = test.get_token ();
2979 ASSERT_EQ (tok->type, CPP_STRING);
2980 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2982 /* Verify ranges of individual characters. */
2983 /* "\t". */
2984 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2985 0, 1, 10, 11);
2986 /* "foo". */
2987 for (int i = 1; i <= 3; i++)
2988 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2989 i, 1, 11 + i, 11 + i);
2990 /* "\\" and "\n". */
2991 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2992 4, 1, 15, 16);
2993 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2994 5, 1, 17, 18);
2996 /* "bar" and closing quote for nul-terminator. */
2997 for (int i = 6; i <= 9; i++)
2998 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2999 i, 1, 13 + i, 13 + i);
3001 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
3004 /* Another test of a string literal containing a letter escape.
3005 Based on string seen in
3006 printf ("%-%\n");
3007 in gcc.dg/format/c90-printf-1.c. */
3009 static void
3010 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
3012 /* .....................000000000.1111.11.1111.22222222223.
3013 .....................123456789.0123.45.6789.01234567890. */
3014 const char *content = (" \"%-%\\n\" /* non-str */\n");
3015 lexer_test test (case_, content, NULL);
3017 /* Verify that we get the expected tokens back. */
3018 const cpp_token *tok = test.get_token ();
3019 ASSERT_EQ (tok->type, CPP_STRING);
3020 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
3022 /* Verify ranges of individual characters. */
3023 /* "%-%". */
3024 for (int i = 0; i < 3; i++)
3025 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3026 i, 1, 10 + i, 10 + i);
3027 /* "\n". */
3028 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3029 3, 1, 13, 14);
3031 /* Closing quote for nul-terminator. */
3032 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3033 4, 1, 15, 15);
3035 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
3038 /* Lex a string literal containing UCN 4 characters.
3039 Verify the substring location data after running cpp_interpret_string
3040 on it. */
3042 static void
3043 test_lexer_string_locations_ucn4 (const line_table_case &case_)
3045 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
3046 as UCN 4.
3047 ....................000000000.111111.111122.222222223.33333333344444
3048 ....................123456789.012345.678901.234567890.12345678901234 */
3049 const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n";
3050 lexer_test test (case_, content, NULL);
3052 /* Verify that we get the expected token back, with the correct
3053 location information. */
3054 const cpp_token *tok = test.get_token ();
3055 ASSERT_EQ (tok->type, CPP_STRING);
3056 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
3058 /* Verify that cpp_interpret_string works.
3059 The string should be encoded in the execution character
3060 set. Assuming that is UTF-8, we should have the following:
3061 ----------- ---- ----- ------- ----------------
3062 Byte offset Byte Octal Unicode Source Column(s)
3063 ----------- ---- ----- ------- ----------------
3064 0 0x30 '0' 10
3065 1 0x31 '1' 11
3066 2 0x32 '2' 12
3067 3 0x33 '3' 13
3068 4 0x34 '4' 14
3069 5 0xE2 \342 U+2174 15-20
3070 6 0x85 \205 (cont) 15-20
3071 7 0xB4 \264 (cont) 15-20
3072 8 0xE2 \342 U+2175 21-26
3073 9 0x85 \205 (cont) 21-26
3074 10 0xB5 \265 (cont) 21-26
3075 11 0x37 '7' 27
3076 12 0x38 '8' 28
3077 13 0x39 '9' 29
3078 14 0x00 30 (closing quote)
3079 ----------- ---- ----- ------- ---------------. */
3081 cpp_string dst_string;
3082 const enum cpp_ttype type = CPP_STRING;
3083 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3084 &dst_string, type);
3085 ASSERT_TRUE (result);
3086 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3087 (const char *)dst_string.text);
3088 free (const_cast <unsigned char *> (dst_string.text));
3090 /* Verify ranges of individual characters. This no longer includes the
3091 opening quote, but does include the closing quote.
3092 '01234'. */
3093 for (int i = 0; i <= 4; i++)
3094 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3095 /* U+2174. */
3096 for (int i = 5; i <= 7; i++)
3097 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
3098 /* U+2175. */
3099 for (int i = 8; i <= 10; i++)
3100 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
3101 /* '789' and nul terminator */
3102 for (int i = 11; i <= 14; i++)
3103 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
3105 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3108 /* Lex a string literal containing UCN 8 characters.
3109 Verify the substring location data after running cpp_interpret_string
3110 on it. */
3112 static void
3113 test_lexer_string_locations_ucn8 (const line_table_case &case_)
3115 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
3116 ....................000000000.111111.1111222222.2222333333333.344444
3117 ....................123456789.012345.6789012345.6789012345678.901234 */
3118 const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n";
3119 lexer_test test (case_, content, NULL);
3121 /* Verify that we get the expected token back, with the correct
3122 location information. */
3123 const cpp_token *tok = test.get_token ();
3124 ASSERT_EQ (tok->type, CPP_STRING);
3125 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3126 "\"01234\\U00002174\\U00002175789\"");
3128 /* Verify that cpp_interpret_string works.
3129 The UTF-8 encoding of the string is identical to that from
3130 the ucn4 testcase above; the only difference is the column
3131 locations. */
3132 cpp_string dst_string;
3133 const enum cpp_ttype type = CPP_STRING;
3134 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3135 &dst_string, type);
3136 ASSERT_TRUE (result);
3137 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3138 (const char *)dst_string.text);
3139 free (const_cast <unsigned char *> (dst_string.text));
3141 /* Verify ranges of individual characters. This no longer includes the
3142 opening quote, but does include the closing quote.
3143 '01234'. */
3144 for (int i = 0; i <= 4; i++)
3145 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3146 /* U+2174. */
3147 for (int i = 5; i <= 7; i++)
3148 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3149 /* U+2175. */
3150 for (int i = 8; i <= 10; i++)
3151 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3152 /* '789' at columns 35-37 */
3153 for (int i = 11; i <= 13; i++)
3154 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3155 /* Closing quote/nul-terminator at column 38. */
3156 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3158 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3161 /* Fetch a big-endian 32-bit value and convert to host endianness. */
3163 static uint32_t
3164 uint32_from_big_endian (const uint32_t *ptr_be_value)
3166 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3167 return (((uint32_t) buf[0] << 24)
3168 | ((uint32_t) buf[1] << 16)
3169 | ((uint32_t) buf[2] << 8)
3170 | (uint32_t) buf[3]);
3173 /* Lex a wide string literal and verify that attempts to read substring
3174 location data from it fail gracefully. */
3176 static void
3177 test_lexer_string_locations_wide_string (const line_table_case &case_)
3179 /* Digits 0-9.
3180 ....................000000000.11111111112.22222222233333
3181 ....................123456789.01234567890.12345678901234 */
3182 const char *content = " L\"0123456789\" /* non-str */\n";
3183 lexer_test test (case_, content, NULL);
3185 /* Verify that we get the expected token back, with the correct
3186 location information. */
3187 const cpp_token *tok = test.get_token ();
3188 ASSERT_EQ (tok->type, CPP_WSTRING);
3189 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3191 /* Verify that cpp_interpret_string works, using CPP_WSTRING. */
3192 cpp_string dst_string;
3193 const enum cpp_ttype type = CPP_WSTRING;
3194 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3195 &dst_string, type);
3196 ASSERT_TRUE (result);
3197 /* The cpp_reader defaults to big-endian with
3198 CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3199 now be encoded as UTF-32BE. */
3200 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3201 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3202 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3203 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3204 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3205 free (const_cast <unsigned char *> (dst_string.text));
3207 /* We don't yet support generating substring location information
3208 for L"" strings. */
3209 ASSERT_HAS_NO_SUBSTRING_RANGES
3210 (test, tok->src_loc, type,
3211 "execution character set != source character set");
3214 /* Fetch a big-endian 16-bit value and convert to host endianness. */
3216 static uint16_t
3217 uint16_from_big_endian (const uint16_t *ptr_be_value)
3219 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3220 return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3223 /* Lex a u"" string literal and verify that attempts to read substring
3224 location data from it fail gracefully. */
3226 static void
3227 test_lexer_string_locations_string16 (const line_table_case &case_)
3229 /* Digits 0-9.
3230 ....................000000000.11111111112.22222222233333
3231 ....................123456789.01234567890.12345678901234 */
3232 const char *content = " u\"0123456789\" /* non-str */\n";
3233 lexer_test test (case_, content, NULL);
3235 /* Verify that we get the expected token back, with the correct
3236 location information. */
3237 const cpp_token *tok = test.get_token ();
3238 ASSERT_EQ (tok->type, CPP_STRING16);
3239 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3241 /* Verify that cpp_interpret_string works, using CPP_STRING16. */
3242 cpp_string dst_string;
3243 const enum cpp_ttype type = CPP_STRING16;
3244 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3245 &dst_string, type);
3246 ASSERT_TRUE (result);
3248 /* The cpp_reader defaults to big-endian, so dst_string should
3249 now be encoded as UTF-16BE. */
3250 const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3251 ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3252 ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3253 ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3254 ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3255 free (const_cast <unsigned char *> (dst_string.text));
3257 /* We don't yet support generating substring location information
3258 for L"" strings. */
3259 ASSERT_HAS_NO_SUBSTRING_RANGES
3260 (test, tok->src_loc, type,
3261 "execution character set != source character set");
3264 /* Lex a U"" string literal and verify that attempts to read substring
3265 location data from it fail gracefully. */
3267 static void
3268 test_lexer_string_locations_string32 (const line_table_case &case_)
3270 /* Digits 0-9.
3271 ....................000000000.11111111112.22222222233333
3272 ....................123456789.01234567890.12345678901234 */
3273 const char *content = " U\"0123456789\" /* non-str */\n";
3274 lexer_test test (case_, content, NULL);
3276 /* Verify that we get the expected token back, with the correct
3277 location information. */
3278 const cpp_token *tok = test.get_token ();
3279 ASSERT_EQ (tok->type, CPP_STRING32);
3280 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3282 /* Verify that cpp_interpret_string works, using CPP_STRING32. */
3283 cpp_string dst_string;
3284 const enum cpp_ttype type = CPP_STRING32;
3285 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3286 &dst_string, type);
3287 ASSERT_TRUE (result);
3289 /* The cpp_reader defaults to big-endian, so dst_string should
3290 now be encoded as UTF-32BE. */
3291 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3292 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3293 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3294 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3295 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3296 free (const_cast <unsigned char *> (dst_string.text));
3298 /* We don't yet support generating substring location information
3299 for L"" strings. */
3300 ASSERT_HAS_NO_SUBSTRING_RANGES
3301 (test, tok->src_loc, type,
3302 "execution character set != source character set");
3305 /* Lex a u8-string literal.
3306 Verify the substring location data after running cpp_interpret_string
3307 on it. */
3309 static void
3310 test_lexer_string_locations_u8 (const line_table_case &case_)
3312 /* Digits 0-9.
3313 ....................000000000.11111111112.22222222233333
3314 ....................123456789.01234567890.12345678901234 */
3315 const char *content = " u8\"0123456789\" /* non-str */\n";
3316 lexer_test test (case_, content, NULL);
3318 /* Verify that we get the expected token back, with the correct
3319 location information. */
3320 const cpp_token *tok = test.get_token ();
3321 ASSERT_EQ (tok->type, CPP_UTF8STRING);
3322 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3324 /* Verify that cpp_interpret_string works. */
3325 cpp_string dst_string;
3326 const enum cpp_ttype type = CPP_STRING;
3327 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3328 &dst_string, type);
3329 ASSERT_TRUE (result);
3330 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3331 free (const_cast <unsigned char *> (dst_string.text));
3333 /* Verify ranges of individual characters. This no longer includes the
3334 opening quote, but does include the closing quote. */
3335 for (int i = 0; i <= 10; i++)
3336 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3339 /* Lex a string literal containing UTF-8 source characters.
3340 Verify the substring location data after running cpp_interpret_string
3341 on it. */
3343 static void
3344 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3346 /* This string literal is written out to the source file as UTF-8,
3347 and is of the form "before mojibake after", where "mojibake"
3348 is written as the following four unicode code points:
3349 U+6587 CJK UNIFIED IDEOGRAPH-6587
3350 U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3351 U+5316 CJK UNIFIED IDEOGRAPH-5316
3352 U+3051 HIRAGANA LETTER KE.
3353 Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3354 "before" and "after" are 1 byte per unicode character.
3356 The numbering shown are "columns", which are *byte* numbers within
3357 the line, rather than unicode character numbers.
3359 .................... 000000000.1111111.
3360 .................... 123456789.0123456. */
3361 const char *content = (" \"before "
3362 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3363 UTF-8: 0xE6 0x96 0x87
3364 C octal escaped UTF-8: \346\226\207
3365 "column" numbers: 17-19. */
3366 "\346\226\207"
3368 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3369 UTF-8: 0xE5 0xAD 0x97
3370 C octal escaped UTF-8: \345\255\227
3371 "column" numbers: 20-22. */
3372 "\345\255\227"
3374 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3375 UTF-8: 0xE5 0x8C 0x96
3376 C octal escaped UTF-8: \345\214\226
3377 "column" numbers: 23-25. */
3378 "\345\214\226"
3380 /* U+3051 HIRAGANA LETTER KE
3381 UTF-8: 0xE3 0x81 0x91
3382 C octal escaped UTF-8: \343\201\221
3383 "column" numbers: 26-28. */
3384 "\343\201\221"
3386 /* column numbers 29 onwards
3387 2333333.33334444444444
3388 9012345.67890123456789. */
3389 " after\" /* non-str */\n");
3390 lexer_test test (case_, content, NULL);
3392 /* Verify that we get the expected token back, with the correct
3393 location information. */
3394 const cpp_token *tok = test.get_token ();
3395 ASSERT_EQ (tok->type, CPP_STRING);
3396 ASSERT_TOKEN_AS_TEXT_EQ
3397 (test.m_parser, tok,
3398 "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3400 /* Verify that cpp_interpret_string works. */
3401 cpp_string dst_string;
3402 const enum cpp_ttype type = CPP_STRING;
3403 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3404 &dst_string, type);
3405 ASSERT_TRUE (result);
3406 ASSERT_STREQ
3407 ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3408 (const char *)dst_string.text);
3409 free (const_cast <unsigned char *> (dst_string.text));
3411 /* Verify ranges of individual characters. This no longer includes the
3412 opening quote, but does include the closing quote.
3413 Assuming that both source and execution encodings are UTF-8, we have
3414 a run of 25 octets in each, plus the NUL terminator. */
3415 for (int i = 0; i < 25; i++)
3416 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3417 /* NUL-terminator should use the closing quote at column 35. */
3418 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3420 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3423 /* Test of string literal concatenation. */
3425 static void
3426 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3428 /* Digits 0-9.
3429 .....................000000000.111111.11112222222222
3430 .....................123456789.012345.67890123456789. */
3431 const char *content = (" \"01234\" /* non-str */\n"
3432 " \"56789\" /* non-str */\n");
3433 lexer_test test (case_, content, NULL);
3435 location_t input_locs[2];
3437 /* Verify that we get the expected tokens back. */
3438 auto_vec <cpp_string> input_strings;
3439 const cpp_token *tok_a = test.get_token ();
3440 ASSERT_EQ (tok_a->type, CPP_STRING);
3441 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3442 input_strings.safe_push (tok_a->val.str);
3443 input_locs[0] = tok_a->src_loc;
3445 const cpp_token *tok_b = test.get_token ();
3446 ASSERT_EQ (tok_b->type, CPP_STRING);
3447 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3448 input_strings.safe_push (tok_b->val.str);
3449 input_locs[1] = tok_b->src_loc;
3451 /* Verify that cpp_interpret_string works. */
3452 cpp_string dst_string;
3453 const enum cpp_ttype type = CPP_STRING;
3454 bool result = cpp_interpret_string (test.m_parser,
3455 input_strings.address (), 2,
3456 &dst_string, type);
3457 ASSERT_TRUE (result);
3458 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3459 free (const_cast <unsigned char *> (dst_string.text));
3461 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3462 test.m_concats.record_string_concatenation (2, input_locs);
3464 location_t initial_loc = input_locs[0];
3466 /* "01234" on line 1. */
3467 for (int i = 0; i <= 4; i++)
3468 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3469 /* "56789" in line 2, plus its closing quote for the nul terminator. */
3470 for (int i = 5; i <= 10; i++)
3471 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3473 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3476 /* Another test of string literal concatenation. */
3478 static void
3479 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3481 /* Digits 0-9.
3482 .....................000000000.111.11111112222222
3483 .....................123456789.012.34567890123456. */
3484 const char *content = (" \"01\" /* non-str */\n"
3485 " \"23\" /* non-str */\n"
3486 " \"45\" /* non-str */\n"
3487 " \"67\" /* non-str */\n"
3488 " \"89\" /* non-str */\n");
3489 lexer_test test (case_, content, NULL);
3491 auto_vec <cpp_string> input_strings;
3492 location_t input_locs[5];
3494 /* Verify that we get the expected tokens back. */
3495 for (int i = 0; i < 5; i++)
3497 const cpp_token *tok = test.get_token ();
3498 ASSERT_EQ (tok->type, CPP_STRING);
3499 input_strings.safe_push (tok->val.str);
3500 input_locs[i] = tok->src_loc;
3503 /* Verify that cpp_interpret_string works. */
3504 cpp_string dst_string;
3505 const enum cpp_ttype type = CPP_STRING;
3506 bool result = cpp_interpret_string (test.m_parser,
3507 input_strings.address (), 5,
3508 &dst_string, type);
3509 ASSERT_TRUE (result);
3510 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3511 free (const_cast <unsigned char *> (dst_string.text));
3513 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3514 test.m_concats.record_string_concatenation (5, input_locs);
3516 location_t initial_loc = input_locs[0];
3518 /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3519 detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3520 and expect get_source_range_for_substring to fail.
3521 However, for a string concatenation test, we can have a case
3522 where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3523 but subsequent strings can be after it.
3524 Attempting to detect this within assert_char_at_range
3525 would overcomplicate the logic for the common test cases, so
3526 we detect it here. */
3527 if (should_have_column_data_p (input_locs[0])
3528 && !should_have_column_data_p (input_locs[4]))
3530 /* Verify that get_source_range_for_substring gracefully rejects
3531 this case. */
3532 source_range actual_range;
3533 const char *err
3534 = get_source_range_for_char (test.m_parser, &test.m_concats,
3535 initial_loc, type, 0, &actual_range);
3536 ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3537 return;
3540 for (int i = 0; i < 5; i++)
3541 for (int j = 0; j < 2; j++)
3542 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3543 i + 1, 10 + j, 10 + j);
3545 /* NUL-terminator should use the final closing quote at line 5 column 12. */
3546 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3548 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3551 /* Another test of string literal concatenation, this time combined with
3552 various kinds of escaped characters. */
3554 static void
3555 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3557 /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3558 digit 6 in ASCII as octal "\066", concatenating multiple strings. */
3559 const char *content
3560 /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3561 .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3562 = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n");
3563 lexer_test test (case_, content, NULL);
3565 auto_vec <cpp_string> input_strings;
3566 location_t input_locs[4];
3568 /* Verify that we get the expected tokens back. */
3569 for (int i = 0; i < 4; i++)
3571 const cpp_token *tok = test.get_token ();
3572 ASSERT_EQ (tok->type, CPP_STRING);
3573 input_strings.safe_push (tok->val.str);
3574 input_locs[i] = tok->src_loc;
3577 /* Verify that cpp_interpret_string works. */
3578 cpp_string dst_string;
3579 const enum cpp_ttype type = CPP_STRING;
3580 bool result = cpp_interpret_string (test.m_parser,
3581 input_strings.address (), 4,
3582 &dst_string, type);
3583 ASSERT_TRUE (result);
3584 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3585 free (const_cast <unsigned char *> (dst_string.text));
3587 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3588 test.m_concats.record_string_concatenation (4, input_locs);
3590 location_t initial_loc = input_locs[0];
3592 for (int i = 0; i <= 4; i++)
3593 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3594 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3595 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3596 for (int i = 7; i <= 9; i++)
3597 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3599 /* NUL-terminator should use the location of the final closing quote. */
3600 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3602 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3605 /* Test of string literal in a macro. */
3607 static void
3608 test_lexer_string_locations_macro (const line_table_case &case_)
3610 /* Digits 0-9.
3611 .....................0000000001111111111.22222222223.
3612 .....................1234567890123456789.01234567890. */
3613 const char *content = ("#define MACRO \"0123456789\" /* non-str */\n"
3614 " MACRO");
3615 lexer_test test (case_, content, NULL);
3617 /* Verify that we get the expected tokens back. */
3618 const cpp_token *tok = test.get_token ();
3619 ASSERT_EQ (tok->type, CPP_PADDING);
3621 tok = test.get_token ();
3622 ASSERT_EQ (tok->type, CPP_STRING);
3623 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3625 /* Verify ranges of individual characters. We ought to
3626 see columns within the macro definition. */
3627 for (int i = 0; i <= 10; i++)
3628 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3629 i, 1, 20 + i, 20 + i);
3631 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3633 tok = test.get_token ();
3634 ASSERT_EQ (tok->type, CPP_PADDING);
3637 /* Test of stringification of a macro argument. */
3639 static void
3640 test_lexer_string_locations_stringified_macro_argument
3641 (const line_table_case &case_)
3643 /* .....................000000000111111111122222222223.
3644 .....................123456789012345678901234567890. */
3645 const char *content = ("#define MACRO(X) #X /* non-str */\n"
3646 "MACRO(foo)\n");
3647 lexer_test test (case_, content, NULL);
3649 /* Verify that we get the expected token back. */
3650 const cpp_token *tok = test.get_token ();
3651 ASSERT_EQ (tok->type, CPP_PADDING);
3653 tok = test.get_token ();
3654 ASSERT_EQ (tok->type, CPP_STRING);
3655 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3657 /* We don't support getting the location of a stringified macro
3658 argument. Verify that it fails gracefully. */
3659 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3660 "cpp_interpret_string_1 failed");
3662 tok = test.get_token ();
3663 ASSERT_EQ (tok->type, CPP_PADDING);
3665 tok = test.get_token ();
3666 ASSERT_EQ (tok->type, CPP_PADDING);
3669 /* Ensure that we are fail gracefully if something attempts to pass
3670 in a location that isn't a string literal token. Seen on this code:
3672 const char a[] = " %d ";
3673 __builtin_printf (a, 0.5);
3676 when c-format.cc erroneously used the indicated one-character
3677 location as the format string location, leading to a read past the
3678 end of a string buffer in cpp_interpret_string_1. */
3680 static void
3681 test_lexer_string_locations_non_string (const line_table_case &case_)
3683 /* .....................000000000111111111122222222223.
3684 .....................123456789012345678901234567890. */
3685 const char *content = (" a\n");
3686 lexer_test test (case_, content, NULL);
3688 /* Verify that we get the expected token back. */
3689 const cpp_token *tok = test.get_token ();
3690 ASSERT_EQ (tok->type, CPP_NAME);
3691 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3693 /* At this point, libcpp is attempting to interpret the name as a
3694 string literal, despite it not starting with a quote. We don't detect
3695 that, but we should at least fail gracefully. */
3696 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3697 "cpp_interpret_string_1 failed");
3700 /* Ensure that we can read substring information for a token which
3701 starts in one linemap and ends in another . Adapted from
3702 gcc.dg/cpp/pr69985.c. */
3704 static void
3705 test_lexer_string_locations_long_line (const line_table_case &case_)
3707 /* .....................000000.000111111111
3708 .....................123456.789012346789. */
3709 const char *content = ("/* A very long line, so that we start a new line map. */\n"
3710 " \"0123456789012345678901234567890123456789"
3711 "0123456789012345678901234567890123456789"
3712 "0123456789012345678901234567890123456789"
3713 "0123456789\"\n");
3715 lexer_test test (case_, content, NULL);
3717 /* Verify that we get the expected token back. */
3718 const cpp_token *tok = test.get_token ();
3719 ASSERT_EQ (tok->type, CPP_STRING);
3721 if (!should_have_column_data_p (line_table->highest_location))
3722 return;
3724 /* Verify ranges of individual characters. */
3725 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3726 for (int i = 0; i < 131; i++)
3727 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3728 i, 2, 7 + i, 7 + i);
3731 /* Test of locations within a raw string that doesn't contain a newline. */
3733 static void
3734 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3736 /* .....................00.0000000111111111122.
3737 .....................12.3456789012345678901. */
3738 const char *content = ("R\"foo(0123456789)foo\"\n");
3739 lexer_test test (case_, content, NULL);
3741 /* Verify that we get the expected token back. */
3742 const cpp_token *tok = test.get_token ();
3743 ASSERT_EQ (tok->type, CPP_STRING);
3745 /* Verify that cpp_interpret_string works. */
3746 cpp_string dst_string;
3747 const enum cpp_ttype type = CPP_STRING;
3748 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3749 &dst_string, type);
3750 ASSERT_TRUE (result);
3751 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3752 free (const_cast <unsigned char *> (dst_string.text));
3754 if (!should_have_column_data_p (line_table->highest_location))
3755 return;
3757 /* 0-9, plus the nil terminator. */
3758 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3759 for (int i = 0; i < 11; i++)
3760 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3761 i, 1, 7 + i, 7 + i);
3764 /* Test of locations within a raw string that contains a newline. */
3766 static void
3767 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3769 /* .....................00.0000.
3770 .....................12.3456. */
3771 const char *content = ("R\"foo(\n"
3772 /* .....................00000.
3773 .....................12345. */
3774 "hello\n"
3775 "world\n"
3776 /* .....................00000.
3777 .....................12345. */
3778 ")foo\"\n");
3779 lexer_test test (case_, content, NULL);
3781 /* Verify that we get the expected token back. */
3782 const cpp_token *tok = test.get_token ();
3783 ASSERT_EQ (tok->type, CPP_STRING);
3785 /* Verify that cpp_interpret_string works. */
3786 cpp_string dst_string;
3787 const enum cpp_ttype type = CPP_STRING;
3788 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3789 &dst_string, type);
3790 ASSERT_TRUE (result);
3791 ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3792 free (const_cast <unsigned char *> (dst_string.text));
3794 if (!should_have_column_data_p (line_table->highest_location))
3795 return;
3797 /* Currently we don't support locations within raw strings that
3798 contain newlines. */
3799 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3800 "range endpoints are on different lines");
3803 /* Test of parsing an unterminated raw string. */
3805 static void
3806 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3808 const char *content = "R\"ouch()ouCh\" /* etc */";
3810 lexer_diagnostic_sink diagnostics;
3811 lexer_test test (case_, content, &diagnostics);
3812 test.m_implicitly_expect_EOF = false;
3814 /* Attempt to parse the raw string. */
3815 const cpp_token *tok = test.get_token ();
3816 ASSERT_EQ (tok->type, CPP_EOF);
3818 ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3819 /* We expect the message "unterminated raw string"
3820 in the "cpplib" translation domain.
3821 It's not clear that dgettext is available on all supported hosts,
3822 so this assertion is commented-out for now.
3823 ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3824 diagnostics.m_diagnostics[0]);
3828 /* Test of lexing char constants. */
3830 static void
3831 test_lexer_char_constants (const line_table_case &case_)
3833 /* Various char constants.
3834 .....................0000000001111111111.22222222223.
3835 .....................1234567890123456789.01234567890. */
3836 const char *content = (" 'a'\n"
3837 " u'a'\n"
3838 " U'a'\n"
3839 " L'a'\n"
3840 " 'abc'\n");
3841 lexer_test test (case_, content, NULL);
3843 /* Verify that we get the expected tokens back. */
3844 /* 'a'. */
3845 const cpp_token *tok = test.get_token ();
3846 ASSERT_EQ (tok->type, CPP_CHAR);
3847 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3849 unsigned int chars_seen;
3850 int unsignedp;
3851 cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3852 &chars_seen, &unsignedp);
3853 ASSERT_EQ (cc, 'a');
3854 ASSERT_EQ (chars_seen, 1);
3856 /* u'a'. */
3857 tok = test.get_token ();
3858 ASSERT_EQ (tok->type, CPP_CHAR16);
3859 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3861 /* U'a'. */
3862 tok = test.get_token ();
3863 ASSERT_EQ (tok->type, CPP_CHAR32);
3864 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3866 /* L'a'. */
3867 tok = test.get_token ();
3868 ASSERT_EQ (tok->type, CPP_WCHAR);
3869 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3871 /* 'abc' (c-char-sequence). */
3872 tok = test.get_token ();
3873 ASSERT_EQ (tok->type, CPP_CHAR);
3874 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3876 /* A table of interesting location_t values, giving one axis of our test
3877 matrix. */
3879 static const location_t boundary_locations[] = {
3880 /* Zero means "don't override the default values for a new line_table". */
3883 /* An arbitrary non-zero value that isn't close to one of
3884 the boundary values below. */
3885 0x10000,
3887 /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES. */
3888 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3889 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3890 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3891 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3892 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3894 /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS. */
3895 LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3896 LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3897 LINE_MAP_MAX_LOCATION_WITH_COLS,
3898 LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3899 LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3902 /* Run TESTCASE multiple times, once for each case in our test matrix. */
3904 void
3905 for_each_line_table_case (void (*testcase) (const line_table_case &))
3907 /* As noted above in the description of struct line_table_case,
3908 we want to explore a test matrix of interesting line_table
3909 situations, running various selftests for each case within the
3910 matrix. */
3912 /* Run all tests with:
3913 (a) line_table->default_range_bits == 0, and
3914 (b) line_table->default_range_bits == 5. */
3915 int num_cases_tested = 0;
3916 for (int default_range_bits = 0; default_range_bits <= 5;
3917 default_range_bits += 5)
3919 /* ...and use each of the "interesting" location values as
3920 the starting location within line_table. */
3921 const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3922 for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3924 line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3926 testcase (c);
3928 num_cases_tested++;
3932 /* Verify that we fully covered the test matrix. */
3933 ASSERT_EQ (num_cases_tested, 2 * 12);
3936 /* Verify that when presented with a consecutive pair of locations with
3937 a very large line offset, we don't attempt to consolidate them into
3938 a single ordinary linemap where the line offsets within the line map
3939 would lead to overflow (PR lto/88147). */
3941 static void
3942 test_line_offset_overflow ()
3944 line_table_test ltt (line_table_case (5, 0));
3946 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3947 linemap_line_start (line_table, 1, 100);
3948 location_t loc_a = linemap_line_start (line_table, 2578, 255);
3949 assert_loceq ("foo.c", 2578, 0, loc_a);
3951 const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3952 ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3953 ASSERT_EQ (ordmap_a->m_range_bits, 5);
3955 location_t loc_b = linemap_line_start (line_table, 404198, 512);
3956 assert_loceq ("foo.c", 404198, 0, loc_b);
3958 /* We should have started a new linemap, rather than attempting to store
3959 a very large line offset. */
3960 const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3961 ASSERT_NE (ordmap_a, ordmap_b);
3964 void test_cpp_utf8 ()
3966 const int def_tabstop = 8;
3967 cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3969 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
3971 int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3972 ASSERT_EQ (8, w_bad);
3973 int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3974 ASSERT_EQ (5, w_ctrl);
3977 /* Verify that wcwidth of valid UTF-8 is as expected. */
3979 const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3980 ASSERT_EQ (1, w_pi);
3981 const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3982 ASSERT_EQ (2, w_emoji);
3983 const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3984 policy);
3985 ASSERT_EQ (1, w_umlaut_precomposed);
3986 const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3987 policy);
3988 ASSERT_EQ (1, w_umlaut_combining);
3989 const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3990 ASSERT_EQ (2, w_han);
3991 const int w_ascii = cpp_display_width ("GCC", 3, policy);
3992 ASSERT_EQ (3, w_ascii);
3993 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3994 "\x9f! \xe4\xb8\xba y\xcc\x88",
3995 24, policy);
3996 ASSERT_EQ (18, w_mixed);
3999 /* Verify that display width properly expands tabs. */
4001 const char *tstr = "\tabc\td";
4002 ASSERT_EQ (6, cpp_display_width (tstr, 6,
4003 cpp_char_column_policy (1, cpp_wcwidth)));
4004 ASSERT_EQ (10, cpp_display_width (tstr, 6,
4005 cpp_char_column_policy (3, cpp_wcwidth)));
4006 ASSERT_EQ (17, cpp_display_width (tstr, 6,
4007 cpp_char_column_policy (8, cpp_wcwidth)));
4008 ASSERT_EQ (1,
4009 cpp_display_column_to_byte_column
4010 (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
4013 /* Verify that cpp_byte_column_to_display_column can go past the end,
4014 and similar edge cases. */
4016 const char *str
4017 /* Display columns.
4018 111111112345 */
4019 = "\xcf\x80 abc";
4020 /* 111122223456
4021 Byte columns. */
4023 ASSERT_EQ (5, cpp_display_width (str, 6, policy));
4024 ASSERT_EQ (105,
4025 cpp_byte_column_to_display_column (str, 6, 106, policy));
4026 ASSERT_EQ (10000,
4027 cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
4028 ASSERT_EQ (0,
4029 cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
4032 /* Verify that cpp_display_column_to_byte_column can go past the end,
4033 and similar edge cases, and check invertibility. */
4035 const char *str
4036 /* Display columns.
4037 000000000000000000000000000000000000011
4038 111111112222222234444444455555555678901 */
4039 = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
4040 /* 000000000000000000000000000000000111111
4041 111122223333444456666777788889999012345
4042 Byte columns. */
4043 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
4044 ASSERT_EQ (15,
4045 cpp_display_column_to_byte_column (str, 15, 11, policy));
4046 ASSERT_EQ (115,
4047 cpp_display_column_to_byte_column (str, 15, 111, policy));
4048 ASSERT_EQ (10000,
4049 cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
4050 ASSERT_EQ (0,
4051 cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
4053 /* Verify that we do not interrupt a UTF-8 sequence. */
4054 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
4056 for (int byte_col = 1; byte_col <= 15; ++byte_col)
4058 const int disp_col
4059 = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
4060 const int byte_col2
4061 = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
4063 /* If we ask for the display column in the middle of a UTF-8
4064 sequence, it will return the length of the partial sequence,
4065 matching the behavior of GCC before display column support.
4066 Otherwise check the round trip was successful. */
4067 if (byte_col < 4)
4068 ASSERT_EQ (byte_col, disp_col);
4069 else if (byte_col >= 6 && byte_col < 9)
4070 ASSERT_EQ (3 + (byte_col - 5), disp_col);
4071 else
4072 ASSERT_EQ (byte_col2, byte_col);
4077 static bool
4078 check_cpp_valid_utf8_p (const char *str)
4080 return cpp_valid_utf8_p (str, strlen (str));
4083 /* Check that cpp_valid_utf8_p works as expected. */
4085 static void
4086 test_cpp_valid_utf8_p ()
4088 ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
4090 /* 2-byte char (pi). */
4091 ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
4093 /* 3-byte chars (the Japanese word "mojibake"). */
4094 ASSERT_TRUE (check_cpp_valid_utf8_p
4096 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
4097 UTF-8: 0xE6 0x96 0x87
4098 C octal escaped UTF-8: \346\226\207. */
4099 "\346\226\207"
4100 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
4101 UTF-8: 0xE5 0xAD 0x97
4102 C octal escaped UTF-8: \345\255\227. */
4103 "\345\255\227"
4104 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
4105 UTF-8: 0xE5 0x8C 0x96
4106 C octal escaped UTF-8: \345\214\226. */
4107 "\345\214\226"
4108 /* U+3051 HIRAGANA LETTER KE
4109 UTF-8: 0xE3 0x81 0x91
4110 C octal escaped UTF-8: \343\201\221. */
4111 "\343\201\221"));
4113 /* 4-byte char: an emoji. */
4114 ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
4116 /* Control codes, including the NUL byte. */
4117 ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
4119 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
4121 /* Unexpected continuation bytes. */
4122 for (unsigned char continuation_byte = 0x80;
4123 continuation_byte <= 0xbf;
4124 continuation_byte++)
4125 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
4127 /* "Lonely start characters" for 2-byte sequences. */
4129 unsigned char buf[2];
4130 buf[1] = ' ';
4131 for (buf[0] = 0xc0;
4132 buf[0] <= 0xdf;
4133 buf[0]++)
4134 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4137 /* "Lonely start characters" for 3-byte sequences. */
4139 unsigned char buf[2];
4140 buf[1] = ' ';
4141 for (buf[0] = 0xe0;
4142 buf[0] <= 0xef;
4143 buf[0]++)
4144 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4147 /* "Lonely start characters" for 4-byte sequences. */
4149 unsigned char buf[2];
4150 buf[1] = ' ';
4151 for (buf[0] = 0xf0;
4152 buf[0] <= 0xf4;
4153 buf[0]++)
4154 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4157 /* Invalid start characters (formerly valid for 5-byte and 6-byte
4158 sequences). */
4160 unsigned char buf[2];
4161 buf[1] = ' ';
4162 for (buf[0] = 0xf5;
4163 buf[0] <= 0xfd;
4164 buf[0]++)
4165 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4168 /* Impossible bytes. */
4169 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
4170 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
4171 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
4172 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
4175 /* Run all of the selftests within this file. */
4177 void
4178 input_cc_tests ()
4180 test_linenum_comparisons ();
4181 test_should_have_column_data_p ();
4182 test_unknown_location ();
4183 test_builtins ();
4184 for_each_line_table_case (test_make_location_nonpure_range_endpoints);
4186 for_each_line_table_case (test_accessing_ordinary_linemaps);
4187 for_each_line_table_case (test_lexer);
4188 for_each_line_table_case (test_lexer_string_locations_simple);
4189 for_each_line_table_case (test_lexer_string_locations_ebcdic);
4190 for_each_line_table_case (test_lexer_string_locations_hex);
4191 for_each_line_table_case (test_lexer_string_locations_oct);
4192 for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
4193 for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
4194 for_each_line_table_case (test_lexer_string_locations_ucn4);
4195 for_each_line_table_case (test_lexer_string_locations_ucn8);
4196 for_each_line_table_case (test_lexer_string_locations_wide_string);
4197 for_each_line_table_case (test_lexer_string_locations_string16);
4198 for_each_line_table_case (test_lexer_string_locations_string32);
4199 for_each_line_table_case (test_lexer_string_locations_u8);
4200 for_each_line_table_case (test_lexer_string_locations_utf8_source);
4201 for_each_line_table_case (test_lexer_string_locations_concatenation_1);
4202 for_each_line_table_case (test_lexer_string_locations_concatenation_2);
4203 for_each_line_table_case (test_lexer_string_locations_concatenation_3);
4204 for_each_line_table_case (test_lexer_string_locations_macro);
4205 for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
4206 for_each_line_table_case (test_lexer_string_locations_non_string);
4207 for_each_line_table_case (test_lexer_string_locations_long_line);
4208 for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
4209 for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
4210 for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
4211 for_each_line_table_case (test_lexer_char_constants);
4213 test_reading_source_line ();
4215 test_line_offset_overflow ();
4217 test_cpp_utf8 ();
4218 test_cpp_valid_utf8_p ();
4221 } // namespace selftest
4223 #endif /* CHECKING_P */