c++: Implement C++26 P2573R2 - = delete("should have a reason"); [PR114458]
[official-gcc.git] / gcc / input.cc
blob9f5228d255c0aecb63058c5f50aa22507f481d10
1 /* Data and functions related to line maps and input files.
2 Copyright (C) 2004-2024 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "selftest.h"
26 #include "cpplib.h"
28 #ifndef HAVE_ICONV
29 #define HAVE_ICONV 0
30 #endif
32 const char *
33 special_fname_builtin ()
35 return _("<built-in>");
38 /* Input charset configuration. */
39 static const char *default_charset_callback (const char *)
41 return nullptr;
44 void
45 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
46 bool should_skip_bom)
48 in_context.ccb = (ccb ? ccb : default_charset_callback);
49 in_context.should_skip_bom = should_skip_bom;
52 /* This is a cache used by get_next_line to store the content of a
53 file to be searched for file lines. */
54 class file_cache_slot
56 public:
57 file_cache_slot ();
58 ~file_cache_slot ();
60 bool read_line_num (size_t line_num,
61 char ** line, ssize_t *line_len);
63 /* Accessors. */
64 const char *get_file_path () const { return m_file_path; }
65 unsigned get_use_count () const { return m_use_count; }
66 bool missing_trailing_newline_p () const
68 return m_missing_trailing_newline;
70 char_span get_full_file_content ();
72 void inc_use_count () { m_use_count++; }
74 bool create (const file_cache::input_context &in_context,
75 const char *file_path, FILE *fp, unsigned highest_use_count);
76 void evict ();
78 private:
79 /* These are information used to store a line boundary. */
80 class line_info
82 public:
83 /* The line number. It starts from 1. */
84 size_t line_num;
86 /* The position (byte count) of the beginning of the line,
87 relative to the file data pointer. This starts at zero. */
88 size_t start_pos;
90 /* The position (byte count) of the last byte of the line. This
91 normally points to the '\n' character, or to one byte after the
92 last byte of the file, if the file doesn't contain a '\n'
93 character. */
94 size_t end_pos;
96 line_info (size_t l, size_t s, size_t e)
97 : line_num (l), start_pos (s), end_pos (e)
100 line_info ()
101 :line_num (0), start_pos (0), end_pos (0)
105 bool needs_read_p () const;
106 bool needs_grow_p () const;
107 void maybe_grow ();
108 bool read_data ();
109 bool maybe_read_data ();
110 bool get_next_line (char **line, ssize_t *line_len);
111 bool read_next_line (char ** line, ssize_t *line_len);
112 bool goto_next_line ();
114 static const size_t buffer_size = 4 * 1024;
115 static const size_t line_record_size = 100;
117 /* The number of time this file has been accessed. This is used
118 to designate which file cache to evict from the cache
119 array. */
120 unsigned m_use_count;
122 /* The file_path is the key for identifying a particular file in
123 the cache.
124 For libcpp-using code, the underlying buffer for this field is
125 owned by the corresponding _cpp_file within the cpp_reader. */
126 const char *m_file_path;
128 FILE *m_fp;
130 /* This points to the content of the file that we've read so
131 far. */
132 char *m_data;
134 /* The allocated buffer to be freed may start a little earlier than DATA,
135 e.g. if a UTF8 BOM was skipped at the beginning. */
136 int m_alloc_offset;
138 /* The size of the DATA array above.*/
139 size_t m_size;
141 /* The number of bytes read from the underlying file so far. This
142 must be less (or equal) than SIZE above. */
143 size_t m_nb_read;
145 /* The index of the beginning of the current line. */
146 size_t m_line_start_idx;
148 /* The number of the previous line read. This starts at 1. Zero
149 means we've read no line so far. */
150 size_t m_line_num;
152 /* This is the total number of lines of the current file. At the
153 moment, we try to get this information from the line map
154 subsystem. Note that this is just a hint. When using the C++
155 front-end, this hint is correct because the input file is then
156 completely tokenized before parsing starts; so the line map knows
157 the number of lines before compilation really starts. For e.g,
158 the C front-end, it can happen that we start emitting diagnostics
159 before the line map has seen the end of the file. */
160 size_t m_total_lines;
162 /* Could this file be missing a trailing newline on its final line?
163 Initially true (to cope with empty files), set to true/false
164 as each line is read. */
165 bool m_missing_trailing_newline;
167 /* This is a record of the beginning and end of the lines we've seen
168 while reading the file. This is useful to avoid walking the data
169 from the beginning when we are asked to read a line that is
170 before LINE_START_IDX above. Note that the maximum size of this
171 record is line_record_size, so that the memory consumption
172 doesn't explode. We thus scale total_lines down to
173 line_record_size. */
174 vec<line_info, va_heap> m_line_record;
176 void offset_buffer (int offset)
178 gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
179 : (size_t) offset <= m_size);
180 gcc_assert (m_data);
181 m_alloc_offset += offset;
182 m_data += offset;
183 m_size -= offset;
188 /* Current position in real source file. */
190 location_t input_location = UNKNOWN_LOCATION;
192 class line_maps *line_table;
194 /* A stashed copy of "line_table" for use by selftest::line_table_test.
195 This needs to be a global so that it can be a GC root, and thus
196 prevent the stashed copy from being garbage-collected if the GC runs
197 during a line_table_test. */
199 class line_maps *saved_line_table;
201 /* Expand the source location LOC into a human readable location. If
202 LOC resolves to a builtin location, the file name of the readable
203 location is set to the string "<built-in>". If EXPANSION_POINT_P is
204 TRUE and LOC is virtual, then it is resolved to the expansion
205 point of the involved macro. Otherwise, it is resolved to the
206 spelling location of the token.
208 When resolving to the spelling location of the token, if the
209 resulting location is for a built-in location (that is, it has no
210 associated line/column) in the context of a macro expansion, the
211 returned location is the first one (while unwinding the macro
212 location towards its expansion point) that is in real source
213 code.
215 ASPECT controls which part of the location to use. */
217 static expanded_location
218 expand_location_1 (const line_maps *set,
219 location_t loc,
220 bool expansion_point_p,
221 enum location_aspect aspect)
223 expanded_location xloc;
224 const line_map_ordinary *map;
225 enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
226 tree block = NULL;
228 if (IS_ADHOC_LOC (loc))
230 block = LOCATION_BLOCK (loc);
231 loc = LOCATION_LOCUS (loc);
234 memset (&xloc, 0, sizeof (xloc));
236 if (loc >= RESERVED_LOCATION_COUNT)
238 if (!expansion_point_p)
240 /* We want to resolve LOC to its spelling location.
242 But if that spelling location is a reserved location that
243 appears in the context of a macro expansion (like for a
244 location for a built-in token), let's consider the first
245 location (toward the expansion point) that is not reserved;
246 that is, the first location that is in real source code. */
247 loc = linemap_unwind_to_first_non_reserved_loc (set,
248 loc, NULL);
249 lrk = LRK_SPELLING_LOCATION;
251 loc = linemap_resolve_location (set, loc, lrk, &map);
253 /* loc is now either in an ordinary map, or is a reserved location.
254 If it is a compound location, the caret is in a spelling location,
255 but the start/finish might still be a virtual location.
256 Depending of what the caller asked for, we may need to recurse
257 one level in order to resolve any virtual locations in the
258 end-points. */
259 switch (aspect)
261 default:
262 gcc_unreachable ();
263 /* Fall through. */
264 case LOCATION_ASPECT_CARET:
265 break;
266 case LOCATION_ASPECT_START:
268 location_t start = get_start (loc);
269 if (start != loc)
270 return expand_location_1 (set, start, expansion_point_p, aspect);
272 break;
273 case LOCATION_ASPECT_FINISH:
275 location_t finish = get_finish (loc);
276 if (finish != loc)
277 return expand_location_1 (set, finish, expansion_point_p, aspect);
279 break;
281 xloc = linemap_expand_location (set, map, loc);
284 xloc.data = block;
285 if (loc <= BUILTINS_LOCATION)
286 xloc.file = loc == UNKNOWN_LOCATION ? NULL : special_fname_builtin ();
288 return xloc;
291 /* Return the total lines number that have been read so far by the
292 line map (in the preprocessor) so far. For languages like C++ that
293 entirely preprocess the input file before starting to parse, this
294 equals the actual number of lines of the file. */
296 static size_t
297 total_lines_num (const char *file_path)
299 size_t r = 0;
300 location_t l = 0;
301 if (linemap_get_file_highest_location (line_table, file_path, &l))
303 gcc_assert (l >= RESERVED_LOCATION_COUNT);
304 expanded_location xloc = expand_location (l);
305 r = xloc.line;
307 return r;
310 /* Lookup the cache used for the content of a given file accessed by
311 caret diagnostic. Return the found cached file, or NULL if no
312 cached file was found. */
314 file_cache_slot *
315 file_cache::lookup_file (const char *file_path)
317 gcc_assert (file_path);
319 /* This will contain the found cached file. */
320 file_cache_slot *r = NULL;
321 for (unsigned i = 0; i < num_file_slots; ++i)
323 file_cache_slot *c = &m_file_slots[i];
324 if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
326 c->inc_use_count ();
327 r = c;
331 if (r)
332 r->inc_use_count ();
334 return r;
337 /* Purge any mention of FILENAME from the cache of files used for
338 printing source code. For use in selftests when working
339 with tempfiles. */
341 void
342 file_cache::forcibly_evict_file (const char *file_path)
344 gcc_assert (file_path);
346 file_cache_slot *r = lookup_file (file_path);
347 if (!r)
348 /* Not found. */
349 return;
351 r->evict ();
354 /* Determine if FILE_PATH missing a trailing newline on its final line.
355 Only valid to call once all of the file has been loaded, by
356 requesting a line number beyond the end of the file. */
358 bool
359 file_cache::missing_trailing_newline_p (const char *file_path)
361 gcc_assert (file_path);
363 file_cache_slot *r = lookup_or_add_file (file_path);
364 return r->missing_trailing_newline_p ();
367 void
368 file_cache_slot::evict ()
370 m_file_path = NULL;
371 if (m_fp)
372 fclose (m_fp);
373 m_fp = NULL;
374 m_nb_read = 0;
375 m_line_start_idx = 0;
376 m_line_num = 0;
377 m_line_record.truncate (0);
378 m_use_count = 0;
379 m_total_lines = 0;
380 m_missing_trailing_newline = true;
383 /* Return the file cache that has been less used, recently, or the
384 first empty one. If HIGHEST_USE_COUNT is non-null,
385 *HIGHEST_USE_COUNT is set to the highest use count of the entries
386 in the cache table. */
388 file_cache_slot*
389 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
391 file_cache_slot *to_evict = &m_file_slots[0];
392 unsigned huc = to_evict->get_use_count ();
393 for (unsigned i = 1; i < num_file_slots; ++i)
395 file_cache_slot *c = &m_file_slots[i];
396 bool c_is_empty = (c->get_file_path () == NULL);
398 if (c->get_use_count () < to_evict->get_use_count ()
399 || (to_evict->get_file_path () && c_is_empty))
400 /* We evict C because it's either an entry with a lower use
401 count or one that is empty. */
402 to_evict = c;
404 if (huc < c->get_use_count ())
405 huc = c->get_use_count ();
407 if (c_is_empty)
408 /* We've reached the end of the cache; subsequent elements are
409 all empty. */
410 break;
413 if (highest_use_count)
414 *highest_use_count = huc;
416 return to_evict;
419 /* Create the cache used for the content of a given file to be
420 accessed by caret diagnostic. This cache is added to an array of
421 cache and can be retrieved by lookup_file_in_cache_tab. This
422 function returns the created cache. Note that only the last
423 num_file_slots files are cached.
425 This can return nullptr if the FILE_PATH can't be opened for
426 reading, or if the content can't be converted to the input_charset. */
428 file_cache_slot*
429 file_cache::add_file (const char *file_path)
432 FILE *fp = fopen (file_path, "r");
433 if (fp == NULL)
434 return NULL;
436 unsigned highest_use_count = 0;
437 file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
438 if (!r->create (in_context, file_path, fp, highest_use_count))
439 return NULL;
440 return r;
443 /* Get a borrowed char_span to the full content of this file
444 as decoded according to the input charset, encoded as UTF-8. */
446 char_span
447 file_cache_slot::get_full_file_content ()
449 char *line;
450 ssize_t line_len;
451 while (get_next_line (&line, &line_len))
454 return char_span (m_data, m_nb_read);
457 /* Populate this slot for use on FILE_PATH and FP, dropping any
458 existing cached content within it. */
460 bool
461 file_cache_slot::create (const file_cache::input_context &in_context,
462 const char *file_path, FILE *fp,
463 unsigned highest_use_count)
465 m_file_path = file_path;
466 if (m_fp)
467 fclose (m_fp);
468 m_fp = fp;
469 if (m_alloc_offset)
470 offset_buffer (-m_alloc_offset);
471 m_nb_read = 0;
472 m_line_start_idx = 0;
473 m_line_num = 0;
474 m_line_record.truncate (0);
475 /* Ensure that this cache entry doesn't get evicted next time
476 add_file_to_cache_tab is called. */
477 m_use_count = ++highest_use_count;
478 m_total_lines = total_lines_num (file_path);
479 m_missing_trailing_newline = true;
482 /* Check the input configuration to determine if we need to do any
483 transformations, such as charset conversion or BOM skipping. */
484 if (const char *input_charset = in_context.ccb (file_path))
486 /* Need a full-blown conversion of the input charset. */
487 fclose (m_fp);
488 m_fp = NULL;
489 const cpp_converted_source cs
490 = cpp_get_converted_source (file_path, input_charset);
491 if (!cs.data)
492 return false;
493 if (m_data)
494 XDELETEVEC (m_data);
495 m_data = cs.data;
496 m_nb_read = m_size = cs.len;
497 m_alloc_offset = cs.data - cs.to_free;
499 else if (in_context.should_skip_bom)
501 if (read_data ())
503 const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
504 offset_buffer (offset);
505 m_nb_read -= offset;
509 return true;
512 /* file_cache's ctor. */
514 file_cache::file_cache ()
515 : m_file_slots (new file_cache_slot[num_file_slots])
517 initialize_input_context (nullptr, false);
520 /* file_cache's dtor. */
522 file_cache::~file_cache ()
524 delete[] m_file_slots;
527 /* Lookup the cache used for the content of a given file accessed by
528 caret diagnostic. If no cached file was found, create a new cache
529 for this file, add it to the array of cached file and return
532 This can return nullptr on a cache miss if FILE_PATH can't be opened for
533 reading, or if the content can't be converted to the input_charset. */
535 file_cache_slot*
536 file_cache::lookup_or_add_file (const char *file_path)
538 file_cache_slot *r = lookup_file (file_path);
539 if (r == NULL)
540 r = add_file (file_path);
541 return r;
544 /* Default constructor for a cache of file used by caret
545 diagnostic. */
547 file_cache_slot::file_cache_slot ()
548 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
549 m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
550 m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
552 m_line_record.create (0);
555 /* Destructor for a cache of file used by caret diagnostic. */
557 file_cache_slot::~file_cache_slot ()
559 if (m_fp)
561 fclose (m_fp);
562 m_fp = NULL;
564 if (m_data)
566 offset_buffer (-m_alloc_offset);
567 XDELETEVEC (m_data);
568 m_data = 0;
570 m_line_record.release ();
573 /* Returns TRUE iff the cache would need to be filled with data coming
574 from the file. That is, either the cache is empty or full or the
575 current line is empty. Note that if the cache is full, it would
576 need to be extended and filled again. */
578 bool
579 file_cache_slot::needs_read_p () const
581 return m_fp && (m_nb_read == 0
582 || m_nb_read == m_size
583 || (m_line_start_idx >= m_nb_read - 1));
586 /* Return TRUE iff the cache is full and thus needs to be
587 extended. */
589 bool
590 file_cache_slot::needs_grow_p () const
592 return m_nb_read == m_size;
595 /* Grow the cache if it needs to be extended. */
597 void
598 file_cache_slot::maybe_grow ()
600 if (!needs_grow_p ())
601 return;
603 if (!m_data)
605 gcc_assert (m_size == 0 && m_alloc_offset == 0);
606 m_size = buffer_size;
607 m_data = XNEWVEC (char, m_size);
609 else
611 const int offset = m_alloc_offset;
612 offset_buffer (-offset);
613 m_size *= 2;
614 m_data = XRESIZEVEC (char, m_data, m_size);
615 offset_buffer (offset);
619 /* Read more data into the cache. Extends the cache if need be.
620 Returns TRUE iff new data could be read. */
622 bool
623 file_cache_slot::read_data ()
625 if (feof (m_fp) || ferror (m_fp))
626 return false;
628 maybe_grow ();
630 char * from = m_data + m_nb_read;
631 size_t to_read = m_size - m_nb_read;
632 size_t nb_read = fread (from, 1, to_read, m_fp);
634 if (ferror (m_fp))
635 return false;
637 m_nb_read += nb_read;
638 return !!nb_read;
641 /* Read new data iff the cache needs to be filled with more data
642 coming from the file FP. Return TRUE iff the cache was filled with
643 mode data. */
645 bool
646 file_cache_slot::maybe_read_data ()
648 if (!needs_read_p ())
649 return false;
650 return read_data ();
653 /* Helper function for file_cache_slot::get_next_line (), to find the end of
654 the next line. Returns with the memchr convention, i.e. nullptr if a line
655 terminator was not found. We need to determine line endings in the same
656 manner that libcpp does: any of \n, \r\n, or \r is a line ending. */
658 static char *
659 find_end_of_line (char *s, size_t len)
661 for (const auto end = s + len; s != end; ++s)
663 if (*s == '\n')
664 return s;
665 if (*s == '\r')
667 const auto next = s + 1;
668 if (next == end)
670 /* Don't find the line ending if \r is the very last character
671 in the buffer; we do not know if it's the end of the file or
672 just the end of what has been read so far, and we wouldn't
673 want to break in the middle of what's actually a \r\n
674 sequence. Instead, we will handle the case of a file ending
675 in a \r later. */
676 break;
678 return (*next == '\n' ? next : s);
681 return nullptr;
684 /* Read a new line from file FP, using C as a cache for the data
685 coming from the file. Upon successful completion, *LINE is set to
686 the beginning of the line found. *LINE points directly in the
687 line cache and is only valid until the next call of get_next_line.
688 *LINE_LEN is set to the length of the line. Note that the line
689 does not contain any terminal delimiter. This function returns
690 true if some data was read or process from the cache, false
691 otherwise. Note that subsequent calls to get_next_line might
692 make the content of *LINE invalid. */
694 bool
695 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
697 /* Fill the cache with data to process. */
698 maybe_read_data ();
700 size_t remaining_size = m_nb_read - m_line_start_idx;
701 if (remaining_size == 0)
702 /* There is no more data to process. */
703 return false;
705 char *line_start = m_data + m_line_start_idx;
707 char *next_line_start = NULL;
708 size_t len = 0;
709 char *line_end = find_end_of_line (line_start, remaining_size);
710 if (line_end == NULL)
712 /* We haven't found an end-of-line delimiter in the cache.
713 Fill the cache with more data from the file and look again. */
714 while (maybe_read_data ())
716 line_start = m_data + m_line_start_idx;
717 remaining_size = m_nb_read - m_line_start_idx;
718 line_end = find_end_of_line (line_start, remaining_size);
719 if (line_end != NULL)
721 next_line_start = line_end + 1;
722 break;
725 if (line_end == NULL)
727 /* We've loaded all the file into the cache and still no
728 terminator. Let's say the line ends up at one byte past the
729 end of the file. This is to stay consistent with the case
730 of when the line ends up with a terminator and line_end points to
731 that. That consistency is useful below in the len calculation.
733 If the file ends in a \r, we didn't identify it as a line
734 terminator above, so do that now instead. */
735 line_end = m_data + m_nb_read;
736 if (m_nb_read && line_end[-1] == '\r')
738 --line_end;
739 m_missing_trailing_newline = false;
741 else
742 m_missing_trailing_newline = true;
744 else
745 m_missing_trailing_newline = false;
747 else
749 next_line_start = line_end + 1;
750 m_missing_trailing_newline = false;
753 if (m_fp && ferror (m_fp))
754 return false;
756 /* At this point, we've found the end of the of line. It either points to
757 the line terminator or to one byte after the last byte of the file. */
758 gcc_assert (line_end != NULL);
760 len = line_end - line_start;
762 if (m_line_start_idx < m_nb_read)
763 *line = line_start;
765 ++m_line_num;
767 /* Before we update our line record, make sure the hint about the
768 total number of lines of the file is correct. If it's not, then
769 we give up recording line boundaries from now on. */
770 bool update_line_record = true;
771 if (m_line_num > m_total_lines)
772 update_line_record = false;
774 /* Now update our line record so that re-reading lines from the
775 before m_line_start_idx is faster. */
776 if (update_line_record
777 && m_line_record.length () < line_record_size)
779 /* If the file lines fits in the line record, we just record all
780 its lines ...*/
781 if (m_total_lines <= line_record_size
782 && m_line_num > m_line_record.length ())
783 m_line_record.safe_push
784 (file_cache_slot::line_info (m_line_num,
785 m_line_start_idx,
786 line_end - m_data));
787 else if (m_total_lines > line_record_size)
789 /* ... otherwise, we just scale total_lines down to
790 (line_record_size lines. */
791 size_t n = (m_line_num * line_record_size) / m_total_lines;
792 if (m_line_record.length () == 0
793 || n >= m_line_record.length ())
794 m_line_record.safe_push
795 (file_cache_slot::line_info (m_line_num,
796 m_line_start_idx,
797 line_end - m_data));
801 /* Update m_line_start_idx so that it points to the next line to be
802 read. */
803 if (next_line_start)
804 m_line_start_idx = next_line_start - m_data;
805 else
806 /* We didn't find any terminal '\n'. Let's consider that the end
807 of line is the end of the data in the cache. The next
808 invocation of get_next_line will either read more data from the
809 underlying file or return false early because we've reached the
810 end of the file. */
811 m_line_start_idx = m_nb_read;
813 *line_len = len;
815 return true;
818 /* Consume the next bytes coming from the cache (or from its
819 underlying file if there are remaining unread bytes in the file)
820 until we reach the next end-of-line (or end-of-file). There is no
821 copying from the cache involved. Return TRUE upon successful
822 completion. */
824 bool
825 file_cache_slot::goto_next_line ()
827 char *l;
828 ssize_t len;
830 return get_next_line (&l, &len);
833 /* Read an arbitrary line number LINE_NUM from the file cached in C.
834 If the line was read successfully, *LINE points to the beginning
835 of the line in the file cache and *LINE_LEN is the length of the
836 line. *LINE is not nul-terminated, but may contain zero bytes.
837 *LINE is only valid until the next call of read_line_num.
838 This function returns bool if a line was read. */
840 bool
841 file_cache_slot::read_line_num (size_t line_num,
842 char ** line, ssize_t *line_len)
844 gcc_assert (line_num > 0);
846 if (line_num <= m_line_num)
848 /* We've been asked to read lines that are before m_line_num.
849 So lets use our line record (if it's not empty) to try to
850 avoid re-reading the file from the beginning again. */
852 if (m_line_record.is_empty ())
854 m_line_start_idx = 0;
855 m_line_num = 0;
857 else
859 file_cache_slot::line_info *i = NULL;
860 if (m_total_lines <= line_record_size)
862 /* In languages where the input file is not totally
863 preprocessed up front, the m_total_lines hint
864 can be smaller than the number of lines of the
865 file. In that case, only the first
866 m_total_lines have been recorded.
868 Otherwise, the first m_total_lines we've read have
869 their start/end recorded here. */
870 i = (line_num <= m_total_lines)
871 ? &m_line_record[line_num - 1]
872 : &m_line_record[m_total_lines - 1];
873 gcc_assert (i->line_num <= line_num);
875 else
877 /* So the file had more lines than our line record
878 size. Thus the number of lines we've recorded has
879 been scaled down to line_record_size. Let's
880 pick the start/end of the recorded line that is
881 closest to line_num. */
882 size_t n = (line_num <= m_total_lines)
883 ? line_num * line_record_size / m_total_lines
884 : m_line_record.length () - 1;
885 if (n < m_line_record.length ())
887 i = &m_line_record[n];
888 gcc_assert (i->line_num <= line_num);
892 if (i && i->line_num == line_num)
894 /* We have the start/end of the line. */
895 *line = m_data + i->start_pos;
896 *line_len = i->end_pos - i->start_pos;
897 return true;
900 if (i)
902 m_line_start_idx = i->start_pos;
903 m_line_num = i->line_num - 1;
905 else
907 m_line_start_idx = 0;
908 m_line_num = 0;
913 /* Let's walk from line m_line_num up to line_num - 1, without
914 copying any line. */
915 while (m_line_num < line_num - 1)
916 if (!goto_next_line ())
917 return false;
919 /* The line we want is the next one. Let's read and copy it back to
920 the caller. */
921 return get_next_line (line, line_len);
924 /* Return the physical source line that corresponds to FILE_PATH/LINE.
925 The line is not nul-terminated. The returned pointer is only
926 valid until the next call of location_get_source_line.
927 Note that the line can contain several null characters,
928 so the returned value's length has the actual length of the line.
929 If the function fails, a NULL char_span is returned. */
931 char_span
932 file_cache::get_source_line (const char *file_path, int line)
934 char *buffer = NULL;
935 ssize_t len;
937 if (line == 0)
938 return char_span (NULL, 0);
940 if (file_path == NULL)
941 return char_span (NULL, 0);
943 file_cache_slot *c = lookup_or_add_file (file_path);
944 if (c == NULL)
945 return char_span (NULL, 0);
947 bool read = c->read_line_num (line, &buffer, &len);
948 if (!read)
949 return char_span (NULL, 0);
951 return char_span (buffer, len);
954 /* Return a NUL-terminated copy of the source text between two locations, or
955 NULL if the arguments are invalid. The caller is responsible for freeing
956 the return value. */
958 char *
959 get_source_text_between (file_cache &fc, location_t start, location_t end)
961 expanded_location expstart =
962 expand_location_to_spelling_point (start, LOCATION_ASPECT_START);
963 expanded_location expend =
964 expand_location_to_spelling_point (end, LOCATION_ASPECT_FINISH);
966 /* If the locations are in different files or the end comes before the
967 start, give up and return nothing. */
968 if (!expstart.file || !expend.file)
969 return NULL;
970 if (strcmp (expstart.file, expend.file) != 0)
971 return NULL;
972 if (expstart.line > expend.line)
973 return NULL;
974 if (expstart.line == expend.line
975 && expstart.column > expend.column)
976 return NULL;
977 /* These aren't real column numbers, give up. */
978 if (expstart.column == 0 || expend.column == 0)
979 return NULL;
981 /* For a single line we need to trim both edges. */
982 if (expstart.line == expend.line)
984 char_span line = fc.get_source_line (expstart.file, expstart.line);
985 if (line.length () < 1)
986 return NULL;
987 int s = expstart.column - 1;
988 int len = expend.column - s;
989 if (line.length () < (size_t)expend.column)
990 return NULL;
991 return line.subspan (s, len).xstrdup ();
994 struct obstack buf_obstack;
995 obstack_init (&buf_obstack);
997 /* Loop through all lines in the range and append each to buf; may trim
998 parts of the start and end lines off depending on column values. */
999 for (int lnum = expstart.line; lnum <= expend.line; ++lnum)
1001 char_span line = fc.get_source_line (expstart.file, lnum);
1002 if (line.length () < 1 && (lnum != expstart.line && lnum != expend.line))
1003 continue;
1005 /* For the first line in the range, only start at expstart.column */
1006 if (lnum == expstart.line)
1008 unsigned off = expstart.column - 1;
1009 if (line.length () < off)
1010 return NULL;
1011 line = line.subspan (off, line.length() - off);
1013 /* For the last line, don't go past expend.column */
1014 else if (lnum == expend.line)
1016 if (line.length () < (size_t)expend.column)
1017 return NULL;
1018 line = line.subspan (0, expend.column);
1021 /* Combine spaces at the beginning of later lines. */
1022 if (lnum > expstart.line)
1024 unsigned off;
1025 for (off = 0; off < line.length(); ++off)
1026 if (line[off] != ' ' && line[off] != '\t')
1027 break;
1028 if (off > 0)
1030 obstack_1grow (&buf_obstack, ' ');
1031 line = line.subspan (off, line.length() - off);
1035 /* This does not include any trailing newlines. */
1036 obstack_grow (&buf_obstack, line.get_buffer (), line.length ());
1039 /* NUL-terminate and finish the buf obstack. */
1040 obstack_1grow (&buf_obstack, 0);
1041 const char *buf = (const char *) obstack_finish (&buf_obstack);
1043 return xstrdup (buf);
1047 char_span
1048 file_cache::get_source_file_content (const char *file_path)
1050 file_cache_slot *c = lookup_or_add_file (file_path);
1051 if (c == nullptr)
1052 return char_span (nullptr, 0);
1053 return c->get_full_file_content ();
1056 /* Test if the location originates from the spelling location of a
1057 builtin-tokens. That is, return TRUE if LOC is a (possibly
1058 virtual) location of a built-in token that appears in the expansion
1059 list of a macro. Please note that this function also works on
1060 tokens that result from built-in tokens. For instance, the
1061 function would return true if passed a token "4" that is the result
1062 of the expansion of the built-in __LINE__ macro. */
1063 bool
1064 is_location_from_builtin_token (location_t loc)
1066 const line_map_ordinary *map = NULL;
1067 loc = linemap_resolve_location (line_table, loc,
1068 LRK_SPELLING_LOCATION, &map);
1069 return loc == BUILTINS_LOCATION;
1072 /* Expand the source location LOC into a human readable location. If
1073 LOC is virtual, it resolves to the expansion point of the involved
1074 macro. If LOC resolves to a builtin location, the file name of the
1075 readable location is set to the string "<built-in>". */
1077 expanded_location
1078 expand_location (location_t loc)
1080 return expand_location_1 (line_table, loc, /*expansion_point_p=*/true,
1081 LOCATION_ASPECT_CARET);
1084 /* Expand the source location LOC into a human readable location. If
1085 LOC is virtual, it resolves to the expansion location of the
1086 relevant macro. If LOC resolves to a builtin location, the file
1087 name of the readable location is set to the string
1088 "<built-in>". */
1090 expanded_location
1091 expand_location_to_spelling_point (location_t loc,
1092 enum location_aspect aspect)
1094 return expand_location_1 (line_table, loc, /*expansion_point_p=*/false,
1095 aspect);
1098 /* The rich_location class within libcpp requires a way to expand
1099 location_t instances, and relies on the client code
1100 providing a symbol named
1101 linemap_client_expand_location_to_spelling_point
1102 to do this.
1104 This is the implementation for libcommon.a (all host binaries),
1105 which simply calls into expand_location_1. */
1107 expanded_location
1108 linemap_client_expand_location_to_spelling_point (const line_maps *set,
1109 location_t loc,
1110 enum location_aspect aspect)
1112 return expand_location_1 (set, loc, /*expansion_point_p=*/false, aspect);
1116 /* If LOCATION is in a system header and if it is a virtual location
1117 for a token coming from the expansion of a macro, unwind it to
1118 the location of the expansion point of the macro. If the expansion
1119 point is also in a system header return the original LOCATION.
1120 Otherwise, return the location of the expansion point.
1122 This is used for instance when we want to emit diagnostics about a
1123 token that may be located in a macro that is itself defined in a
1124 system header, for example, for the NULL macro. In such a case, if
1125 LOCATION were passed directly to diagnostic functions such as
1126 warning_at, the diagnostic would be suppressed (unless
1127 -Wsystem-headers). */
1129 location_t
1130 expansion_point_location_if_in_system_header (location_t location)
1132 if (!in_system_header_at (location))
1133 return location;
1135 location_t xloc = linemap_resolve_location (line_table, location,
1136 LRK_MACRO_EXPANSION_POINT,
1137 NULL);
1138 return in_system_header_at (xloc) ? location : xloc;
1141 /* If LOCATION is a virtual location for a token coming from the expansion
1142 of a macro, unwind to the location of the expansion point of the macro. */
1144 location_t
1145 expansion_point_location (location_t location)
1147 return linemap_resolve_location (line_table, location,
1148 LRK_MACRO_EXPANSION_POINT, NULL);
1151 /* Construct a location with caret at CARET, ranging from START to
1152 FINISH.
1154 For example, consider:
1156 11111111112
1157 12345678901234567890
1159 523 return foo + bar;
1160 ~~~~^~~~~
1163 The location's caret is at the "+", line 523 column 15, but starts
1164 earlier, at the "f" of "foo" at column 11. The finish is at the "r"
1165 of "bar" at column 19. */
1167 location_t
1168 make_location (location_t caret, location_t start, location_t finish)
1170 return line_table->make_location (caret, start, finish);
1173 /* Same as above, but taking a source range rather than two locations. */
1175 location_t
1176 make_location (location_t caret, source_range src_range)
1178 location_t pure_loc = get_pure_location (caret);
1179 return line_table->get_or_create_combined_loc (pure_loc, src_range,
1180 nullptr, 0);
1183 /* An expanded_location stores the column in byte units. This function
1184 converts that column to display units. That requires reading the associated
1185 source line in order to calculate the display width. If that cannot be done
1186 for any reason, then returns the byte column as a fallback. */
1188 location_compute_display_column (file_cache &fc,
1189 expanded_location exploc,
1190 const cpp_char_column_policy &policy)
1192 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1193 return exploc.column;
1194 char_span line = fc.get_source_line (exploc.file, exploc.line);
1195 /* If line is NULL, this function returns exploc.column which is the
1196 desired fallback. */
1197 return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1198 exploc.column, policy);
1201 /* Dump statistics to stderr about the memory usage of the line_table
1202 set of line maps. This also displays some statistics about macro
1203 expansion. */
1205 void
1206 dump_line_table_statistics (void)
1208 struct linemap_stats s;
1209 long total_used_map_size,
1210 macro_maps_size,
1211 total_allocated_map_size;
1213 memset (&s, 0, sizeof (s));
1215 linemap_get_statistics (line_table, &s);
1217 macro_maps_size = s.macro_maps_used_size
1218 + s.macro_maps_locations_size;
1220 total_allocated_map_size = s.ordinary_maps_allocated_size
1221 + s.macro_maps_allocated_size
1222 + s.macro_maps_locations_size;
1224 total_used_map_size = s.ordinary_maps_used_size
1225 + s.macro_maps_used_size
1226 + s.macro_maps_locations_size;
1228 fprintf (stderr, "Number of expanded macros: %5ld\n",
1229 s.num_expanded_macros);
1230 if (s.num_expanded_macros != 0)
1231 fprintf (stderr, "Average number of tokens per macro expansion: %5ld\n",
1232 s.num_macro_tokens / s.num_expanded_macros);
1233 fprintf (stderr,
1234 "\nLine Table allocations during the "
1235 "compilation process\n");
1236 fprintf (stderr, "Number of ordinary maps used: " PRsa (5) "\n",
1237 SIZE_AMOUNT (s.num_ordinary_maps_used));
1238 fprintf (stderr, "Ordinary map used size: " PRsa (5) "\n",
1239 SIZE_AMOUNT (s.ordinary_maps_used_size));
1240 fprintf (stderr, "Number of ordinary maps allocated: " PRsa (5) "\n",
1241 SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1242 fprintf (stderr, "Ordinary maps allocated size: " PRsa (5) "\n",
1243 SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1244 fprintf (stderr, "Number of macro maps used: " PRsa (5) "\n",
1245 SIZE_AMOUNT (s.num_macro_maps_used));
1246 fprintf (stderr, "Macro maps used size: " PRsa (5) "\n",
1247 SIZE_AMOUNT (s.macro_maps_used_size));
1248 fprintf (stderr, "Macro maps locations size: " PRsa (5) "\n",
1249 SIZE_AMOUNT (s.macro_maps_locations_size));
1250 fprintf (stderr, "Macro maps size: " PRsa (5) "\n",
1251 SIZE_AMOUNT (macro_maps_size));
1252 fprintf (stderr, "Duplicated maps locations size: " PRsa (5) "\n",
1253 SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1254 fprintf (stderr, "Total allocated maps size: " PRsa (5) "\n",
1255 SIZE_AMOUNT (total_allocated_map_size));
1256 fprintf (stderr, "Total used maps size: " PRsa (5) "\n",
1257 SIZE_AMOUNT (total_used_map_size));
1258 fprintf (stderr, "Ad-hoc table size: " PRsa (5) "\n",
1259 SIZE_AMOUNT (s.adhoc_table_size));
1260 fprintf (stderr, "Ad-hoc table entries used: " PRsa (5) "\n",
1261 SIZE_AMOUNT (s.adhoc_table_entries_used));
1262 fprintf (stderr, "optimized_ranges: " PRsa (5) "\n",
1263 SIZE_AMOUNT (line_table->m_num_optimized_ranges));
1264 fprintf (stderr, "unoptimized_ranges: " PRsa (5) "\n",
1265 SIZE_AMOUNT (line_table->m_num_unoptimized_ranges));
1267 fprintf (stderr, "\n");
1270 /* Get location one beyond the final location in ordinary map IDX. */
1272 static location_t
1273 get_end_location (class line_maps *set, unsigned int idx)
1275 if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1276 return set->highest_location;
1278 struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1279 return MAP_START_LOCATION (next_map);
1282 /* Helper function for write_digit_row. */
1284 static void
1285 write_digit (FILE *stream, int digit)
1287 fputc ('0' + (digit % 10), stream);
1290 /* Helper function for dump_location_info.
1291 Write a row of numbers to STREAM, numbering a source line,
1292 giving the units, tens, hundreds etc of the column number. */
1294 static void
1295 write_digit_row (FILE *stream, int indent,
1296 const line_map_ordinary *map,
1297 location_t loc, int max_col, int divisor)
1299 fprintf (stream, "%*c", indent, ' ');
1300 fprintf (stream, "|");
1301 for (int column = 1; column < max_col; column++)
1303 location_t column_loc = loc + (column << map->m_range_bits);
1304 write_digit (stream, column_loc / divisor);
1306 fprintf (stream, "\n");
1309 /* Write a half-closed (START) / half-open (END) interval of
1310 location_t to STREAM. */
1312 static void
1313 dump_location_range (FILE *stream,
1314 location_t start, location_t end)
1316 fprintf (stream,
1317 " location_t interval: %u <= loc < %u\n",
1318 start, end);
1321 /* Write a labelled description of a half-closed (START) / half-open (END)
1322 interval of location_t to STREAM. */
1324 static void
1325 dump_labelled_location_range (FILE *stream,
1326 const char *name,
1327 location_t start, location_t end)
1329 fprintf (stream, "%s\n", name);
1330 dump_location_range (stream, start, end);
1331 fprintf (stream, "\n");
1334 /* Write a visualization of the locations in the line_table to STREAM. */
1336 void
1337 dump_location_info (FILE *stream)
1339 file_cache fc;
1341 /* Visualize the reserved locations. */
1342 dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1343 0, RESERVED_LOCATION_COUNT);
1345 /* Visualize the ordinary line_map instances, rendering the sources. */
1346 for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1348 location_t end_location = get_end_location (line_table, idx);
1349 /* half-closed: doesn't include this one. */
1351 const line_map_ordinary *map
1352 = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1353 fprintf (stream, "ORDINARY MAP: %i\n", idx);
1354 dump_location_range (stream,
1355 MAP_START_LOCATION (map), end_location);
1356 fprintf (stream, " file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1357 fprintf (stream, " starting at line: %i\n",
1358 ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1359 fprintf (stream, " column and range bits: %i\n",
1360 map->m_column_and_range_bits);
1361 fprintf (stream, " column bits: %i\n",
1362 map->m_column_and_range_bits - map->m_range_bits);
1363 fprintf (stream, " range bits: %i\n",
1364 map->m_range_bits);
1365 const char * reason;
1366 switch (map->reason) {
1367 case LC_ENTER:
1368 reason = "LC_ENTER";
1369 break;
1370 case LC_LEAVE:
1371 reason = "LC_LEAVE";
1372 break;
1373 case LC_RENAME:
1374 reason = "LC_RENAME";
1375 break;
1376 case LC_RENAME_VERBATIM:
1377 reason = "LC_RENAME_VERBATIM";
1378 break;
1379 case LC_ENTER_MACRO:
1380 reason = "LC_RENAME_MACRO";
1381 break;
1382 default:
1383 reason = "Unknown";
1385 fprintf (stream, " reason: %d (%s)\n", map->reason, reason);
1387 const line_map_ordinary *includer_map
1388 = linemap_included_from_linemap (line_table, map);
1389 fprintf (stream, " included from location: %d",
1390 linemap_included_from (map));
1391 if (includer_map) {
1392 fprintf (stream, " (in ordinary map %d)",
1393 int (includer_map - line_table->info_ordinary.maps));
1395 fprintf (stream, "\n");
1397 /* Render the span of source lines that this "map" covers. */
1398 for (location_t loc = MAP_START_LOCATION (map);
1399 loc < end_location;
1400 loc += (1 << map->m_range_bits) )
1402 gcc_assert (pure_location_p (line_table, loc) );
1404 expanded_location exploc
1405 = linemap_expand_location (line_table, map, loc);
1407 if (exploc.column == 0)
1409 /* Beginning of a new source line: draw the line. */
1411 char_span line_text = fc.get_source_line (exploc.file,
1412 exploc.line);
1413 if (!line_text)
1414 break;
1415 fprintf (stream,
1416 "%s:%3i|loc:%5i|%.*s\n",
1417 exploc.file, exploc.line,
1418 loc,
1419 (int)line_text.length (), line_text.get_buffer ());
1421 /* "loc" is at column 0, which means "the whole line".
1422 Render the locations *within* the line, by underlining
1423 it, showing the location_t numeric values
1424 at each column. */
1425 size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1426 if (max_col > line_text.length ())
1427 max_col = line_text.length () + 1;
1429 int len_lnum = num_digits (exploc.line);
1430 if (len_lnum < 3)
1431 len_lnum = 3;
1432 int len_loc = num_digits (loc);
1433 if (len_loc < 5)
1434 len_loc = 5;
1436 int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1438 /* Thousands. */
1439 if (end_location > 999)
1440 write_digit_row (stream, indent, map, loc, max_col, 1000);
1442 /* Hundreds. */
1443 if (end_location > 99)
1444 write_digit_row (stream, indent, map, loc, max_col, 100);
1446 /* Tens. */
1447 write_digit_row (stream, indent, map, loc, max_col, 10);
1449 /* Units. */
1450 write_digit_row (stream, indent, map, loc, max_col, 1);
1453 fprintf (stream, "\n");
1456 /* Visualize unallocated values. */
1457 dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1458 line_table->highest_location,
1459 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1461 /* Visualize the macro line_map instances, rendering the sources. */
1462 for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1464 /* Each macro map that is allocated owns location_t values
1465 that are *lower* that the one before them.
1466 Hence it's meaningful to view them either in order of ascending
1467 source locations, or in order of ascending macro map index. */
1468 const bool ascending_location_ts = true;
1469 unsigned int idx = (ascending_location_ts
1470 ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1471 : i);
1472 const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1473 fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1474 idx,
1475 linemap_map_get_macro_name (map),
1476 MACRO_MAP_NUM_MACRO_TOKENS (map));
1477 dump_location_range (stream,
1478 map->start_location,
1479 (map->start_location
1480 + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1481 inform (map->get_expansion_point_location (),
1482 "expansion point is location %i",
1483 map->get_expansion_point_location ());
1484 fprintf (stream, " map->start_location: %u\n",
1485 map->start_location);
1487 fprintf (stream, " macro_locations:\n");
1488 for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1490 location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1491 location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1493 /* linemap_add_macro_token encodes token numbers in an expansion
1494 by putting them after MAP_START_LOCATION. */
1496 /* I'm typically seeing 4 uninitialized entries at the end of
1497 0xafafafaf.
1498 This appears to be due to macro.cc:replace_args
1499 adding 2 extra args for padding tokens; presumably there may
1500 be a leading and/or trailing padding token injected,
1501 each for 2 more location slots.
1502 This would explain there being up to 4 location_ts slots
1503 that may be uninitialized. */
1505 fprintf (stream, " %u: %u, %u\n",
1509 if (x == y)
1511 if (x < MAP_START_LOCATION (map))
1512 inform (x, "token %u has %<x-location == y-location == %u%>",
1513 i, x);
1514 else
1515 fprintf (stream,
1516 "x-location == y-location == %u encodes token # %u\n",
1517 x, x - MAP_START_LOCATION (map));
1519 else
1521 inform (x, "token %u has %<x-location == %u%>", i, x);
1522 inform (x, "token %u has %<y-location == %u%>", i, y);
1525 fprintf (stream, "\n");
1528 /* It appears that MAX_LOCATION_T itself is never assigned to a
1529 macro map, presumably due to an off-by-one error somewhere
1530 between the logic in linemap_enter_macro and
1531 LINEMAPS_MACRO_LOWEST_LOCATION. */
1532 dump_labelled_location_range (stream, "MAX_LOCATION_T",
1533 MAX_LOCATION_T,
1534 MAX_LOCATION_T + 1);
1536 /* Visualize ad-hoc values. */
1537 dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1538 MAX_LOCATION_T + 1, UINT_MAX);
1541 /* string_concat's constructor. */
1543 string_concat::string_concat (int num, location_t *locs)
1544 : m_num (num)
1546 m_locs = ggc_vec_alloc <location_t> (num);
1547 for (int i = 0; i < num; i++)
1548 m_locs[i] = locs[i];
1551 /* string_concat_db's constructor. */
1553 string_concat_db::string_concat_db ()
1555 m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1558 /* Record that a string concatenation occurred, covering NUM
1559 string literal tokens. LOCS is an array of size NUM, containing the
1560 locations of the tokens. A copy of LOCS is taken. */
1562 void
1563 string_concat_db::record_string_concatenation (int num, location_t *locs)
1565 gcc_assert (num > 1);
1566 gcc_assert (locs);
1568 location_t key_loc = get_key_loc (locs[0]);
1569 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1570 any data now recorded under key 'key_loc' would be overwritten by a
1571 subsequent call with the same key 'key_loc'. */
1572 if (RESERVED_LOCATION_P (key_loc))
1573 return;
1575 string_concat *concat
1576 = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1577 m_table->put (key_loc, concat);
1580 /* Determine if LOC was the location of the initial token of a
1581 concatenation of string literal tokens.
1582 If so, *OUT_NUM is written to with the number of tokens, and
1583 *OUT_LOCS with the location of an array of locations of the
1584 tokens, and return true. *OUT_LOCS is a borrowed pointer to
1585 storage owned by the string_concat_db.
1586 Otherwise, return false. */
1588 bool
1589 string_concat_db::get_string_concatenation (location_t loc,
1590 int *out_num,
1591 location_t **out_locs)
1593 gcc_assert (out_num);
1594 gcc_assert (out_locs);
1596 location_t key_loc = get_key_loc (loc);
1597 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1598 discussion in 'string_concat_db::record_string_concatenation'. */
1599 if (RESERVED_LOCATION_P (key_loc))
1600 return false;
1602 string_concat **concat = m_table->get (key_loc);
1603 if (!concat)
1604 return false;
1606 *out_num = (*concat)->m_num;
1607 *out_locs =(*concat)->m_locs;
1608 return true;
1611 /* Internal function. Canonicalize LOC into a form suitable for
1612 use as a key within the database, stripping away macro expansion,
1613 ad-hoc information, and range information, using the location of
1614 the start of LOC within an ordinary linemap. */
1616 location_t
1617 string_concat_db::get_key_loc (location_t loc)
1619 loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1620 NULL);
1622 loc = get_range_from_loc (line_table, loc).m_start;
1624 return loc;
1627 /* Helper class for use within get_substring_ranges_for_loc.
1628 An vec of cpp_string with responsibility for releasing all of the
1629 str->text for each str in the vector. */
1631 class auto_cpp_string_vec : public auto_vec <cpp_string>
1633 public:
1634 auto_cpp_string_vec (int alloc)
1635 : auto_vec <cpp_string> (alloc) {}
1637 ~auto_cpp_string_vec ()
1639 /* Clean up the copies within this vec. */
1640 int i;
1641 cpp_string *str;
1642 FOR_EACH_VEC_ELT (*this, i, str)
1643 free (const_cast <unsigned char *> (str->text));
1647 /* Attempt to populate RANGES with source location information on the
1648 individual characters within the string literal found at STRLOC.
1649 If CONCATS is non-NULL, then any string literals that the token at
1650 STRLOC was concatenated with are also added to RANGES.
1652 Return NULL if successful, or an error message if any errors occurred (in
1653 which case RANGES may be only partially populated and should not
1654 be used).
1656 This is implemented by re-parsing the relevant source line(s). */
1658 static const char *
1659 get_substring_ranges_for_loc (cpp_reader *pfile,
1660 file_cache &fc,
1661 string_concat_db *concats,
1662 location_t strloc,
1663 enum cpp_ttype type,
1664 cpp_substring_ranges &ranges)
1666 gcc_assert (pfile);
1668 if (strloc == UNKNOWN_LOCATION)
1669 return "unknown location";
1671 /* Reparsing the strings requires accurate location information.
1672 If -ftrack-macro-expansion has been overridden from its default
1673 of 2, then we might have a location of a macro expansion point,
1674 rather than the location of the literal itself.
1675 Avoid this by requiring that we have full macro expansion tracking
1676 for substring locations to be available. */
1677 if (cpp_get_options (pfile)->track_macro_expansion != 2)
1678 return "track_macro_expansion != 2";
1680 /* If #line or # 44 "file"-style directives are present, then there's
1681 no guarantee that the line numbers we have can be used to locate
1682 the strings. For example, we might have a .i file with # directives
1683 pointing back to lines within a .c file, but the .c file might
1684 have been edited since the .i file was created.
1685 In such a case, the safest course is to disable on-demand substring
1686 locations. */
1687 if (line_table->seen_line_directive)
1688 return "seen line directive";
1690 /* If string concatenation has occurred at STRLOC, get the locations
1691 of all of the literal tokens making up the compound string.
1692 Otherwise, just use STRLOC. */
1693 int num_locs = 1;
1694 location_t *strlocs = &strloc;
1695 if (concats)
1696 concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1698 auto_cpp_string_vec strs (num_locs);
1699 auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1700 for (int i = 0; i < num_locs; i++)
1702 /* Get range of strloc. We will use it to locate the start and finish
1703 of the literal token within the line. */
1704 source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1706 if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1708 /* If the string token was within a macro expansion, then we can
1709 cope with it for the simple case where we have a single token.
1710 Otherwise, bail out. */
1711 if (src_range.m_start != src_range.m_finish)
1712 return "macro expansion";
1714 else
1716 if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1717 /* If so, we can't reliably determine where the token started within
1718 its line. */
1719 return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1721 if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1722 /* If so, we can't reliably determine where the token finished
1723 within its line. */
1724 return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1727 expanded_location start
1728 = expand_location_to_spelling_point (src_range.m_start,
1729 LOCATION_ASPECT_START);
1730 expanded_location finish
1731 = expand_location_to_spelling_point (src_range.m_finish,
1732 LOCATION_ASPECT_FINISH);
1733 if (start.file != finish.file)
1734 return "range endpoints are in different files";
1735 if (start.line != finish.line)
1736 return "range endpoints are on different lines";
1737 if (start.column > finish.column)
1738 return "range endpoints are reversed";
1740 char_span line = fc.get_source_line (start.file, start.line);
1741 if (!line)
1742 return "unable to read source line";
1744 /* Determine the location of the literal (including quotes
1745 and leading prefix chars, such as the 'u' in a u""
1746 token). */
1747 size_t literal_length = finish.column - start.column + 1;
1749 /* Ensure that we don't crash if we got the wrong location. */
1750 if (start.column < 1)
1751 return "zero start column";
1752 if (line.length () < (start.column - 1 + literal_length))
1753 return "line is not wide enough";
1755 char_span literal = line.subspan (start.column - 1, literal_length);
1757 cpp_string from;
1758 from.len = literal_length;
1759 /* Make a copy of the literal, to avoid having to rely on
1760 the lifetime of the copy of the line within the cache.
1761 This will be released by the auto_cpp_string_vec dtor. */
1762 from.text = (unsigned char *)literal.xstrdup ();
1763 strs.safe_push (from);
1765 /* For very long lines, a new linemap could have started
1766 halfway through the token.
1767 Ensure that the loc_reader uses the linemap of the
1768 *end* of the token for its start location. */
1769 const line_map_ordinary *start_ord_map;
1770 linemap_resolve_location (line_table, src_range.m_start,
1771 LRK_SPELLING_LOCATION, &start_ord_map);
1772 const line_map_ordinary *final_ord_map;
1773 linemap_resolve_location (line_table, src_range.m_finish,
1774 LRK_SPELLING_LOCATION, &final_ord_map);
1775 if (start_ord_map == NULL || final_ord_map == NULL)
1776 return "failed to get ordinary maps";
1777 /* Bulletproofing. We ought to only have different ordinary maps
1778 for start vs finish due to line-length jumps. */
1779 if (start_ord_map != final_ord_map
1780 && start_ord_map->to_file != final_ord_map->to_file)
1781 return "start and finish are spelled in different ordinary maps";
1782 /* The file from linemap_resolve_location ought to match that from
1783 expand_location_to_spelling_point. */
1784 if (start_ord_map->to_file != start.file)
1785 return "mismatching file after resolving linemap";
1787 location_t start_loc
1788 = linemap_position_for_line_and_column (line_table, final_ord_map,
1789 start.line, start.column);
1791 cpp_string_location_reader loc_reader (start_loc, line_table);
1792 loc_readers.safe_push (loc_reader);
1795 /* Rerun cpp_interpret_string, or rather, a modified version of it. */
1796 const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1797 loc_readers.address (),
1798 num_locs, &ranges, type);
1799 if (err)
1800 return err;
1802 /* Success: "ranges" should now contain information on the string. */
1803 return NULL;
1806 /* Attempt to populate *OUT_LOC with source location information on the
1807 given characters within the string literal found at STRLOC.
1808 CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1809 character set.
1811 For example, given CARET_IDX = 4, START_IDX = 3, END_IDX = 7
1812 and string literal "012345\n789"
1813 *OUT_LOC is written to with:
1814 "012345\n789"
1815 ~^~~~~
1817 If CONCATS is non-NULL, then any string literals that the token at
1818 STRLOC was concatenated with are also considered.
1820 This is implemented by re-parsing the relevant source line(s).
1822 Return NULL if successful, or an error message if any errors occurred.
1823 Error messages are intended for GCC developers (to help debugging) rather
1824 than for end-users. */
1826 const char *
1827 get_location_within_string (cpp_reader *pfile,
1828 file_cache &fc,
1829 string_concat_db *concats,
1830 location_t strloc,
1831 enum cpp_ttype type,
1832 int caret_idx, int start_idx, int end_idx,
1833 location_t *out_loc)
1835 gcc_checking_assert (caret_idx >= 0);
1836 gcc_checking_assert (start_idx >= 0);
1837 gcc_checking_assert (end_idx >= 0);
1838 gcc_assert (out_loc);
1840 cpp_substring_ranges ranges;
1841 const char *err
1842 = get_substring_ranges_for_loc (pfile, fc, concats, strloc, type, ranges);
1843 if (err)
1844 return err;
1846 if (caret_idx >= ranges.get_num_ranges ())
1847 return "caret_idx out of range";
1848 if (start_idx >= ranges.get_num_ranges ())
1849 return "start_idx out of range";
1850 if (end_idx >= ranges.get_num_ranges ())
1851 return "end_idx out of range";
1853 *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1854 ranges.get_range (start_idx).m_start,
1855 ranges.get_range (end_idx).m_finish);
1856 return NULL;
1859 /* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1861 location_t
1862 location_with_discriminator (location_t locus, int discriminator)
1864 tree block = LOCATION_BLOCK (locus);
1865 source_range src_range = get_range_from_loc (line_table, locus);
1866 locus = get_pure_location (locus);
1868 if (locus == UNKNOWN_LOCATION)
1869 return locus;
1871 return line_table->get_or_create_combined_loc (locus, src_range, block,
1872 discriminator);
1875 /* Return TRUE if LOCUS represents a location with a discriminator. */
1877 bool
1878 has_discriminator (location_t locus)
1880 return get_discriminator_from_loc (locus) != 0;
1883 /* Return the discriminator for LOCUS. */
1886 get_discriminator_from_loc (location_t locus)
1888 return get_discriminator_from_loc (line_table, locus);
1891 #if CHECKING_P
1893 namespace selftest {
1895 /* Selftests of location handling. */
1897 /* Attempt to populate *OUT_RANGE with source location information on the
1898 given character within the string literal found at STRLOC.
1899 CHAR_IDX refers to an offset within the execution character set.
1900 If CONCATS is non-NULL, then any string literals that the token at
1901 STRLOC was concatenated with are also considered.
1903 This is implemented by re-parsing the relevant source line(s).
1905 Return NULL if successful, or an error message if any errors occurred.
1906 Error messages are intended for GCC developers (to help debugging) rather
1907 than for end-users. */
1909 static const char *
1910 get_source_range_for_char (cpp_reader *pfile,
1911 file_cache &fc,
1912 string_concat_db *concats,
1913 location_t strloc,
1914 enum cpp_ttype type,
1915 int char_idx,
1916 source_range *out_range)
1918 gcc_checking_assert (char_idx >= 0);
1919 gcc_assert (out_range);
1921 cpp_substring_ranges ranges;
1922 const char *err
1923 = get_substring_ranges_for_loc (pfile, fc, concats, strloc, type, ranges);
1924 if (err)
1925 return err;
1927 if (char_idx >= ranges.get_num_ranges ())
1928 return "char_idx out of range";
1930 *out_range = ranges.get_range (char_idx);
1931 return NULL;
1934 /* As get_source_range_for_char, but write to *OUT the number
1935 of ranges that are available. */
1937 static const char *
1938 get_num_source_ranges_for_substring (cpp_reader *pfile,
1939 file_cache &fc,
1940 string_concat_db *concats,
1941 location_t strloc,
1942 enum cpp_ttype type,
1943 int *out)
1945 gcc_assert (out);
1947 cpp_substring_ranges ranges;
1948 const char *err
1949 = get_substring_ranges_for_loc (pfile, fc, concats, strloc, type, ranges);
1951 if (err)
1952 return err;
1954 *out = ranges.get_num_ranges ();
1955 return NULL;
1958 /* Selftests of location handling. */
1960 /* Verify that compare() on linenum_type handles comparisons over the full
1961 range of the type. */
1963 static void
1964 test_linenum_comparisons ()
1966 linenum_type min_line (0);
1967 linenum_type max_line (0xffffffff);
1968 ASSERT_EQ (0, compare (min_line, min_line));
1969 ASSERT_EQ (0, compare (max_line, max_line));
1971 ASSERT_GT (compare (max_line, min_line), 0);
1972 ASSERT_LT (compare (min_line, max_line), 0);
1975 /* Helper function for verifying location data: when location_t
1976 values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1977 as having column 0. */
1979 static bool
1980 should_have_column_data_p (location_t loc)
1982 if (IS_ADHOC_LOC (loc))
1983 loc = get_location_from_adhoc_loc (line_table, loc);
1984 if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1985 return false;
1986 return true;
1989 /* Selftest for should_have_column_data_p. */
1991 static void
1992 test_should_have_column_data_p ()
1994 ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1995 ASSERT_TRUE
1996 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1997 ASSERT_FALSE
1998 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
2001 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
2002 on LOC. */
2004 static void
2005 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
2006 location_t loc)
2008 ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
2009 ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
2010 /* If location_t values are sufficiently high, then column numbers
2011 will be unavailable and LOCATION_COLUMN (loc) will be 0.
2012 When close to the threshold, column numbers *may* be present: if
2013 the final linemap before the threshold contains a line that straddles
2014 the threshold, locations in that line have column information. */
2015 if (should_have_column_data_p (loc))
2016 ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
2019 /* Various selftests involve constructing a line table and one or more
2020 line maps within it.
2022 For maximum test coverage we want to run these tests with a variety
2023 of situations:
2024 - line_table->default_range_bits: some frontends use a non-zero value
2025 and others use zero
2026 - the fallback modes within line-map.cc: there are various threshold
2027 values for location_t beyond line-map.cc changes
2028 behavior (disabling of the range-packing optimization, disabling
2029 of column-tracking). We can exercise these by starting the line_table
2030 at interesting values at or near these thresholds.
2032 The following struct describes a particular case within our test
2033 matrix. */
2035 class line_table_case
2037 public:
2038 line_table_case (int default_range_bits, int base_location)
2039 : m_default_range_bits (default_range_bits),
2040 m_base_location (base_location)
2043 int m_default_range_bits;
2044 int m_base_location;
2047 /* Constructor. Store the old value of line_table, and create a new
2048 one, using sane defaults. */
2050 line_table_test::line_table_test ()
2052 gcc_assert (saved_line_table == NULL);
2053 saved_line_table = line_table;
2054 line_table = ggc_alloc<line_maps> ();
2055 linemap_init (line_table, BUILTINS_LOCATION);
2056 gcc_assert (saved_line_table->m_reallocator);
2057 line_table->m_reallocator = saved_line_table->m_reallocator;
2058 gcc_assert (saved_line_table->m_round_alloc_size);
2059 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2060 line_table->default_range_bits = 0;
2063 /* Constructor. Store the old value of line_table, and create a new
2064 one, using the sitation described in CASE_. */
2066 line_table_test::line_table_test (const line_table_case &case_)
2068 gcc_assert (saved_line_table == NULL);
2069 saved_line_table = line_table;
2070 line_table = ggc_alloc<line_maps> ();
2071 linemap_init (line_table, BUILTINS_LOCATION);
2072 gcc_assert (saved_line_table->m_reallocator);
2073 line_table->m_reallocator = saved_line_table->m_reallocator;
2074 gcc_assert (saved_line_table->m_round_alloc_size);
2075 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2076 line_table->default_range_bits = case_.m_default_range_bits;
2077 if (case_.m_base_location)
2079 line_table->highest_location = case_.m_base_location;
2080 line_table->highest_line = case_.m_base_location;
2084 /* Destructor. Restore the old value of line_table. */
2086 line_table_test::~line_table_test ()
2088 gcc_assert (saved_line_table != NULL);
2089 line_table = saved_line_table;
2090 saved_line_table = NULL;
2093 /* Verify basic operation of ordinary linemaps. */
2095 static void
2096 test_accessing_ordinary_linemaps (const line_table_case &case_)
2098 line_table_test ltt (case_);
2100 /* Build a simple linemap describing some locations. */
2101 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
2103 linemap_line_start (line_table, 1, 100);
2104 location_t loc_a = linemap_position_for_column (line_table, 1);
2105 location_t loc_b = linemap_position_for_column (line_table, 23);
2107 linemap_line_start (line_table, 2, 100);
2108 location_t loc_c = linemap_position_for_column (line_table, 1);
2109 location_t loc_d = linemap_position_for_column (line_table, 17);
2111 /* Example of a very long line. */
2112 linemap_line_start (line_table, 3, 2000);
2113 location_t loc_e = linemap_position_for_column (line_table, 700);
2115 /* Transitioning back to a short line. */
2116 linemap_line_start (line_table, 4, 0);
2117 location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2119 if (should_have_column_data_p (loc_back_to_short))
2121 /* Verify that we switched to short lines in the linemap. */
2122 line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
2123 ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2126 /* Example of a line that will eventually be seen to be longer
2127 than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2128 below that. */
2129 linemap_line_start (line_table, 5, 2000);
2131 location_t loc_start_of_very_long_line
2132 = linemap_position_for_column (line_table, 2000);
2133 location_t loc_too_wide
2134 = linemap_position_for_column (line_table, 4097);
2135 location_t loc_too_wide_2
2136 = linemap_position_for_column (line_table, 4098);
2138 /* ...and back to a sane line length. */
2139 linemap_line_start (line_table, 6, 100);
2140 location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2142 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2144 /* Multiple files. */
2145 linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2146 linemap_line_start (line_table, 1, 200);
2147 location_t loc_f = linemap_position_for_column (line_table, 150);
2148 linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2150 /* Verify that we can recover the location info. */
2151 assert_loceq ("foo.c", 1, 1, loc_a);
2152 assert_loceq ("foo.c", 1, 23, loc_b);
2153 assert_loceq ("foo.c", 2, 1, loc_c);
2154 assert_loceq ("foo.c", 2, 17, loc_d);
2155 assert_loceq ("foo.c", 3, 700, loc_e);
2156 assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2158 /* In the very wide line, the initial location should be fully tracked. */
2159 assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2160 /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2161 be disabled. */
2162 assert_loceq ("foo.c", 5, 0, loc_too_wide);
2163 assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2164 /*...and column-tracking should be re-enabled for subsequent lines. */
2165 assert_loceq ("foo.c", 6, 10, loc_sane_again);
2167 assert_loceq ("bar.c", 1, 150, loc_f);
2169 ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2170 ASSERT_TRUE (pure_location_p (line_table, loc_a));
2172 /* Verify using make_location to build a range, and extracting data
2173 back from it. */
2174 location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2175 ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2176 ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2177 source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2178 ASSERT_EQ (loc_b, src_range.m_start);
2179 ASSERT_EQ (loc_d, src_range.m_finish);
2182 /* Verify various properties of UNKNOWN_LOCATION. */
2184 static void
2185 test_unknown_location ()
2187 ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2188 ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2189 ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2192 /* Verify various properties of BUILTINS_LOCATION. */
2194 static void
2195 test_builtins ()
2197 assert_loceq (special_fname_builtin (), 0, 0, BUILTINS_LOCATION);
2198 ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2201 /* Regression test for make_location.
2202 Ensure that we use pure locations for the start/finish of the range,
2203 rather than storing a packed or ad-hoc range as the start/finish. */
2205 static void
2206 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2208 /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2209 with C++ frontend.
2210 ....................0000000001111111111222.
2211 ....................1234567890123456789012. */
2212 const char *content = " r += !aaa == bbb;\n";
2213 temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2214 line_table_test ltt (case_);
2215 linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2217 const location_t c11 = linemap_position_for_column (line_table, 11);
2218 const location_t c12 = linemap_position_for_column (line_table, 12);
2219 const location_t c13 = linemap_position_for_column (line_table, 13);
2220 const location_t c14 = linemap_position_for_column (line_table, 14);
2221 const location_t c21 = linemap_position_for_column (line_table, 21);
2223 if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2224 return;
2226 /* Use column 13 for the caret location, arbitrarily, to verify that we
2227 handle start != caret. */
2228 const location_t aaa = make_location (c13, c12, c14);
2229 ASSERT_EQ (c13, get_pure_location (aaa));
2230 ASSERT_EQ (c12, get_start (aaa));
2231 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2232 ASSERT_EQ (c14, get_finish (aaa));
2233 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2235 /* Make a location using a location with a range as the start-point. */
2236 const location_t not_aaa = make_location (c11, aaa, c14);
2237 ASSERT_EQ (c11, get_pure_location (not_aaa));
2238 /* It should use the start location of the range, not store the range
2239 itself. */
2240 ASSERT_EQ (c12, get_start (not_aaa));
2241 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2242 ASSERT_EQ (c14, get_finish (not_aaa));
2243 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2245 /* Similarly, make a location with a range as the end-point. */
2246 const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2247 ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2248 ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2249 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2250 ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2251 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2252 const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2253 /* It should use the finish location of the range, not store the range
2254 itself. */
2255 ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2256 ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2257 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2258 ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2259 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2262 /* Verify reading of input files (e.g. for caret-based diagnostics). */
2264 static void
2265 test_reading_source_line ()
2267 /* Create a tempfile and write some text to it. */
2268 temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2269 "01234567890123456789\n"
2270 "This is the test text\n"
2271 "This is the 3rd line");
2272 file_cache fc;
2274 /* Read back a specific line from the tempfile. */
2275 char_span source_line = fc.get_source_line (tmp.get_filename (), 3);
2276 ASSERT_TRUE (source_line);
2277 ASSERT_TRUE (source_line.get_buffer () != NULL);
2278 ASSERT_EQ (20, source_line.length ());
2279 ASSERT_TRUE (!strncmp ("This is the 3rd line",
2280 source_line.get_buffer (), source_line.length ()));
2282 source_line = fc.get_source_line (tmp.get_filename (), 2);
2283 ASSERT_TRUE (source_line);
2284 ASSERT_TRUE (source_line.get_buffer () != NULL);
2285 ASSERT_EQ (21, source_line.length ());
2286 ASSERT_TRUE (!strncmp ("This is the test text",
2287 source_line.get_buffer (), source_line.length ()));
2289 source_line = fc.get_source_line (tmp.get_filename (), 4);
2290 ASSERT_FALSE (source_line);
2291 ASSERT_TRUE (source_line.get_buffer () == NULL);
2294 /* Tests of lexing. */
2296 /* Verify that token TOK from PARSER has cpp_token_as_text
2297 equal to EXPECTED_TEXT. */
2299 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT) \
2300 SELFTEST_BEGIN_STMT \
2301 unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK)); \
2302 ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt); \
2303 SELFTEST_END_STMT
2305 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2306 and ranges from EXP_START_COL to EXP_FINISH_COL.
2307 Use LOC as the effective location of the selftest. */
2309 static void
2310 assert_token_loc_eq (const location &loc,
2311 const cpp_token *tok,
2312 const char *exp_filename, int exp_linenum,
2313 int exp_start_col, int exp_finish_col)
2315 location_t tok_loc = tok->src_loc;
2316 ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2317 ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2319 /* If location_t values are sufficiently high, then column numbers
2320 will be unavailable. */
2321 if (!should_have_column_data_p (tok_loc))
2322 return;
2324 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2325 source_range tok_range = get_range_from_loc (line_table, tok_loc);
2326 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2327 ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2330 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2331 SELFTEST_LOCATION as the effective location of the selftest. */
2333 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2334 EXP_START_COL, EXP_FINISH_COL) \
2335 assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2336 (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2338 /* Test of lexing a file using libcpp, verifying tokens and their
2339 location information. */
2341 static void
2342 test_lexer (const line_table_case &case_)
2344 /* Create a tempfile and write some text to it. */
2345 const char *content =
2346 /*00000000011111111112222222222333333.3333444444444.455555555556
2347 12345678901234567890123456789012345.6789012345678.901234567890. */
2348 ("test_name /* c-style comment */\n"
2349 " \"test literal\"\n"
2350 " // test c++-style comment\n"
2351 " 42\n");
2352 temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2354 line_table_test ltt (case_);
2356 cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2358 const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2359 ASSERT_NE (fname, NULL);
2361 /* Verify that we get the expected tokens back, with the correct
2362 location information. */
2364 location_t loc;
2365 const cpp_token *tok;
2366 tok = cpp_get_token_with_location (parser, &loc);
2367 ASSERT_NE (tok, NULL);
2368 ASSERT_EQ (tok->type, CPP_NAME);
2369 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2370 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2372 tok = cpp_get_token_with_location (parser, &loc);
2373 ASSERT_NE (tok, NULL);
2374 ASSERT_EQ (tok->type, CPP_STRING);
2375 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2376 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2378 tok = cpp_get_token_with_location (parser, &loc);
2379 ASSERT_NE (tok, NULL);
2380 ASSERT_EQ (tok->type, CPP_NUMBER);
2381 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2382 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2384 tok = cpp_get_token_with_location (parser, &loc);
2385 ASSERT_NE (tok, NULL);
2386 ASSERT_EQ (tok->type, CPP_EOF);
2388 cpp_finish (parser, NULL);
2389 cpp_destroy (parser);
2392 /* Forward decls. */
2394 class lexer_test;
2395 class lexer_test_options;
2397 /* A class for specifying options of a lexer_test.
2398 The "apply" vfunc is called during the lexer_test constructor. */
2400 class lexer_test_options
2402 public:
2403 virtual void apply (lexer_test &) = 0;
2406 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2407 in its dtor.
2409 This is needed by struct lexer_test to ensure that the cleanup of the
2410 cpp_reader happens *after* the cleanup of the temp_source_file. */
2412 class cpp_reader_ptr
2414 public:
2415 cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2417 ~cpp_reader_ptr ()
2419 cpp_finish (m_ptr, NULL);
2420 cpp_destroy (m_ptr);
2423 operator cpp_reader * () const { return m_ptr; }
2425 private:
2426 cpp_reader *m_ptr;
2429 /* A struct for writing lexer tests. */
2431 class lexer_test
2433 public:
2434 lexer_test (const line_table_case &case_, const char *content,
2435 lexer_test_options *options);
2436 ~lexer_test ();
2438 const cpp_token *get_token ();
2440 /* The ordering of these fields matters.
2441 The line_table_test must be first, since the cpp_reader_ptr
2442 uses it.
2443 The cpp_reader must be cleaned up *after* the temp_source_file
2444 since the filenames in input.cc's input cache are owned by the
2445 cpp_reader; in particular, when ~temp_source_file evicts the
2446 filename the filenames must still be alive. */
2447 line_table_test m_ltt;
2448 cpp_reader_ptr m_parser;
2449 temp_source_file m_tempfile;
2450 file_cache m_file_cache;
2451 string_concat_db m_concats;
2452 bool m_implicitly_expect_EOF;
2455 /* Use an EBCDIC encoding for the execution charset, specifically
2456 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2458 This exercises iconv integration within libcpp.
2459 Not every build of iconv supports the given charset,
2460 so we need to flag this error and handle it gracefully. */
2462 class ebcdic_execution_charset : public lexer_test_options
2464 public:
2465 ebcdic_execution_charset () : m_num_iconv_errors (0)
2467 gcc_assert (s_singleton == NULL);
2468 s_singleton = this;
2470 ~ebcdic_execution_charset ()
2472 gcc_assert (s_singleton == this);
2473 s_singleton = NULL;
2476 void apply (lexer_test &test) final override
2478 cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2479 cpp_opts->narrow_charset = "IBM1047";
2481 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2482 callbacks->diagnostic = on_diagnostic;
2485 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2486 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2487 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2488 rich_location *richloc ATTRIBUTE_UNUSED,
2489 const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2490 ATTRIBUTE_FPTR_PRINTF(5,0)
2492 gcc_assert (s_singleton);
2493 /* Avoid exgettext from picking this up, it is translated in libcpp. */
2494 const char *msg = "conversion from %s to %s not supported by iconv";
2495 #ifdef ENABLE_NLS
2496 msg = dgettext ("cpplib", msg);
2497 #endif
2498 /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2499 when the local iconv build doesn't support the conversion. */
2500 if (strcmp (msgid, msg) == 0)
2502 s_singleton->m_num_iconv_errors++;
2503 return true;
2506 /* Otherwise, we have an unexpected error. */
2507 abort ();
2510 bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2512 private:
2513 static ebcdic_execution_charset *s_singleton;
2514 int m_num_iconv_errors;
2517 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2519 /* A lexer_test_options subclass that records a list of diagnostic
2520 messages emitted by the lexer. */
2522 class lexer_diagnostic_sink : public lexer_test_options
2524 public:
2525 lexer_diagnostic_sink ()
2527 gcc_assert (s_singleton == NULL);
2528 s_singleton = this;
2530 ~lexer_diagnostic_sink ()
2532 gcc_assert (s_singleton == this);
2533 s_singleton = NULL;
2535 int i;
2536 char *str;
2537 FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2538 free (str);
2541 void apply (lexer_test &test) final override
2543 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2544 callbacks->diagnostic = on_diagnostic;
2547 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2548 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2549 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2550 rich_location *richloc ATTRIBUTE_UNUSED,
2551 const char *msgid, va_list *ap)
2552 ATTRIBUTE_FPTR_PRINTF(5,0)
2554 char *msg = xvasprintf (msgid, *ap);
2555 s_singleton->m_diagnostics.safe_push (msg);
2556 return true;
2559 auto_vec<char *> m_diagnostics;
2561 private:
2562 static lexer_diagnostic_sink *s_singleton;
2565 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2567 /* Constructor. Override line_table with a new instance based on CASE_,
2568 and write CONTENT to a tempfile. Create a cpp_reader, and use it to
2569 start parsing the tempfile. */
2571 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2572 lexer_test_options *options)
2573 : m_ltt (case_),
2574 m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2575 /* Create a tempfile and write the text to it. */
2576 m_tempfile (SELFTEST_LOCATION, ".c", content),
2577 m_concats (),
2578 m_implicitly_expect_EOF (true)
2580 if (options)
2581 options->apply (*this);
2583 cpp_init_iconv (m_parser);
2585 /* Parse the file. */
2586 const char *fname = cpp_read_main_file (m_parser,
2587 m_tempfile.get_filename ());
2588 ASSERT_NE (fname, NULL);
2591 /* Destructor. By default, verify that the next token in m_parser is EOF. */
2593 lexer_test::~lexer_test ()
2595 location_t loc;
2596 const cpp_token *tok;
2598 if (m_implicitly_expect_EOF)
2600 tok = cpp_get_token_with_location (m_parser, &loc);
2601 ASSERT_NE (tok, NULL);
2602 ASSERT_EQ (tok->type, CPP_EOF);
2606 /* Get the next token from m_parser. */
2608 const cpp_token *
2609 lexer_test::get_token ()
2611 location_t loc;
2612 const cpp_token *tok;
2614 tok = cpp_get_token_with_location (m_parser, &loc);
2615 ASSERT_NE (tok, NULL);
2616 return tok;
2619 /* Verify that locations within string literals are correctly handled. */
2621 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2622 using the string concatenation database for TEST.
2624 Assert that the character at index IDX is on EXPECTED_LINE,
2625 and that it begins at column EXPECTED_START_COL and ends at
2626 EXPECTED_FINISH_COL (unless the locations are beyond
2627 LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2628 columns). */
2630 static void
2631 assert_char_at_range (const location &loc,
2632 lexer_test& test,
2633 location_t strloc, enum cpp_ttype type, int idx,
2634 int expected_line, int expected_start_col,
2635 int expected_finish_col)
2637 cpp_reader *pfile = test.m_parser;
2638 string_concat_db *concats = &test.m_concats;
2640 source_range actual_range = source_range();
2641 const char *err
2642 = get_source_range_for_char (pfile, test.m_file_cache,
2643 concats, strloc, type, idx,
2644 &actual_range);
2645 if (should_have_column_data_p (strloc))
2646 ASSERT_EQ_AT (loc, NULL, err);
2647 else
2649 ASSERT_STREQ_AT (loc,
2650 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2651 err);
2652 return;
2655 int actual_start_line = LOCATION_LINE (actual_range.m_start);
2656 ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2657 int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2658 ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2660 if (should_have_column_data_p (actual_range.m_start))
2662 int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2663 ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2665 if (should_have_column_data_p (actual_range.m_finish))
2667 int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2668 ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2672 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2673 the effective location of any errors. */
2675 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2676 EXPECTED_START_COL, EXPECTED_FINISH_COL) \
2677 assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2678 (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2679 (EXPECTED_FINISH_COL))
2681 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2682 using the string concatenation database for TEST.
2684 Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */
2686 static void
2687 assert_num_substring_ranges (const location &loc,
2688 lexer_test& test,
2689 location_t strloc,
2690 enum cpp_ttype type,
2691 int expected_num_ranges)
2693 cpp_reader *pfile = test.m_parser;
2694 string_concat_db *concats = &test.m_concats;
2696 int actual_num_ranges = -1;
2697 const char *err
2698 = get_num_source_ranges_for_substring (pfile, test.m_file_cache,
2699 concats, strloc, type,
2700 &actual_num_ranges);
2701 if (should_have_column_data_p (strloc))
2702 ASSERT_EQ_AT (loc, NULL, err);
2703 else
2705 ASSERT_STREQ_AT (loc,
2706 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2707 err);
2708 return;
2710 ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2713 /* Macro for calling assert_num_substring_ranges, supplying
2714 SELFTEST_LOCATION for the effective location of any errors. */
2716 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2717 EXPECTED_NUM_RANGES) \
2718 assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2719 (TYPE), (EXPECTED_NUM_RANGES))
2722 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2723 returns an error (using the string concatenation database for TEST). */
2725 static void
2726 assert_has_no_substring_ranges (const location &loc,
2727 lexer_test& test,
2728 location_t strloc,
2729 enum cpp_ttype type,
2730 const char *expected_err)
2732 cpp_reader *pfile = test.m_parser;
2733 string_concat_db *concats = &test.m_concats;
2734 cpp_substring_ranges ranges;
2735 const char *actual_err
2736 = get_substring_ranges_for_loc (pfile, test.m_file_cache, concats, strloc,
2737 type, ranges);
2738 if (should_have_column_data_p (strloc))
2739 ASSERT_STREQ_AT (loc, expected_err, actual_err);
2740 else
2741 ASSERT_STREQ_AT (loc,
2742 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2743 actual_err);
2746 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \
2747 assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2748 (STRLOC), (TYPE), (ERR))
2750 /* Lex a simple string literal. Verify the substring location data, before
2751 and after running cpp_interpret_string on it. */
2753 static void
2754 test_lexer_string_locations_simple (const line_table_case &case_)
2756 /* Digits 0-9 (with 0 at column 10), the simple way.
2757 ....................000000000.11111111112.2222222223333333333
2758 ....................123456789.01234567890.1234567890123456789
2759 We add a trailing comment to ensure that we correctly locate
2760 the end of the string literal token. */
2761 const char *content = " \"0123456789\" /* not a string */\n";
2762 lexer_test test (case_, content, NULL);
2764 /* Verify that we get the expected token back, with the correct
2765 location information. */
2766 const cpp_token *tok = test.get_token ();
2767 ASSERT_EQ (tok->type, CPP_STRING);
2768 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2769 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2771 /* At this point in lexing, the quote characters are treated as part of
2772 the string (they are stripped off by cpp_interpret_string). */
2774 ASSERT_EQ (tok->val.str.len, 12);
2776 /* Verify that cpp_interpret_string works. */
2777 cpp_string dst_string;
2778 const enum cpp_ttype type = CPP_STRING;
2779 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2780 &dst_string, type);
2781 ASSERT_TRUE (result);
2782 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2783 free (const_cast <unsigned char *> (dst_string.text));
2785 /* Verify ranges of individual characters. This no longer includes the
2786 opening quote, but does include the closing quote. */
2787 for (int i = 0; i <= 10; i++)
2788 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2789 10 + i, 10 + i);
2791 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2794 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2795 encoding. */
2797 static void
2798 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2800 /* EBCDIC support requires iconv. */
2801 if (!HAVE_ICONV)
2802 return;
2804 /* Digits 0-9 (with 0 at column 10), the simple way.
2805 ....................000000000.11111111112.2222222223333333333
2806 ....................123456789.01234567890.1234567890123456789
2807 We add a trailing comment to ensure that we correctly locate
2808 the end of the string literal token. */
2809 const char *content = " \"0123456789\" /* not a string */\n";
2810 ebcdic_execution_charset use_ebcdic;
2811 lexer_test test (case_, content, &use_ebcdic);
2813 /* Verify that we get the expected token back, with the correct
2814 location information. */
2815 const cpp_token *tok = test.get_token ();
2816 ASSERT_EQ (tok->type, CPP_STRING);
2817 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2818 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2820 /* At this point in lexing, the quote characters are treated as part of
2821 the string (they are stripped off by cpp_interpret_string). */
2823 ASSERT_EQ (tok->val.str.len, 12);
2825 /* The remainder of the test requires an iconv implementation that
2826 can convert from UTF-8 to the EBCDIC encoding requested above. */
2827 if (use_ebcdic.iconv_errors_occurred_p ())
2828 return;
2830 /* Verify that cpp_interpret_string works. */
2831 cpp_string dst_string;
2832 const enum cpp_ttype type = CPP_STRING;
2833 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2834 &dst_string, type);
2835 ASSERT_TRUE (result);
2836 /* We should now have EBCDIC-encoded text, specifically
2837 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2838 The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */
2839 ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2840 (const char *)dst_string.text);
2841 free (const_cast <unsigned char *> (dst_string.text));
2843 /* Verify that we don't attempt to record substring location information
2844 for such cases. */
2845 ASSERT_HAS_NO_SUBSTRING_RANGES
2846 (test, tok->src_loc, type,
2847 "execution character set != source character set");
2850 /* Lex a string literal containing a hex-escaped character.
2851 Verify the substring location data, before and after running
2852 cpp_interpret_string on it. */
2854 static void
2855 test_lexer_string_locations_hex (const line_table_case &case_)
2857 /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2858 and with a space in place of digit 6, to terminate the escaped
2859 hex code.
2860 ....................000000000.111111.11112222.
2861 ....................123456789.012345.67890123. */
2862 const char *content = " \"01234\\x35 789\"\n";
2863 lexer_test test (case_, content, NULL);
2865 /* Verify that we get the expected token back, with the correct
2866 location information. */
2867 const cpp_token *tok = test.get_token ();
2868 ASSERT_EQ (tok->type, CPP_STRING);
2869 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2870 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2872 /* At this point in lexing, the quote characters are treated as part of
2873 the string (they are stripped off by cpp_interpret_string). */
2874 ASSERT_EQ (tok->val.str.len, 15);
2876 /* Verify that cpp_interpret_string works. */
2877 cpp_string dst_string;
2878 const enum cpp_ttype type = CPP_STRING;
2879 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2880 &dst_string, type);
2881 ASSERT_TRUE (result);
2882 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2883 free (const_cast <unsigned char *> (dst_string.text));
2885 /* Verify ranges of individual characters. This no longer includes the
2886 opening quote, but does include the closing quote. */
2887 for (int i = 0; i <= 4; i++)
2888 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2889 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2890 for (int i = 6; i <= 10; i++)
2891 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2893 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2896 /* Lex a string literal containing an octal-escaped character.
2897 Verify the substring location data after running cpp_interpret_string
2898 on it. */
2900 static void
2901 test_lexer_string_locations_oct (const line_table_case &case_)
2903 /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2904 and with a space in place of digit 6, to terminate the escaped
2905 octal code.
2906 ....................000000000.111111.11112222.2222223333333333444
2907 ....................123456789.012345.67890123.4567890123456789012 */
2908 const char *content = " \"01234\\065 789\" /* not a string */\n";
2909 lexer_test test (case_, content, NULL);
2911 /* Verify that we get the expected token back, with the correct
2912 location information. */
2913 const cpp_token *tok = test.get_token ();
2914 ASSERT_EQ (tok->type, CPP_STRING);
2915 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2917 /* Verify that cpp_interpret_string works. */
2918 cpp_string dst_string;
2919 const enum cpp_ttype type = CPP_STRING;
2920 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2921 &dst_string, type);
2922 ASSERT_TRUE (result);
2923 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2924 free (const_cast <unsigned char *> (dst_string.text));
2926 /* Verify ranges of individual characters. This no longer includes the
2927 opening quote, but does include the closing quote. */
2928 for (int i = 0; i < 5; i++)
2929 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2930 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2931 for (int i = 6; i <= 10; i++)
2932 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2934 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2937 /* Test of string literal containing letter escapes. */
2939 static void
2940 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2942 /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2943 .....................000000000.1.11111.1.1.11222.22222223333333
2944 .....................123456789.0.12345.6.7.89012.34567890123456. */
2945 const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2946 lexer_test test (case_, content, NULL);
2948 /* Verify that we get the expected tokens back. */
2949 const cpp_token *tok = test.get_token ();
2950 ASSERT_EQ (tok->type, CPP_STRING);
2951 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2953 /* Verify ranges of individual characters. */
2954 /* "\t". */
2955 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2956 0, 1, 10, 11);
2957 /* "foo". */
2958 for (int i = 1; i <= 3; i++)
2959 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2960 i, 1, 11 + i, 11 + i);
2961 /* "\\" and "\n". */
2962 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2963 4, 1, 15, 16);
2964 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2965 5, 1, 17, 18);
2967 /* "bar" and closing quote for nul-terminator. */
2968 for (int i = 6; i <= 9; i++)
2969 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2970 i, 1, 13 + i, 13 + i);
2972 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2975 /* Another test of a string literal containing a letter escape.
2976 Based on string seen in
2977 printf ("%-%\n");
2978 in gcc.dg/format/c90-printf-1.c. */
2980 static void
2981 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2983 /* .....................000000000.1111.11.1111.22222222223.
2984 .....................123456789.0123.45.6789.01234567890. */
2985 const char *content = (" \"%-%\\n\" /* non-str */\n");
2986 lexer_test test (case_, content, NULL);
2988 /* Verify that we get the expected tokens back. */
2989 const cpp_token *tok = test.get_token ();
2990 ASSERT_EQ (tok->type, CPP_STRING);
2991 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2993 /* Verify ranges of individual characters. */
2994 /* "%-%". */
2995 for (int i = 0; i < 3; i++)
2996 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2997 i, 1, 10 + i, 10 + i);
2998 /* "\n". */
2999 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3000 3, 1, 13, 14);
3002 /* Closing quote for nul-terminator. */
3003 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3004 4, 1, 15, 15);
3006 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
3009 /* Lex a string literal containing UCN 4 characters.
3010 Verify the substring location data after running cpp_interpret_string
3011 on it. */
3013 static void
3014 test_lexer_string_locations_ucn4 (const line_table_case &case_)
3016 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
3017 as UCN 4.
3018 ....................000000000.111111.111122.222222223.33333333344444
3019 ....................123456789.012345.678901.234567890.12345678901234 */
3020 const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n";
3021 lexer_test test (case_, content, NULL);
3023 /* Verify that we get the expected token back, with the correct
3024 location information. */
3025 const cpp_token *tok = test.get_token ();
3026 ASSERT_EQ (tok->type, CPP_STRING);
3027 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
3029 /* Verify that cpp_interpret_string works.
3030 The string should be encoded in the execution character
3031 set. Assuming that is UTF-8, we should have the following:
3032 ----------- ---- ----- ------- ----------------
3033 Byte offset Byte Octal Unicode Source Column(s)
3034 ----------- ---- ----- ------- ----------------
3035 0 0x30 '0' 10
3036 1 0x31 '1' 11
3037 2 0x32 '2' 12
3038 3 0x33 '3' 13
3039 4 0x34 '4' 14
3040 5 0xE2 \342 U+2174 15-20
3041 6 0x85 \205 (cont) 15-20
3042 7 0xB4 \264 (cont) 15-20
3043 8 0xE2 \342 U+2175 21-26
3044 9 0x85 \205 (cont) 21-26
3045 10 0xB5 \265 (cont) 21-26
3046 11 0x37 '7' 27
3047 12 0x38 '8' 28
3048 13 0x39 '9' 29
3049 14 0x00 30 (closing quote)
3050 ----------- ---- ----- ------- ---------------. */
3052 cpp_string dst_string;
3053 const enum cpp_ttype type = CPP_STRING;
3054 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3055 &dst_string, type);
3056 ASSERT_TRUE (result);
3057 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3058 (const char *)dst_string.text);
3059 free (const_cast <unsigned char *> (dst_string.text));
3061 /* Verify ranges of individual characters. This no longer includes the
3062 opening quote, but does include the closing quote.
3063 '01234'. */
3064 for (int i = 0; i <= 4; i++)
3065 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3066 /* U+2174. */
3067 for (int i = 5; i <= 7; i++)
3068 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
3069 /* U+2175. */
3070 for (int i = 8; i <= 10; i++)
3071 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
3072 /* '789' and nul terminator */
3073 for (int i = 11; i <= 14; i++)
3074 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
3076 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3079 /* Lex a string literal containing UCN 8 characters.
3080 Verify the substring location data after running cpp_interpret_string
3081 on it. */
3083 static void
3084 test_lexer_string_locations_ucn8 (const line_table_case &case_)
3086 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
3087 ....................000000000.111111.1111222222.2222333333333.344444
3088 ....................123456789.012345.6789012345.6789012345678.901234 */
3089 const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n";
3090 lexer_test test (case_, content, NULL);
3092 /* Verify that we get the expected token back, with the correct
3093 location information. */
3094 const cpp_token *tok = test.get_token ();
3095 ASSERT_EQ (tok->type, CPP_STRING);
3096 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3097 "\"01234\\U00002174\\U00002175789\"");
3099 /* Verify that cpp_interpret_string works.
3100 The UTF-8 encoding of the string is identical to that from
3101 the ucn4 testcase above; the only difference is the column
3102 locations. */
3103 cpp_string dst_string;
3104 const enum cpp_ttype type = CPP_STRING;
3105 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3106 &dst_string, type);
3107 ASSERT_TRUE (result);
3108 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3109 (const char *)dst_string.text);
3110 free (const_cast <unsigned char *> (dst_string.text));
3112 /* Verify ranges of individual characters. This no longer includes the
3113 opening quote, but does include the closing quote.
3114 '01234'. */
3115 for (int i = 0; i <= 4; i++)
3116 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3117 /* U+2174. */
3118 for (int i = 5; i <= 7; i++)
3119 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3120 /* U+2175. */
3121 for (int i = 8; i <= 10; i++)
3122 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3123 /* '789' at columns 35-37 */
3124 for (int i = 11; i <= 13; i++)
3125 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3126 /* Closing quote/nul-terminator at column 38. */
3127 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3129 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3132 /* Fetch a big-endian 32-bit value and convert to host endianness. */
3134 static uint32_t
3135 uint32_from_big_endian (const uint32_t *ptr_be_value)
3137 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3138 return (((uint32_t) buf[0] << 24)
3139 | ((uint32_t) buf[1] << 16)
3140 | ((uint32_t) buf[2] << 8)
3141 | (uint32_t) buf[3]);
3144 /* Lex a wide string literal and verify that attempts to read substring
3145 location data from it fail gracefully. */
3147 static void
3148 test_lexer_string_locations_wide_string (const line_table_case &case_)
3150 /* Digits 0-9.
3151 ....................000000000.11111111112.22222222233333
3152 ....................123456789.01234567890.12345678901234 */
3153 const char *content = " L\"0123456789\" /* non-str */\n";
3154 lexer_test test (case_, content, NULL);
3156 /* Verify that we get the expected token back, with the correct
3157 location information. */
3158 const cpp_token *tok = test.get_token ();
3159 ASSERT_EQ (tok->type, CPP_WSTRING);
3160 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3162 /* Verify that cpp_interpret_string works, using CPP_WSTRING. */
3163 cpp_string dst_string;
3164 const enum cpp_ttype type = CPP_WSTRING;
3165 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3166 &dst_string, type);
3167 ASSERT_TRUE (result);
3168 /* The cpp_reader defaults to big-endian with
3169 CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3170 now be encoded as UTF-32BE. */
3171 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3172 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3173 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3174 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3175 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3176 free (const_cast <unsigned char *> (dst_string.text));
3178 /* We don't yet support generating substring location information
3179 for L"" strings. */
3180 ASSERT_HAS_NO_SUBSTRING_RANGES
3181 (test, tok->src_loc, type,
3182 "execution character set != source character set");
3185 /* Fetch a big-endian 16-bit value and convert to host endianness. */
3187 static uint16_t
3188 uint16_from_big_endian (const uint16_t *ptr_be_value)
3190 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3191 return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3194 /* Lex a u"" string literal and verify that attempts to read substring
3195 location data from it fail gracefully. */
3197 static void
3198 test_lexer_string_locations_string16 (const line_table_case &case_)
3200 /* Digits 0-9.
3201 ....................000000000.11111111112.22222222233333
3202 ....................123456789.01234567890.12345678901234 */
3203 const char *content = " u\"0123456789\" /* non-str */\n";
3204 lexer_test test (case_, content, NULL);
3206 /* Verify that we get the expected token back, with the correct
3207 location information. */
3208 const cpp_token *tok = test.get_token ();
3209 ASSERT_EQ (tok->type, CPP_STRING16);
3210 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3212 /* Verify that cpp_interpret_string works, using CPP_STRING16. */
3213 cpp_string dst_string;
3214 const enum cpp_ttype type = CPP_STRING16;
3215 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3216 &dst_string, type);
3217 ASSERT_TRUE (result);
3219 /* The cpp_reader defaults to big-endian, so dst_string should
3220 now be encoded as UTF-16BE. */
3221 const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3222 ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3223 ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3224 ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3225 ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3226 free (const_cast <unsigned char *> (dst_string.text));
3228 /* We don't yet support generating substring location information
3229 for L"" strings. */
3230 ASSERT_HAS_NO_SUBSTRING_RANGES
3231 (test, tok->src_loc, type,
3232 "execution character set != source character set");
3235 /* Lex a U"" string literal and verify that attempts to read substring
3236 location data from it fail gracefully. */
3238 static void
3239 test_lexer_string_locations_string32 (const line_table_case &case_)
3241 /* Digits 0-9.
3242 ....................000000000.11111111112.22222222233333
3243 ....................123456789.01234567890.12345678901234 */
3244 const char *content = " U\"0123456789\" /* non-str */\n";
3245 lexer_test test (case_, content, NULL);
3247 /* Verify that we get the expected token back, with the correct
3248 location information. */
3249 const cpp_token *tok = test.get_token ();
3250 ASSERT_EQ (tok->type, CPP_STRING32);
3251 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3253 /* Verify that cpp_interpret_string works, using CPP_STRING32. */
3254 cpp_string dst_string;
3255 const enum cpp_ttype type = CPP_STRING32;
3256 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3257 &dst_string, type);
3258 ASSERT_TRUE (result);
3260 /* The cpp_reader defaults to big-endian, so dst_string should
3261 now be encoded as UTF-32BE. */
3262 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3263 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3264 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3265 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3266 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3267 free (const_cast <unsigned char *> (dst_string.text));
3269 /* We don't yet support generating substring location information
3270 for L"" strings. */
3271 ASSERT_HAS_NO_SUBSTRING_RANGES
3272 (test, tok->src_loc, type,
3273 "execution character set != source character set");
3276 /* Lex a u8-string literal.
3277 Verify the substring location data after running cpp_interpret_string
3278 on it. */
3280 static void
3281 test_lexer_string_locations_u8 (const line_table_case &case_)
3283 /* Digits 0-9.
3284 ....................000000000.11111111112.22222222233333
3285 ....................123456789.01234567890.12345678901234 */
3286 const char *content = " u8\"0123456789\" /* non-str */\n";
3287 lexer_test test (case_, content, NULL);
3289 /* Verify that we get the expected token back, with the correct
3290 location information. */
3291 const cpp_token *tok = test.get_token ();
3292 ASSERT_EQ (tok->type, CPP_UTF8STRING);
3293 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3295 /* Verify that cpp_interpret_string works. */
3296 cpp_string dst_string;
3297 const enum cpp_ttype type = CPP_STRING;
3298 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3299 &dst_string, type);
3300 ASSERT_TRUE (result);
3301 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3302 free (const_cast <unsigned char *> (dst_string.text));
3304 /* Verify ranges of individual characters. This no longer includes the
3305 opening quote, but does include the closing quote. */
3306 for (int i = 0; i <= 10; i++)
3307 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3310 /* Lex a string literal containing UTF-8 source characters.
3311 Verify the substring location data after running cpp_interpret_string
3312 on it. */
3314 static void
3315 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3317 /* This string literal is written out to the source file as UTF-8,
3318 and is of the form "before mojibake after", where "mojibake"
3319 is written as the following four unicode code points:
3320 U+6587 CJK UNIFIED IDEOGRAPH-6587
3321 U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3322 U+5316 CJK UNIFIED IDEOGRAPH-5316
3323 U+3051 HIRAGANA LETTER KE.
3324 Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3325 "before" and "after" are 1 byte per unicode character.
3327 The numbering shown are "columns", which are *byte* numbers within
3328 the line, rather than unicode character numbers.
3330 .................... 000000000.1111111.
3331 .................... 123456789.0123456. */
3332 const char *content = (" \"before "
3333 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3334 UTF-8: 0xE6 0x96 0x87
3335 C octal escaped UTF-8: \346\226\207
3336 "column" numbers: 17-19. */
3337 "\346\226\207"
3339 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3340 UTF-8: 0xE5 0xAD 0x97
3341 C octal escaped UTF-8: \345\255\227
3342 "column" numbers: 20-22. */
3343 "\345\255\227"
3345 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3346 UTF-8: 0xE5 0x8C 0x96
3347 C octal escaped UTF-8: \345\214\226
3348 "column" numbers: 23-25. */
3349 "\345\214\226"
3351 /* U+3051 HIRAGANA LETTER KE
3352 UTF-8: 0xE3 0x81 0x91
3353 C octal escaped UTF-8: \343\201\221
3354 "column" numbers: 26-28. */
3355 "\343\201\221"
3357 /* column numbers 29 onwards
3358 2333333.33334444444444
3359 9012345.67890123456789. */
3360 " after\" /* non-str */\n");
3361 lexer_test test (case_, content, NULL);
3363 /* Verify that we get the expected token back, with the correct
3364 location information. */
3365 const cpp_token *tok = test.get_token ();
3366 ASSERT_EQ (tok->type, CPP_STRING);
3367 ASSERT_TOKEN_AS_TEXT_EQ
3368 (test.m_parser, tok,
3369 "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3371 /* Verify that cpp_interpret_string works. */
3372 cpp_string dst_string;
3373 const enum cpp_ttype type = CPP_STRING;
3374 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3375 &dst_string, type);
3376 ASSERT_TRUE (result);
3377 ASSERT_STREQ
3378 ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3379 (const char *)dst_string.text);
3380 free (const_cast <unsigned char *> (dst_string.text));
3382 /* Verify ranges of individual characters. This no longer includes the
3383 opening quote, but does include the closing quote.
3384 Assuming that both source and execution encodings are UTF-8, we have
3385 a run of 25 octets in each, plus the NUL terminator. */
3386 for (int i = 0; i < 25; i++)
3387 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3388 /* NUL-terminator should use the closing quote at column 35. */
3389 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3391 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3394 /* Test of string literal concatenation. */
3396 static void
3397 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3399 /* Digits 0-9.
3400 .....................000000000.111111.11112222222222
3401 .....................123456789.012345.67890123456789. */
3402 const char *content = (" \"01234\" /* non-str */\n"
3403 " \"56789\" /* non-str */\n");
3404 lexer_test test (case_, content, NULL);
3406 location_t input_locs[2];
3408 /* Verify that we get the expected tokens back. */
3409 auto_vec <cpp_string> input_strings;
3410 const cpp_token *tok_a = test.get_token ();
3411 ASSERT_EQ (tok_a->type, CPP_STRING);
3412 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3413 input_strings.safe_push (tok_a->val.str);
3414 input_locs[0] = tok_a->src_loc;
3416 const cpp_token *tok_b = test.get_token ();
3417 ASSERT_EQ (tok_b->type, CPP_STRING);
3418 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3419 input_strings.safe_push (tok_b->val.str);
3420 input_locs[1] = tok_b->src_loc;
3422 /* Verify that cpp_interpret_string works. */
3423 cpp_string dst_string;
3424 const enum cpp_ttype type = CPP_STRING;
3425 bool result = cpp_interpret_string (test.m_parser,
3426 input_strings.address (), 2,
3427 &dst_string, type);
3428 ASSERT_TRUE (result);
3429 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3430 free (const_cast <unsigned char *> (dst_string.text));
3432 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3433 test.m_concats.record_string_concatenation (2, input_locs);
3435 location_t initial_loc = input_locs[0];
3437 /* "01234" on line 1. */
3438 for (int i = 0; i <= 4; i++)
3439 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3440 /* "56789" in line 2, plus its closing quote for the nul terminator. */
3441 for (int i = 5; i <= 10; i++)
3442 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3444 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3447 /* Another test of string literal concatenation. */
3449 static void
3450 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3452 /* Digits 0-9.
3453 .....................000000000.111.11111112222222
3454 .....................123456789.012.34567890123456. */
3455 const char *content = (" \"01\" /* non-str */\n"
3456 " \"23\" /* non-str */\n"
3457 " \"45\" /* non-str */\n"
3458 " \"67\" /* non-str */\n"
3459 " \"89\" /* non-str */\n");
3460 lexer_test test (case_, content, NULL);
3462 auto_vec <cpp_string> input_strings;
3463 location_t input_locs[5];
3465 /* Verify that we get the expected tokens back. */
3466 for (int i = 0; i < 5; i++)
3468 const cpp_token *tok = test.get_token ();
3469 ASSERT_EQ (tok->type, CPP_STRING);
3470 input_strings.safe_push (tok->val.str);
3471 input_locs[i] = tok->src_loc;
3474 /* Verify that cpp_interpret_string works. */
3475 cpp_string dst_string;
3476 const enum cpp_ttype type = CPP_STRING;
3477 bool result = cpp_interpret_string (test.m_parser,
3478 input_strings.address (), 5,
3479 &dst_string, type);
3480 ASSERT_TRUE (result);
3481 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3482 free (const_cast <unsigned char *> (dst_string.text));
3484 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3485 test.m_concats.record_string_concatenation (5, input_locs);
3487 location_t initial_loc = input_locs[0];
3489 /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3490 detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3491 and expect get_source_range_for_substring to fail.
3492 However, for a string concatenation test, we can have a case
3493 where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3494 but subsequent strings can be after it.
3495 Attempting to detect this within assert_char_at_range
3496 would overcomplicate the logic for the common test cases, so
3497 we detect it here. */
3498 if (should_have_column_data_p (input_locs[0])
3499 && !should_have_column_data_p (input_locs[4]))
3501 /* Verify that get_source_range_for_substring gracefully rejects
3502 this case. */
3503 source_range actual_range;
3504 const char *err
3505 = get_source_range_for_char (test.m_parser, test.m_file_cache,
3506 &test.m_concats,
3507 initial_loc, type, 0, &actual_range);
3508 ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3509 return;
3512 for (int i = 0; i < 5; i++)
3513 for (int j = 0; j < 2; j++)
3514 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3515 i + 1, 10 + j, 10 + j);
3517 /* NUL-terminator should use the final closing quote at line 5 column 12. */
3518 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3520 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3523 /* Another test of string literal concatenation, this time combined with
3524 various kinds of escaped characters. */
3526 static void
3527 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3529 /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3530 digit 6 in ASCII as octal "\066", concatenating multiple strings. */
3531 const char *content
3532 /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3533 .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3534 = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n");
3535 lexer_test test (case_, content, NULL);
3537 auto_vec <cpp_string> input_strings;
3538 location_t input_locs[4];
3540 /* Verify that we get the expected tokens back. */
3541 for (int i = 0; i < 4; i++)
3543 const cpp_token *tok = test.get_token ();
3544 ASSERT_EQ (tok->type, CPP_STRING);
3545 input_strings.safe_push (tok->val.str);
3546 input_locs[i] = tok->src_loc;
3549 /* Verify that cpp_interpret_string works. */
3550 cpp_string dst_string;
3551 const enum cpp_ttype type = CPP_STRING;
3552 bool result = cpp_interpret_string (test.m_parser,
3553 input_strings.address (), 4,
3554 &dst_string, type);
3555 ASSERT_TRUE (result);
3556 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3557 free (const_cast <unsigned char *> (dst_string.text));
3559 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3560 test.m_concats.record_string_concatenation (4, input_locs);
3562 location_t initial_loc = input_locs[0];
3564 for (int i = 0; i <= 4; i++)
3565 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3566 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3567 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3568 for (int i = 7; i <= 9; i++)
3569 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3571 /* NUL-terminator should use the location of the final closing quote. */
3572 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3574 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3577 /* Test of string literal in a macro. */
3579 static void
3580 test_lexer_string_locations_macro (const line_table_case &case_)
3582 /* Digits 0-9.
3583 .....................0000000001111111111.22222222223.
3584 .....................1234567890123456789.01234567890. */
3585 const char *content = ("#define MACRO \"0123456789\" /* non-str */\n"
3586 " MACRO");
3587 lexer_test test (case_, content, NULL);
3589 /* Verify that we get the expected tokens back. */
3590 const cpp_token *tok = test.get_token ();
3591 ASSERT_EQ (tok->type, CPP_PADDING);
3593 tok = test.get_token ();
3594 ASSERT_EQ (tok->type, CPP_STRING);
3595 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3597 /* Verify ranges of individual characters. We ought to
3598 see columns within the macro definition. */
3599 for (int i = 0; i <= 10; i++)
3600 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3601 i, 1, 20 + i, 20 + i);
3603 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3605 tok = test.get_token ();
3606 ASSERT_EQ (tok->type, CPP_PADDING);
3609 /* Test of stringification of a macro argument. */
3611 static void
3612 test_lexer_string_locations_stringified_macro_argument
3613 (const line_table_case &case_)
3615 /* .....................000000000111111111122222222223.
3616 .....................123456789012345678901234567890. */
3617 const char *content = ("#define MACRO(X) #X /* non-str */\n"
3618 "MACRO(foo)\n");
3619 lexer_test test (case_, content, NULL);
3621 /* Verify that we get the expected token back. */
3622 const cpp_token *tok = test.get_token ();
3623 ASSERT_EQ (tok->type, CPP_PADDING);
3625 tok = test.get_token ();
3626 ASSERT_EQ (tok->type, CPP_STRING);
3627 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3629 /* We don't support getting the location of a stringified macro
3630 argument. Verify that it fails gracefully. */
3631 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3632 "cpp_interpret_string_1 failed");
3634 tok = test.get_token ();
3635 ASSERT_EQ (tok->type, CPP_PADDING);
3637 tok = test.get_token ();
3638 ASSERT_EQ (tok->type, CPP_PADDING);
3641 /* Ensure that we are fail gracefully if something attempts to pass
3642 in a location that isn't a string literal token. Seen on this code:
3644 const char a[] = " %d ";
3645 __builtin_printf (a, 0.5);
3648 when c-format.cc erroneously used the indicated one-character
3649 location as the format string location, leading to a read past the
3650 end of a string buffer in cpp_interpret_string_1. */
3652 static void
3653 test_lexer_string_locations_non_string (const line_table_case &case_)
3655 /* .....................000000000111111111122222222223.
3656 .....................123456789012345678901234567890. */
3657 const char *content = (" a\n");
3658 lexer_test test (case_, content, NULL);
3660 /* Verify that we get the expected token back. */
3661 const cpp_token *tok = test.get_token ();
3662 ASSERT_EQ (tok->type, CPP_NAME);
3663 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3665 /* At this point, libcpp is attempting to interpret the name as a
3666 string literal, despite it not starting with a quote. We don't detect
3667 that, but we should at least fail gracefully. */
3668 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3669 "cpp_interpret_string_1 failed");
3672 /* Ensure that we can read substring information for a token which
3673 starts in one linemap and ends in another . Adapted from
3674 gcc.dg/cpp/pr69985.c. */
3676 static void
3677 test_lexer_string_locations_long_line (const line_table_case &case_)
3679 /* .....................000000.000111111111
3680 .....................123456.789012346789. */
3681 const char *content = ("/* A very long line, so that we start a new line map. */\n"
3682 " \"0123456789012345678901234567890123456789"
3683 "0123456789012345678901234567890123456789"
3684 "0123456789012345678901234567890123456789"
3685 "0123456789\"\n");
3687 lexer_test test (case_, content, NULL);
3689 /* Verify that we get the expected token back. */
3690 const cpp_token *tok = test.get_token ();
3691 ASSERT_EQ (tok->type, CPP_STRING);
3693 if (!should_have_column_data_p (line_table->highest_location))
3694 return;
3696 /* Verify ranges of individual characters. */
3697 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3698 for (int i = 0; i < 131; i++)
3699 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3700 i, 2, 7 + i, 7 + i);
3703 /* Test of locations within a raw string that doesn't contain a newline. */
3705 static void
3706 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3708 /* .....................00.0000000111111111122.
3709 .....................12.3456789012345678901. */
3710 const char *content = ("R\"foo(0123456789)foo\"\n");
3711 lexer_test test (case_, content, NULL);
3713 /* Verify that we get the expected token back. */
3714 const cpp_token *tok = test.get_token ();
3715 ASSERT_EQ (tok->type, CPP_STRING);
3717 /* Verify that cpp_interpret_string works. */
3718 cpp_string dst_string;
3719 const enum cpp_ttype type = CPP_STRING;
3720 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3721 &dst_string, type);
3722 ASSERT_TRUE (result);
3723 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3724 free (const_cast <unsigned char *> (dst_string.text));
3726 if (!should_have_column_data_p (line_table->highest_location))
3727 return;
3729 /* 0-9, plus the nil terminator. */
3730 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3731 for (int i = 0; i < 11; i++)
3732 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3733 i, 1, 7 + i, 7 + i);
3736 /* Test of locations within a raw string that contains a newline. */
3738 static void
3739 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3741 /* .....................00.0000.
3742 .....................12.3456. */
3743 const char *content = ("R\"foo(\n"
3744 /* .....................00000.
3745 .....................12345. */
3746 "hello\n"
3747 "world\n"
3748 /* .....................00000.
3749 .....................12345. */
3750 ")foo\"\n");
3751 lexer_test test (case_, content, NULL);
3753 /* Verify that we get the expected token back. */
3754 const cpp_token *tok = test.get_token ();
3755 ASSERT_EQ (tok->type, CPP_STRING);
3757 /* Verify that cpp_interpret_string works. */
3758 cpp_string dst_string;
3759 const enum cpp_ttype type = CPP_STRING;
3760 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3761 &dst_string, type);
3762 ASSERT_TRUE (result);
3763 ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3764 free (const_cast <unsigned char *> (dst_string.text));
3766 if (!should_have_column_data_p (line_table->highest_location))
3767 return;
3769 /* Currently we don't support locations within raw strings that
3770 contain newlines. */
3771 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3772 "range endpoints are on different lines");
3775 /* Test of parsing an unterminated raw string. */
3777 static void
3778 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3780 const char *content = "R\"ouch()ouCh\" /* etc */";
3782 lexer_diagnostic_sink diagnostics;
3783 lexer_test test (case_, content, &diagnostics);
3784 test.m_implicitly_expect_EOF = false;
3786 /* Attempt to parse the raw string. */
3787 const cpp_token *tok = test.get_token ();
3788 ASSERT_EQ (tok->type, CPP_EOF);
3790 ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3791 /* We expect the message "unterminated raw string"
3792 in the "cpplib" translation domain.
3793 It's not clear that dgettext is available on all supported hosts,
3794 so this assertion is commented-out for now.
3795 ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3796 diagnostics.m_diagnostics[0]);
3800 /* Test of lexing char constants. */
3802 static void
3803 test_lexer_char_constants (const line_table_case &case_)
3805 /* Various char constants.
3806 .....................0000000001111111111.22222222223.
3807 .....................1234567890123456789.01234567890. */
3808 const char *content = (" 'a'\n"
3809 " u'a'\n"
3810 " U'a'\n"
3811 " L'a'\n"
3812 " 'abc'\n");
3813 lexer_test test (case_, content, NULL);
3815 /* Verify that we get the expected tokens back. */
3816 /* 'a'. */
3817 const cpp_token *tok = test.get_token ();
3818 ASSERT_EQ (tok->type, CPP_CHAR);
3819 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3821 unsigned int chars_seen;
3822 int unsignedp;
3823 cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3824 &chars_seen, &unsignedp);
3825 ASSERT_EQ (cc, 'a');
3826 ASSERT_EQ (chars_seen, 1);
3828 /* u'a'. */
3829 tok = test.get_token ();
3830 ASSERT_EQ (tok->type, CPP_CHAR16);
3831 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3833 /* U'a'. */
3834 tok = test.get_token ();
3835 ASSERT_EQ (tok->type, CPP_CHAR32);
3836 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3838 /* L'a'. */
3839 tok = test.get_token ();
3840 ASSERT_EQ (tok->type, CPP_WCHAR);
3841 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3843 /* 'abc' (c-char-sequence). */
3844 tok = test.get_token ();
3845 ASSERT_EQ (tok->type, CPP_CHAR);
3846 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3848 /* A table of interesting location_t values, giving one axis of our test
3849 matrix. */
3851 static const location_t boundary_locations[] = {
3852 /* Zero means "don't override the default values for a new line_table". */
3855 /* An arbitrary non-zero value that isn't close to one of
3856 the boundary values below. */
3857 0x10000,
3859 /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES. */
3860 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3861 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3862 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3863 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3864 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3866 /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS. */
3867 LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3868 LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3869 LINE_MAP_MAX_LOCATION_WITH_COLS,
3870 LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3871 LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3874 /* Run TESTCASE multiple times, once for each case in our test matrix. */
3876 void
3877 for_each_line_table_case (void (*testcase) (const line_table_case &))
3879 /* As noted above in the description of struct line_table_case,
3880 we want to explore a test matrix of interesting line_table
3881 situations, running various selftests for each case within the
3882 matrix. */
3884 /* Run all tests with:
3885 (a) line_table->default_range_bits == 0, and
3886 (b) line_table->default_range_bits == 5. */
3887 int num_cases_tested = 0;
3888 for (int default_range_bits = 0; default_range_bits <= 5;
3889 default_range_bits += 5)
3891 /* ...and use each of the "interesting" location values as
3892 the starting location within line_table. */
3893 const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3894 for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3896 line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3898 testcase (c);
3900 num_cases_tested++;
3904 /* Verify that we fully covered the test matrix. */
3905 ASSERT_EQ (num_cases_tested, 2 * 12);
3908 /* Verify that when presented with a consecutive pair of locations with
3909 a very large line offset, we don't attempt to consolidate them into
3910 a single ordinary linemap where the line offsets within the line map
3911 would lead to overflow (PR lto/88147). */
3913 static void
3914 test_line_offset_overflow ()
3916 line_table_test ltt (line_table_case (5, 0));
3918 linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3919 linemap_line_start (line_table, 1, 100);
3920 location_t loc_a = linemap_line_start (line_table, 2578, 255);
3921 assert_loceq ("foo.c", 2578, 0, loc_a);
3923 const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3924 ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3925 ASSERT_EQ (ordmap_a->m_range_bits, 5);
3927 location_t loc_b = linemap_line_start (line_table, 404198, 512);
3928 assert_loceq ("foo.c", 404198, 0, loc_b);
3930 /* We should have started a new linemap, rather than attempting to store
3931 a very large line offset. */
3932 const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3933 ASSERT_NE (ordmap_a, ordmap_b);
3936 void test_cpp_utf8 ()
3938 const int def_tabstop = 8;
3939 cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3941 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
3943 int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3944 ASSERT_EQ (8, w_bad);
3945 int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3946 ASSERT_EQ (5, w_ctrl);
3949 /* Verify that wcwidth of valid UTF-8 is as expected. */
3951 const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3952 ASSERT_EQ (1, w_pi);
3953 const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3954 ASSERT_EQ (2, w_emoji);
3955 const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3956 policy);
3957 ASSERT_EQ (1, w_umlaut_precomposed);
3958 const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3959 policy);
3960 ASSERT_EQ (1, w_umlaut_combining);
3961 const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3962 ASSERT_EQ (2, w_han);
3963 const int w_ascii = cpp_display_width ("GCC", 3, policy);
3964 ASSERT_EQ (3, w_ascii);
3965 const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3966 "\x9f! \xe4\xb8\xba y\xcc\x88",
3967 24, policy);
3968 ASSERT_EQ (18, w_mixed);
3971 /* Verify that display width properly expands tabs. */
3973 const char *tstr = "\tabc\td";
3974 ASSERT_EQ (6, cpp_display_width (tstr, 6,
3975 cpp_char_column_policy (1, cpp_wcwidth)));
3976 ASSERT_EQ (10, cpp_display_width (tstr, 6,
3977 cpp_char_column_policy (3, cpp_wcwidth)));
3978 ASSERT_EQ (17, cpp_display_width (tstr, 6,
3979 cpp_char_column_policy (8, cpp_wcwidth)));
3980 ASSERT_EQ (1,
3981 cpp_display_column_to_byte_column
3982 (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
3985 /* Verify that cpp_byte_column_to_display_column can go past the end,
3986 and similar edge cases. */
3988 const char *str
3989 /* Display columns.
3990 111111112345 */
3991 = "\xcf\x80 abc";
3992 /* 111122223456
3993 Byte columns. */
3995 ASSERT_EQ (5, cpp_display_width (str, 6, policy));
3996 ASSERT_EQ (105,
3997 cpp_byte_column_to_display_column (str, 6, 106, policy));
3998 ASSERT_EQ (10000,
3999 cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
4000 ASSERT_EQ (0,
4001 cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
4004 /* Verify that cpp_display_column_to_byte_column can go past the end,
4005 and similar edge cases, and check invertibility. */
4007 const char *str
4008 /* Display columns.
4009 000000000000000000000000000000000000011
4010 111111112222222234444444455555555678901 */
4011 = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
4012 /* 000000000000000000000000000000000111111
4013 111122223333444456666777788889999012345
4014 Byte columns. */
4015 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
4016 ASSERT_EQ (15,
4017 cpp_display_column_to_byte_column (str, 15, 11, policy));
4018 ASSERT_EQ (115,
4019 cpp_display_column_to_byte_column (str, 15, 111, policy));
4020 ASSERT_EQ (10000,
4021 cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
4022 ASSERT_EQ (0,
4023 cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
4025 /* Verify that we do not interrupt a UTF-8 sequence. */
4026 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
4028 for (int byte_col = 1; byte_col <= 15; ++byte_col)
4030 const int disp_col
4031 = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
4032 const int byte_col2
4033 = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
4035 /* If we ask for the display column in the middle of a UTF-8
4036 sequence, it will return the length of the partial sequence,
4037 matching the behavior of GCC before display column support.
4038 Otherwise check the round trip was successful. */
4039 if (byte_col < 4)
4040 ASSERT_EQ (byte_col, disp_col);
4041 else if (byte_col >= 6 && byte_col < 9)
4042 ASSERT_EQ (3 + (byte_col - 5), disp_col);
4043 else
4044 ASSERT_EQ (byte_col2, byte_col);
4049 static bool
4050 check_cpp_valid_utf8_p (const char *str)
4052 return cpp_valid_utf8_p (str, strlen (str));
4055 /* Check that cpp_valid_utf8_p works as expected. */
4057 static void
4058 test_cpp_valid_utf8_p ()
4060 ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
4062 /* 2-byte char (pi). */
4063 ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
4065 /* 3-byte chars (the Japanese word "mojibake"). */
4066 ASSERT_TRUE (check_cpp_valid_utf8_p
4068 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
4069 UTF-8: 0xE6 0x96 0x87
4070 C octal escaped UTF-8: \346\226\207. */
4071 "\346\226\207"
4072 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
4073 UTF-8: 0xE5 0xAD 0x97
4074 C octal escaped UTF-8: \345\255\227. */
4075 "\345\255\227"
4076 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
4077 UTF-8: 0xE5 0x8C 0x96
4078 C octal escaped UTF-8: \345\214\226. */
4079 "\345\214\226"
4080 /* U+3051 HIRAGANA LETTER KE
4081 UTF-8: 0xE3 0x81 0x91
4082 C octal escaped UTF-8: \343\201\221. */
4083 "\343\201\221"));
4085 /* 4-byte char: an emoji. */
4086 ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
4088 /* Control codes, including the NUL byte. */
4089 ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
4091 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
4093 /* Unexpected continuation bytes. */
4094 for (unsigned char continuation_byte = 0x80;
4095 continuation_byte <= 0xbf;
4096 continuation_byte++)
4097 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
4099 /* "Lonely start characters" for 2-byte sequences. */
4101 unsigned char buf[2];
4102 buf[1] = ' ';
4103 for (buf[0] = 0xc0;
4104 buf[0] <= 0xdf;
4105 buf[0]++)
4106 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4109 /* "Lonely start characters" for 3-byte sequences. */
4111 unsigned char buf[2];
4112 buf[1] = ' ';
4113 for (buf[0] = 0xe0;
4114 buf[0] <= 0xef;
4115 buf[0]++)
4116 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4119 /* "Lonely start characters" for 4-byte sequences. */
4121 unsigned char buf[2];
4122 buf[1] = ' ';
4123 for (buf[0] = 0xf0;
4124 buf[0] <= 0xf4;
4125 buf[0]++)
4126 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4129 /* Invalid start characters (formerly valid for 5-byte and 6-byte
4130 sequences). */
4132 unsigned char buf[2];
4133 buf[1] = ' ';
4134 for (buf[0] = 0xf5;
4135 buf[0] <= 0xfd;
4136 buf[0]++)
4137 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4140 /* Impossible bytes. */
4141 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
4142 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
4143 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
4144 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
4147 /* Run all of the selftests within this file. */
4149 void
4150 input_cc_tests ()
4152 test_linenum_comparisons ();
4153 test_should_have_column_data_p ();
4154 test_unknown_location ();
4155 test_builtins ();
4156 for_each_line_table_case (test_make_location_nonpure_range_endpoints);
4158 for_each_line_table_case (test_accessing_ordinary_linemaps);
4159 for_each_line_table_case (test_lexer);
4160 for_each_line_table_case (test_lexer_string_locations_simple);
4161 for_each_line_table_case (test_lexer_string_locations_ebcdic);
4162 for_each_line_table_case (test_lexer_string_locations_hex);
4163 for_each_line_table_case (test_lexer_string_locations_oct);
4164 for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
4165 for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
4166 for_each_line_table_case (test_lexer_string_locations_ucn4);
4167 for_each_line_table_case (test_lexer_string_locations_ucn8);
4168 for_each_line_table_case (test_lexer_string_locations_wide_string);
4169 for_each_line_table_case (test_lexer_string_locations_string16);
4170 for_each_line_table_case (test_lexer_string_locations_string32);
4171 for_each_line_table_case (test_lexer_string_locations_u8);
4172 for_each_line_table_case (test_lexer_string_locations_utf8_source);
4173 for_each_line_table_case (test_lexer_string_locations_concatenation_1);
4174 for_each_line_table_case (test_lexer_string_locations_concatenation_2);
4175 for_each_line_table_case (test_lexer_string_locations_concatenation_3);
4176 for_each_line_table_case (test_lexer_string_locations_macro);
4177 for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
4178 for_each_line_table_case (test_lexer_string_locations_non_string);
4179 for_each_line_table_case (test_lexer_string_locations_long_line);
4180 for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
4181 for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
4182 for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
4183 for_each_line_table_case (test_lexer_char_constants);
4185 test_reading_source_line ();
4187 test_line_offset_overflow ();
4189 test_cpp_utf8 ();
4190 test_cpp_valid_utf8_p ();
4193 } // namespace selftest
4195 #endif /* CHECKING_P */