gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2017 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic-core.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* This is a cache used by get_next_line to store the content of a
  33    file to be searched for file lines.  */
  34 struct fcache
  35 {
  36   /* These are information used to store a line boundary.  */
  37   struct line_info
  38   {
  39     /* The line number.  It starts from 1.  */
  40     size_t line_num;
  41
  42     /* The position (byte count) of the beginning of the line,
  43        relative to the file data pointer.  This starts at zero.  */
  44     size_t start_pos;
  45
  46     /* The position (byte count) of the last byte of the line.  This
  47        normally points to the '\n' character, or to one byte after the
  48        last byte of the file, if the file doesn't contain a '\n'
  49        character.  */
  50     size_t end_pos;
  51
  52     line_info (size_t l, size_t s, size_t e)
  53       : line_num (l), start_pos (s), end_pos (e)
  54     {}
  55
  56     line_info ()
  57       :line_num (0), start_pos (0), end_pos (0)
  58     {}
  59   };
  60
  61   /* The number of time this file has been accessed.  This is used
  62      to designate which file cache to evict from the cache
  63      array.  */
  64   unsigned use_count;
  65
  66   /* The file_path is the key for identifying a particular file in
  67      the cache.
  68      For libcpp-using code, the underlying buffer for this field is
  69      owned by the corresponding _cpp_file within the cpp_reader.  */
  70   const char *file_path;
  71
  72   FILE *fp;
  73
  74   /* This points to the content of the file that we've read so
  75      far.  */
  76   char *data;
  77
  78   /*  The size of the DATA array above.*/
  79   size_t size;
  80
  81   /* The number of bytes read from the underlying file so far.  This
  82      must be less (or equal) than SIZE above.  */
  83   size_t nb_read;
  84
  85   /* The index of the beginning of the current line.  */
  86   size_t line_start_idx;
  87
  88   /* The number of the previous line read.  This starts at 1.  Zero
  89      means we've read no line so far.  */
  90   size_t line_num;
  91
  92   /* This is the total number of lines of the current file.  At the
  93      moment, we try to get this information from the line map
  94      subsystem.  Note that this is just a hint.  When using the C++
  95      front-end, this hint is correct because the input file is then
  96      completely tokenized before parsing starts; so the line map knows
  97      the number of lines before compilation really starts.  For e.g,
  98      the C front-end, it can happen that we start emitting diagnostics
  99      before the line map has seen the end of the file.  */
 100   size_t total_lines;
 101
 102   /* Could this file be missing a trailing newline on its final line?
 103      Initially true (to cope with empty files), set to true/false
 104      as each line is read.  */
 105   bool missing_trailing_newline;
 106
 107   /* This is a record of the beginning and end of the lines we've seen
 108      while reading the file.  This is useful to avoid walking the data
 109      from the beginning when we are asked to read a line that is
 110      before LINE_START_IDX above.  Note that the maximum size of this
 111      record is fcache_line_record_size, so that the memory consumption
 112      doesn't explode.  We thus scale total_lines down to
 113      fcache_line_record_size.  */
 114   vec<line_info, va_heap> line_record;
 115
 116   fcache ();
 117   ~fcache ();
 118 };
 119
 120 /* Current position in real source file.  */
 121
 122 location_t input_location = UNKNOWN_LOCATION;
 123
 124 struct line_maps *line_table;
 125
 126 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 127    This needs to be a global so that it can be a GC root, and thus
 128    prevent the stashed copy from being garbage-collected if the GC runs
 129    during a line_table_test.  */
 130
 131 struct line_maps *saved_line_table;
 132
 133 static fcache *fcache_tab;
 134 static const size_t fcache_tab_size = 16;
 135 static const size_t fcache_buffer_size = 4 * 1024;
 136 static const size_t fcache_line_record_size = 100;
 137
 138 /* Expand the source location LOC into a human readable location.  If
 139    LOC resolves to a builtin location, the file name of the readable
 140    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 141    TRUE and LOC is virtual, then it is resolved to the expansion
 142    point of the involved macro.  Otherwise, it is resolved to the
 143    spelling location of the token.
 144
 145    When resolving to the spelling location of the token, if the
 146    resulting location is for a built-in location (that is, it has no
 147    associated line/column) in the context of a macro expansion, the
 148    returned location is the first one (while unwinding the macro
 149    location towards its expansion point) that is in real source
 150    code.  */
 151
 152 static expanded_location
 153 expand_location_1 (source_location loc,
 154                    bool expansion_point_p)
 155 {
 156   expanded_location xloc;
 157   const line_map_ordinary *map;
 158   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 159   tree block = NULL;
 160
 161   if (IS_ADHOC_LOC (loc))
 162     {
 163       block = LOCATION_BLOCK (loc);
 164       loc = LOCATION_LOCUS (loc);
 165     }
 166
 167   memset (&xloc, 0, sizeof (xloc));
 168
 169   if (loc >= RESERVED_LOCATION_COUNT)
 170     {
 171       if (!expansion_point_p)
 172         {
 173           /* We want to resolve LOC to its spelling location.
 174
 175              But if that spelling location is a reserved location that
 176              appears in the context of a macro expansion (like for a
 177              location for a built-in token), let's consider the first
 178              location (toward the expansion point) that is not reserved;
 179              that is, the first location that is in real source code.  */
 180           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 181                                                           loc, NULL);
 182           lrk = LRK_SPELLING_LOCATION;
 183         }
 184       loc = linemap_resolve_location (line_table, loc,
 185                                       lrk, &map);
 186       xloc = linemap_expand_location (line_table, map, loc);
 187     }
 188
 189   xloc.data = block;
 190   if (loc <= BUILTINS_LOCATION)
 191     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 192
 193   return xloc;
 194 }
 195
 196 /* Initialize the set of cache used for files accessed by caret
 197    diagnostic.  */
 198
 199 static void
 200 diagnostic_file_cache_init (void)
 201 {
 202   if (fcache_tab == NULL)
 203     fcache_tab = new fcache[fcache_tab_size];
 204 }
 205
 206 /* Free the resources used by the set of cache used for files accessed
 207    by caret diagnostic.  */
 208
 209 void
 210 diagnostic_file_cache_fini (void)
 211 {
 212   if (fcache_tab)
 213     {
 214       delete [] (fcache_tab);
 215       fcache_tab = NULL;
 216     }
 217 }
 218
 219 /* Return the total lines number that have been read so far by the
 220    line map (in the preprocessor) so far.  For languages like C++ that
 221    entirely preprocess the input file before starting to parse, this
 222    equals the actual number of lines of the file.  */
 223
 224 static size_t
 225 total_lines_num (const char *file_path)
 226 {
 227   size_t r = 0;
 228   source_location l = 0;
 229   if (linemap_get_file_highest_location (line_table, file_path, &l))
 230     {
 231       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 232       expanded_location xloc = expand_location (l);
 233       r = xloc.line;
 234     }
 235   return r;
 236 }
 237
 238 /* Lookup the cache used for the content of a given file accessed by
 239    caret diagnostic.  Return the found cached file, or NULL if no
 240    cached file was found.  */
 241
 242 static fcache*
 243 lookup_file_in_cache_tab (const char *file_path)
 244 {
 245   if (file_path == NULL)
 246     return NULL;
 247
 248   diagnostic_file_cache_init ();
 249
 250   /* This will contain the found cached file.  */
 251   fcache *r = NULL;
 252   for (unsigned i = 0; i < fcache_tab_size; ++i)
 253     {
 254       fcache *c = &fcache_tab[i];
 255       if (c->file_path && !strcmp (c->file_path, file_path))
 256         {
 257           ++c->use_count;
 258           r = c;
 259         }
 260     }
 261
 262   if (r)
 263     ++r->use_count;
 264
 265   return r;
 266 }
 267
 268 /* Purge any mention of FILENAME from the cache of files used for
 269    printing source code.  For use in selftests when working
 270    with tempfiles.  */
 271
 272 void
 273 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 274 {
 275   gcc_assert (file_path);
 276
 277   fcache *r = lookup_file_in_cache_tab (file_path);
 278   if (!r)
 279     /* Not found.  */
 280     return;
 281
 282   r->file_path = NULL;
 283   if (r->fp)
 284     fclose (r->fp);
 285   r->fp = NULL;
 286   r->nb_read = 0;
 287   r->line_start_idx = 0;
 288   r->line_num = 0;
 289   r->line_record.truncate (0);
 290   r->use_count = 0;
 291   r->total_lines = 0;
 292   r->missing_trailing_newline = true;
 293 }
 294
 295 /* Return the file cache that has been less used, recently, or the
 296    first empty one.  If HIGHEST_USE_COUNT is non-null,
 297    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 298    in the cache table.  */
 299
 300 static fcache*
 301 evicted_cache_tab_entry (unsigned *highest_use_count)
 302 {
 303   diagnostic_file_cache_init ();
 304
 305   fcache *to_evict = &fcache_tab[0];
 306   unsigned huc = to_evict->use_count;
 307   for (unsigned i = 1; i < fcache_tab_size; ++i)
 308     {
 309       fcache *c = &fcache_tab[i];
 310       bool c_is_empty = (c->file_path == NULL);
 311
 312       if (c->use_count < to_evict->use_count
 313           || (to_evict->file_path && c_is_empty))
 314         /* We evict C because it's either an entry with a lower use
 315            count or one that is empty.  */
 316         to_evict = c;
 317
 318       if (huc < c->use_count)
 319         huc = c->use_count;
 320
 321       if (c_is_empty)
 322         /* We've reached the end of the cache; subsequent elements are
 323            all empty.  */
 324         break;
 325     }
 326
 327   if (highest_use_count)
 328     *highest_use_count = huc;
 329
 330   return to_evict;
 331 }
 332
 333 /* Create the cache used for the content of a given file to be
 334    accessed by caret diagnostic.  This cache is added to an array of
 335    cache and can be retrieved by lookup_file_in_cache_tab.  This
 336    function returns the created cache.  Note that only the last
 337    fcache_tab_size files are cached.  */
 338
 339 static fcache*
 340 add_file_to_cache_tab (const char *file_path)
 341 {
 342
 343   FILE *fp = fopen (file_path, "r");
 344   if (fp == NULL)
 345     return NULL;
 346
 347   unsigned highest_use_count = 0;
 348   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 349   r->file_path = file_path;
 350   if (r->fp)
 351     fclose (r->fp);
 352   r->fp = fp;
 353   r->nb_read = 0;
 354   r->line_start_idx = 0;
 355   r->line_num = 0;
 356   r->line_record.truncate (0);
 357   /* Ensure that this cache entry doesn't get evicted next time
 358      add_file_to_cache_tab is called.  */
 359   r->use_count = ++highest_use_count;
 360   r->total_lines = total_lines_num (file_path);
 361   r->missing_trailing_newline = true;
 362
 363   return r;
 364 }
 365
 366 /* Lookup the cache used for the content of a given file accessed by
 367    caret diagnostic.  If no cached file was found, create a new cache
 368    for this file, add it to the array of cached file and return
 369    it.  */
 370
 371 static fcache*
 372 lookup_or_add_file_to_cache_tab (const char *file_path)
 373 {
 374   fcache *r = lookup_file_in_cache_tab (file_path);
 375   if (r == NULL)
 376     r = add_file_to_cache_tab (file_path);
 377   return r;
 378 }
 379
 380 /* Default constructor for a cache of file used by caret
 381    diagnostic.  */
 382
 383 fcache::fcache ()
 384 : use_count (0), file_path (NULL), fp (NULL), data (0),
 385   size (0), nb_read (0), line_start_idx (0), line_num (0),
 386   total_lines (0), missing_trailing_newline (true)
 387 {
 388   line_record.create (0);
 389 }
 390
 391 /* Destructor for a cache of file used by caret diagnostic.  */
 392
 393 fcache::~fcache ()
 394 {
 395   if (fp)
 396     {
 397       fclose (fp);
 398       fp = NULL;
 399     }
 400   if (data)
 401     {
 402       XDELETEVEC (data);
 403       data = 0;
 404     }
 405   line_record.release ();
 406 }
 407
 408 /* Returns TRUE iff the cache would need to be filled with data coming
 409    from the file.  That is, either the cache is empty or full or the
 410    current line is empty.  Note that if the cache is full, it would
 411    need to be extended and filled again.  */
 412
 413 static bool
 414 needs_read (fcache *c)
 415 {
 416   return (c->nb_read == 0
 417           || c->nb_read == c->size
 418           || (c->line_start_idx >= c->nb_read - 1));
 419 }
 420
 421 /*  Return TRUE iff the cache is full and thus needs to be
 422     extended.  */
 423
 424 static bool
 425 needs_grow (fcache *c)
 426 {
 427   return c->nb_read == c->size;
 428 }
 429
 430 /* Grow the cache if it needs to be extended.  */
 431
 432 static void
 433 maybe_grow (fcache *c)
 434 {
 435   if (!needs_grow (c))
 436     return;
 437
 438   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 439   c->data = XRESIZEVEC (char, c->data, size);
 440   c->size = size;
 441 }
 442
 443 /*  Read more data into the cache.  Extends the cache if need be.
 444     Returns TRUE iff new data could be read.  */
 445
 446 static bool
 447 read_data (fcache *c)
 448 {
 449   if (feof (c->fp) || ferror (c->fp))
 450     return false;
 451
 452   maybe_grow (c);
 453
 454   char * from = c->data + c->nb_read;
 455   size_t to_read = c->size - c->nb_read;
 456   size_t nb_read = fread (from, 1, to_read, c->fp);
 457
 458   if (ferror (c->fp))
 459     return false;
 460
 461   c->nb_read += nb_read;
 462   return !!nb_read;
 463 }
 464
 465 /* Read new data iff the cache needs to be filled with more data
 466    coming from the file FP.  Return TRUE iff the cache was filled with
 467    mode data.  */
 468
 469 static bool
 470 maybe_read_data (fcache *c)
 471 {
 472   if (!needs_read (c))
 473     return false;
 474   return read_data (c);
 475 }
 476
 477 /* Read a new line from file FP, using C as a cache for the data
 478    coming from the file.  Upon successful completion, *LINE is set to
 479    the beginning of the line found.  *LINE points directly in the
 480    line cache and is only valid until the next call of get_next_line.
 481    *LINE_LEN is set to the length of the line.  Note that the line
 482    does not contain any terminal delimiter.  This function returns
 483    true if some data was read or process from the cache, false
 484    otherwise.  Note that subsequent calls to get_next_line might
 485    make the content of *LINE invalid.  */
 486
 487 static bool
 488 get_next_line (fcache *c, char **line, ssize_t *line_len)
 489 {
 490   /* Fill the cache with data to process.  */
 491   maybe_read_data (c);
 492
 493   size_t remaining_size = c->nb_read - c->line_start_idx;
 494   if (remaining_size == 0)
 495     /* There is no more data to process.  */
 496     return false;
 497
 498   char *line_start = c->data + c->line_start_idx;
 499
 500   char *next_line_start = NULL;
 501   size_t len = 0;
 502   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 503   if (line_end == NULL)
 504     {
 505       /* We haven't found the end-of-line delimiter in the cache.
 506          Fill the cache with more data from the file and look for the
 507          '\n'.  */
 508       while (maybe_read_data (c))
 509         {
 510           line_start = c->data + c->line_start_idx;
 511           remaining_size = c->nb_read - c->line_start_idx;
 512           line_end = (char *) memchr (line_start, '\n', remaining_size);
 513           if (line_end != NULL)
 514             {
 515               next_line_start = line_end + 1;
 516               break;
 517             }
 518         }
 519       if (line_end == NULL)
 520         {
 521           /* We've loadded all the file into the cache and still no
 522              '\n'.  Let's say the line ends up at one byte passed the
 523              end of the file.  This is to stay consistent with the case
 524              of when the line ends up with a '\n' and line_end points to
 525              that terminal '\n'.  That consistency is useful below in
 526              the len calculation.  */
 527           line_end = c->data + c->nb_read ;
 528           c->missing_trailing_newline = true;
 529         }
 530       else
 531         c->missing_trailing_newline = false;
 532     }
 533   else
 534     {
 535       next_line_start = line_end + 1;
 536       c->missing_trailing_newline = false;
 537     }
 538
 539   if (ferror (c->fp))
 540     return false;
 541
 542   /* At this point, we've found the end of the of line.  It either
 543      points to the '\n' or to one byte after the last byte of the
 544      file.  */
 545   gcc_assert (line_end != NULL);
 546
 547   len = line_end - line_start;
 548
 549   if (c->line_start_idx < c->nb_read)
 550     *line = line_start;
 551
 552   ++c->line_num;
 553
 554   /* Before we update our line record, make sure the hint about the
 555      total number of lines of the file is correct.  If it's not, then
 556      we give up recording line boundaries from now on.  */
 557   bool update_line_record = true;
 558   if (c->line_num > c->total_lines)
 559     update_line_record = false;
 560
 561     /* Now update our line record so that re-reading lines from the
 562      before c->line_start_idx is faster.  */
 563   if (update_line_record
 564       && c->line_record.length () < fcache_line_record_size)
 565     {
 566       /* If the file lines fits in the line record, we just record all
 567          its lines ...*/
 568       if (c->total_lines <= fcache_line_record_size
 569           && c->line_num > c->line_record.length ())
 570         c->line_record.safe_push (fcache::line_info (c->line_num,
 571                                                  c->line_start_idx,
 572                                                  line_end - c->data));
 573       else if (c->total_lines > fcache_line_record_size)
 574         {
 575           /* ... otherwise, we just scale total_lines down to
 576              (fcache_line_record_size lines.  */
 577           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 578           if (c->line_record.length () == 0
 579               || n >= c->line_record.length ())
 580             c->line_record.safe_push (fcache::line_info (c->line_num,
 581                                                      c->line_start_idx,
 582                                                      line_end - c->data));
 583         }
 584     }
 585
 586   /* Update c->line_start_idx so that it points to the next line to be
 587      read.  */
 588   if (next_line_start)
 589     c->line_start_idx = next_line_start - c->data;
 590   else
 591     /* We didn't find any terminal '\n'.  Let's consider that the end
 592        of line is the end of the data in the cache.  The next
 593        invocation of get_next_line will either read more data from the
 594        underlying file or return false early because we've reached the
 595        end of the file.  */
 596     c->line_start_idx = c->nb_read;
 597
 598   *line_len = len;
 599
 600   return true;
 601 }
 602
 603 /* Consume the next bytes coming from the cache (or from its
 604    underlying file if there are remaining unread bytes in the file)
 605    until we reach the next end-of-line (or end-of-file).  There is no
 606    copying from the cache involved.  Return TRUE upon successful
 607    completion.  */
 608
 609 static bool
 610 goto_next_line (fcache *cache)
 611 {
 612   char *l;
 613   ssize_t len;
 614
 615   return get_next_line (cache, &l, &len);
 616 }
 617
 618 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 619    If the line was read successfully, *LINE points to the beginning
 620    of the line in the file cache and *LINE_LEN is the length of the
 621    line.  *LINE is not nul-terminated, but may contain zero bytes.
 622    *LINE is only valid until the next call of read_line_num.
 623    This function returns bool if a line was read.  */
 624
 625 static bool
 626 read_line_num (fcache *c, size_t line_num,
 627                char **line, ssize_t *line_len)
 628 {
 629   gcc_assert (line_num > 0);
 630
 631   if (line_num <= c->line_num)
 632     {
 633       /* We've been asked to read lines that are before c->line_num.
 634          So lets use our line record (if it's not empty) to try to
 635          avoid re-reading the file from the beginning again.  */
 636
 637       if (c->line_record.is_empty ())
 638         {
 639           c->line_start_idx = 0;
 640           c->line_num = 0;
 641         }
 642       else
 643         {
 644           fcache::line_info *i = NULL;
 645           if (c->total_lines <= fcache_line_record_size)
 646             {
 647               /* In languages where the input file is not totally
 648                  preprocessed up front, the c->total_lines hint
 649                  can be smaller than the number of lines of the
 650                  file.  In that case, only the first
 651                  c->total_lines have been recorded.
 652
 653                  Otherwise, the first c->total_lines we've read have
 654                  their start/end recorded here.  */
 655               i = (line_num <= c->total_lines)
 656                 ? &c->line_record[line_num - 1]
 657                 : &c->line_record[c->total_lines - 1];
 658               gcc_assert (i->line_num <= line_num);
 659             }
 660           else
 661             {
 662               /*  So the file had more lines than our line record
 663                   size.  Thus the number of lines we've recorded has
 664                   been scaled down to fcache_line_reacord_size.  Let's
 665                   pick the start/end of the recorded line that is
 666                   closest to line_num.  */
 667               size_t n = (line_num <= c->total_lines)
 668                 ? line_num * fcache_line_record_size / c->total_lines
 669                 : c ->line_record.length () - 1;
 670               if (n < c->line_record.length ())
 671                 {
 672                   i = &c->line_record[n];
 673                   gcc_assert (i->line_num <= line_num);
 674                 }
 675             }
 676
 677           if (i && i->line_num == line_num)
 678             {
 679               /* We have the start/end of the line.  */
 680               *line = c->data + i->start_pos;
 681               *line_len = i->end_pos - i->start_pos;
 682               return true;
 683             }
 684
 685           if (i)
 686             {
 687               c->line_start_idx = i->start_pos;
 688               c->line_num = i->line_num - 1;
 689             }
 690           else
 691             {
 692               c->line_start_idx = 0;
 693               c->line_num = 0;
 694             }
 695         }
 696     }
 697
 698   /*  Let's walk from line c->line_num up to line_num - 1, without
 699       copying any line.  */
 700   while (c->line_num < line_num - 1)
 701     if (!goto_next_line (c))
 702       return false;
 703
 704   /* The line we want is the next one.  Let's read and copy it back to
 705      the caller.  */
 706   return get_next_line (c, line, line_len);
 707 }
 708
 709 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 710    The line is not nul-terminated.  The returned pointer is only
 711    valid until the next call of location_get_source_line.
 712    Note that the line can contain several null characters,
 713    so LINE_LEN, if non-null, points to the actual length of the line.
 714    If the function fails, NULL is returned.  */
 715
 716 const char *
 717 location_get_source_line (const char *file_path, int line,
 718                           int *line_len)
 719 {
 720   char *buffer = NULL;
 721   ssize_t len;
 722
 723   if (line == 0)
 724     return NULL;
 725
 726   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 727   if (c == NULL)
 728     return NULL;
 729
 730   bool read = read_line_num (c, line, &buffer, &len);
 731
 732   if (read && line_len)
 733     *line_len = len;
 734
 735   return read ? buffer : NULL;
 736 }
 737
 738 /* Determine if FILE_PATH missing a trailing newline on its final line.
 739    Only valid to call once all of the file has been loaded, by
 740    requesting a line number beyond the end of the file.  */
 741
 742 bool
 743 location_missing_trailing_newline (const char *file_path)
 744 {
 745   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 746   if (c == NULL)
 747     return false;
 748
 749   return c->missing_trailing_newline;
 750 }
 751
 752 /* Test if the location originates from the spelling location of a
 753    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 754    virtual) location of a built-in token that appears in the expansion
 755    list of a macro.  Please note that this function also works on
 756    tokens that result from built-in tokens.  For instance, the
 757    function would return true if passed a token "4" that is the result
 758    of the expansion of the built-in __LINE__ macro.  */
 759 bool
 760 is_location_from_builtin_token (source_location loc)
 761 {
 762   const line_map_ordinary *map = NULL;
 763   loc = linemap_resolve_location (line_table, loc,
 764                                   LRK_SPELLING_LOCATION, &map);
 765   return loc == BUILTINS_LOCATION;
 766 }
 767
 768 /* Expand the source location LOC into a human readable location.  If
 769    LOC is virtual, it resolves to the expansion point of the involved
 770    macro.  If LOC resolves to a builtin location, the file name of the
 771    readable location is set to the string "<built-in>".  */
 772
 773 expanded_location
 774 expand_location (source_location loc)
 775 {
 776   return expand_location_1 (loc, /*expansion_point_p=*/true);
 777 }
 778
 779 /* Expand the source location LOC into a human readable location.  If
 780    LOC is virtual, it resolves to the expansion location of the
 781    relevant macro.  If LOC resolves to a builtin location, the file
 782    name of the readable location is set to the string
 783    "<built-in>".  */
 784
 785 expanded_location
 786 expand_location_to_spelling_point (source_location loc)
 787 {
 788   return expand_location_1 (loc, /*expansion_point_p=*/false);
 789 }
 790
 791 /* The rich_location class within libcpp requires a way to expand
 792    source_location instances, and relies on the client code
 793    providing a symbol named
 794      linemap_client_expand_location_to_spelling_point
 795    to do this.
 796
 797    This is the implementation for libcommon.a (all host binaries),
 798    which simply calls into expand_location_to_spelling_point.  */
 799
 800 expanded_location
 801 linemap_client_expand_location_to_spelling_point (source_location loc)
 802 {
 803   return expand_location_to_spelling_point (loc);
 804 }
 805
 806
 807 /* If LOCATION is in a system header and if it is a virtual location for
 808    a token coming from the expansion of a macro, unwind it to the
 809    location of the expansion point of the macro.  Otherwise, just return
 810    LOCATION.
 811
 812    This is used for instance when we want to emit diagnostics about a
 813    token that may be located in a macro that is itself defined in a
 814    system header, for example, for the NULL macro.  In such a case, if
 815    LOCATION were passed directly to diagnostic functions such as
 816    warning_at, the diagnostic would be suppressed (unless
 817    -Wsystem-headers).  */
 818
 819 source_location
 820 expansion_point_location_if_in_system_header (source_location location)
 821 {
 822   if (in_system_header_at (location))
 823     location = linemap_resolve_location (line_table, location,
 824                                          LRK_MACRO_EXPANSION_POINT,
 825                                          NULL);
 826   return location;
 827 }
 828
 829 /* If LOCATION is a virtual location for a token coming from the expansion
 830    of a macro, unwind to the location of the expansion point of the macro.  */
 831
 832 source_location
 833 expansion_point_location (source_location location)
 834 {
 835   return linemap_resolve_location (line_table, location,
 836                                    LRK_MACRO_EXPANSION_POINT, NULL);
 837 }
 838
 839 /* Construct a location with caret at CARET, ranging from START to
 840    finish e.g.
 841
 842                  11111111112
 843         12345678901234567890
 844      522
 845      523   return foo + bar;
 846                   ~~~~^~~~~
 847      524
 848
 849    The location's caret is at the "+", line 523 column 15, but starts
 850    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 851    of "bar" at column 19.  */
 852
 853 location_t
 854 make_location (location_t caret, location_t start, location_t finish)
 855 {
 856   location_t pure_loc = get_pure_location (caret);
 857   source_range src_range;
 858   src_range.m_start = get_start (start);
 859   src_range.m_finish = get_finish (finish);
 860   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 861                                                    pure_loc,
 862                                                    src_range,
 863                                                    NULL);
 864   return combined_loc;
 865 }
 866
 867 #define ONE_K 1024
 868 #define ONE_M (ONE_K * ONE_K)
 869
 870 /* Display a number as an integer multiple of either:
 871    - 1024, if said integer is >= to 10 K (in base 2)
 872    - 1024 * 1024, if said integer is >= 10 M in (base 2)
 873  */
 874 #define SCALE(x) ((unsigned long) ((x) < 10 * ONE_K \
 875                   ? (x) \
 876                   : ((x) < 10 * ONE_M \
 877                      ? (x) / ONE_K \
 878                      : (x) / ONE_M)))
 879
 880 /* For a given integer, display either:
 881    - the character 'k', if the number is higher than 10 K (in base 2)
 882      but strictly lower than 10 M (in base 2)
 883    - the character 'M' if the number is higher than 10 M (in base2)
 884    - the charcter ' ' if the number is strictly lower  than 10 K  */
 885 #define STAT_LABEL(x) ((x) < 10 * ONE_K ? ' ' : ((x) < 10 * ONE_M ? 'k' : 'M'))
 886
 887 /* Display an integer amount as multiple of 1K or 1M (in base 2).
 888    Display the correct unit (either k, M, or ' ') after the amout, as
 889    well.  */
 890 #define FORMAT_AMOUNT(size) SCALE (size), STAT_LABEL (size)
 891
 892 /* Dump statistics to stderr about the memory usage of the line_table
 893    set of line maps.  This also displays some statistics about macro
 894    expansion.  */
 895
 896 void
 897 dump_line_table_statistics (void)
 898 {
 899   struct linemap_stats s;
 900   long total_used_map_size,
 901     macro_maps_size,
 902     total_allocated_map_size;
 903
 904   memset (&s, 0, sizeof (s));
 905
 906   linemap_get_statistics (line_table, &s);
 907
 908   macro_maps_size = s.macro_maps_used_size
 909     + s.macro_maps_locations_size;
 910
 911   total_allocated_map_size = s.ordinary_maps_allocated_size
 912     + s.macro_maps_allocated_size
 913     + s.macro_maps_locations_size;
 914
 915   total_used_map_size = s.ordinary_maps_used_size
 916     + s.macro_maps_used_size
 917     + s.macro_maps_locations_size;
 918
 919   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 920            s.num_expanded_macros);
 921   if (s.num_expanded_macros != 0)
 922     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 923              s.num_macro_tokens / s.num_expanded_macros);
 924   fprintf (stderr,
 925            "\nLine Table allocations during the "
 926            "compilation process\n");
 927   fprintf (stderr, "Number of ordinary maps used:        %5ld%c\n",
 928            SCALE (s.num_ordinary_maps_used),
 929            STAT_LABEL (s.num_ordinary_maps_used));
 930   fprintf (stderr, "Ordinary map used size:              %5ld%c\n",
 931            SCALE (s.ordinary_maps_used_size),
 932            STAT_LABEL (s.ordinary_maps_used_size));
 933   fprintf (stderr, "Number of ordinary maps allocated:   %5ld%c\n",
 934            SCALE (s.num_ordinary_maps_allocated),
 935            STAT_LABEL (s.num_ordinary_maps_allocated));
 936   fprintf (stderr, "Ordinary maps allocated size:        %5ld%c\n",
 937            SCALE (s.ordinary_maps_allocated_size),
 938            STAT_LABEL (s.ordinary_maps_allocated_size));
 939   fprintf (stderr, "Number of macro maps used:           %5ld%c\n",
 940            SCALE (s.num_macro_maps_used),
 941            STAT_LABEL (s.num_macro_maps_used));
 942   fprintf (stderr, "Macro maps used size:                %5ld%c\n",
 943            SCALE (s.macro_maps_used_size),
 944            STAT_LABEL (s.macro_maps_used_size));
 945   fprintf (stderr, "Macro maps locations size:           %5ld%c\n",
 946            SCALE (s.macro_maps_locations_size),
 947            STAT_LABEL (s.macro_maps_locations_size));
 948   fprintf (stderr, "Macro maps size:                     %5ld%c\n",
 949            SCALE (macro_maps_size),
 950            STAT_LABEL (macro_maps_size));
 951   fprintf (stderr, "Duplicated maps locations size:      %5ld%c\n",
 952            SCALE (s.duplicated_macro_maps_locations_size),
 953            STAT_LABEL (s.duplicated_macro_maps_locations_size));
 954   fprintf (stderr, "Total allocated maps size:           %5ld%c\n",
 955            SCALE (total_allocated_map_size),
 956            STAT_LABEL (total_allocated_map_size));
 957   fprintf (stderr, "Total used maps size:                %5ld%c\n",
 958            SCALE (total_used_map_size),
 959            STAT_LABEL (total_used_map_size));
 960   fprintf (stderr, "Ad-hoc table size:                   %5ld%c\n",
 961            SCALE (s.adhoc_table_size),
 962            STAT_LABEL (s.adhoc_table_size));
 963   fprintf (stderr, "Ad-hoc table entries used:           %5ld\n",
 964            s.adhoc_table_entries_used);
 965   fprintf (stderr, "optimized_ranges: %i\n",
 966            line_table->num_optimized_ranges);
 967   fprintf (stderr, "unoptimized_ranges: %i\n",
 968            line_table->num_unoptimized_ranges);
 969
 970   fprintf (stderr, "\n");
 971 }
 972
 973 /* Get location one beyond the final location in ordinary map IDX.  */
 974
 975 static source_location
 976 get_end_location (struct line_maps *set, unsigned int idx)
 977 {
 978   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
 979     return set->highest_location;
 980
 981   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
 982   return MAP_START_LOCATION (next_map);
 983 }
 984
 985 /* Helper function for write_digit_row.  */
 986
 987 static void
 988 write_digit (FILE *stream, int digit)
 989 {
 990   fputc ('0' + (digit % 10), stream);
 991 }
 992
 993 /* Helper function for dump_location_info.
 994    Write a row of numbers to STREAM, numbering a source line,
 995    giving the units, tens, hundreds etc of the column number.  */
 996
 997 static void
 998 write_digit_row (FILE *stream, int indent,
 999                  const line_map_ordinary *map,
1000                  source_location loc, int max_col, int divisor)
1001 {
1002   fprintf (stream, "%*c", indent, ' ');
1003   fprintf (stream, "|");
1004   for (int column = 1; column < max_col; column++)
1005     {
1006       source_location column_loc = loc + (column << map->m_range_bits);
1007       write_digit (stream, column_loc / divisor);
1008     }
1009   fprintf (stream, "\n");
1010 }
1011
1012 /* Write a half-closed (START) / half-open (END) interval of
1013    source_location to STREAM.  */
1014
1015 static void
1016 dump_location_range (FILE *stream,
1017                      source_location start, source_location end)
1018 {
1019   fprintf (stream,
1020            "  source_location interval: %u <= loc < %u\n",
1021            start, end);
1022 }
1023
1024 /* Write a labelled description of a half-closed (START) / half-open (END)
1025    interval of source_location to STREAM.  */
1026
1027 static void
1028 dump_labelled_location_range (FILE *stream,
1029                               const char *name,
1030                               source_location start, source_location end)
1031 {
1032   fprintf (stream, "%s\n", name);
1033   dump_location_range (stream, start, end);
1034   fprintf (stream, "\n");
1035 }
1036
1037 /* Write a visualization of the locations in the line_table to STREAM.  */
1038
1039 void
1040 dump_location_info (FILE *stream)
1041 {
1042   /* Visualize the reserved locations.  */
1043   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1044                                 0, RESERVED_LOCATION_COUNT);
1045
1046   /* Visualize the ordinary line_map instances, rendering the sources. */
1047   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1048     {
1049       source_location end_location = get_end_location (line_table, idx);
1050       /* half-closed: doesn't include this one. */
1051
1052       const line_map_ordinary *map
1053         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1054       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1055       dump_location_range (stream,
1056                            MAP_START_LOCATION (map), end_location);
1057       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1058       fprintf (stream, "  starting at line: %i\n",
1059                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1060       fprintf (stream, "  column and range bits: %i\n",
1061                map->m_column_and_range_bits);
1062       fprintf (stream, "  column bits: %i\n",
1063                map->m_column_and_range_bits - map->m_range_bits);
1064       fprintf (stream, "  range bits: %i\n",
1065                map->m_range_bits);
1066
1067       /* Render the span of source lines that this "map" covers.  */
1068       for (source_location loc = MAP_START_LOCATION (map);
1069            loc < end_location;
1070            loc += (1 << map->m_range_bits) )
1071         {
1072           gcc_assert (pure_location_p (line_table, loc) );
1073
1074           expanded_location exploc
1075             = linemap_expand_location (line_table, map, loc);
1076
1077           if (0 == exploc.column)
1078             {
1079               /* Beginning of a new source line: draw the line.  */
1080
1081               int line_size;
1082               const char *line_text = location_get_source_line (exploc.file,
1083                                                                 exploc.line,
1084                                                                 &line_size);
1085               if (!line_text)
1086                 break;
1087               fprintf (stream,
1088                        "%s:%3i|loc:%5i|%.*s\n",
1089                        exploc.file, exploc.line,
1090                        loc,
1091                        line_size, line_text);
1092
1093               /* "loc" is at column 0, which means "the whole line".
1094                  Render the locations *within* the line, by underlining
1095                  it, showing the source_location numeric values
1096                  at each column.  */
1097               int max_col = (1 << map->m_column_and_range_bits) - 1;
1098               if (max_col > line_size)
1099                 max_col = line_size + 1;
1100
1101               int indent = 14 + strlen (exploc.file);
1102
1103               /* Thousands.  */
1104               if (end_location > 999)
1105                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1106
1107               /* Hundreds.  */
1108               if (end_location > 99)
1109                 write_digit_row (stream, indent, map, loc, max_col, 100);
1110
1111               /* Tens.  */
1112               write_digit_row (stream, indent, map, loc, max_col, 10);
1113
1114               /* Units.  */
1115               write_digit_row (stream, indent, map, loc, max_col, 1);
1116             }
1117         }
1118       fprintf (stream, "\n");
1119     }
1120
1121   /* Visualize unallocated values.  */
1122   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1123                                 line_table->highest_location,
1124                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1125
1126   /* Visualize the macro line_map instances, rendering the sources. */
1127   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1128     {
1129       /* Each macro map that is allocated owns source_location values
1130          that are *lower* that the one before them.
1131          Hence it's meaningful to view them either in order of ascending
1132          source locations, or in order of ascending macro map index.  */
1133       const bool ascending_source_locations = true;
1134       unsigned int idx = (ascending_source_locations
1135                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1136                           : i);
1137       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1138       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1139                idx,
1140                linemap_map_get_macro_name (map),
1141                MACRO_MAP_NUM_MACRO_TOKENS (map));
1142       dump_location_range (stream,
1143                            map->start_location,
1144                            (map->start_location
1145                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1146       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1147               "expansion point is location %i",
1148               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1149       fprintf (stream, "  map->start_location: %u\n",
1150                map->start_location);
1151
1152       fprintf (stream, "  macro_locations:\n");
1153       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1154         {
1155           source_location x = MACRO_MAP_LOCATIONS (map)[2 * i];
1156           source_location y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1157
1158           /* linemap_add_macro_token encodes token numbers in an expansion
1159              by putting them after MAP_START_LOCATION. */
1160
1161           /* I'm typically seeing 4 uninitialized entries at the end of
1162              0xafafafaf.
1163              This appears to be due to macro.c:replace_args
1164              adding 2 extra args for padding tokens; presumably there may
1165              be a leading and/or trailing padding token injected,
1166              each for 2 more location slots.
1167              This would explain there being up to 4 source_locations slots
1168              that may be uninitialized.  */
1169
1170           fprintf (stream, "    %u: %u, %u\n",
1171                    i,
1172                    x,
1173                    y);
1174           if (x == y)
1175             {
1176               if (x < MAP_START_LOCATION (map))
1177                 inform (x, "token %u has x-location == y-location == %u", i, x);
1178               else
1179                 fprintf (stream,
1180                          "x-location == y-location == %u encodes token # %u\n",
1181                          x, x - MAP_START_LOCATION (map));
1182                 }
1183           else
1184             {
1185               inform (x, "token %u has x-location == %u", i, x);
1186               inform (x, "token %u has y-location == %u", i, y);
1187             }
1188         }
1189       fprintf (stream, "\n");
1190     }
1191
1192   /* It appears that MAX_SOURCE_LOCATION itself is never assigned to a
1193      macro map, presumably due to an off-by-one error somewhere
1194      between the logic in linemap_enter_macro and
1195      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1196   dump_labelled_location_range (stream, "MAX_SOURCE_LOCATION",
1197                                 MAX_SOURCE_LOCATION,
1198                                 MAX_SOURCE_LOCATION + 1);
1199
1200   /* Visualize ad-hoc values.  */
1201   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1202                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
1203 }
1204
1205 /* string_concat's constructor.  */
1206
1207 string_concat::string_concat (int num, location_t *locs)
1208   : m_num (num)
1209 {
1210   m_locs = ggc_vec_alloc <location_t> (num);
1211   for (int i = 0; i < num; i++)
1212     m_locs[i] = locs[i];
1213 }
1214
1215 /* string_concat_db's constructor.  */
1216
1217 string_concat_db::string_concat_db ()
1218 {
1219   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1220 }
1221
1222 /* Record that a string concatenation occurred, covering NUM
1223    string literal tokens.  LOCS is an array of size NUM, containing the
1224    locations of the tokens.  A copy of LOCS is taken.  */
1225
1226 void
1227 string_concat_db::record_string_concatenation (int num, location_t *locs)
1228 {
1229   gcc_assert (num > 1);
1230   gcc_assert (locs);
1231
1232   location_t key_loc = get_key_loc (locs[0]);
1233
1234   string_concat *concat
1235     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1236   m_table->put (key_loc, concat);
1237 }
1238
1239 /* Determine if LOC was the location of the the initial token of a
1240    concatenation of string literal tokens.
1241    If so, *OUT_NUM is written to with the number of tokens, and
1242    *OUT_LOCS with the location of an array of locations of the
1243    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1244    storage owned by the string_concat_db.
1245    Otherwise, return false.  */
1246
1247 bool
1248 string_concat_db::get_string_concatenation (location_t loc,
1249                                             int *out_num,
1250                                             location_t **out_locs)
1251 {
1252   gcc_assert (out_num);
1253   gcc_assert (out_locs);
1254
1255   location_t key_loc = get_key_loc (loc);
1256
1257   string_concat **concat = m_table->get (key_loc);
1258   if (!concat)
1259     return false;
1260
1261   *out_num = (*concat)->m_num;
1262   *out_locs =(*concat)->m_locs;
1263   return true;
1264 }
1265
1266 /* Internal function.  Canonicalize LOC into a form suitable for
1267    use as a key within the database, stripping away macro expansion,
1268    ad-hoc information, and range information, using the location of
1269    the start of LOC within an ordinary linemap.  */
1270
1271 location_t
1272 string_concat_db::get_key_loc (location_t loc)
1273 {
1274   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1275                                   NULL);
1276
1277   loc = get_range_from_loc (line_table, loc).m_start;
1278
1279   return loc;
1280 }
1281
1282 /* Helper class for use within get_substring_ranges_for_loc.
1283    An vec of cpp_string with responsibility for releasing all of the
1284    str->text for each str in the vector.  */
1285
1286 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1287 {
1288  public:
1289   auto_cpp_string_vec (int alloc)
1290     : auto_vec <cpp_string> (alloc) {}
1291
1292   ~auto_cpp_string_vec ()
1293   {
1294     /* Clean up the copies within this vec.  */
1295     int i;
1296     cpp_string *str;
1297     FOR_EACH_VEC_ELT (*this, i, str)
1298       free (const_cast <unsigned char *> (str->text));
1299   }
1300 };
1301
1302 /* Attempt to populate RANGES with source location information on the
1303    individual characters within the string literal found at STRLOC.
1304    If CONCATS is non-NULL, then any string literals that the token at
1305    STRLOC  was concatenated with are also added to RANGES.
1306
1307    Return NULL if successful, or an error message if any errors occurred (in
1308    which case RANGES may be only partially populated and should not
1309    be used).
1310
1311    This is implemented by re-parsing the relevant source line(s).  */
1312
1313 static const char *
1314 get_substring_ranges_for_loc (cpp_reader *pfile,
1315                               string_concat_db *concats,
1316                               location_t strloc,
1317                               enum cpp_ttype type,
1318                               cpp_substring_ranges &ranges)
1319 {
1320   gcc_assert (pfile);
1321
1322   if (strloc == UNKNOWN_LOCATION)
1323     return "unknown location";
1324
1325   /* Reparsing the strings requires accurate location information.
1326      If -ftrack-macro-expansion has been overridden from its default
1327      of 2, then we might have a location of a macro expansion point,
1328      rather than the location of the literal itself.
1329      Avoid this by requiring that we have full macro expansion tracking
1330      for substring locations to be available.  */
1331   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1332     return "track_macro_expansion != 2";
1333
1334   /* If #line or # 44 "file"-style directives are present, then there's
1335      no guarantee that the line numbers we have can be used to locate
1336      the strings.  For example, we might have a .i file with # directives
1337      pointing back to lines within a .c file, but the .c file might
1338      have been edited since the .i file was created.
1339      In such a case, the safest course is to disable on-demand substring
1340      locations.  */
1341   if (line_table->seen_line_directive)
1342     return "seen line directive";
1343
1344   /* If string concatenation has occurred at STRLOC, get the locations
1345      of all of the literal tokens making up the compound string.
1346      Otherwise, just use STRLOC.  */
1347   int num_locs = 1;
1348   location_t *strlocs = &strloc;
1349   if (concats)
1350     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1351
1352   auto_cpp_string_vec strs (num_locs);
1353   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1354   for (int i = 0; i < num_locs; i++)
1355     {
1356       /* Get range of strloc.  We will use it to locate the start and finish
1357          of the literal token within the line.  */
1358       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1359
1360       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1361         /* If the string is within a macro expansion, we can't get at the
1362            end location.  */
1363         return "macro expansion";
1364
1365       if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1366         /* If so, we can't reliably determine where the token started within
1367            its line.  */
1368         return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1369
1370       if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1371         /* If so, we can't reliably determine where the token finished within
1372            its line.  */
1373         return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1374
1375       expanded_location start
1376         = expand_location_to_spelling_point (src_range.m_start);
1377       expanded_location finish
1378         = expand_location_to_spelling_point (src_range.m_finish);
1379       if (start.file != finish.file)
1380         return "range endpoints are in different files";
1381       if (start.line != finish.line)
1382         return "range endpoints are on different lines";
1383       if (start.column > finish.column)
1384         return "range endpoints are reversed";
1385
1386       int line_width;
1387       const char *line = location_get_source_line (start.file, start.line,
1388                                                    &line_width);
1389       if (line == NULL)
1390         return "unable to read source line";
1391
1392       /* Determine the location of the literal (including quotes
1393          and leading prefix chars, such as the 'u' in a u""
1394          token).  */
1395       const char *literal = line + start.column - 1;
1396       int literal_length = finish.column - start.column + 1;
1397
1398       gcc_assert (line_width >= (start.column - 1 + literal_length));
1399       cpp_string from;
1400       from.len = literal_length;
1401       /* Make a copy of the literal, to avoid having to rely on
1402          the lifetime of the copy of the line within the cache.
1403          This will be released by the auto_cpp_string_vec dtor.  */
1404       from.text = XDUPVEC (unsigned char, literal, literal_length);
1405       strs.safe_push (from);
1406
1407       /* For very long lines, a new linemap could have started
1408          halfway through the token.
1409          Ensure that the loc_reader uses the linemap of the
1410          *end* of the token for its start location.  */
1411       const line_map_ordinary *final_ord_map;
1412       linemap_resolve_location (line_table, src_range.m_finish,
1413                                 LRK_MACRO_EXPANSION_POINT, &final_ord_map);
1414       location_t start_loc
1415         = linemap_position_for_line_and_column (line_table, final_ord_map,
1416                                                 start.line, start.column);
1417
1418       cpp_string_location_reader loc_reader (start_loc, line_table);
1419       loc_readers.safe_push (loc_reader);
1420     }
1421
1422   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1423   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1424                                                  loc_readers.address (),
1425                                                  num_locs, &ranges, type);
1426   if (err)
1427     return err;
1428
1429   /* Success: "ranges" should now contain information on the string.  */
1430   return NULL;
1431 }
1432
1433 /* Attempt to populate *OUT_LOC with source location information on the
1434    given characters within the string literal found at STRLOC.
1435    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1436    character set.
1437
1438    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1439    and string literal "012345\n789"
1440    *OUT_LOC is written to with:
1441      "012345\n789"
1442          ~^~~~~
1443
1444    If CONCATS is non-NULL, then any string literals that the token at
1445    STRLOC was concatenated with are also considered.
1446
1447    This is implemented by re-parsing the relevant source line(s).
1448
1449    Return NULL if successful, or an error message if any errors occurred.
1450    Error messages are intended for GCC developers (to help debugging) rather
1451    than for end-users.  */
1452
1453 const char *
1454 get_source_location_for_substring (cpp_reader *pfile,
1455                                    string_concat_db *concats,
1456                                    location_t strloc,
1457                                    enum cpp_ttype type,
1458                                    int caret_idx, int start_idx, int end_idx,
1459                                    source_location *out_loc)
1460 {
1461   gcc_checking_assert (caret_idx >= 0);
1462   gcc_checking_assert (start_idx >= 0);
1463   gcc_checking_assert (end_idx >= 0);
1464   gcc_assert (out_loc);
1465
1466   cpp_substring_ranges ranges;
1467   const char *err
1468     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1469   if (err)
1470     return err;
1471
1472   if (caret_idx >= ranges.get_num_ranges ())
1473     return "caret_idx out of range";
1474   if (start_idx >= ranges.get_num_ranges ())
1475     return "start_idx out of range";
1476   if (end_idx >= ranges.get_num_ranges ())
1477     return "end_idx out of range";
1478
1479   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1480                             ranges.get_range (start_idx).m_start,
1481                             ranges.get_range (end_idx).m_finish);
1482   return NULL;
1483 }
1484
1485 #if CHECKING_P
1486
1487 namespace selftest {
1488
1489 /* Selftests of location handling.  */
1490
1491 /* Attempt to populate *OUT_RANGE with source location information on the
1492    given character within the string literal found at STRLOC.
1493    CHAR_IDX refers to an offset within the execution character set.
1494    If CONCATS is non-NULL, then any string literals that the token at
1495    STRLOC was concatenated with are also considered.
1496
1497    This is implemented by re-parsing the relevant source line(s).
1498
1499    Return NULL if successful, or an error message if any errors occurred.
1500    Error messages are intended for GCC developers (to help debugging) rather
1501    than for end-users.  */
1502
1503 static const char *
1504 get_source_range_for_char (cpp_reader *pfile,
1505                            string_concat_db *concats,
1506                            location_t strloc,
1507                            enum cpp_ttype type,
1508                            int char_idx,
1509                            source_range *out_range)
1510 {
1511   gcc_checking_assert (char_idx >= 0);
1512   gcc_assert (out_range);
1513
1514   cpp_substring_ranges ranges;
1515   const char *err
1516     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1517   if (err)
1518     return err;
1519
1520   if (char_idx >= ranges.get_num_ranges ())
1521     return "char_idx out of range";
1522
1523   *out_range = ranges.get_range (char_idx);
1524   return NULL;
1525 }
1526
1527 /* As get_source_range_for_char, but write to *OUT the number
1528    of ranges that are available.  */
1529
1530 static const char *
1531 get_num_source_ranges_for_substring (cpp_reader *pfile,
1532                                      string_concat_db *concats,
1533                                      location_t strloc,
1534                                      enum cpp_ttype type,
1535                                      int *out)
1536 {
1537   gcc_assert (out);
1538
1539   cpp_substring_ranges ranges;
1540   const char *err
1541     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1542
1543   if (err)
1544     return err;
1545
1546   *out = ranges.get_num_ranges ();
1547   return NULL;
1548 }
1549
1550 /* Selftests of location handling.  */
1551
1552 /* Helper function for verifying location data: when location_t
1553    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1554    as having column 0.  */
1555
1556 static bool
1557 should_have_column_data_p (location_t loc)
1558 {
1559   if (IS_ADHOC_LOC (loc))
1560     loc = get_location_from_adhoc_loc (line_table, loc);
1561   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1562     return false;
1563   return true;
1564 }
1565
1566 /* Selftest for should_have_column_data_p.  */
1567
1568 static void
1569 test_should_have_column_data_p ()
1570 {
1571   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1572   ASSERT_TRUE
1573     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1574   ASSERT_FALSE
1575     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1576 }
1577
1578 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1579    on LOC.  */
1580
1581 static void
1582 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1583               location_t loc)
1584 {
1585   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1586   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1587   /* If location_t values are sufficiently high, then column numbers
1588      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1589      When close to the threshold, column numbers *may* be present: if
1590      the final linemap before the threshold contains a line that straddles
1591      the threshold, locations in that line have column information.  */
1592   if (should_have_column_data_p (loc))
1593     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1594 }
1595
1596 /* Various selftests involve constructing a line table and one or more
1597    line maps within it.
1598
1599    For maximum test coverage we want to run these tests with a variety
1600    of situations:
1601    - line_table->default_range_bits: some frontends use a non-zero value
1602    and others use zero
1603    - the fallback modes within line-map.c: there are various threshold
1604    values for source_location/location_t beyond line-map.c changes
1605    behavior (disabling of the range-packing optimization, disabling
1606    of column-tracking).  We can exercise these by starting the line_table
1607    at interesting values at or near these thresholds.
1608
1609    The following struct describes a particular case within our test
1610    matrix.  */
1611
1612 struct line_table_case
1613 {
1614   line_table_case (int default_range_bits, int base_location)
1615   : m_default_range_bits (default_range_bits),
1616     m_base_location (base_location)
1617   {}
1618
1619   int m_default_range_bits;
1620   int m_base_location;
1621 };
1622
1623 /* Constructor.  Store the old value of line_table, and create a new
1624    one, using sane defaults.  */
1625
1626 line_table_test::line_table_test ()
1627 {
1628   gcc_assert (saved_line_table == NULL);
1629   saved_line_table = line_table;
1630   line_table = ggc_alloc<line_maps> ();
1631   linemap_init (line_table, BUILTINS_LOCATION);
1632   gcc_assert (saved_line_table->reallocator);
1633   line_table->reallocator = saved_line_table->reallocator;
1634   gcc_assert (saved_line_table->round_alloc_size);
1635   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1636   line_table->default_range_bits = 0;
1637 }
1638
1639 /* Constructor.  Store the old value of line_table, and create a new
1640    one, using the sitation described in CASE_.  */
1641
1642 line_table_test::line_table_test (const line_table_case &case_)
1643 {
1644   gcc_assert (saved_line_table == NULL);
1645   saved_line_table = line_table;
1646   line_table = ggc_alloc<line_maps> ();
1647   linemap_init (line_table, BUILTINS_LOCATION);
1648   gcc_assert (saved_line_table->reallocator);
1649   line_table->reallocator = saved_line_table->reallocator;
1650   gcc_assert (saved_line_table->round_alloc_size);
1651   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1652   line_table->default_range_bits = case_.m_default_range_bits;
1653   if (case_.m_base_location)
1654     {
1655       line_table->highest_location = case_.m_base_location;
1656       line_table->highest_line = case_.m_base_location;
1657     }
1658 }
1659
1660 /* Destructor.  Restore the old value of line_table.  */
1661
1662 line_table_test::~line_table_test ()
1663 {
1664   gcc_assert (saved_line_table != NULL);
1665   line_table = saved_line_table;
1666   saved_line_table = NULL;
1667 }
1668
1669 /* Verify basic operation of ordinary linemaps.  */
1670
1671 static void
1672 test_accessing_ordinary_linemaps (const line_table_case &case_)
1673 {
1674   line_table_test ltt (case_);
1675
1676   /* Build a simple linemap describing some locations. */
1677   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1678
1679   linemap_line_start (line_table, 1, 100);
1680   location_t loc_a = linemap_position_for_column (line_table, 1);
1681   location_t loc_b = linemap_position_for_column (line_table, 23);
1682
1683   linemap_line_start (line_table, 2, 100);
1684   location_t loc_c = linemap_position_for_column (line_table, 1);
1685   location_t loc_d = linemap_position_for_column (line_table, 17);
1686
1687   /* Example of a very long line.  */
1688   linemap_line_start (line_table, 3, 2000);
1689   location_t loc_e = linemap_position_for_column (line_table, 700);
1690
1691   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1692
1693   /* Multiple files.  */
1694   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1695   linemap_line_start (line_table, 1, 200);
1696   location_t loc_f = linemap_position_for_column (line_table, 150);
1697   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1698
1699   /* Verify that we can recover the location info.  */
1700   assert_loceq ("foo.c", 1, 1, loc_a);
1701   assert_loceq ("foo.c", 1, 23, loc_b);
1702   assert_loceq ("foo.c", 2, 1, loc_c);
1703   assert_loceq ("foo.c", 2, 17, loc_d);
1704   assert_loceq ("foo.c", 3, 700, loc_e);
1705   assert_loceq ("bar.c", 1, 150, loc_f);
1706
1707   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1708   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1709
1710   /* Verify using make_location to build a range, and extracting data
1711      back from it.  */
1712   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1713   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1714   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1715   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1716   ASSERT_EQ (loc_b, src_range.m_start);
1717   ASSERT_EQ (loc_d, src_range.m_finish);
1718 }
1719
1720 /* Verify various properties of UNKNOWN_LOCATION.  */
1721
1722 static void
1723 test_unknown_location ()
1724 {
1725   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1726   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1727   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1728 }
1729
1730 /* Verify various properties of BUILTINS_LOCATION.  */
1731
1732 static void
1733 test_builtins ()
1734 {
1735   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1736   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1737 }
1738
1739 /* Regression test for make_location.
1740    Ensure that we use pure locations for the start/finish of the range,
1741    rather than storing a packed or ad-hoc range as the start/finish.  */
1742
1743 static void
1744 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1745 {
1746   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1747      with C++ frontend.
1748      ....................0000000001111111111222.
1749      ....................1234567890123456789012.  */
1750   const char *content = "     r += !aaa == bbb;\n";
1751   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1752   line_table_test ltt (case_);
1753   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1754
1755   const location_t c11 = linemap_position_for_column (line_table, 11);
1756   const location_t c12 = linemap_position_for_column (line_table, 12);
1757   const location_t c13 = linemap_position_for_column (line_table, 13);
1758   const location_t c14 = linemap_position_for_column (line_table, 14);
1759   const location_t c21 = linemap_position_for_column (line_table, 21);
1760
1761   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1762     return;
1763
1764   /* Use column 13 for the caret location, arbitrarily, to verify that we
1765      handle start != caret.  */
1766   const location_t aaa = make_location (c13, c12, c14);
1767   ASSERT_EQ (c13, get_pure_location (aaa));
1768   ASSERT_EQ (c12, get_start (aaa));
1769   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1770   ASSERT_EQ (c14, get_finish (aaa));
1771   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1772
1773   /* Make a location using a location with a range as the start-point.  */
1774   const location_t not_aaa = make_location (c11, aaa, c14);
1775   ASSERT_EQ (c11, get_pure_location (not_aaa));
1776   /* It should use the start location of the range, not store the range
1777      itself.  */
1778   ASSERT_EQ (c12, get_start (not_aaa));
1779   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1780   ASSERT_EQ (c14, get_finish (not_aaa));
1781   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1782
1783   /* Similarly, make a location with a range as the end-point.  */
1784   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1785   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1786   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1787   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1788   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1789   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1790   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1791   /* It should use the finish location of the range, not store the range
1792      itself.  */
1793   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1794   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1795   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1796   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1797   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1798 }
1799
1800 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1801
1802 static void
1803 test_reading_source_line ()
1804 {
1805   /* Create a tempfile and write some text to it.  */
1806   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1807                         "01234567890123456789\n"
1808                         "This is the test text\n"
1809                         "This is the 3rd line");
1810
1811   /* Read back a specific line from the tempfile.  */
1812   int line_size;
1813   const char *source_line = location_get_source_line (tmp.get_filename (),
1814                                                       3, &line_size);
1815   ASSERT_TRUE (source_line != NULL);
1816   ASSERT_EQ (20, line_size);
1817   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1818                          source_line, line_size));
1819
1820   source_line = location_get_source_line (tmp.get_filename (),
1821                                           2, &line_size);
1822   ASSERT_TRUE (source_line != NULL);
1823   ASSERT_EQ (21, line_size);
1824   ASSERT_TRUE (!strncmp ("This is the test text",
1825                          source_line, line_size));
1826
1827   source_line = location_get_source_line (tmp.get_filename (),
1828                                           4, &line_size);
1829   ASSERT_TRUE (source_line == NULL);
1830 }
1831
1832 /* Tests of lexing.  */
1833
1834 /* Verify that token TOK from PARSER has cpp_token_as_text
1835    equal to EXPECTED_TEXT.  */
1836
1837 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1838   SELFTEST_BEGIN_STMT                                                   \
1839     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1840     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1841   SELFTEST_END_STMT
1842
1843 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1844    and ranges from EXP_START_COL to EXP_FINISH_COL.
1845    Use LOC as the effective location of the selftest.  */
1846
1847 static void
1848 assert_token_loc_eq (const location &loc,
1849                      const cpp_token *tok,
1850                      const char *exp_filename, int exp_linenum,
1851                      int exp_start_col, int exp_finish_col)
1852 {
1853   location_t tok_loc = tok->src_loc;
1854   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1855   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1856
1857   /* If location_t values are sufficiently high, then column numbers
1858      will be unavailable.  */
1859   if (!should_have_column_data_p (tok_loc))
1860     return;
1861
1862   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
1863   source_range tok_range = get_range_from_loc (line_table, tok_loc);
1864   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
1865   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
1866 }
1867
1868 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
1869    SELFTEST_LOCATION as the effective location of the selftest.  */
1870
1871 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
1872                             EXP_START_COL, EXP_FINISH_COL) \
1873   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
1874                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
1875
1876 /* Test of lexing a file using libcpp, verifying tokens and their
1877    location information.  */
1878
1879 static void
1880 test_lexer (const line_table_case &case_)
1881 {
1882   /* Create a tempfile and write some text to it.  */
1883   const char *content =
1884     /*00000000011111111112222222222333333.3333444444444.455555555556
1885       12345678901234567890123456789012345.6789012345678.901234567890.  */
1886     ("test_name /* c-style comment */\n"
1887      "                                  \"test literal\"\n"
1888      " // test c++-style comment\n"
1889      "   42\n");
1890   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
1891
1892   line_table_test ltt (case_);
1893
1894   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
1895
1896   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
1897   ASSERT_NE (fname, NULL);
1898
1899   /* Verify that we get the expected tokens back, with the correct
1900      location information.  */
1901
1902   location_t loc;
1903   const cpp_token *tok;
1904   tok = cpp_get_token_with_location (parser, &loc);
1905   ASSERT_NE (tok, NULL);
1906   ASSERT_EQ (tok->type, CPP_NAME);
1907   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
1908   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
1909
1910   tok = cpp_get_token_with_location (parser, &loc);
1911   ASSERT_NE (tok, NULL);
1912   ASSERT_EQ (tok->type, CPP_STRING);
1913   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
1914   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
1915
1916   tok = cpp_get_token_with_location (parser, &loc);
1917   ASSERT_NE (tok, NULL);
1918   ASSERT_EQ (tok->type, CPP_NUMBER);
1919   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
1920   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
1921
1922   tok = cpp_get_token_with_location (parser, &loc);
1923   ASSERT_NE (tok, NULL);
1924   ASSERT_EQ (tok->type, CPP_EOF);
1925
1926   cpp_finish (parser, NULL);
1927   cpp_destroy (parser);
1928 }
1929
1930 /* Forward decls.  */
1931
1932 struct lexer_test;
1933 class lexer_test_options;
1934
1935 /* A class for specifying options of a lexer_test.
1936    The "apply" vfunc is called during the lexer_test constructor.  */
1937
1938 class lexer_test_options
1939 {
1940  public:
1941   virtual void apply (lexer_test &) = 0;
1942 };
1943
1944 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
1945    in its dtor.
1946
1947    This is needed by struct lexer_test to ensure that the cleanup of the
1948    cpp_reader happens *after* the cleanup of the temp_source_file.  */
1949
1950 class cpp_reader_ptr
1951 {
1952  public:
1953   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
1954
1955   ~cpp_reader_ptr ()
1956   {
1957     cpp_finish (m_ptr, NULL);
1958     cpp_destroy (m_ptr);
1959   }
1960
1961   operator cpp_reader * () const { return m_ptr; }
1962
1963  private:
1964   cpp_reader *m_ptr;
1965 };
1966
1967 /* A struct for writing lexer tests.  */
1968
1969 struct lexer_test
1970 {
1971   lexer_test (const line_table_case &case_, const char *content,
1972               lexer_test_options *options);
1973   ~lexer_test ();
1974
1975   const cpp_token *get_token ();
1976
1977   /* The ordering of these fields matters.
1978      The line_table_test must be first, since the cpp_reader_ptr
1979      uses it.
1980      The cpp_reader must be cleaned up *after* the temp_source_file
1981      since the filenames in input.c's input cache are owned by the
1982      cpp_reader; in particular, when ~temp_source_file evicts the
1983      filename the filenames must still be alive.  */
1984   line_table_test m_ltt;
1985   cpp_reader_ptr m_parser;
1986   temp_source_file m_tempfile;
1987   string_concat_db m_concats;
1988   bool m_implicitly_expect_EOF;
1989 };
1990
1991 /* Use an EBCDIC encoding for the execution charset, specifically
1992    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
1993
1994    This exercises iconv integration within libcpp.
1995    Not every build of iconv supports the given charset,
1996    so we need to flag this error and handle it gracefully.  */
1997
1998 class ebcdic_execution_charset : public lexer_test_options
1999 {
2000  public:
2001   ebcdic_execution_charset () : m_num_iconv_errors (0)
2002     {
2003       gcc_assert (s_singleton == NULL);
2004       s_singleton = this;
2005     }
2006   ~ebcdic_execution_charset ()
2007     {
2008       gcc_assert (s_singleton == this);
2009       s_singleton = NULL;
2010     }
2011
2012   void apply (lexer_test &test) FINAL OVERRIDE
2013   {
2014     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2015     cpp_opts->narrow_charset = "IBM1047";
2016
2017     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2018     callbacks->error = on_error;
2019   }
2020
2021   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2022                         int level ATTRIBUTE_UNUSED,
2023                         int reason ATTRIBUTE_UNUSED,
2024                         rich_location *richloc ATTRIBUTE_UNUSED,
2025                         const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2026     ATTRIBUTE_FPTR_PRINTF(5,0)
2027   {
2028     gcc_assert (s_singleton);
2029     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2030     const char *msg = "conversion from %s to %s not supported by iconv";
2031 #ifdef ENABLE_NLS
2032     msg = dgettext ("cpplib", msg);
2033 #endif
2034     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2035        when the local iconv build doesn't support the conversion.  */
2036     if (strcmp (msgid, msg) == 0)
2037       {
2038         s_singleton->m_num_iconv_errors++;
2039         return true;
2040       }
2041
2042     /* Otherwise, we have an unexpected error.  */
2043     abort ();
2044   }
2045
2046   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2047
2048  private:
2049   static ebcdic_execution_charset *s_singleton;
2050   int m_num_iconv_errors;
2051 };
2052
2053 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2054
2055 /* A lexer_test_options subclass that records a list of error
2056    messages emitted by the lexer.  */
2057
2058 class lexer_error_sink : public lexer_test_options
2059 {
2060  public:
2061   lexer_error_sink ()
2062   {
2063     gcc_assert (s_singleton == NULL);
2064     s_singleton = this;
2065   }
2066   ~lexer_error_sink ()
2067   {
2068     gcc_assert (s_singleton == this);
2069     s_singleton = NULL;
2070
2071     int i;
2072     char *str;
2073     FOR_EACH_VEC_ELT (m_errors, i, str)
2074       free (str);
2075   }
2076
2077   void apply (lexer_test &test) FINAL OVERRIDE
2078   {
2079     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2080     callbacks->error = on_error;
2081   }
2082
2083   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2084                         int level ATTRIBUTE_UNUSED,
2085                         int reason ATTRIBUTE_UNUSED,
2086                         rich_location *richloc ATTRIBUTE_UNUSED,
2087                         const char *msgid, va_list *ap)
2088     ATTRIBUTE_FPTR_PRINTF(5,0)
2089   {
2090     char *msg = xvasprintf (msgid, *ap);
2091     s_singleton->m_errors.safe_push (msg);
2092     return true;
2093   }
2094
2095   auto_vec<char *> m_errors;
2096
2097  private:
2098   static lexer_error_sink *s_singleton;
2099 };
2100
2101 lexer_error_sink *lexer_error_sink::s_singleton;
2102
2103 /* Constructor.  Override line_table with a new instance based on CASE_,
2104    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2105    start parsing the tempfile.  */
2106
2107 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2108                         lexer_test_options *options)
2109 : m_ltt (case_),
2110   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2111   /* Create a tempfile and write the text to it.  */
2112   m_tempfile (SELFTEST_LOCATION, ".c", content),
2113   m_concats (),
2114   m_implicitly_expect_EOF (true)
2115 {
2116   if (options)
2117     options->apply (*this);
2118
2119   cpp_init_iconv (m_parser);
2120
2121   /* Parse the file.  */
2122   const char *fname = cpp_read_main_file (m_parser,
2123                                           m_tempfile.get_filename ());
2124   ASSERT_NE (fname, NULL);
2125 }
2126
2127 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2128
2129 lexer_test::~lexer_test ()
2130 {
2131   location_t loc;
2132   const cpp_token *tok;
2133
2134   if (m_implicitly_expect_EOF)
2135     {
2136       tok = cpp_get_token_with_location (m_parser, &loc);
2137       ASSERT_NE (tok, NULL);
2138       ASSERT_EQ (tok->type, CPP_EOF);
2139     }
2140 }
2141
2142 /* Get the next token from m_parser.  */
2143
2144 const cpp_token *
2145 lexer_test::get_token ()
2146 {
2147   location_t loc;
2148   const cpp_token *tok;
2149
2150   tok = cpp_get_token_with_location (m_parser, &loc);
2151   ASSERT_NE (tok, NULL);
2152   return tok;
2153 }
2154
2155 /* Verify that locations within string literals are correctly handled.  */
2156
2157 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2158    using the string concatenation database for TEST.
2159
2160    Assert that the character at index IDX is on EXPECTED_LINE,
2161    and that it begins at column EXPECTED_START_COL and ends at
2162    EXPECTED_FINISH_COL (unless the locations are beyond
2163    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2164    columns).  */
2165
2166 static void
2167 assert_char_at_range (const location &loc,
2168                       lexer_test& test,
2169                       location_t strloc, enum cpp_ttype type, int idx,
2170                       int expected_line, int expected_start_col,
2171                       int expected_finish_col)
2172 {
2173   cpp_reader *pfile = test.m_parser;
2174   string_concat_db *concats = &test.m_concats;
2175
2176   source_range actual_range;
2177   const char *err
2178     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2179                                  &actual_range);
2180   if (should_have_column_data_p (strloc))
2181     ASSERT_EQ_AT (loc, NULL, err);
2182   else
2183     {
2184       ASSERT_STREQ_AT (loc,
2185                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2186                        err);
2187       return;
2188     }
2189
2190   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2191   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2192   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2193   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2194
2195   if (should_have_column_data_p (actual_range.m_start))
2196     {
2197       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2198       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2199     }
2200   if (should_have_column_data_p (actual_range.m_finish))
2201     {
2202       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2203       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2204     }
2205 }
2206
2207 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2208    the effective location of any errors.  */
2209
2210 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2211                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2212   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2213                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2214                         (EXPECTED_FINISH_COL))
2215
2216 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2217    using the string concatenation database for TEST.
2218
2219    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2220
2221 static void
2222 assert_num_substring_ranges (const location &loc,
2223                              lexer_test& test,
2224                              location_t strloc,
2225                              enum cpp_ttype type,
2226                              int expected_num_ranges)
2227 {
2228   cpp_reader *pfile = test.m_parser;
2229   string_concat_db *concats = &test.m_concats;
2230
2231   int actual_num_ranges = -1;
2232   const char *err
2233     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2234                                            &actual_num_ranges);
2235   if (should_have_column_data_p (strloc))
2236     ASSERT_EQ_AT (loc, NULL, err);
2237   else
2238     {
2239       ASSERT_STREQ_AT (loc,
2240                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2241                        err);
2242       return;
2243     }
2244   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2245 }
2246
2247 /* Macro for calling assert_num_substring_ranges, supplying
2248    SELFTEST_LOCATION for the effective location of any errors.  */
2249
2250 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2251                                     EXPECTED_NUM_RANGES)                \
2252   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2253                                (TYPE), (EXPECTED_NUM_RANGES))
2254
2255
2256 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2257    returns an error (using the string concatenation database for TEST).  */
2258
2259 static void
2260 assert_has_no_substring_ranges (const location &loc,
2261                                 lexer_test& test,
2262                                 location_t strloc,
2263                                 enum cpp_ttype type,
2264                                 const char *expected_err)
2265 {
2266   cpp_reader *pfile = test.m_parser;
2267   string_concat_db *concats = &test.m_concats;
2268   cpp_substring_ranges ranges;
2269   const char *actual_err
2270     = get_substring_ranges_for_loc (pfile, concats, strloc,
2271                                     type, ranges);
2272   if (should_have_column_data_p (strloc))
2273     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2274   else
2275     ASSERT_STREQ_AT (loc,
2276                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2277                      actual_err);
2278 }
2279
2280 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2281     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2282                                     (STRLOC), (TYPE), (ERR))
2283
2284 /* Lex a simple string literal.  Verify the substring location data, before
2285    and after running cpp_interpret_string on it.  */
2286
2287 static void
2288 test_lexer_string_locations_simple (const line_table_case &case_)
2289 {
2290   /* Digits 0-9 (with 0 at column 10), the simple way.
2291      ....................000000000.11111111112.2222222223333333333
2292      ....................123456789.01234567890.1234567890123456789
2293      We add a trailing comment to ensure that we correctly locate
2294      the end of the string literal token.  */
2295   const char *content = "        \"0123456789\" /* not a string */\n";
2296   lexer_test test (case_, content, NULL);
2297
2298   /* Verify that we get the expected token back, with the correct
2299      location information.  */
2300   const cpp_token *tok = test.get_token ();
2301   ASSERT_EQ (tok->type, CPP_STRING);
2302   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2303   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2304
2305   /* At this point in lexing, the quote characters are treated as part of
2306      the string (they are stripped off by cpp_interpret_string).  */
2307
2308   ASSERT_EQ (tok->val.str.len, 12);
2309
2310   /* Verify that cpp_interpret_string works.  */
2311   cpp_string dst_string;
2312   const enum cpp_ttype type = CPP_STRING;
2313   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2314                                       &dst_string, type);
2315   ASSERT_TRUE (result);
2316   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2317   free (const_cast <unsigned char *> (dst_string.text));
2318
2319   /* Verify ranges of individual characters.  This no longer includes the
2320      opening quote, but does include the closing quote.  */
2321   for (int i = 0; i <= 10; i++)
2322     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2323                           10 + i, 10 + i);
2324
2325   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2326 }
2327
2328 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2329    encoding.  */
2330
2331 static void
2332 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2333 {
2334   /* EBCDIC support requires iconv.  */
2335   if (!HAVE_ICONV)
2336     return;
2337
2338   /* Digits 0-9 (with 0 at column 10), the simple way.
2339      ....................000000000.11111111112.2222222223333333333
2340      ....................123456789.01234567890.1234567890123456789
2341      We add a trailing comment to ensure that we correctly locate
2342      the end of the string literal token.  */
2343   const char *content = "        \"0123456789\" /* not a string */\n";
2344   ebcdic_execution_charset use_ebcdic;
2345   lexer_test test (case_, content, &use_ebcdic);
2346
2347   /* Verify that we get the expected token back, with the correct
2348      location information.  */
2349   const cpp_token *tok = test.get_token ();
2350   ASSERT_EQ (tok->type, CPP_STRING);
2351   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2352   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2353
2354   /* At this point in lexing, the quote characters are treated as part of
2355      the string (they are stripped off by cpp_interpret_string).  */
2356
2357   ASSERT_EQ (tok->val.str.len, 12);
2358
2359   /* The remainder of the test requires an iconv implementation that
2360      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2361   if (use_ebcdic.iconv_errors_occurred_p ())
2362     return;
2363
2364   /* Verify that cpp_interpret_string works.  */
2365   cpp_string dst_string;
2366   const enum cpp_ttype type = CPP_STRING;
2367   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2368                                       &dst_string, type);
2369   ASSERT_TRUE (result);
2370   /* We should now have EBCDIC-encoded text, specifically
2371      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2372      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2373   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2374                 (const char *)dst_string.text);
2375   free (const_cast <unsigned char *> (dst_string.text));
2376
2377   /* Verify that we don't attempt to record substring location information
2378      for such cases.  */
2379   ASSERT_HAS_NO_SUBSTRING_RANGES
2380     (test, tok->src_loc, type,
2381      "execution character set != source character set");
2382 }
2383
2384 /* Lex a string literal containing a hex-escaped character.
2385    Verify the substring location data, before and after running
2386    cpp_interpret_string on it.  */
2387
2388 static void
2389 test_lexer_string_locations_hex (const line_table_case &case_)
2390 {
2391   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2392      and with a space in place of digit 6, to terminate the escaped
2393      hex code.
2394      ....................000000000.111111.11112222.
2395      ....................123456789.012345.67890123.  */
2396   const char *content = "        \"01234\\x35 789\"\n";
2397   lexer_test test (case_, content, NULL);
2398
2399   /* Verify that we get the expected token back, with the correct
2400      location information.  */
2401   const cpp_token *tok = test.get_token ();
2402   ASSERT_EQ (tok->type, CPP_STRING);
2403   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2404   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2405
2406   /* At this point in lexing, the quote characters are treated as part of
2407      the string (they are stripped off by cpp_interpret_string).  */
2408   ASSERT_EQ (tok->val.str.len, 15);
2409
2410   /* Verify that cpp_interpret_string works.  */
2411   cpp_string dst_string;
2412   const enum cpp_ttype type = CPP_STRING;
2413   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2414                                       &dst_string, type);
2415   ASSERT_TRUE (result);
2416   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2417   free (const_cast <unsigned char *> (dst_string.text));
2418
2419   /* Verify ranges of individual characters.  This no longer includes the
2420      opening quote, but does include the closing quote.  */
2421   for (int i = 0; i <= 4; i++)
2422     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2423   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2424   for (int i = 6; i <= 10; i++)
2425     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2426
2427   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2428 }
2429
2430 /* Lex a string literal containing an octal-escaped character.
2431    Verify the substring location data after running cpp_interpret_string
2432    on it.  */
2433
2434 static void
2435 test_lexer_string_locations_oct (const line_table_case &case_)
2436 {
2437   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2438      and with a space in place of digit 6, to terminate the escaped
2439      octal code.
2440      ....................000000000.111111.11112222.2222223333333333444
2441      ....................123456789.012345.67890123.4567890123456789012  */
2442   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2443   lexer_test test (case_, content, NULL);
2444
2445   /* Verify that we get the expected token back, with the correct
2446      location information.  */
2447   const cpp_token *tok = test.get_token ();
2448   ASSERT_EQ (tok->type, CPP_STRING);
2449   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2450
2451   /* Verify that cpp_interpret_string works.  */
2452   cpp_string dst_string;
2453   const enum cpp_ttype type = CPP_STRING;
2454   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2455                                       &dst_string, type);
2456   ASSERT_TRUE (result);
2457   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2458   free (const_cast <unsigned char *> (dst_string.text));
2459
2460   /* Verify ranges of individual characters.  This no longer includes the
2461      opening quote, but does include the closing quote.  */
2462   for (int i = 0; i < 5; i++)
2463     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2464   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2465   for (int i = 6; i <= 10; i++)
2466     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2467
2468   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2469 }
2470
2471 /* Test of string literal containing letter escapes.  */
2472
2473 static void
2474 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2475 {
2476   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2477      .....................000000000.1.11111.1.1.11222.22222223333333
2478      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2479   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2480   lexer_test test (case_, content, NULL);
2481
2482   /* Verify that we get the expected tokens back.  */
2483   const cpp_token *tok = test.get_token ();
2484   ASSERT_EQ (tok->type, CPP_STRING);
2485   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2486
2487   /* Verify ranges of individual characters. */
2488   /* "\t".  */
2489   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2490                         0, 1, 10, 11);
2491   /* "foo". */
2492   for (int i = 1; i <= 3; i++)
2493     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2494                           i, 1, 11 + i, 11 + i);
2495   /* "\\" and "\n".  */
2496   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2497                         4, 1, 15, 16);
2498   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2499                         5, 1, 17, 18);
2500
2501   /* "bar" and closing quote for nul-terminator.  */
2502   for (int i = 6; i <= 9; i++)
2503     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2504                           i, 1, 13 + i, 13 + i);
2505
2506   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2507 }
2508
2509 /* Another test of a string literal containing a letter escape.
2510    Based on string seen in
2511      printf ("%-%\n");
2512    in gcc.dg/format/c90-printf-1.c.  */
2513
2514 static void
2515 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2516 {
2517   /* .....................000000000.1111.11.1111.22222222223.
2518      .....................123456789.0123.45.6789.01234567890.  */
2519   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2520   lexer_test test (case_, content, NULL);
2521
2522   /* Verify that we get the expected tokens back.  */
2523   const cpp_token *tok = test.get_token ();
2524   ASSERT_EQ (tok->type, CPP_STRING);
2525   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2526
2527   /* Verify ranges of individual characters. */
2528   /* "%-%".  */
2529   for (int i = 0; i < 3; i++)
2530     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2531                           i, 1, 10 + i, 10 + i);
2532   /* "\n".  */
2533   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2534                         3, 1, 13, 14);
2535
2536   /* Closing quote for nul-terminator.  */
2537   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2538                         4, 1, 15, 15);
2539
2540   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2541 }
2542
2543 /* Lex a string literal containing UCN 4 characters.
2544    Verify the substring location data after running cpp_interpret_string
2545    on it.  */
2546
2547 static void
2548 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2549 {
2550   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2551      as UCN 4.
2552      ....................000000000.111111.111122.222222223.33333333344444
2553      ....................123456789.012345.678901.234567890.12345678901234  */
2554   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2555   lexer_test test (case_, content, NULL);
2556
2557   /* Verify that we get the expected token back, with the correct
2558      location information.  */
2559   const cpp_token *tok = test.get_token ();
2560   ASSERT_EQ (tok->type, CPP_STRING);
2561   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2562
2563   /* Verify that cpp_interpret_string works.
2564      The string should be encoded in the execution character
2565      set.  Assuming that that is UTF-8, we should have the following:
2566      -----------  ----  -----  -------  ----------------
2567      Byte offset  Byte  Octal  Unicode  Source Column(s)
2568      -----------  ----  -----  -------  ----------------
2569      0            0x30         '0'      10
2570      1            0x31         '1'      11
2571      2            0x32         '2'      12
2572      3            0x33         '3'      13
2573      4            0x34         '4'      14
2574      5            0xE2  \342   U+2174   15-20
2575      6            0x85  \205    (cont)  15-20
2576      7            0xB4  \264    (cont)  15-20
2577      8            0xE2  \342   U+2175   21-26
2578      9            0x85  \205    (cont)  21-26
2579      10           0xB5  \265    (cont)  21-26
2580      11           0x37         '7'      27
2581      12           0x38         '8'      28
2582      13           0x39         '9'      29
2583      14           0x00                  30 (closing quote)
2584      -----------  ----  -----  -------  ---------------.  */
2585
2586   cpp_string dst_string;
2587   const enum cpp_ttype type = CPP_STRING;
2588   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2589                                       &dst_string, type);
2590   ASSERT_TRUE (result);
2591   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2592                 (const char *)dst_string.text);
2593   free (const_cast <unsigned char *> (dst_string.text));
2594
2595   /* Verify ranges of individual characters.  This no longer includes the
2596      opening quote, but does include the closing quote.
2597      '01234'.  */
2598   for (int i = 0; i <= 4; i++)
2599     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2600   /* U+2174.  */
2601   for (int i = 5; i <= 7; i++)
2602     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2603   /* U+2175.  */
2604   for (int i = 8; i <= 10; i++)
2605     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2606   /* '789' and nul terminator  */
2607   for (int i = 11; i <= 14; i++)
2608     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2609
2610   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2611 }
2612
2613 /* Lex a string literal containing UCN 8 characters.
2614    Verify the substring location data after running cpp_interpret_string
2615    on it.  */
2616
2617 static void
2618 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2619 {
2620   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2621      ....................000000000.111111.1111222222.2222333333333.344444
2622      ....................123456789.012345.6789012345.6789012345678.901234  */
2623   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2624   lexer_test test (case_, content, NULL);
2625
2626   /* Verify that we get the expected token back, with the correct
2627      location information.  */
2628   const cpp_token *tok = test.get_token ();
2629   ASSERT_EQ (tok->type, CPP_STRING);
2630   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2631                            "\"01234\\U00002174\\U00002175789\"");
2632
2633   /* Verify that cpp_interpret_string works.
2634      The UTF-8 encoding of the string is identical to that from
2635      the ucn4 testcase above; the only difference is the column
2636      locations.  */
2637   cpp_string dst_string;
2638   const enum cpp_ttype type = CPP_STRING;
2639   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2640                                       &dst_string, type);
2641   ASSERT_TRUE (result);
2642   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2643                 (const char *)dst_string.text);
2644   free (const_cast <unsigned char *> (dst_string.text));
2645
2646   /* Verify ranges of individual characters.  This no longer includes the
2647      opening quote, but does include the closing quote.
2648      '01234'.  */
2649   for (int i = 0; i <= 4; i++)
2650     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2651   /* U+2174.  */
2652   for (int i = 5; i <= 7; i++)
2653     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2654   /* U+2175.  */
2655   for (int i = 8; i <= 10; i++)
2656     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2657   /* '789' at columns 35-37  */
2658   for (int i = 11; i <= 13; i++)
2659     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2660   /* Closing quote/nul-terminator at column 38.  */
2661   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2662
2663   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2664 }
2665
2666 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2667
2668 static uint32_t
2669 uint32_from_big_endian (const uint32_t *ptr_be_value)
2670 {
2671   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2672   return (((uint32_t) buf[0] << 24)
2673           | ((uint32_t) buf[1] << 16)
2674           | ((uint32_t) buf[2] << 8)
2675           | (uint32_t) buf[3]);
2676 }
2677
2678 /* Lex a wide string literal and verify that attempts to read substring
2679    location data from it fail gracefully.  */
2680
2681 static void
2682 test_lexer_string_locations_wide_string (const line_table_case &case_)
2683 {
2684   /* Digits 0-9.
2685      ....................000000000.11111111112.22222222233333
2686      ....................123456789.01234567890.12345678901234  */
2687   const char *content = "       L\"0123456789\" /* non-str */\n";
2688   lexer_test test (case_, content, NULL);
2689
2690   /* Verify that we get the expected token back, with the correct
2691      location information.  */
2692   const cpp_token *tok = test.get_token ();
2693   ASSERT_EQ (tok->type, CPP_WSTRING);
2694   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2695
2696   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2697   cpp_string dst_string;
2698   const enum cpp_ttype type = CPP_WSTRING;
2699   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2700                                       &dst_string, type);
2701   ASSERT_TRUE (result);
2702   /* The cpp_reader defaults to big-endian with
2703      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2704      now be encoded as UTF-32BE.  */
2705   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2706   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2707   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2708   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2709   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2710   free (const_cast <unsigned char *> (dst_string.text));
2711
2712   /* We don't yet support generating substring location information
2713      for L"" strings.  */
2714   ASSERT_HAS_NO_SUBSTRING_RANGES
2715     (test, tok->src_loc, type,
2716      "execution character set != source character set");
2717 }
2718
2719 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2720
2721 static uint16_t
2722 uint16_from_big_endian (const uint16_t *ptr_be_value)
2723 {
2724   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2725   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2726 }
2727
2728 /* Lex a u"" string literal and verify that attempts to read substring
2729    location data from it fail gracefully.  */
2730
2731 static void
2732 test_lexer_string_locations_string16 (const line_table_case &case_)
2733 {
2734   /* Digits 0-9.
2735      ....................000000000.11111111112.22222222233333
2736      ....................123456789.01234567890.12345678901234  */
2737   const char *content = "       u\"0123456789\" /* non-str */\n";
2738   lexer_test test (case_, content, NULL);
2739
2740   /* Verify that we get the expected token back, with the correct
2741      location information.  */
2742   const cpp_token *tok = test.get_token ();
2743   ASSERT_EQ (tok->type, CPP_STRING16);
2744   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2745
2746   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2747   cpp_string dst_string;
2748   const enum cpp_ttype type = CPP_STRING16;
2749   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2750                                       &dst_string, type);
2751   ASSERT_TRUE (result);
2752
2753   /* The cpp_reader defaults to big-endian, so dst_string should
2754      now be encoded as UTF-16BE.  */
2755   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2756   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2757   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2758   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2759   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2760   free (const_cast <unsigned char *> (dst_string.text));
2761
2762   /* We don't yet support generating substring location information
2763      for L"" strings.  */
2764   ASSERT_HAS_NO_SUBSTRING_RANGES
2765     (test, tok->src_loc, type,
2766      "execution character set != source character set");
2767 }
2768
2769 /* Lex a U"" string literal and verify that attempts to read substring
2770    location data from it fail gracefully.  */
2771
2772 static void
2773 test_lexer_string_locations_string32 (const line_table_case &case_)
2774 {
2775   /* Digits 0-9.
2776      ....................000000000.11111111112.22222222233333
2777      ....................123456789.01234567890.12345678901234  */
2778   const char *content = "       U\"0123456789\" /* non-str */\n";
2779   lexer_test test (case_, content, NULL);
2780
2781   /* Verify that we get the expected token back, with the correct
2782      location information.  */
2783   const cpp_token *tok = test.get_token ();
2784   ASSERT_EQ (tok->type, CPP_STRING32);
2785   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2786
2787   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2788   cpp_string dst_string;
2789   const enum cpp_ttype type = CPP_STRING32;
2790   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2791                                       &dst_string, type);
2792   ASSERT_TRUE (result);
2793
2794   /* The cpp_reader defaults to big-endian, so dst_string should
2795      now be encoded as UTF-32BE.  */
2796   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2797   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2798   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2799   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2800   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2801   free (const_cast <unsigned char *> (dst_string.text));
2802
2803   /* We don't yet support generating substring location information
2804      for L"" strings.  */
2805   ASSERT_HAS_NO_SUBSTRING_RANGES
2806     (test, tok->src_loc, type,
2807      "execution character set != source character set");
2808 }
2809
2810 /* Lex a u8-string literal.
2811    Verify the substring location data after running cpp_interpret_string
2812    on it.  */
2813
2814 static void
2815 test_lexer_string_locations_u8 (const line_table_case &case_)
2816 {
2817   /* Digits 0-9.
2818      ....................000000000.11111111112.22222222233333
2819      ....................123456789.01234567890.12345678901234  */
2820   const char *content = "      u8\"0123456789\" /* non-str */\n";
2821   lexer_test test (case_, content, NULL);
2822
2823   /* Verify that we get the expected token back, with the correct
2824      location information.  */
2825   const cpp_token *tok = test.get_token ();
2826   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2827   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2828
2829   /* Verify that cpp_interpret_string works.  */
2830   cpp_string dst_string;
2831   const enum cpp_ttype type = CPP_STRING;
2832   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2833                                       &dst_string, type);
2834   ASSERT_TRUE (result);
2835   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2836   free (const_cast <unsigned char *> (dst_string.text));
2837
2838   /* Verify ranges of individual characters.  This no longer includes the
2839      opening quote, but does include the closing quote.  */
2840   for (int i = 0; i <= 10; i++)
2841     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2842 }
2843
2844 /* Lex a string literal containing UTF-8 source characters.
2845    Verify the substring location data after running cpp_interpret_string
2846    on it.  */
2847
2848 static void
2849 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2850 {
2851  /* This string literal is written out to the source file as UTF-8,
2852     and is of the form "before mojibake after", where "mojibake"
2853     is written as the following four unicode code points:
2854        U+6587 CJK UNIFIED IDEOGRAPH-6587
2855        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2856        U+5316 CJK UNIFIED IDEOGRAPH-5316
2857        U+3051 HIRAGANA LETTER KE.
2858      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2859      "before" and "after" are 1 byte per unicode character.
2860
2861      The numbering shown are "columns", which are *byte* numbers within
2862      the line, rather than unicode character numbers.
2863
2864      .................... 000000000.1111111.
2865      .................... 123456789.0123456.  */
2866   const char *content = ("        \"before "
2867                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
2868                               UTF-8: 0xE6 0x96 0x87
2869                               C octal escaped UTF-8: \346\226\207
2870                             "column" numbers: 17-19.  */
2871                          "\346\226\207"
2872
2873                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2874                               UTF-8: 0xE5 0xAD 0x97
2875                               C octal escaped UTF-8: \345\255\227
2876                             "column" numbers: 20-22.  */
2877                          "\345\255\227"
2878
2879                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
2880                               UTF-8: 0xE5 0x8C 0x96
2881                               C octal escaped UTF-8: \345\214\226
2882                             "column" numbers: 23-25.  */
2883                          "\345\214\226"
2884
2885                          /* U+3051 HIRAGANA LETTER KE
2886                               UTF-8: 0xE3 0x81 0x91
2887                               C octal escaped UTF-8: \343\201\221
2888                             "column" numbers: 26-28.  */
2889                          "\343\201\221"
2890
2891                          /* column numbers 29 onwards
2892                           2333333.33334444444444
2893                           9012345.67890123456789. */
2894                          " after\" /* non-str */\n");
2895   lexer_test test (case_, content, NULL);
2896
2897   /* Verify that we get the expected token back, with the correct
2898      location information.  */
2899   const cpp_token *tok = test.get_token ();
2900   ASSERT_EQ (tok->type, CPP_STRING);
2901   ASSERT_TOKEN_AS_TEXT_EQ
2902     (test.m_parser, tok,
2903      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
2904
2905   /* Verify that cpp_interpret_string works.  */
2906   cpp_string dst_string;
2907   const enum cpp_ttype type = CPP_STRING;
2908   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2909                                       &dst_string, type);
2910   ASSERT_TRUE (result);
2911   ASSERT_STREQ
2912     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
2913      (const char *)dst_string.text);
2914   free (const_cast <unsigned char *> (dst_string.text));
2915
2916   /* Verify ranges of individual characters.  This no longer includes the
2917      opening quote, but does include the closing quote.
2918      Assuming that both source and execution encodings are UTF-8, we have
2919      a run of 25 octets in each, plus the NUL terminator.  */
2920   for (int i = 0; i < 25; i++)
2921     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2922   /* NUL-terminator should use the closing quote at column 35.  */
2923   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
2924
2925   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
2926 }
2927
2928 /* Test of string literal concatenation.  */
2929
2930 static void
2931 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
2932 {
2933   /* Digits 0-9.
2934      .....................000000000.111111.11112222222222
2935      .....................123456789.012345.67890123456789.  */
2936   const char *content = ("        \"01234\" /* non-str */\n"
2937                          "        \"56789\" /* non-str */\n");
2938   lexer_test test (case_, content, NULL);
2939
2940   location_t input_locs[2];
2941
2942   /* Verify that we get the expected tokens back.  */
2943   auto_vec <cpp_string> input_strings;
2944   const cpp_token *tok_a = test.get_token ();
2945   ASSERT_EQ (tok_a->type, CPP_STRING);
2946   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
2947   input_strings.safe_push (tok_a->val.str);
2948   input_locs[0] = tok_a->src_loc;
2949
2950   const cpp_token *tok_b = test.get_token ();
2951   ASSERT_EQ (tok_b->type, CPP_STRING);
2952   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
2953   input_strings.safe_push (tok_b->val.str);
2954   input_locs[1] = tok_b->src_loc;
2955
2956   /* Verify that cpp_interpret_string works.  */
2957   cpp_string dst_string;
2958   const enum cpp_ttype type = CPP_STRING;
2959   bool result = cpp_interpret_string (test.m_parser,
2960                                       input_strings.address (), 2,
2961                                       &dst_string, type);
2962   ASSERT_TRUE (result);
2963   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2964   free (const_cast <unsigned char *> (dst_string.text));
2965
2966   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2967   test.m_concats.record_string_concatenation (2, input_locs);
2968
2969   location_t initial_loc = input_locs[0];
2970
2971   /* "01234" on line 1.  */
2972   for (int i = 0; i <= 4; i++)
2973     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
2974   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
2975   for (int i = 5; i <= 10; i++)
2976     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
2977
2978   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2979 }
2980
2981 /* Another test of string literal concatenation.  */
2982
2983 static void
2984 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
2985 {
2986   /* Digits 0-9.
2987      .....................000000000.111.11111112222222
2988      .....................123456789.012.34567890123456.  */
2989   const char *content = ("        \"01\" /* non-str */\n"
2990                          "        \"23\" /* non-str */\n"
2991                          "        \"45\" /* non-str */\n"
2992                          "        \"67\" /* non-str */\n"
2993                          "        \"89\" /* non-str */\n");
2994   lexer_test test (case_, content, NULL);
2995
2996   auto_vec <cpp_string> input_strings;
2997   location_t input_locs[5];
2998
2999   /* Verify that we get the expected tokens back.  */
3000   for (int i = 0; i < 5; i++)
3001     {
3002       const cpp_token *tok = test.get_token ();
3003       ASSERT_EQ (tok->type, CPP_STRING);
3004       input_strings.safe_push (tok->val.str);
3005       input_locs[i] = tok->src_loc;
3006     }
3007
3008   /* Verify that cpp_interpret_string works.  */
3009   cpp_string dst_string;
3010   const enum cpp_ttype type = CPP_STRING;
3011   bool result = cpp_interpret_string (test.m_parser,
3012                                       input_strings.address (), 5,
3013                                       &dst_string, type);
3014   ASSERT_TRUE (result);
3015   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3016   free (const_cast <unsigned char *> (dst_string.text));
3017
3018   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3019   test.m_concats.record_string_concatenation (5, input_locs);
3020
3021   location_t initial_loc = input_locs[0];
3022
3023   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3024      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3025      and expect get_source_range_for_substring to fail.
3026      However, for a string concatenation test, we can have a case
3027      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3028      but subsequent strings can be after it.
3029      Attempting to detect this within assert_char_at_range
3030      would overcomplicate the logic for the common test cases, so
3031      we detect it here.  */
3032   if (should_have_column_data_p (input_locs[0])
3033       && !should_have_column_data_p (input_locs[4]))
3034     {
3035       /* Verify that get_source_range_for_substring gracefully rejects
3036          this case.  */
3037       source_range actual_range;
3038       const char *err
3039         = get_source_range_for_char (test.m_parser, &test.m_concats,
3040                                      initial_loc, type, 0, &actual_range);
3041       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3042       return;
3043     }
3044
3045   for (int i = 0; i < 5; i++)
3046     for (int j = 0; j < 2; j++)
3047       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3048                             i + 1, 10 + j, 10 + j);
3049
3050   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3051   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3052
3053   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3054 }
3055
3056 /* Another test of string literal concatenation, this time combined with
3057    various kinds of escaped characters.  */
3058
3059 static void
3060 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3061 {
3062   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3063      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3064   const char *content
3065     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3066        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3067     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3068   lexer_test test (case_, content, NULL);
3069
3070   auto_vec <cpp_string> input_strings;
3071   location_t input_locs[4];
3072
3073   /* Verify that we get the expected tokens back.  */
3074   for (int i = 0; i < 4; i++)
3075     {
3076       const cpp_token *tok = test.get_token ();
3077       ASSERT_EQ (tok->type, CPP_STRING);
3078       input_strings.safe_push (tok->val.str);
3079       input_locs[i] = tok->src_loc;
3080     }
3081
3082   /* Verify that cpp_interpret_string works.  */
3083   cpp_string dst_string;
3084   const enum cpp_ttype type = CPP_STRING;
3085   bool result = cpp_interpret_string (test.m_parser,
3086                                       input_strings.address (), 4,
3087                                       &dst_string, type);
3088   ASSERT_TRUE (result);
3089   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3090   free (const_cast <unsigned char *> (dst_string.text));
3091
3092   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3093   test.m_concats.record_string_concatenation (4, input_locs);
3094
3095   location_t initial_loc = input_locs[0];
3096
3097   for (int i = 0; i <= 4; i++)
3098     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3099   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3100   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3101   for (int i = 7; i <= 9; i++)
3102     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3103
3104   /* NUL-terminator should use the location of the final closing quote.  */
3105   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3106
3107   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3108 }
3109
3110 /* Test of string literal in a macro.  */
3111
3112 static void
3113 test_lexer_string_locations_macro (const line_table_case &case_)
3114 {
3115   /* Digits 0-9.
3116      .....................0000000001111111111.22222222223.
3117      .....................1234567890123456789.01234567890.  */
3118   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3119                          "  MACRO");
3120   lexer_test test (case_, content, NULL);
3121
3122   /* Verify that we get the expected tokens back.  */
3123   const cpp_token *tok = test.get_token ();
3124   ASSERT_EQ (tok->type, CPP_PADDING);
3125
3126   tok = test.get_token ();
3127   ASSERT_EQ (tok->type, CPP_STRING);
3128   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3129
3130   /* Verify ranges of individual characters.  We ought to
3131      see columns within the macro definition.  */
3132   for (int i = 0; i <= 10; i++)
3133     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3134                           i, 1, 20 + i, 20 + i);
3135
3136   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3137
3138   tok = test.get_token ();
3139   ASSERT_EQ (tok->type, CPP_PADDING);
3140 }
3141
3142 /* Test of stringification of a macro argument.  */
3143
3144 static void
3145 test_lexer_string_locations_stringified_macro_argument
3146   (const line_table_case &case_)
3147 {
3148   /* .....................000000000111111111122222222223.
3149      .....................123456789012345678901234567890.  */
3150   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3151                          "MACRO(foo)\n");
3152   lexer_test test (case_, content, NULL);
3153
3154   /* Verify that we get the expected token back.  */
3155   const cpp_token *tok = test.get_token ();
3156   ASSERT_EQ (tok->type, CPP_PADDING);
3157
3158   tok = test.get_token ();
3159   ASSERT_EQ (tok->type, CPP_STRING);
3160   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3161
3162   /* We don't support getting the location of a stringified macro
3163      argument.  Verify that it fails gracefully.  */
3164   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3165                                   "cpp_interpret_string_1 failed");
3166
3167   tok = test.get_token ();
3168   ASSERT_EQ (tok->type, CPP_PADDING);
3169
3170   tok = test.get_token ();
3171   ASSERT_EQ (tok->type, CPP_PADDING);
3172 }
3173
3174 /* Ensure that we are fail gracefully if something attempts to pass
3175    in a location that isn't a string literal token.  Seen on this code:
3176
3177      const char a[] = " %d ";
3178      __builtin_printf (a, 0.5);
3179                        ^
3180
3181    when c-format.c erroneously used the indicated one-character
3182    location as the format string location, leading to a read past the
3183    end of a string buffer in cpp_interpret_string_1.  */
3184
3185 static void
3186 test_lexer_string_locations_non_string (const line_table_case &case_)
3187 {
3188   /* .....................000000000111111111122222222223.
3189      .....................123456789012345678901234567890.  */
3190   const char *content = ("         a\n");
3191   lexer_test test (case_, content, NULL);
3192
3193   /* Verify that we get the expected token back.  */
3194   const cpp_token *tok = test.get_token ();
3195   ASSERT_EQ (tok->type, CPP_NAME);
3196   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3197
3198   /* At this point, libcpp is attempting to interpret the name as a
3199      string literal, despite it not starting with a quote.  We don't detect
3200      that, but we should at least fail gracefully.  */
3201   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3202                                   "cpp_interpret_string_1 failed");
3203 }
3204
3205 /* Ensure that we can read substring information for a token which
3206    starts in one linemap and ends in another .  Adapted from
3207    gcc.dg/cpp/pr69985.c.  */
3208
3209 static void
3210 test_lexer_string_locations_long_line (const line_table_case &case_)
3211 {
3212   /* .....................000000.000111111111
3213      .....................123456.789012346789.  */
3214   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3215                          "     \"0123456789012345678901234567890123456789"
3216                          "0123456789012345678901234567890123456789"
3217                          "0123456789012345678901234567890123456789"
3218                          "0123456789\"\n");
3219
3220   lexer_test test (case_, content, NULL);
3221
3222   /* Verify that we get the expected token back.  */
3223   const cpp_token *tok = test.get_token ();
3224   ASSERT_EQ (tok->type, CPP_STRING);
3225
3226   if (!should_have_column_data_p (line_table->highest_location))
3227     return;
3228
3229   /* Verify ranges of individual characters.  */
3230   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3231   for (int i = 0; i < 131; i++)
3232     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3233                           i, 2, 7 + i, 7 + i);
3234 }
3235
3236 /* Test of locations within a raw string that doesn't contain a newline.  */
3237
3238 static void
3239 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3240 {
3241   /* .....................00.0000000111111111122.
3242      .....................12.3456789012345678901.  */
3243   const char *content = ("R\"foo(0123456789)foo\"\n");
3244   lexer_test test (case_, content, NULL);
3245
3246   /* Verify that we get the expected token back.  */
3247   const cpp_token *tok = test.get_token ();
3248   ASSERT_EQ (tok->type, CPP_STRING);
3249
3250   /* Verify that cpp_interpret_string works.  */
3251   cpp_string dst_string;
3252   const enum cpp_ttype type = CPP_STRING;
3253   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3254                                       &dst_string, type);
3255   ASSERT_TRUE (result);
3256   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3257   free (const_cast <unsigned char *> (dst_string.text));
3258
3259   if (!should_have_column_data_p (line_table->highest_location))
3260     return;
3261
3262   /* 0-9, plus the nil terminator.  */
3263   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3264   for (int i = 0; i < 11; i++)
3265     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3266                           i, 1, 7 + i, 7 + i);
3267 }
3268
3269 /* Test of locations within a raw string that contains a newline.  */
3270
3271 static void
3272 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3273 {
3274   /* .....................00.0000.
3275      .....................12.3456.  */
3276   const char *content = ("R\"foo(\n"
3277   /* .....................00000.
3278      .....................12345.  */
3279                          "hello\n"
3280                          "world\n"
3281   /* .....................00000.
3282      .....................12345.  */
3283                          ")foo\"\n");
3284   lexer_test test (case_, content, NULL);
3285
3286   /* Verify that we get the expected token back.  */
3287   const cpp_token *tok = test.get_token ();
3288   ASSERT_EQ (tok->type, CPP_STRING);
3289
3290   /* Verify that cpp_interpret_string works.  */
3291   cpp_string dst_string;
3292   const enum cpp_ttype type = CPP_STRING;
3293   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3294                                       &dst_string, type);
3295   ASSERT_TRUE (result);
3296   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3297   free (const_cast <unsigned char *> (dst_string.text));
3298
3299   if (!should_have_column_data_p (line_table->highest_location))
3300     return;
3301
3302   /* Currently we don't support locations within raw strings that
3303      contain newlines.  */
3304   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3305                                   "range endpoints are on different lines");
3306 }
3307
3308 /* Test of parsing an unterminated raw string.  */
3309
3310 static void
3311 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3312 {
3313   const char *content = "R\"ouch()ouCh\" /* etc */";
3314
3315   lexer_error_sink errors;
3316   lexer_test test (case_, content, &errors);
3317   test.m_implicitly_expect_EOF = false;
3318
3319   /* Attempt to parse the raw string.  */
3320   const cpp_token *tok = test.get_token ();
3321   ASSERT_EQ (tok->type, CPP_EOF);
3322
3323   ASSERT_EQ (1, errors.m_errors.length ());
3324   /* We expect the message "unterminated raw string"
3325      in the "cpplib" translation domain.
3326      It's not clear that dgettext is available on all supported hosts,
3327      so this assertion is commented-out for now.
3328        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3329                      errors.m_errors[0]);
3330   */
3331 }
3332
3333 /* Test of lexing char constants.  */
3334
3335 static void
3336 test_lexer_char_constants (const line_table_case &case_)
3337 {
3338   /* Various char constants.
3339      .....................0000000001111111111.22222222223.
3340      .....................1234567890123456789.01234567890.  */
3341   const char *content = ("         'a'\n"
3342                          "        u'a'\n"
3343                          "        U'a'\n"
3344                          "        L'a'\n"
3345                          "         'abc'\n");
3346   lexer_test test (case_, content, NULL);
3347
3348   /* Verify that we get the expected tokens back.  */
3349   /* 'a'.  */
3350   const cpp_token *tok = test.get_token ();
3351   ASSERT_EQ (tok->type, CPP_CHAR);
3352   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3353
3354   unsigned int chars_seen;
3355   int unsignedp;
3356   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3357                                           &chars_seen, &unsignedp);
3358   ASSERT_EQ (cc, 'a');
3359   ASSERT_EQ (chars_seen, 1);
3360
3361   /* u'a'.  */
3362   tok = test.get_token ();
3363   ASSERT_EQ (tok->type, CPP_CHAR16);
3364   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3365
3366   /* U'a'.  */
3367   tok = test.get_token ();
3368   ASSERT_EQ (tok->type, CPP_CHAR32);
3369   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3370
3371   /* L'a'.  */
3372   tok = test.get_token ();
3373   ASSERT_EQ (tok->type, CPP_WCHAR);
3374   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3375
3376   /* 'abc' (c-char-sequence).  */
3377   tok = test.get_token ();
3378   ASSERT_EQ (tok->type, CPP_CHAR);
3379   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3380 }
3381 /* A table of interesting location_t values, giving one axis of our test
3382    matrix.  */
3383
3384 static const location_t boundary_locations[] = {
3385   /* Zero means "don't override the default values for a new line_table".  */
3386   0,
3387
3388   /* An arbitrary non-zero value that isn't close to one of
3389      the boundary values below.  */
3390   0x10000,
3391
3392   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3393   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3394   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3395   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3396   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3397   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3398
3399   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3400   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3401   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3402   LINE_MAP_MAX_LOCATION_WITH_COLS,
3403   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3404   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3405 };
3406
3407 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3408
3409 void
3410 for_each_line_table_case (void (*testcase) (const line_table_case &))
3411 {
3412   /* As noted above in the description of struct line_table_case,
3413      we want to explore a test matrix of interesting line_table
3414      situations, running various selftests for each case within the
3415      matrix.  */
3416
3417   /* Run all tests with:
3418      (a) line_table->default_range_bits == 0, and
3419      (b) line_table->default_range_bits == 5.  */
3420   int num_cases_tested = 0;
3421   for (int default_range_bits = 0; default_range_bits <= 5;
3422        default_range_bits += 5)
3423     {
3424       /* ...and use each of the "interesting" location values as
3425          the starting location within line_table.  */
3426       const int num_boundary_locations
3427         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3428       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3429         {
3430           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3431
3432           testcase (c);
3433
3434           num_cases_tested++;
3435         }
3436     }
3437
3438   /* Verify that we fully covered the test matrix.  */
3439   ASSERT_EQ (num_cases_tested, 2 * 12);
3440 }
3441
3442 /* Run all of the selftests within this file.  */
3443
3444 void
3445 input_c_tests ()
3446 {
3447   test_should_have_column_data_p ();
3448   test_unknown_location ();
3449   test_builtins ();
3450   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3451
3452   for_each_line_table_case (test_accessing_ordinary_linemaps);
3453   for_each_line_table_case (test_lexer);
3454   for_each_line_table_case (test_lexer_string_locations_simple);
3455   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3456   for_each_line_table_case (test_lexer_string_locations_hex);
3457   for_each_line_table_case (test_lexer_string_locations_oct);
3458   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3459   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3460   for_each_line_table_case (test_lexer_string_locations_ucn4);
3461   for_each_line_table_case (test_lexer_string_locations_ucn8);
3462   for_each_line_table_case (test_lexer_string_locations_wide_string);
3463   for_each_line_table_case (test_lexer_string_locations_string16);
3464   for_each_line_table_case (test_lexer_string_locations_string32);
3465   for_each_line_table_case (test_lexer_string_locations_u8);
3466   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3467   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3468   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3469   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3470   for_each_line_table_case (test_lexer_string_locations_macro);
3471   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3472   for_each_line_table_case (test_lexer_string_locations_non_string);
3473   for_each_line_table_case (test_lexer_string_locations_long_line);
3474   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3475   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3476   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3477   for_each_line_table_case (test_lexer_char_constants);
3478
3479   test_reading_source_line ();
3480 }
3481
3482 } // namespace selftest
3483
3484 #endif /* CHECKING_P */