gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2017 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic-core.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* This is a cache used by get_next_line to store the content of a
  33    file to be searched for file lines.  */
  34 struct fcache
  35 {
  36   /* These are information used to store a line boundary.  */
  37   struct line_info
  38   {
  39     /* The line number.  It starts from 1.  */
  40     size_t line_num;
  41
  42     /* The position (byte count) of the beginning of the line,
  43        relative to the file data pointer.  This starts at zero.  */
  44     size_t start_pos;
  45
  46     /* The position (byte count) of the last byte of the line.  This
  47        normally points to the '\n' character, or to one byte after the
  48        last byte of the file, if the file doesn't contain a '\n'
  49        character.  */
  50     size_t end_pos;
  51
  52     line_info (size_t l, size_t s, size_t e)
  53       : line_num (l), start_pos (s), end_pos (e)
  54     {}
  55
  56     line_info ()
  57       :line_num (0), start_pos (0), end_pos (0)
  58     {}
  59   };
  60
  61   /* The number of time this file has been accessed.  This is used
  62      to designate which file cache to evict from the cache
  63      array.  */
  64   unsigned use_count;
  65
  66   /* The file_path is the key for identifying a particular file in
  67      the cache.
  68      For libcpp-using code, the underlying buffer for this field is
  69      owned by the corresponding _cpp_file within the cpp_reader.  */
  70   const char *file_path;
  71
  72   FILE *fp;
  73
  74   /* This points to the content of the file that we've read so
  75      far.  */
  76   char *data;
  77
  78   /*  The size of the DATA array above.*/
  79   size_t size;
  80
  81   /* The number of bytes read from the underlying file so far.  This
  82      must be less (or equal) than SIZE above.  */
  83   size_t nb_read;
  84
  85   /* The index of the beginning of the current line.  */
  86   size_t line_start_idx;
  87
  88   /* The number of the previous line read.  This starts at 1.  Zero
  89      means we've read no line so far.  */
  90   size_t line_num;
  91
  92   /* This is the total number of lines of the current file.  At the
  93      moment, we try to get this information from the line map
  94      subsystem.  Note that this is just a hint.  When using the C++
  95      front-end, this hint is correct because the input file is then
  96      completely tokenized before parsing starts; so the line map knows
  97      the number of lines before compilation really starts.  For e.g,
  98      the C front-end, it can happen that we start emitting diagnostics
  99      before the line map has seen the end of the file.  */
 100   size_t total_lines;
 101
 102   /* Could this file be missing a trailing newline on its final line?
 103      Initially true (to cope with empty files), set to true/false
 104      as each line is read.  */
 105   bool missing_trailing_newline;
 106
 107   /* This is a record of the beginning and end of the lines we've seen
 108      while reading the file.  This is useful to avoid walking the data
 109      from the beginning when we are asked to read a line that is
 110      before LINE_START_IDX above.  Note that the maximum size of this
 111      record is fcache_line_record_size, so that the memory consumption
 112      doesn't explode.  We thus scale total_lines down to
 113      fcache_line_record_size.  */
 114   vec<line_info, va_heap> line_record;
 115
 116   fcache ();
 117   ~fcache ();
 118 };
 119
 120 /* Current position in real source file.  */
 121
 122 location_t input_location = UNKNOWN_LOCATION;
 123
 124 struct line_maps *line_table;
 125
 126 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 127    This needs to be a global so that it can be a GC root, and thus
 128    prevent the stashed copy from being garbage-collected if the GC runs
 129    during a line_table_test.  */
 130
 131 struct line_maps *saved_line_table;
 132
 133 static fcache *fcache_tab;
 134 static const size_t fcache_tab_size = 16;
 135 static const size_t fcache_buffer_size = 4 * 1024;
 136 static const size_t fcache_line_record_size = 100;
 137
 138 /* Expand the source location LOC into a human readable location.  If
 139    LOC resolves to a builtin location, the file name of the readable
 140    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 141    TRUE and LOC is virtual, then it is resolved to the expansion
 142    point of the involved macro.  Otherwise, it is resolved to the
 143    spelling location of the token.
 144
 145    When resolving to the spelling location of the token, if the
 146    resulting location is for a built-in location (that is, it has no
 147    associated line/column) in the context of a macro expansion, the
 148    returned location is the first one (while unwinding the macro
 149    location towards its expansion point) that is in real source
 150    code.  */
 151
 152 static expanded_location
 153 expand_location_1 (source_location loc,
 154                    bool expansion_point_p)
 155 {
 156   expanded_location xloc;
 157   const line_map_ordinary *map;
 158   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 159   tree block = NULL;
 160
 161   if (IS_ADHOC_LOC (loc))
 162     {
 163       block = LOCATION_BLOCK (loc);
 164       loc = LOCATION_LOCUS (loc);
 165     }
 166
 167   memset (&xloc, 0, sizeof (xloc));
 168
 169   if (loc >= RESERVED_LOCATION_COUNT)
 170     {
 171       if (!expansion_point_p)
 172         {
 173           /* We want to resolve LOC to its spelling location.
 174
 175              But if that spelling location is a reserved location that
 176              appears in the context of a macro expansion (like for a
 177              location for a built-in token), let's consider the first
 178              location (toward the expansion point) that is not reserved;
 179              that is, the first location that is in real source code.  */
 180           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 181                                                           loc, NULL);
 182           lrk = LRK_SPELLING_LOCATION;
 183         }
 184       loc = linemap_resolve_location (line_table, loc,
 185                                       lrk, &map);
 186       xloc = linemap_expand_location (line_table, map, loc);
 187     }
 188
 189   xloc.data = block;
 190   if (loc <= BUILTINS_LOCATION)
 191     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 192
 193   return xloc;
 194 }
 195
 196 /* Initialize the set of cache used for files accessed by caret
 197    diagnostic.  */
 198
 199 static void
 200 diagnostic_file_cache_init (void)
 201 {
 202   if (fcache_tab == NULL)
 203     fcache_tab = new fcache[fcache_tab_size];
 204 }
 205
 206 /* Free the resources used by the set of cache used for files accessed
 207    by caret diagnostic.  */
 208
 209 void
 210 diagnostic_file_cache_fini (void)
 211 {
 212   if (fcache_tab)
 213     {
 214       delete [] (fcache_tab);
 215       fcache_tab = NULL;
 216     }
 217 }
 218
 219 /* Return the total lines number that have been read so far by the
 220    line map (in the preprocessor) so far.  For languages like C++ that
 221    entirely preprocess the input file before starting to parse, this
 222    equals the actual number of lines of the file.  */
 223
 224 static size_t
 225 total_lines_num (const char *file_path)
 226 {
 227   size_t r = 0;
 228   source_location l = 0;
 229   if (linemap_get_file_highest_location (line_table, file_path, &l))
 230     {
 231       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 232       expanded_location xloc = expand_location (l);
 233       r = xloc.line;
 234     }
 235   return r;
 236 }
 237
 238 /* Lookup the cache used for the content of a given file accessed by
 239    caret diagnostic.  Return the found cached file, or NULL if no
 240    cached file was found.  */
 241
 242 static fcache*
 243 lookup_file_in_cache_tab (const char *file_path)
 244 {
 245   if (file_path == NULL)
 246     return NULL;
 247
 248   diagnostic_file_cache_init ();
 249
 250   /* This will contain the found cached file.  */
 251   fcache *r = NULL;
 252   for (unsigned i = 0; i < fcache_tab_size; ++i)
 253     {
 254       fcache *c = &fcache_tab[i];
 255       if (c->file_path && !strcmp (c->file_path, file_path))
 256         {
 257           ++c->use_count;
 258           r = c;
 259         }
 260     }
 261
 262   if (r)
 263     ++r->use_count;
 264
 265   return r;
 266 }
 267
 268 /* Purge any mention of FILENAME from the cache of files used for
 269    printing source code.  For use in selftests when working
 270    with tempfiles.  */
 271
 272 void
 273 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 274 {
 275   gcc_assert (file_path);
 276
 277   fcache *r = lookup_file_in_cache_tab (file_path);
 278   if (!r)
 279     /* Not found.  */
 280     return;
 281
 282   r->file_path = NULL;
 283   if (r->fp)
 284     fclose (r->fp);
 285   r->fp = NULL;
 286   r->nb_read = 0;
 287   r->line_start_idx = 0;
 288   r->line_num = 0;
 289   r->line_record.truncate (0);
 290   r->use_count = 0;
 291   r->total_lines = 0;
 292   r->missing_trailing_newline = true;
 293 }
 294
 295 /* Return the file cache that has been less used, recently, or the
 296    first empty one.  If HIGHEST_USE_COUNT is non-null,
 297    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 298    in the cache table.  */
 299
 300 static fcache*
 301 evicted_cache_tab_entry (unsigned *highest_use_count)
 302 {
 303   diagnostic_file_cache_init ();
 304
 305   fcache *to_evict = &fcache_tab[0];
 306   unsigned huc = to_evict->use_count;
 307   for (unsigned i = 1; i < fcache_tab_size; ++i)
 308     {
 309       fcache *c = &fcache_tab[i];
 310       bool c_is_empty = (c->file_path == NULL);
 311
 312       if (c->use_count < to_evict->use_count
 313           || (to_evict->file_path && c_is_empty))
 314         /* We evict C because it's either an entry with a lower use
 315            count or one that is empty.  */
 316         to_evict = c;
 317
 318       if (huc < c->use_count)
 319         huc = c->use_count;
 320
 321       if (c_is_empty)
 322         /* We've reached the end of the cache; subsequent elements are
 323            all empty.  */
 324         break;
 325     }
 326
 327   if (highest_use_count)
 328     *highest_use_count = huc;
 329
 330   return to_evict;
 331 }
 332
 333 /* Create the cache used for the content of a given file to be
 334    accessed by caret diagnostic.  This cache is added to an array of
 335    cache and can be retrieved by lookup_file_in_cache_tab.  This
 336    function returns the created cache.  Note that only the last
 337    fcache_tab_size files are cached.  */
 338
 339 static fcache*
 340 add_file_to_cache_tab (const char *file_path)
 341 {
 342
 343   FILE *fp = fopen (file_path, "r");
 344   if (fp == NULL)
 345     return NULL;
 346
 347   unsigned highest_use_count = 0;
 348   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 349   r->file_path = file_path;
 350   if (r->fp)
 351     fclose (r->fp);
 352   r->fp = fp;
 353   r->nb_read = 0;
 354   r->line_start_idx = 0;
 355   r->line_num = 0;
 356   r->line_record.truncate (0);
 357   /* Ensure that this cache entry doesn't get evicted next time
 358      add_file_to_cache_tab is called.  */
 359   r->use_count = ++highest_use_count;
 360   r->total_lines = total_lines_num (file_path);
 361   r->missing_trailing_newline = true;
 362
 363   return r;
 364 }
 365
 366 /* Lookup the cache used for the content of a given file accessed by
 367    caret diagnostic.  If no cached file was found, create a new cache
 368    for this file, add it to the array of cached file and return
 369    it.  */
 370
 371 static fcache*
 372 lookup_or_add_file_to_cache_tab (const char *file_path)
 373 {
 374   fcache *r = lookup_file_in_cache_tab (file_path);
 375   if (r == NULL)
 376     r = add_file_to_cache_tab (file_path);
 377   return r;
 378 }
 379
 380 /* Default constructor for a cache of file used by caret
 381    diagnostic.  */
 382
 383 fcache::fcache ()
 384 : use_count (0), file_path (NULL), fp (NULL), data (0),
 385   size (0), nb_read (0), line_start_idx (0), line_num (0),
 386   total_lines (0), missing_trailing_newline (true)
 387 {
 388   line_record.create (0);
 389 }
 390
 391 /* Destructor for a cache of file used by caret diagnostic.  */
 392
 393 fcache::~fcache ()
 394 {
 395   if (fp)
 396     {
 397       fclose (fp);
 398       fp = NULL;
 399     }
 400   if (data)
 401     {
 402       XDELETEVEC (data);
 403       data = 0;
 404     }
 405   line_record.release ();
 406 }
 407
 408 /* Returns TRUE iff the cache would need to be filled with data coming
 409    from the file.  That is, either the cache is empty or full or the
 410    current line is empty.  Note that if the cache is full, it would
 411    need to be extended and filled again.  */
 412
 413 static bool
 414 needs_read (fcache *c)
 415 {
 416   return (c->nb_read == 0
 417           || c->nb_read == c->size
 418           || (c->line_start_idx >= c->nb_read - 1));
 419 }
 420
 421 /*  Return TRUE iff the cache is full and thus needs to be
 422     extended.  */
 423
 424 static bool
 425 needs_grow (fcache *c)
 426 {
 427   return c->nb_read == c->size;
 428 }
 429
 430 /* Grow the cache if it needs to be extended.  */
 431
 432 static void
 433 maybe_grow (fcache *c)
 434 {
 435   if (!needs_grow (c))
 436     return;
 437
 438   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 439   c->data = XRESIZEVEC (char, c->data, size);
 440   c->size = size;
 441 }
 442
 443 /*  Read more data into the cache.  Extends the cache if need be.
 444     Returns TRUE iff new data could be read.  */
 445
 446 static bool
 447 read_data (fcache *c)
 448 {
 449   if (feof (c->fp) || ferror (c->fp))
 450     return false;
 451
 452   maybe_grow (c);
 453
 454   char * from = c->data + c->nb_read;
 455   size_t to_read = c->size - c->nb_read;
 456   size_t nb_read = fread (from, 1, to_read, c->fp);
 457
 458   if (ferror (c->fp))
 459     return false;
 460
 461   c->nb_read += nb_read;
 462   return !!nb_read;
 463 }
 464
 465 /* Read new data iff the cache needs to be filled with more data
 466    coming from the file FP.  Return TRUE iff the cache was filled with
 467    mode data.  */
 468
 469 static bool
 470 maybe_read_data (fcache *c)
 471 {
 472   if (!needs_read (c))
 473     return false;
 474   return read_data (c);
 475 }
 476
 477 /* Read a new line from file FP, using C as a cache for the data
 478    coming from the file.  Upon successful completion, *LINE is set to
 479    the beginning of the line found.  *LINE points directly in the
 480    line cache and is only valid until the next call of get_next_line.
 481    *LINE_LEN is set to the length of the line.  Note that the line
 482    does not contain any terminal delimiter.  This function returns
 483    true if some data was read or process from the cache, false
 484    otherwise.  Note that subsequent calls to get_next_line might
 485    make the content of *LINE invalid.  */
 486
 487 static bool
 488 get_next_line (fcache *c, char **line, ssize_t *line_len)
 489 {
 490   /* Fill the cache with data to process.  */
 491   maybe_read_data (c);
 492
 493   size_t remaining_size = c->nb_read - c->line_start_idx;
 494   if (remaining_size == 0)
 495     /* There is no more data to process.  */
 496     return false;
 497
 498   char *line_start = c->data + c->line_start_idx;
 499
 500   char *next_line_start = NULL;
 501   size_t len = 0;
 502   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 503   if (line_end == NULL)
 504     {
 505       /* We haven't found the end-of-line delimiter in the cache.
 506          Fill the cache with more data from the file and look for the
 507          '\n'.  */
 508       while (maybe_read_data (c))
 509         {
 510           line_start = c->data + c->line_start_idx;
 511           remaining_size = c->nb_read - c->line_start_idx;
 512           line_end = (char *) memchr (line_start, '\n', remaining_size);
 513           if (line_end != NULL)
 514             {
 515               next_line_start = line_end + 1;
 516               break;
 517             }
 518         }
 519       if (line_end == NULL)
 520         {
 521           /* We've loadded all the file into the cache and still no
 522              '\n'.  Let's say the line ends up at one byte passed the
 523              end of the file.  This is to stay consistent with the case
 524              of when the line ends up with a '\n' and line_end points to
 525              that terminal '\n'.  That consistency is useful below in
 526              the len calculation.  */
 527           line_end = c->data + c->nb_read ;
 528           c->missing_trailing_newline = true;
 529         }
 530       else
 531         c->missing_trailing_newline = false;
 532     }
 533   else
 534     {
 535       next_line_start = line_end + 1;
 536       c->missing_trailing_newline = false;
 537     }
 538
 539   if (ferror (c->fp))
 540     return false;
 541
 542   /* At this point, we've found the end of the of line.  It either
 543      points to the '\n' or to one byte after the last byte of the
 544      file.  */
 545   gcc_assert (line_end != NULL);
 546
 547   len = line_end - line_start;
 548
 549   if (c->line_start_idx < c->nb_read)
 550     *line = line_start;
 551
 552   ++c->line_num;
 553
 554   /* Before we update our line record, make sure the hint about the
 555      total number of lines of the file is correct.  If it's not, then
 556      we give up recording line boundaries from now on.  */
 557   bool update_line_record = true;
 558   if (c->line_num > c->total_lines)
 559     update_line_record = false;
 560
 561     /* Now update our line record so that re-reading lines from the
 562      before c->line_start_idx is faster.  */
 563   if (update_line_record
 564       && c->line_record.length () < fcache_line_record_size)
 565     {
 566       /* If the file lines fits in the line record, we just record all
 567          its lines ...*/
 568       if (c->total_lines <= fcache_line_record_size
 569           && c->line_num > c->line_record.length ())
 570         c->line_record.safe_push (fcache::line_info (c->line_num,
 571                                                  c->line_start_idx,
 572                                                  line_end - c->data));
 573       else if (c->total_lines > fcache_line_record_size)
 574         {
 575           /* ... otherwise, we just scale total_lines down to
 576              (fcache_line_record_size lines.  */
 577           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 578           if (c->line_record.length () == 0
 579               || n >= c->line_record.length ())
 580             c->line_record.safe_push (fcache::line_info (c->line_num,
 581                                                      c->line_start_idx,
 582                                                      line_end - c->data));
 583         }
 584     }
 585
 586   /* Update c->line_start_idx so that it points to the next line to be
 587      read.  */
 588   if (next_line_start)
 589     c->line_start_idx = next_line_start - c->data;
 590   else
 591     /* We didn't find any terminal '\n'.  Let's consider that the end
 592        of line is the end of the data in the cache.  The next
 593        invocation of get_next_line will either read more data from the
 594        underlying file or return false early because we've reached the
 595        end of the file.  */
 596     c->line_start_idx = c->nb_read;
 597
 598   *line_len = len;
 599
 600   return true;
 601 }
 602
 603 /* Consume the next bytes coming from the cache (or from its
 604    underlying file if there are remaining unread bytes in the file)
 605    until we reach the next end-of-line (or end-of-file).  There is no
 606    copying from the cache involved.  Return TRUE upon successful
 607    completion.  */
 608
 609 static bool
 610 goto_next_line (fcache *cache)
 611 {
 612   char *l;
 613   ssize_t len;
 614
 615   return get_next_line (cache, &l, &len);
 616 }
 617
 618 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 619    If the line was read successfully, *LINE points to the beginning
 620    of the line in the file cache and *LINE_LEN is the length of the
 621    line.  *LINE is not nul-terminated, but may contain zero bytes.
 622    *LINE is only valid until the next call of read_line_num.
 623    This function returns bool if a line was read.  */
 624
 625 static bool
 626 read_line_num (fcache *c, size_t line_num,
 627                char **line, ssize_t *line_len)
 628 {
 629   gcc_assert (line_num > 0);
 630
 631   if (line_num <= c->line_num)
 632     {
 633       /* We've been asked to read lines that are before c->line_num.
 634          So lets use our line record (if it's not empty) to try to
 635          avoid re-reading the file from the beginning again.  */
 636
 637       if (c->line_record.is_empty ())
 638         {
 639           c->line_start_idx = 0;
 640           c->line_num = 0;
 641         }
 642       else
 643         {
 644           fcache::line_info *i = NULL;
 645           if (c->total_lines <= fcache_line_record_size)
 646             {
 647               /* In languages where the input file is not totally
 648                  preprocessed up front, the c->total_lines hint
 649                  can be smaller than the number of lines of the
 650                  file.  In that case, only the first
 651                  c->total_lines have been recorded.
 652
 653                  Otherwise, the first c->total_lines we've read have
 654                  their start/end recorded here.  */
 655               i = (line_num <= c->total_lines)
 656                 ? &c->line_record[line_num - 1]
 657                 : &c->line_record[c->total_lines - 1];
 658               gcc_assert (i->line_num <= line_num);
 659             }
 660           else
 661             {
 662               /*  So the file had more lines than our line record
 663                   size.  Thus the number of lines we've recorded has
 664                   been scaled down to fcache_line_reacord_size.  Let's
 665                   pick the start/end of the recorded line that is
 666                   closest to line_num.  */
 667               size_t n = (line_num <= c->total_lines)
 668                 ? line_num * fcache_line_record_size / c->total_lines
 669                 : c ->line_record.length () - 1;
 670               if (n < c->line_record.length ())
 671                 {
 672                   i = &c->line_record[n];
 673                   gcc_assert (i->line_num <= line_num);
 674                 }
 675             }
 676
 677           if (i && i->line_num == line_num)
 678             {
 679               /* We have the start/end of the line.  */
 680               *line = c->data + i->start_pos;
 681               *line_len = i->end_pos - i->start_pos;
 682               return true;
 683             }
 684
 685           if (i)
 686             {
 687               c->line_start_idx = i->start_pos;
 688               c->line_num = i->line_num - 1;
 689             }
 690           else
 691             {
 692               c->line_start_idx = 0;
 693               c->line_num = 0;
 694             }
 695         }
 696     }
 697
 698   /*  Let's walk from line c->line_num up to line_num - 1, without
 699       copying any line.  */
 700   while (c->line_num < line_num - 1)
 701     if (!goto_next_line (c))
 702       return false;
 703
 704   /* The line we want is the next one.  Let's read and copy it back to
 705      the caller.  */
 706   return get_next_line (c, line, line_len);
 707 }
 708
 709 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 710    The line is not nul-terminated.  The returned pointer is only
 711    valid until the next call of location_get_source_line.
 712    Note that the line can contain several null characters,
 713    so LINE_LEN, if non-null, points to the actual length of the line.
 714    If the function fails, NULL is returned.  */
 715
 716 const char *
 717 location_get_source_line (const char *file_path, int line,
 718                           int *line_len)
 719 {
 720   char *buffer = NULL;
 721   ssize_t len;
 722
 723   if (line == 0)
 724     return NULL;
 725
 726   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 727   if (c == NULL)
 728     return NULL;
 729
 730   bool read = read_line_num (c, line, &buffer, &len);
 731
 732   if (read && line_len)
 733     *line_len = len;
 734
 735   return read ? buffer : NULL;
 736 }
 737
 738 /* Determine if FILE_PATH missing a trailing newline on its final line.
 739    Only valid to call once all of the file has been loaded, by
 740    requesting a line number beyond the end of the file.  */
 741
 742 bool
 743 location_missing_trailing_newline (const char *file_path)
 744 {
 745   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 746   if (c == NULL)
 747     return false;
 748
 749   return c->missing_trailing_newline;
 750 }
 751
 752 /* Test if the location originates from the spelling location of a
 753    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 754    virtual) location of a built-in token that appears in the expansion
 755    list of a macro.  Please note that this function also works on
 756    tokens that result from built-in tokens.  For instance, the
 757    function would return true if passed a token "4" that is the result
 758    of the expansion of the built-in __LINE__ macro.  */
 759 bool
 760 is_location_from_builtin_token (source_location loc)
 761 {
 762   const line_map_ordinary *map = NULL;
 763   loc = linemap_resolve_location (line_table, loc,
 764                                   LRK_SPELLING_LOCATION, &map);
 765   return loc == BUILTINS_LOCATION;
 766 }
 767
 768 /* Expand the source location LOC into a human readable location.  If
 769    LOC is virtual, it resolves to the expansion point of the involved
 770    macro.  If LOC resolves to a builtin location, the file name of the
 771    readable location is set to the string "<built-in>".  */
 772
 773 expanded_location
 774 expand_location (source_location loc)
 775 {
 776   return expand_location_1 (loc, /*expansion_point_p=*/true);
 777 }
 778
 779 /* Expand the source location LOC into a human readable location.  If
 780    LOC is virtual, it resolves to the expansion location of the
 781    relevant macro.  If LOC resolves to a builtin location, the file
 782    name of the readable location is set to the string
 783    "<built-in>".  */
 784
 785 expanded_location
 786 expand_location_to_spelling_point (source_location loc)
 787 {
 788   return expand_location_1 (loc, /*expansion_point_p=*/false);
 789 }
 790
 791 /* The rich_location class within libcpp requires a way to expand
 792    source_location instances, and relies on the client code
 793    providing a symbol named
 794      linemap_client_expand_location_to_spelling_point
 795    to do this.
 796
 797    This is the implementation for libcommon.a (all host binaries),
 798    which simply calls into expand_location_to_spelling_point.  */
 799
 800 expanded_location
 801 linemap_client_expand_location_to_spelling_point (source_location loc)
 802 {
 803   return expand_location_to_spelling_point (loc);
 804 }
 805
 806
 807 /* If LOCATION is in a system header and if it is a virtual location for
 808    a token coming from the expansion of a macro, unwind it to the
 809    location of the expansion point of the macro.  Otherwise, just return
 810    LOCATION.
 811
 812    This is used for instance when we want to emit diagnostics about a
 813    token that may be located in a macro that is itself defined in a
 814    system header, for example, for the NULL macro.  In such a case, if
 815    LOCATION were passed directly to diagnostic functions such as
 816    warning_at, the diagnostic would be suppressed (unless
 817    -Wsystem-headers).  */
 818
 819 source_location
 820 expansion_point_location_if_in_system_header (source_location location)
 821 {
 822   if (in_system_header_at (location))
 823     location = linemap_resolve_location (line_table, location,
 824                                          LRK_MACRO_EXPANSION_POINT,
 825                                          NULL);
 826   return location;
 827 }
 828
 829 /* If LOCATION is a virtual location for a token coming from the expansion
 830    of a macro, unwind to the location of the expansion point of the macro.  */
 831
 832 source_location
 833 expansion_point_location (source_location location)
 834 {
 835   return linemap_resolve_location (line_table, location,
 836                                    LRK_MACRO_EXPANSION_POINT, NULL);
 837 }
 838
 839 /* Construct a location with caret at CARET, ranging from START to
 840    finish e.g.
 841
 842                  11111111112
 843         12345678901234567890
 844      522
 845      523   return foo + bar;
 846                   ~~~~^~~~~
 847      524
 848
 849    The location's caret is at the "+", line 523 column 15, but starts
 850    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 851    of "bar" at column 19.  */
 852
 853 location_t
 854 make_location (location_t caret, location_t start, location_t finish)
 855 {
 856   location_t pure_loc = get_pure_location (caret);
 857   source_range src_range;
 858   src_range.m_start = get_start (start);
 859   src_range.m_finish = get_finish (finish);
 860   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 861                                                    pure_loc,
 862                                                    src_range,
 863                                                    NULL);
 864   return combined_loc;
 865 }
 866
 867 #define ONE_K 1024
 868 #define ONE_M (ONE_K * ONE_K)
 869
 870 /* Display a number as an integer multiple of either:
 871    - 1024, if said integer is >= to 10 K (in base 2)
 872    - 1024 * 1024, if said integer is >= 10 M in (base 2)
 873  */
 874 #define SCALE(x) ((unsigned long) ((x) < 10 * ONE_K \
 875                   ? (x) \
 876                   : ((x) < 10 * ONE_M \
 877                      ? (x) / ONE_K \
 878                      : (x) / ONE_M)))
 879
 880 /* For a given integer, display either:
 881    - the character 'k', if the number is higher than 10 K (in base 2)
 882      but strictly lower than 10 M (in base 2)
 883    - the character 'M' if the number is higher than 10 M (in base2)
 884    - the charcter ' ' if the number is strictly lower  than 10 K  */
 885 #define STAT_LABEL(x) ((x) < 10 * ONE_K ? ' ' : ((x) < 10 * ONE_M ? 'k' : 'M'))
 886
 887 /* Display an integer amount as multiple of 1K or 1M (in base 2).
 888    Display the correct unit (either k, M, or ' ') after the amout, as
 889    well.  */
 890 #define FORMAT_AMOUNT(size) SCALE (size), STAT_LABEL (size)
 891
 892 /* Dump statistics to stderr about the memory usage of the line_table
 893    set of line maps.  This also displays some statistics about macro
 894    expansion.  */
 895
 896 void
 897 dump_line_table_statistics (void)
 898 {
 899   struct linemap_stats s;
 900   long total_used_map_size,
 901     macro_maps_size,
 902     total_allocated_map_size;
 903
 904   memset (&s, 0, sizeof (s));
 905
 906   linemap_get_statistics (line_table, &s);
 907
 908   macro_maps_size = s.macro_maps_used_size
 909     + s.macro_maps_locations_size;
 910
 911   total_allocated_map_size = s.ordinary_maps_allocated_size
 912     + s.macro_maps_allocated_size
 913     + s.macro_maps_locations_size;
 914
 915   total_used_map_size = s.ordinary_maps_used_size
 916     + s.macro_maps_used_size
 917     + s.macro_maps_locations_size;
 918
 919   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 920            s.num_expanded_macros);
 921   if (s.num_expanded_macros != 0)
 922     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 923              s.num_macro_tokens / s.num_expanded_macros);
 924   fprintf (stderr,
 925            "\nLine Table allocations during the "
 926            "compilation process\n");
 927   fprintf (stderr, "Number of ordinary maps used:        %5ld%c\n",
 928            SCALE (s.num_ordinary_maps_used),
 929            STAT_LABEL (s.num_ordinary_maps_used));
 930   fprintf (stderr, "Ordinary map used size:              %5ld%c\n",
 931            SCALE (s.ordinary_maps_used_size),
 932            STAT_LABEL (s.ordinary_maps_used_size));
 933   fprintf (stderr, "Number of ordinary maps allocated:   %5ld%c\n",
 934            SCALE (s.num_ordinary_maps_allocated),
 935            STAT_LABEL (s.num_ordinary_maps_allocated));
 936   fprintf (stderr, "Ordinary maps allocated size:        %5ld%c\n",
 937            SCALE (s.ordinary_maps_allocated_size),
 938            STAT_LABEL (s.ordinary_maps_allocated_size));
 939   fprintf (stderr, "Number of macro maps used:           %5ld%c\n",
 940            SCALE (s.num_macro_maps_used),
 941            STAT_LABEL (s.num_macro_maps_used));
 942   fprintf (stderr, "Macro maps used size:                %5ld%c\n",
 943            SCALE (s.macro_maps_used_size),
 944            STAT_LABEL (s.macro_maps_used_size));
 945   fprintf (stderr, "Macro maps locations size:           %5ld%c\n",
 946            SCALE (s.macro_maps_locations_size),
 947            STAT_LABEL (s.macro_maps_locations_size));
 948   fprintf (stderr, "Macro maps size:                     %5ld%c\n",
 949            SCALE (macro_maps_size),
 950            STAT_LABEL (macro_maps_size));
 951   fprintf (stderr, "Duplicated maps locations size:      %5ld%c\n",
 952            SCALE (s.duplicated_macro_maps_locations_size),
 953            STAT_LABEL (s.duplicated_macro_maps_locations_size));
 954   fprintf (stderr, "Total allocated maps size:           %5ld%c\n",
 955            SCALE (total_allocated_map_size),
 956            STAT_LABEL (total_allocated_map_size));
 957   fprintf (stderr, "Total used maps size:                %5ld%c\n",
 958            SCALE (total_used_map_size),
 959            STAT_LABEL (total_used_map_size));
 960   fprintf (stderr, "Ad-hoc table size:                   %5ld%c\n",
 961            SCALE (s.adhoc_table_size),
 962            STAT_LABEL (s.adhoc_table_size));
 963   fprintf (stderr, "Ad-hoc table entries used:           %5ld\n",
 964            s.adhoc_table_entries_used);
 965   fprintf (stderr, "optimized_ranges: %i\n",
 966            line_table->num_optimized_ranges);
 967   fprintf (stderr, "unoptimized_ranges: %i\n",
 968            line_table->num_unoptimized_ranges);
 969
 970   fprintf (stderr, "\n");
 971 }
 972
 973 /* Get location one beyond the final location in ordinary map IDX.  */
 974
 975 static source_location
 976 get_end_location (struct line_maps *set, unsigned int idx)
 977 {
 978   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
 979     return set->highest_location;
 980
 981   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
 982   return MAP_START_LOCATION (next_map);
 983 }
 984
 985 /* Helper function for write_digit_row.  */
 986
 987 static void
 988 write_digit (FILE *stream, int digit)
 989 {
 990   fputc ('0' + (digit % 10), stream);
 991 }
 992
 993 /* Helper function for dump_location_info.
 994    Write a row of numbers to STREAM, numbering a source line,
 995    giving the units, tens, hundreds etc of the column number.  */
 996
 997 static void
 998 write_digit_row (FILE *stream, int indent,
 999                  const line_map_ordinary *map,
1000                  source_location loc, int max_col, int divisor)
1001 {
1002   fprintf (stream, "%*c", indent, ' ');
1003   fprintf (stream, "|");
1004   for (int column = 1; column < max_col; column++)
1005     {
1006       source_location column_loc = loc + (column << map->m_range_bits);
1007       write_digit (stream, column_loc / divisor);
1008     }
1009   fprintf (stream, "\n");
1010 }
1011
1012 /* Write a half-closed (START) / half-open (END) interval of
1013    source_location to STREAM.  */
1014
1015 static void
1016 dump_location_range (FILE *stream,
1017                      source_location start, source_location end)
1018 {
1019   fprintf (stream,
1020            "  source_location interval: %u <= loc < %u\n",
1021            start, end);
1022 }
1023
1024 /* Write a labelled description of a half-closed (START) / half-open (END)
1025    interval of source_location to STREAM.  */
1026
1027 static void
1028 dump_labelled_location_range (FILE *stream,
1029                               const char *name,
1030                               source_location start, source_location end)
1031 {
1032   fprintf (stream, "%s\n", name);
1033   dump_location_range (stream, start, end);
1034   fprintf (stream, "\n");
1035 }
1036
1037 /* Write a visualization of the locations in the line_table to STREAM.  */
1038
1039 void
1040 dump_location_info (FILE *stream)
1041 {
1042   /* Visualize the reserved locations.  */
1043   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1044                                 0, RESERVED_LOCATION_COUNT);
1045
1046   /* Visualize the ordinary line_map instances, rendering the sources. */
1047   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1048     {
1049       source_location end_location = get_end_location (line_table, idx);
1050       /* half-closed: doesn't include this one. */
1051
1052       const line_map_ordinary *map
1053         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1054       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1055       dump_location_range (stream,
1056                            MAP_START_LOCATION (map), end_location);
1057       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1058       fprintf (stream, "  starting at line: %i\n",
1059                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1060       fprintf (stream, "  column and range bits: %i\n",
1061                map->m_column_and_range_bits);
1062       fprintf (stream, "  column bits: %i\n",
1063                map->m_column_and_range_bits - map->m_range_bits);
1064       fprintf (stream, "  range bits: %i\n",
1065                map->m_range_bits);
1066
1067       /* Render the span of source lines that this "map" covers.  */
1068       for (source_location loc = MAP_START_LOCATION (map);
1069            loc < end_location;
1070            loc += (1 << map->m_range_bits) )
1071         {
1072           gcc_assert (pure_location_p (line_table, loc) );
1073
1074           expanded_location exploc
1075             = linemap_expand_location (line_table, map, loc);
1076
1077           if (0 == exploc.column)
1078             {
1079               /* Beginning of a new source line: draw the line.  */
1080
1081               int line_size;
1082               const char *line_text = location_get_source_line (exploc.file,
1083                                                                 exploc.line,
1084                                                                 &line_size);
1085               if (!line_text)
1086                 break;
1087               fprintf (stream,
1088                        "%s:%3i|loc:%5i|%.*s\n",
1089                        exploc.file, exploc.line,
1090                        loc,
1091                        line_size, line_text);
1092
1093               /* "loc" is at column 0, which means "the whole line".
1094                  Render the locations *within* the line, by underlining
1095                  it, showing the source_location numeric values
1096                  at each column.  */
1097               int max_col = (1 << map->m_column_and_range_bits) - 1;
1098               if (max_col > line_size)
1099                 max_col = line_size + 1;
1100
1101               int indent = 14 + strlen (exploc.file);
1102
1103               /* Thousands.  */
1104               if (end_location > 999)
1105                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1106
1107               /* Hundreds.  */
1108               if (end_location > 99)
1109                 write_digit_row (stream, indent, map, loc, max_col, 100);
1110
1111               /* Tens.  */
1112               write_digit_row (stream, indent, map, loc, max_col, 10);
1113
1114               /* Units.  */
1115               write_digit_row (stream, indent, map, loc, max_col, 1);
1116             }
1117         }
1118       fprintf (stream, "\n");
1119     }
1120
1121   /* Visualize unallocated values.  */
1122   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1123                                 line_table->highest_location,
1124                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1125
1126   /* Visualize the macro line_map instances, rendering the sources. */
1127   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1128     {
1129       /* Each macro map that is allocated owns source_location values
1130          that are *lower* that the one before them.
1131          Hence it's meaningful to view them either in order of ascending
1132          source locations, or in order of ascending macro map index.  */
1133       const bool ascending_source_locations = true;
1134       unsigned int idx = (ascending_source_locations
1135                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1136                           : i);
1137       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1138       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1139                idx,
1140                linemap_map_get_macro_name (map),
1141                MACRO_MAP_NUM_MACRO_TOKENS (map));
1142       dump_location_range (stream,
1143                            map->start_location,
1144                            (map->start_location
1145                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1146       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1147               "expansion point is location %i",
1148               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1149       fprintf (stream, "  map->start_location: %u\n",
1150                map->start_location);
1151
1152       fprintf (stream, "  macro_locations:\n");
1153       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1154         {
1155           source_location x = MACRO_MAP_LOCATIONS (map)[2 * i];
1156           source_location y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1157
1158           /* linemap_add_macro_token encodes token numbers in an expansion
1159              by putting them after MAP_START_LOCATION. */
1160
1161           /* I'm typically seeing 4 uninitialized entries at the end of
1162              0xafafafaf.
1163              This appears to be due to macro.c:replace_args
1164              adding 2 extra args for padding tokens; presumably there may
1165              be a leading and/or trailing padding token injected,
1166              each for 2 more location slots.
1167              This would explain there being up to 4 source_locations slots
1168              that may be uninitialized.  */
1169
1170           fprintf (stream, "    %u: %u, %u\n",
1171                    i,
1172                    x,
1173                    y);
1174           if (x == y)
1175             {
1176               if (x < MAP_START_LOCATION (map))
1177                 inform (x, "token %u has x-location == y-location == %u", i, x);
1178               else
1179                 fprintf (stream,
1180                          "x-location == y-location == %u encodes token # %u\n",
1181                          x, x - MAP_START_LOCATION (map));
1182                 }
1183           else
1184             {
1185               inform (x, "token %u has x-location == %u", i, x);
1186               inform (x, "token %u has y-location == %u", i, y);
1187             }
1188         }
1189       fprintf (stream, "\n");
1190     }
1191
1192   /* It appears that MAX_SOURCE_LOCATION itself is never assigned to a
1193      macro map, presumably due to an off-by-one error somewhere
1194      between the logic in linemap_enter_macro and
1195      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1196   dump_labelled_location_range (stream, "MAX_SOURCE_LOCATION",
1197                                 MAX_SOURCE_LOCATION,
1198                                 MAX_SOURCE_LOCATION + 1);
1199
1200   /* Visualize ad-hoc values.  */
1201   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1202                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
1203 }
1204
1205 /* string_concat's constructor.  */
1206
1207 string_concat::string_concat (int num, location_t *locs)
1208   : m_num (num)
1209 {
1210   m_locs = ggc_vec_alloc <location_t> (num);
1211   for (int i = 0; i < num; i++)
1212     m_locs[i] = locs[i];
1213 }
1214
1215 /* string_concat_db's constructor.  */
1216
1217 string_concat_db::string_concat_db ()
1218 {
1219   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1220 }
1221
1222 /* Record that a string concatenation occurred, covering NUM
1223    string literal tokens.  LOCS is an array of size NUM, containing the
1224    locations of the tokens.  A copy of LOCS is taken.  */
1225
1226 void
1227 string_concat_db::record_string_concatenation (int num, location_t *locs)
1228 {
1229   gcc_assert (num > 1);
1230   gcc_assert (locs);
1231
1232   location_t key_loc = get_key_loc (locs[0]);
1233
1234   string_concat *concat
1235     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1236   m_table->put (key_loc, concat);
1237 }
1238
1239 /* Determine if LOC was the location of the the initial token of a
1240    concatenation of string literal tokens.
1241    If so, *OUT_NUM is written to with the number of tokens, and
1242    *OUT_LOCS with the location of an array of locations of the
1243    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1244    storage owned by the string_concat_db.
1245    Otherwise, return false.  */
1246
1247 bool
1248 string_concat_db::get_string_concatenation (location_t loc,
1249                                             int *out_num,
1250                                             location_t **out_locs)
1251 {
1252   gcc_assert (out_num);
1253   gcc_assert (out_locs);
1254
1255   location_t key_loc = get_key_loc (loc);
1256
1257   string_concat **concat = m_table->get (key_loc);
1258   if (!concat)
1259     return false;
1260
1261   *out_num = (*concat)->m_num;
1262   *out_locs =(*concat)->m_locs;
1263   return true;
1264 }
1265
1266 /* Internal function.  Canonicalize LOC into a form suitable for
1267    use as a key within the database, stripping away macro expansion,
1268    ad-hoc information, and range information, using the location of
1269    the start of LOC within an ordinary linemap.  */
1270
1271 location_t
1272 string_concat_db::get_key_loc (location_t loc)
1273 {
1274   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1275                                   NULL);
1276
1277   loc = get_range_from_loc (line_table, loc).m_start;
1278
1279   return loc;
1280 }
1281
1282 /* Helper class for use within get_substring_ranges_for_loc.
1283    An vec of cpp_string with responsibility for releasing all of the
1284    str->text for each str in the vector.  */
1285
1286 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1287 {
1288  public:
1289   auto_cpp_string_vec (int alloc)
1290     : auto_vec <cpp_string> (alloc) {}
1291
1292   ~auto_cpp_string_vec ()
1293   {
1294     /* Clean up the copies within this vec.  */
1295     int i;
1296     cpp_string *str;
1297     FOR_EACH_VEC_ELT (*this, i, str)
1298       free (const_cast <unsigned char *> (str->text));
1299   }
1300 };
1301
1302 /* Attempt to populate RANGES with source location information on the
1303    individual characters within the string literal found at STRLOC.
1304    If CONCATS is non-NULL, then any string literals that the token at
1305    STRLOC  was concatenated with are also added to RANGES.
1306
1307    Return NULL if successful, or an error message if any errors occurred (in
1308    which case RANGES may be only partially populated and should not
1309    be used).
1310
1311    This is implemented by re-parsing the relevant source line(s).  */
1312
1313 static const char *
1314 get_substring_ranges_for_loc (cpp_reader *pfile,
1315                               string_concat_db *concats,
1316                               location_t strloc,
1317                               enum cpp_ttype type,
1318                               cpp_substring_ranges &ranges)
1319 {
1320   gcc_assert (pfile);
1321
1322   if (strloc == UNKNOWN_LOCATION)
1323     return "unknown location";
1324
1325   /* Reparsing the strings requires accurate location information.
1326      If -ftrack-macro-expansion has been overridden from its default
1327      of 2, then we might have a location of a macro expansion point,
1328      rather than the location of the literal itself.
1329      Avoid this by requiring that we have full macro expansion tracking
1330      for substring locations to be available.  */
1331   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1332     return "track_macro_expansion != 2";
1333
1334   /* If #line or # 44 "file"-style directives are present, then there's
1335      no guarantee that the line numbers we have can be used to locate
1336      the strings.  For example, we might have a .i file with # directives
1337      pointing back to lines within a .c file, but the .c file might
1338      have been edited since the .i file was created.
1339      In such a case, the safest course is to disable on-demand substring
1340      locations.  */
1341   if (line_table->seen_line_directive)
1342     return "seen line directive";
1343
1344   /* If string concatenation has occurred at STRLOC, get the locations
1345      of all of the literal tokens making up the compound string.
1346      Otherwise, just use STRLOC.  */
1347   int num_locs = 1;
1348   location_t *strlocs = &strloc;
1349   if (concats)
1350     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1351
1352   auto_cpp_string_vec strs (num_locs);
1353   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1354   for (int i = 0; i < num_locs; i++)
1355     {
1356       /* Get range of strloc.  We will use it to locate the start and finish
1357          of the literal token within the line.  */
1358       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1359
1360       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1361         /* If the string is within a macro expansion, we can't get at the
1362            end location.  */
1363         return "macro expansion";
1364
1365       if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1366         /* If so, we can't reliably determine where the token started within
1367            its line.  */
1368         return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1369
1370       if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1371         /* If so, we can't reliably determine where the token finished within
1372            its line.  */
1373         return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1374
1375       expanded_location start
1376         = expand_location_to_spelling_point (src_range.m_start);
1377       expanded_location finish
1378         = expand_location_to_spelling_point (src_range.m_finish);
1379       if (start.file != finish.file)
1380         return "range endpoints are in different files";
1381       if (start.line != finish.line)
1382         return "range endpoints are on different lines";
1383       if (start.column > finish.column)
1384         return "range endpoints are reversed";
1385
1386       int line_width;
1387       const char *line = location_get_source_line (start.file, start.line,
1388                                                    &line_width);
1389       if (line == NULL)
1390         return "unable to read source line";
1391
1392       /* Determine the location of the literal (including quotes
1393          and leading prefix chars, such as the 'u' in a u""
1394          token).  */
1395       const char *literal = line + start.column - 1;
1396       int literal_length = finish.column - start.column + 1;
1397
1398       gcc_assert (line_width >= (start.column - 1 + literal_length));
1399       cpp_string from;
1400       from.len = literal_length;
1401       /* Make a copy of the literal, to avoid having to rely on
1402          the lifetime of the copy of the line within the cache.
1403          This will be released by the auto_cpp_string_vec dtor.  */
1404       from.text = XDUPVEC (unsigned char, literal, literal_length);
1405       strs.safe_push (from);
1406
1407       /* For very long lines, a new linemap could have started
1408          halfway through the token.
1409          Ensure that the loc_reader uses the linemap of the
1410          *end* of the token for its start location.  */
1411       const line_map_ordinary *final_ord_map;
1412       linemap_resolve_location (line_table, src_range.m_finish,
1413                                 LRK_MACRO_EXPANSION_POINT, &final_ord_map);
1414       location_t start_loc
1415         = linemap_position_for_line_and_column (line_table, final_ord_map,
1416                                                 start.line, start.column);
1417
1418       cpp_string_location_reader loc_reader (start_loc, line_table);
1419       loc_readers.safe_push (loc_reader);
1420     }
1421
1422   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1423   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1424                                                  loc_readers.address (),
1425                                                  num_locs, &ranges, type);
1426   if (err)
1427     return err;
1428
1429   /* Success: "ranges" should now contain information on the string.  */
1430   return NULL;
1431 }
1432
1433 /* Attempt to populate *OUT_LOC with source location information on the
1434    given characters within the string literal found at STRLOC.
1435    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1436    character set.
1437
1438    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1439    and string literal "012345\n789"
1440    *OUT_LOC is written to with:
1441      "012345\n789"
1442          ~^~~~~
1443
1444    If CONCATS is non-NULL, then any string literals that the token at
1445    STRLOC was concatenated with are also considered.
1446
1447    This is implemented by re-parsing the relevant source line(s).
1448
1449    Return NULL if successful, or an error message if any errors occurred.
1450    Error messages are intended for GCC developers (to help debugging) rather
1451    than for end-users.  */
1452
1453 const char *
1454 get_source_location_for_substring (cpp_reader *pfile,
1455                                    string_concat_db *concats,
1456                                    location_t strloc,
1457                                    enum cpp_ttype type,
1458                                    int caret_idx, int start_idx, int end_idx,
1459                                    source_location *out_loc)
1460 {
1461   gcc_checking_assert (caret_idx >= 0);
1462   gcc_checking_assert (start_idx >= 0);
1463   gcc_checking_assert (end_idx >= 0);
1464   gcc_assert (out_loc);
1465
1466   cpp_substring_ranges ranges;
1467   const char *err
1468     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1469   if (err)
1470     return err;
1471
1472   if (caret_idx >= ranges.get_num_ranges ())
1473     return "caret_idx out of range";
1474   if (start_idx >= ranges.get_num_ranges ())
1475     return "start_idx out of range";
1476   if (end_idx >= ranges.get_num_ranges ())
1477     return "end_idx out of range";
1478
1479   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1480                             ranges.get_range (start_idx).m_start,
1481                             ranges.get_range (end_idx).m_finish);
1482   return NULL;
1483 }
1484
1485 #if CHECKING_P
1486
1487 namespace selftest {
1488
1489 /* Selftests of location handling.  */
1490
1491 /* Attempt to populate *OUT_RANGE with source location information on the
1492    given character within the string literal found at STRLOC.
1493    CHAR_IDX refers to an offset within the execution character set.
1494    If CONCATS is non-NULL, then any string literals that the token at
1495    STRLOC was concatenated with are also considered.
1496
1497    This is implemented by re-parsing the relevant source line(s).
1498
1499    Return NULL if successful, or an error message if any errors occurred.
1500    Error messages are intended for GCC developers (to help debugging) rather
1501    than for end-users.  */
1502
1503 static const char *
1504 get_source_range_for_char (cpp_reader *pfile,
1505                            string_concat_db *concats,
1506                            location_t strloc,
1507                            enum cpp_ttype type,
1508                            int char_idx,
1509                            source_range *out_range)
1510 {
1511   gcc_checking_assert (char_idx >= 0);
1512   gcc_assert (out_range);
1513
1514   cpp_substring_ranges ranges;
1515   const char *err
1516     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1517   if (err)
1518     return err;
1519
1520   if (char_idx >= ranges.get_num_ranges ())
1521     return "char_idx out of range";
1522
1523   *out_range = ranges.get_range (char_idx);
1524   return NULL;
1525 }
1526
1527 /* As get_source_range_for_char, but write to *OUT the number
1528    of ranges that are available.  */
1529
1530 static const char *
1531 get_num_source_ranges_for_substring (cpp_reader *pfile,
1532                                      string_concat_db *concats,
1533                                      location_t strloc,
1534                                      enum cpp_ttype type,
1535                                      int *out)
1536 {
1537   gcc_assert (out);
1538
1539   cpp_substring_ranges ranges;
1540   const char *err
1541     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1542
1543   if (err)
1544     return err;
1545
1546   *out = ranges.get_num_ranges ();
1547   return NULL;
1548 }
1549
1550 /* Selftests of location handling.  */
1551
1552 /* Helper function for verifying location data: when location_t
1553    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1554    as having column 0.  */
1555
1556 static bool
1557 should_have_column_data_p (location_t loc)
1558 {
1559   if (IS_ADHOC_LOC (loc))
1560     loc = get_location_from_adhoc_loc (line_table, loc);
1561   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1562     return false;
1563   return true;
1564 }
1565
1566 /* Selftest for should_have_column_data_p.  */
1567
1568 static void
1569 test_should_have_column_data_p ()
1570 {
1571   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1572   ASSERT_TRUE
1573     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1574   ASSERT_FALSE
1575     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1576 }
1577
1578 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1579    on LOC.  */
1580
1581 static void
1582 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1583               location_t loc)
1584 {
1585   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1586   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1587   /* If location_t values are sufficiently high, then column numbers
1588      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1589      When close to the threshold, column numbers *may* be present: if
1590      the final linemap before the threshold contains a line that straddles
1591      the threshold, locations in that line have column information.  */
1592   if (should_have_column_data_p (loc))
1593     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1594 }
1595
1596 /* Various selftests involve constructing a line table and one or more
1597    line maps within it.
1598
1599    For maximum test coverage we want to run these tests with a variety
1600    of situations:
1601    - line_table->default_range_bits: some frontends use a non-zero value
1602    and others use zero
1603    - the fallback modes within line-map.c: there are various threshold
1604    values for source_location/location_t beyond line-map.c changes
1605    behavior (disabling of the range-packing optimization, disabling
1606    of column-tracking).  We can exercise these by starting the line_table
1607    at interesting values at or near these thresholds.
1608
1609    The following struct describes a particular case within our test
1610    matrix.  */
1611
1612 struct line_table_case
1613 {
1614   line_table_case (int default_range_bits, int base_location)
1615   : m_default_range_bits (default_range_bits),
1616     m_base_location (base_location)
1617   {}
1618
1619   int m_default_range_bits;
1620   int m_base_location;
1621 };
1622
1623 /* Constructor.  Store the old value of line_table, and create a new
1624    one, using sane defaults.  */
1625
1626 line_table_test::line_table_test ()
1627 {
1628   gcc_assert (saved_line_table == NULL);
1629   saved_line_table = line_table;
1630   line_table = ggc_alloc<line_maps> ();
1631   linemap_init (line_table, BUILTINS_LOCATION);
1632   gcc_assert (saved_line_table->reallocator);
1633   line_table->reallocator = saved_line_table->reallocator;
1634   gcc_assert (saved_line_table->round_alloc_size);
1635   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1636   line_table->default_range_bits = 0;
1637 }
1638
1639 /* Constructor.  Store the old value of line_table, and create a new
1640    one, using the sitation described in CASE_.  */
1641
1642 line_table_test::line_table_test (const line_table_case &case_)
1643 {
1644   gcc_assert (saved_line_table == NULL);
1645   saved_line_table = line_table;
1646   line_table = ggc_alloc<line_maps> ();
1647   linemap_init (line_table, BUILTINS_LOCATION);
1648   gcc_assert (saved_line_table->reallocator);
1649   line_table->reallocator = saved_line_table->reallocator;
1650   gcc_assert (saved_line_table->round_alloc_size);
1651   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1652   line_table->default_range_bits = case_.m_default_range_bits;
1653   if (case_.m_base_location)
1654     {
1655       line_table->highest_location = case_.m_base_location;
1656       line_table->highest_line = case_.m_base_location;
1657     }
1658 }
1659
1660 /* Destructor.  Restore the old value of line_table.  */
1661
1662 line_table_test::~line_table_test ()
1663 {
1664   gcc_assert (saved_line_table != NULL);
1665   line_table = saved_line_table;
1666   saved_line_table = NULL;
1667 }
1668
1669 /* Verify basic operation of ordinary linemaps.  */
1670
1671 static void
1672 test_accessing_ordinary_linemaps (const line_table_case &case_)
1673 {
1674   line_table_test ltt (case_);
1675
1676   /* Build a simple linemap describing some locations. */
1677   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1678
1679   linemap_line_start (line_table, 1, 100);
1680   location_t loc_a = linemap_position_for_column (line_table, 1);
1681   location_t loc_b = linemap_position_for_column (line_table, 23);
1682
1683   linemap_line_start (line_table, 2, 100);
1684   location_t loc_c = linemap_position_for_column (line_table, 1);
1685   location_t loc_d = linemap_position_for_column (line_table, 17);
1686
1687   /* Example of a very long line.  */
1688   linemap_line_start (line_table, 3, 2000);
1689   location_t loc_e = linemap_position_for_column (line_table, 700);
1690
1691   /* Transitioning back to a short line.  */
1692   linemap_line_start (line_table, 4, 0);
1693   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1694
1695   if (should_have_column_data_p (loc_back_to_short))
1696     {
1697       /* Verify that we switched to short lines in the linemap.  */
1698       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1699       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1700     }
1701
1702   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1703
1704   /* Multiple files.  */
1705   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1706   linemap_line_start (line_table, 1, 200);
1707   location_t loc_f = linemap_position_for_column (line_table, 150);
1708   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1709
1710   /* Verify that we can recover the location info.  */
1711   assert_loceq ("foo.c", 1, 1, loc_a);
1712   assert_loceq ("foo.c", 1, 23, loc_b);
1713   assert_loceq ("foo.c", 2, 1, loc_c);
1714   assert_loceq ("foo.c", 2, 17, loc_d);
1715   assert_loceq ("foo.c", 3, 700, loc_e);
1716   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1717   assert_loceq ("bar.c", 1, 150, loc_f);
1718
1719   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1720   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1721
1722   /* Verify using make_location to build a range, and extracting data
1723      back from it.  */
1724   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1725   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1726   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1727   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1728   ASSERT_EQ (loc_b, src_range.m_start);
1729   ASSERT_EQ (loc_d, src_range.m_finish);
1730 }
1731
1732 /* Verify various properties of UNKNOWN_LOCATION.  */
1733
1734 static void
1735 test_unknown_location ()
1736 {
1737   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1738   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1739   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1740 }
1741
1742 /* Verify various properties of BUILTINS_LOCATION.  */
1743
1744 static void
1745 test_builtins ()
1746 {
1747   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1748   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1749 }
1750
1751 /* Regression test for make_location.
1752    Ensure that we use pure locations for the start/finish of the range,
1753    rather than storing a packed or ad-hoc range as the start/finish.  */
1754
1755 static void
1756 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1757 {
1758   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1759      with C++ frontend.
1760      ....................0000000001111111111222.
1761      ....................1234567890123456789012.  */
1762   const char *content = "     r += !aaa == bbb;\n";
1763   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1764   line_table_test ltt (case_);
1765   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1766
1767   const location_t c11 = linemap_position_for_column (line_table, 11);
1768   const location_t c12 = linemap_position_for_column (line_table, 12);
1769   const location_t c13 = linemap_position_for_column (line_table, 13);
1770   const location_t c14 = linemap_position_for_column (line_table, 14);
1771   const location_t c21 = linemap_position_for_column (line_table, 21);
1772
1773   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1774     return;
1775
1776   /* Use column 13 for the caret location, arbitrarily, to verify that we
1777      handle start != caret.  */
1778   const location_t aaa = make_location (c13, c12, c14);
1779   ASSERT_EQ (c13, get_pure_location (aaa));
1780   ASSERT_EQ (c12, get_start (aaa));
1781   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1782   ASSERT_EQ (c14, get_finish (aaa));
1783   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1784
1785   /* Make a location using a location with a range as the start-point.  */
1786   const location_t not_aaa = make_location (c11, aaa, c14);
1787   ASSERT_EQ (c11, get_pure_location (not_aaa));
1788   /* It should use the start location of the range, not store the range
1789      itself.  */
1790   ASSERT_EQ (c12, get_start (not_aaa));
1791   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1792   ASSERT_EQ (c14, get_finish (not_aaa));
1793   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1794
1795   /* Similarly, make a location with a range as the end-point.  */
1796   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1797   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1798   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1799   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1800   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1801   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1802   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1803   /* It should use the finish location of the range, not store the range
1804      itself.  */
1805   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1806   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1807   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1808   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1809   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1810 }
1811
1812 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1813
1814 static void
1815 test_reading_source_line ()
1816 {
1817   /* Create a tempfile and write some text to it.  */
1818   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1819                         "01234567890123456789\n"
1820                         "This is the test text\n"
1821                         "This is the 3rd line");
1822
1823   /* Read back a specific line from the tempfile.  */
1824   int line_size;
1825   const char *source_line = location_get_source_line (tmp.get_filename (),
1826                                                       3, &line_size);
1827   ASSERT_TRUE (source_line != NULL);
1828   ASSERT_EQ (20, line_size);
1829   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1830                          source_line, line_size));
1831
1832   source_line = location_get_source_line (tmp.get_filename (),
1833                                           2, &line_size);
1834   ASSERT_TRUE (source_line != NULL);
1835   ASSERT_EQ (21, line_size);
1836   ASSERT_TRUE (!strncmp ("This is the test text",
1837                          source_line, line_size));
1838
1839   source_line = location_get_source_line (tmp.get_filename (),
1840                                           4, &line_size);
1841   ASSERT_TRUE (source_line == NULL);
1842 }
1843
1844 /* Tests of lexing.  */
1845
1846 /* Verify that token TOK from PARSER has cpp_token_as_text
1847    equal to EXPECTED_TEXT.  */
1848
1849 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1850   SELFTEST_BEGIN_STMT                                                   \
1851     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1852     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1853   SELFTEST_END_STMT
1854
1855 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1856    and ranges from EXP_START_COL to EXP_FINISH_COL.
1857    Use LOC as the effective location of the selftest.  */
1858
1859 static void
1860 assert_token_loc_eq (const location &loc,
1861                      const cpp_token *tok,
1862                      const char *exp_filename, int exp_linenum,
1863                      int exp_start_col, int exp_finish_col)
1864 {
1865   location_t tok_loc = tok->src_loc;
1866   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1867   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1868
1869   /* If location_t values are sufficiently high, then column numbers
1870      will be unavailable.  */
1871   if (!should_have_column_data_p (tok_loc))
1872     return;
1873
1874   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
1875   source_range tok_range = get_range_from_loc (line_table, tok_loc);
1876   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
1877   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
1878 }
1879
1880 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
1881    SELFTEST_LOCATION as the effective location of the selftest.  */
1882
1883 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
1884                             EXP_START_COL, EXP_FINISH_COL) \
1885   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
1886                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
1887
1888 /* Test of lexing a file using libcpp, verifying tokens and their
1889    location information.  */
1890
1891 static void
1892 test_lexer (const line_table_case &case_)
1893 {
1894   /* Create a tempfile and write some text to it.  */
1895   const char *content =
1896     /*00000000011111111112222222222333333.3333444444444.455555555556
1897       12345678901234567890123456789012345.6789012345678.901234567890.  */
1898     ("test_name /* c-style comment */\n"
1899      "                                  \"test literal\"\n"
1900      " // test c++-style comment\n"
1901      "   42\n");
1902   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
1903
1904   line_table_test ltt (case_);
1905
1906   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
1907
1908   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
1909   ASSERT_NE (fname, NULL);
1910
1911   /* Verify that we get the expected tokens back, with the correct
1912      location information.  */
1913
1914   location_t loc;
1915   const cpp_token *tok;
1916   tok = cpp_get_token_with_location (parser, &loc);
1917   ASSERT_NE (tok, NULL);
1918   ASSERT_EQ (tok->type, CPP_NAME);
1919   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
1920   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
1921
1922   tok = cpp_get_token_with_location (parser, &loc);
1923   ASSERT_NE (tok, NULL);
1924   ASSERT_EQ (tok->type, CPP_STRING);
1925   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
1926   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
1927
1928   tok = cpp_get_token_with_location (parser, &loc);
1929   ASSERT_NE (tok, NULL);
1930   ASSERT_EQ (tok->type, CPP_NUMBER);
1931   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
1932   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
1933
1934   tok = cpp_get_token_with_location (parser, &loc);
1935   ASSERT_NE (tok, NULL);
1936   ASSERT_EQ (tok->type, CPP_EOF);
1937
1938   cpp_finish (parser, NULL);
1939   cpp_destroy (parser);
1940 }
1941
1942 /* Forward decls.  */
1943
1944 struct lexer_test;
1945 class lexer_test_options;
1946
1947 /* A class for specifying options of a lexer_test.
1948    The "apply" vfunc is called during the lexer_test constructor.  */
1949
1950 class lexer_test_options
1951 {
1952  public:
1953   virtual void apply (lexer_test &) = 0;
1954 };
1955
1956 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
1957    in its dtor.
1958
1959    This is needed by struct lexer_test to ensure that the cleanup of the
1960    cpp_reader happens *after* the cleanup of the temp_source_file.  */
1961
1962 class cpp_reader_ptr
1963 {
1964  public:
1965   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
1966
1967   ~cpp_reader_ptr ()
1968   {
1969     cpp_finish (m_ptr, NULL);
1970     cpp_destroy (m_ptr);
1971   }
1972
1973   operator cpp_reader * () const { return m_ptr; }
1974
1975  private:
1976   cpp_reader *m_ptr;
1977 };
1978
1979 /* A struct for writing lexer tests.  */
1980
1981 struct lexer_test
1982 {
1983   lexer_test (const line_table_case &case_, const char *content,
1984               lexer_test_options *options);
1985   ~lexer_test ();
1986
1987   const cpp_token *get_token ();
1988
1989   /* The ordering of these fields matters.
1990      The line_table_test must be first, since the cpp_reader_ptr
1991      uses it.
1992      The cpp_reader must be cleaned up *after* the temp_source_file
1993      since the filenames in input.c's input cache are owned by the
1994      cpp_reader; in particular, when ~temp_source_file evicts the
1995      filename the filenames must still be alive.  */
1996   line_table_test m_ltt;
1997   cpp_reader_ptr m_parser;
1998   temp_source_file m_tempfile;
1999   string_concat_db m_concats;
2000   bool m_implicitly_expect_EOF;
2001 };
2002
2003 /* Use an EBCDIC encoding for the execution charset, specifically
2004    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2005
2006    This exercises iconv integration within libcpp.
2007    Not every build of iconv supports the given charset,
2008    so we need to flag this error and handle it gracefully.  */
2009
2010 class ebcdic_execution_charset : public lexer_test_options
2011 {
2012  public:
2013   ebcdic_execution_charset () : m_num_iconv_errors (0)
2014     {
2015       gcc_assert (s_singleton == NULL);
2016       s_singleton = this;
2017     }
2018   ~ebcdic_execution_charset ()
2019     {
2020       gcc_assert (s_singleton == this);
2021       s_singleton = NULL;
2022     }
2023
2024   void apply (lexer_test &test) FINAL OVERRIDE
2025   {
2026     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2027     cpp_opts->narrow_charset = "IBM1047";
2028
2029     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2030     callbacks->error = on_error;
2031   }
2032
2033   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2034                         int level ATTRIBUTE_UNUSED,
2035                         int reason ATTRIBUTE_UNUSED,
2036                         rich_location *richloc ATTRIBUTE_UNUSED,
2037                         const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2038     ATTRIBUTE_FPTR_PRINTF(5,0)
2039   {
2040     gcc_assert (s_singleton);
2041     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2042     const char *msg = "conversion from %s to %s not supported by iconv";
2043 #ifdef ENABLE_NLS
2044     msg = dgettext ("cpplib", msg);
2045 #endif
2046     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2047        when the local iconv build doesn't support the conversion.  */
2048     if (strcmp (msgid, msg) == 0)
2049       {
2050         s_singleton->m_num_iconv_errors++;
2051         return true;
2052       }
2053
2054     /* Otherwise, we have an unexpected error.  */
2055     abort ();
2056   }
2057
2058   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2059
2060  private:
2061   static ebcdic_execution_charset *s_singleton;
2062   int m_num_iconv_errors;
2063 };
2064
2065 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2066
2067 /* A lexer_test_options subclass that records a list of error
2068    messages emitted by the lexer.  */
2069
2070 class lexer_error_sink : public lexer_test_options
2071 {
2072  public:
2073   lexer_error_sink ()
2074   {
2075     gcc_assert (s_singleton == NULL);
2076     s_singleton = this;
2077   }
2078   ~lexer_error_sink ()
2079   {
2080     gcc_assert (s_singleton == this);
2081     s_singleton = NULL;
2082
2083     int i;
2084     char *str;
2085     FOR_EACH_VEC_ELT (m_errors, i, str)
2086       free (str);
2087   }
2088
2089   void apply (lexer_test &test) FINAL OVERRIDE
2090   {
2091     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2092     callbacks->error = on_error;
2093   }
2094
2095   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2096                         int level ATTRIBUTE_UNUSED,
2097                         int reason ATTRIBUTE_UNUSED,
2098                         rich_location *richloc ATTRIBUTE_UNUSED,
2099                         const char *msgid, va_list *ap)
2100     ATTRIBUTE_FPTR_PRINTF(5,0)
2101   {
2102     char *msg = xvasprintf (msgid, *ap);
2103     s_singleton->m_errors.safe_push (msg);
2104     return true;
2105   }
2106
2107   auto_vec<char *> m_errors;
2108
2109  private:
2110   static lexer_error_sink *s_singleton;
2111 };
2112
2113 lexer_error_sink *lexer_error_sink::s_singleton;
2114
2115 /* Constructor.  Override line_table with a new instance based on CASE_,
2116    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2117    start parsing the tempfile.  */
2118
2119 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2120                         lexer_test_options *options)
2121 : m_ltt (case_),
2122   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2123   /* Create a tempfile and write the text to it.  */
2124   m_tempfile (SELFTEST_LOCATION, ".c", content),
2125   m_concats (),
2126   m_implicitly_expect_EOF (true)
2127 {
2128   if (options)
2129     options->apply (*this);
2130
2131   cpp_init_iconv (m_parser);
2132
2133   /* Parse the file.  */
2134   const char *fname = cpp_read_main_file (m_parser,
2135                                           m_tempfile.get_filename ());
2136   ASSERT_NE (fname, NULL);
2137 }
2138
2139 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2140
2141 lexer_test::~lexer_test ()
2142 {
2143   location_t loc;
2144   const cpp_token *tok;
2145
2146   if (m_implicitly_expect_EOF)
2147     {
2148       tok = cpp_get_token_with_location (m_parser, &loc);
2149       ASSERT_NE (tok, NULL);
2150       ASSERT_EQ (tok->type, CPP_EOF);
2151     }
2152 }
2153
2154 /* Get the next token from m_parser.  */
2155
2156 const cpp_token *
2157 lexer_test::get_token ()
2158 {
2159   location_t loc;
2160   const cpp_token *tok;
2161
2162   tok = cpp_get_token_with_location (m_parser, &loc);
2163   ASSERT_NE (tok, NULL);
2164   return tok;
2165 }
2166
2167 /* Verify that locations within string literals are correctly handled.  */
2168
2169 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2170    using the string concatenation database for TEST.
2171
2172    Assert that the character at index IDX is on EXPECTED_LINE,
2173    and that it begins at column EXPECTED_START_COL and ends at
2174    EXPECTED_FINISH_COL (unless the locations are beyond
2175    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2176    columns).  */
2177
2178 static void
2179 assert_char_at_range (const location &loc,
2180                       lexer_test& test,
2181                       location_t strloc, enum cpp_ttype type, int idx,
2182                       int expected_line, int expected_start_col,
2183                       int expected_finish_col)
2184 {
2185   cpp_reader *pfile = test.m_parser;
2186   string_concat_db *concats = &test.m_concats;
2187
2188   source_range actual_range = source_range();
2189   const char *err
2190     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2191                                  &actual_range);
2192   if (should_have_column_data_p (strloc))
2193     ASSERT_EQ_AT (loc, NULL, err);
2194   else
2195     {
2196       ASSERT_STREQ_AT (loc,
2197                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2198                        err);
2199       return;
2200     }
2201
2202   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2203   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2204   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2205   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2206
2207   if (should_have_column_data_p (actual_range.m_start))
2208     {
2209       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2210       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2211     }
2212   if (should_have_column_data_p (actual_range.m_finish))
2213     {
2214       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2215       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2216     }
2217 }
2218
2219 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2220    the effective location of any errors.  */
2221
2222 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2223                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2224   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2225                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2226                         (EXPECTED_FINISH_COL))
2227
2228 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2229    using the string concatenation database for TEST.
2230
2231    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2232
2233 static void
2234 assert_num_substring_ranges (const location &loc,
2235                              lexer_test& test,
2236                              location_t strloc,
2237                              enum cpp_ttype type,
2238                              int expected_num_ranges)
2239 {
2240   cpp_reader *pfile = test.m_parser;
2241   string_concat_db *concats = &test.m_concats;
2242
2243   int actual_num_ranges = -1;
2244   const char *err
2245     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2246                                            &actual_num_ranges);
2247   if (should_have_column_data_p (strloc))
2248     ASSERT_EQ_AT (loc, NULL, err);
2249   else
2250     {
2251       ASSERT_STREQ_AT (loc,
2252                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2253                        err);
2254       return;
2255     }
2256   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2257 }
2258
2259 /* Macro for calling assert_num_substring_ranges, supplying
2260    SELFTEST_LOCATION for the effective location of any errors.  */
2261
2262 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2263                                     EXPECTED_NUM_RANGES)                \
2264   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2265                                (TYPE), (EXPECTED_NUM_RANGES))
2266
2267
2268 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2269    returns an error (using the string concatenation database for TEST).  */
2270
2271 static void
2272 assert_has_no_substring_ranges (const location &loc,
2273                                 lexer_test& test,
2274                                 location_t strloc,
2275                                 enum cpp_ttype type,
2276                                 const char *expected_err)
2277 {
2278   cpp_reader *pfile = test.m_parser;
2279   string_concat_db *concats = &test.m_concats;
2280   cpp_substring_ranges ranges;
2281   const char *actual_err
2282     = get_substring_ranges_for_loc (pfile, concats, strloc,
2283                                     type, ranges);
2284   if (should_have_column_data_p (strloc))
2285     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2286   else
2287     ASSERT_STREQ_AT (loc,
2288                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2289                      actual_err);
2290 }
2291
2292 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2293     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2294                                     (STRLOC), (TYPE), (ERR))
2295
2296 /* Lex a simple string literal.  Verify the substring location data, before
2297    and after running cpp_interpret_string on it.  */
2298
2299 static void
2300 test_lexer_string_locations_simple (const line_table_case &case_)
2301 {
2302   /* Digits 0-9 (with 0 at column 10), the simple way.
2303      ....................000000000.11111111112.2222222223333333333
2304      ....................123456789.01234567890.1234567890123456789
2305      We add a trailing comment to ensure that we correctly locate
2306      the end of the string literal token.  */
2307   const char *content = "        \"0123456789\" /* not a string */\n";
2308   lexer_test test (case_, content, NULL);
2309
2310   /* Verify that we get the expected token back, with the correct
2311      location information.  */
2312   const cpp_token *tok = test.get_token ();
2313   ASSERT_EQ (tok->type, CPP_STRING);
2314   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2315   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2316
2317   /* At this point in lexing, the quote characters are treated as part of
2318      the string (they are stripped off by cpp_interpret_string).  */
2319
2320   ASSERT_EQ (tok->val.str.len, 12);
2321
2322   /* Verify that cpp_interpret_string works.  */
2323   cpp_string dst_string;
2324   const enum cpp_ttype type = CPP_STRING;
2325   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2326                                       &dst_string, type);
2327   ASSERT_TRUE (result);
2328   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2329   free (const_cast <unsigned char *> (dst_string.text));
2330
2331   /* Verify ranges of individual characters.  This no longer includes the
2332      opening quote, but does include the closing quote.  */
2333   for (int i = 0; i <= 10; i++)
2334     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2335                           10 + i, 10 + i);
2336
2337   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2338 }
2339
2340 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2341    encoding.  */
2342
2343 static void
2344 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2345 {
2346   /* EBCDIC support requires iconv.  */
2347   if (!HAVE_ICONV)
2348     return;
2349
2350   /* Digits 0-9 (with 0 at column 10), the simple way.
2351      ....................000000000.11111111112.2222222223333333333
2352      ....................123456789.01234567890.1234567890123456789
2353      We add a trailing comment to ensure that we correctly locate
2354      the end of the string literal token.  */
2355   const char *content = "        \"0123456789\" /* not a string */\n";
2356   ebcdic_execution_charset use_ebcdic;
2357   lexer_test test (case_, content, &use_ebcdic);
2358
2359   /* Verify that we get the expected token back, with the correct
2360      location information.  */
2361   const cpp_token *tok = test.get_token ();
2362   ASSERT_EQ (tok->type, CPP_STRING);
2363   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2364   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2365
2366   /* At this point in lexing, the quote characters are treated as part of
2367      the string (they are stripped off by cpp_interpret_string).  */
2368
2369   ASSERT_EQ (tok->val.str.len, 12);
2370
2371   /* The remainder of the test requires an iconv implementation that
2372      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2373   if (use_ebcdic.iconv_errors_occurred_p ())
2374     return;
2375
2376   /* Verify that cpp_interpret_string works.  */
2377   cpp_string dst_string;
2378   const enum cpp_ttype type = CPP_STRING;
2379   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2380                                       &dst_string, type);
2381   ASSERT_TRUE (result);
2382   /* We should now have EBCDIC-encoded text, specifically
2383      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2384      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2385   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2386                 (const char *)dst_string.text);
2387   free (const_cast <unsigned char *> (dst_string.text));
2388
2389   /* Verify that we don't attempt to record substring location information
2390      for such cases.  */
2391   ASSERT_HAS_NO_SUBSTRING_RANGES
2392     (test, tok->src_loc, type,
2393      "execution character set != source character set");
2394 }
2395
2396 /* Lex a string literal containing a hex-escaped character.
2397    Verify the substring location data, before and after running
2398    cpp_interpret_string on it.  */
2399
2400 static void
2401 test_lexer_string_locations_hex (const line_table_case &case_)
2402 {
2403   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2404      and with a space in place of digit 6, to terminate the escaped
2405      hex code.
2406      ....................000000000.111111.11112222.
2407      ....................123456789.012345.67890123.  */
2408   const char *content = "        \"01234\\x35 789\"\n";
2409   lexer_test test (case_, content, NULL);
2410
2411   /* Verify that we get the expected token back, with the correct
2412      location information.  */
2413   const cpp_token *tok = test.get_token ();
2414   ASSERT_EQ (tok->type, CPP_STRING);
2415   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2416   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2417
2418   /* At this point in lexing, the quote characters are treated as part of
2419      the string (they are stripped off by cpp_interpret_string).  */
2420   ASSERT_EQ (tok->val.str.len, 15);
2421
2422   /* Verify that cpp_interpret_string works.  */
2423   cpp_string dst_string;
2424   const enum cpp_ttype type = CPP_STRING;
2425   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2426                                       &dst_string, type);
2427   ASSERT_TRUE (result);
2428   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2429   free (const_cast <unsigned char *> (dst_string.text));
2430
2431   /* Verify ranges of individual characters.  This no longer includes the
2432      opening quote, but does include the closing quote.  */
2433   for (int i = 0; i <= 4; i++)
2434     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2435   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2436   for (int i = 6; i <= 10; i++)
2437     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2438
2439   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2440 }
2441
2442 /* Lex a string literal containing an octal-escaped character.
2443    Verify the substring location data after running cpp_interpret_string
2444    on it.  */
2445
2446 static void
2447 test_lexer_string_locations_oct (const line_table_case &case_)
2448 {
2449   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2450      and with a space in place of digit 6, to terminate the escaped
2451      octal code.
2452      ....................000000000.111111.11112222.2222223333333333444
2453      ....................123456789.012345.67890123.4567890123456789012  */
2454   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2455   lexer_test test (case_, content, NULL);
2456
2457   /* Verify that we get the expected token back, with the correct
2458      location information.  */
2459   const cpp_token *tok = test.get_token ();
2460   ASSERT_EQ (tok->type, CPP_STRING);
2461   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2462
2463   /* Verify that cpp_interpret_string works.  */
2464   cpp_string dst_string;
2465   const enum cpp_ttype type = CPP_STRING;
2466   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2467                                       &dst_string, type);
2468   ASSERT_TRUE (result);
2469   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2470   free (const_cast <unsigned char *> (dst_string.text));
2471
2472   /* Verify ranges of individual characters.  This no longer includes the
2473      opening quote, but does include the closing quote.  */
2474   for (int i = 0; i < 5; i++)
2475     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2476   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2477   for (int i = 6; i <= 10; i++)
2478     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2479
2480   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2481 }
2482
2483 /* Test of string literal containing letter escapes.  */
2484
2485 static void
2486 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2487 {
2488   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2489      .....................000000000.1.11111.1.1.11222.22222223333333
2490      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2491   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2492   lexer_test test (case_, content, NULL);
2493
2494   /* Verify that we get the expected tokens back.  */
2495   const cpp_token *tok = test.get_token ();
2496   ASSERT_EQ (tok->type, CPP_STRING);
2497   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2498
2499   /* Verify ranges of individual characters. */
2500   /* "\t".  */
2501   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2502                         0, 1, 10, 11);
2503   /* "foo". */
2504   for (int i = 1; i <= 3; i++)
2505     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2506                           i, 1, 11 + i, 11 + i);
2507   /* "\\" and "\n".  */
2508   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2509                         4, 1, 15, 16);
2510   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2511                         5, 1, 17, 18);
2512
2513   /* "bar" and closing quote for nul-terminator.  */
2514   for (int i = 6; i <= 9; i++)
2515     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2516                           i, 1, 13 + i, 13 + i);
2517
2518   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2519 }
2520
2521 /* Another test of a string literal containing a letter escape.
2522    Based on string seen in
2523      printf ("%-%\n");
2524    in gcc.dg/format/c90-printf-1.c.  */
2525
2526 static void
2527 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2528 {
2529   /* .....................000000000.1111.11.1111.22222222223.
2530      .....................123456789.0123.45.6789.01234567890.  */
2531   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2532   lexer_test test (case_, content, NULL);
2533
2534   /* Verify that we get the expected tokens back.  */
2535   const cpp_token *tok = test.get_token ();
2536   ASSERT_EQ (tok->type, CPP_STRING);
2537   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2538
2539   /* Verify ranges of individual characters. */
2540   /* "%-%".  */
2541   for (int i = 0; i < 3; i++)
2542     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2543                           i, 1, 10 + i, 10 + i);
2544   /* "\n".  */
2545   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2546                         3, 1, 13, 14);
2547
2548   /* Closing quote for nul-terminator.  */
2549   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2550                         4, 1, 15, 15);
2551
2552   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2553 }
2554
2555 /* Lex a string literal containing UCN 4 characters.
2556    Verify the substring location data after running cpp_interpret_string
2557    on it.  */
2558
2559 static void
2560 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2561 {
2562   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2563      as UCN 4.
2564      ....................000000000.111111.111122.222222223.33333333344444
2565      ....................123456789.012345.678901.234567890.12345678901234  */
2566   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2567   lexer_test test (case_, content, NULL);
2568
2569   /* Verify that we get the expected token back, with the correct
2570      location information.  */
2571   const cpp_token *tok = test.get_token ();
2572   ASSERT_EQ (tok->type, CPP_STRING);
2573   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2574
2575   /* Verify that cpp_interpret_string works.
2576      The string should be encoded in the execution character
2577      set.  Assuming that that is UTF-8, we should have the following:
2578      -----------  ----  -----  -------  ----------------
2579      Byte offset  Byte  Octal  Unicode  Source Column(s)
2580      -----------  ----  -----  -------  ----------------
2581      0            0x30         '0'      10
2582      1            0x31         '1'      11
2583      2            0x32         '2'      12
2584      3            0x33         '3'      13
2585      4            0x34         '4'      14
2586      5            0xE2  \342   U+2174   15-20
2587      6            0x85  \205    (cont)  15-20
2588      7            0xB4  \264    (cont)  15-20
2589      8            0xE2  \342   U+2175   21-26
2590      9            0x85  \205    (cont)  21-26
2591      10           0xB5  \265    (cont)  21-26
2592      11           0x37         '7'      27
2593      12           0x38         '8'      28
2594      13           0x39         '9'      29
2595      14           0x00                  30 (closing quote)
2596      -----------  ----  -----  -------  ---------------.  */
2597
2598   cpp_string dst_string;
2599   const enum cpp_ttype type = CPP_STRING;
2600   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2601                                       &dst_string, type);
2602   ASSERT_TRUE (result);
2603   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2604                 (const char *)dst_string.text);
2605   free (const_cast <unsigned char *> (dst_string.text));
2606
2607   /* Verify ranges of individual characters.  This no longer includes the
2608      opening quote, but does include the closing quote.
2609      '01234'.  */
2610   for (int i = 0; i <= 4; i++)
2611     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2612   /* U+2174.  */
2613   for (int i = 5; i <= 7; i++)
2614     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2615   /* U+2175.  */
2616   for (int i = 8; i <= 10; i++)
2617     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2618   /* '789' and nul terminator  */
2619   for (int i = 11; i <= 14; i++)
2620     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2621
2622   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2623 }
2624
2625 /* Lex a string literal containing UCN 8 characters.
2626    Verify the substring location data after running cpp_interpret_string
2627    on it.  */
2628
2629 static void
2630 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2631 {
2632   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2633      ....................000000000.111111.1111222222.2222333333333.344444
2634      ....................123456789.012345.6789012345.6789012345678.901234  */
2635   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2636   lexer_test test (case_, content, NULL);
2637
2638   /* Verify that we get the expected token back, with the correct
2639      location information.  */
2640   const cpp_token *tok = test.get_token ();
2641   ASSERT_EQ (tok->type, CPP_STRING);
2642   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2643                            "\"01234\\U00002174\\U00002175789\"");
2644
2645   /* Verify that cpp_interpret_string works.
2646      The UTF-8 encoding of the string is identical to that from
2647      the ucn4 testcase above; the only difference is the column
2648      locations.  */
2649   cpp_string dst_string;
2650   const enum cpp_ttype type = CPP_STRING;
2651   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2652                                       &dst_string, type);
2653   ASSERT_TRUE (result);
2654   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2655                 (const char *)dst_string.text);
2656   free (const_cast <unsigned char *> (dst_string.text));
2657
2658   /* Verify ranges of individual characters.  This no longer includes the
2659      opening quote, but does include the closing quote.
2660      '01234'.  */
2661   for (int i = 0; i <= 4; i++)
2662     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2663   /* U+2174.  */
2664   for (int i = 5; i <= 7; i++)
2665     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2666   /* U+2175.  */
2667   for (int i = 8; i <= 10; i++)
2668     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2669   /* '789' at columns 35-37  */
2670   for (int i = 11; i <= 13; i++)
2671     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2672   /* Closing quote/nul-terminator at column 38.  */
2673   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2674
2675   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2676 }
2677
2678 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2679
2680 static uint32_t
2681 uint32_from_big_endian (const uint32_t *ptr_be_value)
2682 {
2683   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2684   return (((uint32_t) buf[0] << 24)
2685           | ((uint32_t) buf[1] << 16)
2686           | ((uint32_t) buf[2] << 8)
2687           | (uint32_t) buf[3]);
2688 }
2689
2690 /* Lex a wide string literal and verify that attempts to read substring
2691    location data from it fail gracefully.  */
2692
2693 static void
2694 test_lexer_string_locations_wide_string (const line_table_case &case_)
2695 {
2696   /* Digits 0-9.
2697      ....................000000000.11111111112.22222222233333
2698      ....................123456789.01234567890.12345678901234  */
2699   const char *content = "       L\"0123456789\" /* non-str */\n";
2700   lexer_test test (case_, content, NULL);
2701
2702   /* Verify that we get the expected token back, with the correct
2703      location information.  */
2704   const cpp_token *tok = test.get_token ();
2705   ASSERT_EQ (tok->type, CPP_WSTRING);
2706   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2707
2708   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2709   cpp_string dst_string;
2710   const enum cpp_ttype type = CPP_WSTRING;
2711   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2712                                       &dst_string, type);
2713   ASSERT_TRUE (result);
2714   /* The cpp_reader defaults to big-endian with
2715      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2716      now be encoded as UTF-32BE.  */
2717   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2718   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2719   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2720   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2721   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2722   free (const_cast <unsigned char *> (dst_string.text));
2723
2724   /* We don't yet support generating substring location information
2725      for L"" strings.  */
2726   ASSERT_HAS_NO_SUBSTRING_RANGES
2727     (test, tok->src_loc, type,
2728      "execution character set != source character set");
2729 }
2730
2731 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2732
2733 static uint16_t
2734 uint16_from_big_endian (const uint16_t *ptr_be_value)
2735 {
2736   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2737   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2738 }
2739
2740 /* Lex a u"" string literal and verify that attempts to read substring
2741    location data from it fail gracefully.  */
2742
2743 static void
2744 test_lexer_string_locations_string16 (const line_table_case &case_)
2745 {
2746   /* Digits 0-9.
2747      ....................000000000.11111111112.22222222233333
2748      ....................123456789.01234567890.12345678901234  */
2749   const char *content = "       u\"0123456789\" /* non-str */\n";
2750   lexer_test test (case_, content, NULL);
2751
2752   /* Verify that we get the expected token back, with the correct
2753      location information.  */
2754   const cpp_token *tok = test.get_token ();
2755   ASSERT_EQ (tok->type, CPP_STRING16);
2756   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2757
2758   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2759   cpp_string dst_string;
2760   const enum cpp_ttype type = CPP_STRING16;
2761   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2762                                       &dst_string, type);
2763   ASSERT_TRUE (result);
2764
2765   /* The cpp_reader defaults to big-endian, so dst_string should
2766      now be encoded as UTF-16BE.  */
2767   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2768   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2769   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2770   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2771   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2772   free (const_cast <unsigned char *> (dst_string.text));
2773
2774   /* We don't yet support generating substring location information
2775      for L"" strings.  */
2776   ASSERT_HAS_NO_SUBSTRING_RANGES
2777     (test, tok->src_loc, type,
2778      "execution character set != source character set");
2779 }
2780
2781 /* Lex a U"" string literal and verify that attempts to read substring
2782    location data from it fail gracefully.  */
2783
2784 static void
2785 test_lexer_string_locations_string32 (const line_table_case &case_)
2786 {
2787   /* Digits 0-9.
2788      ....................000000000.11111111112.22222222233333
2789      ....................123456789.01234567890.12345678901234  */
2790   const char *content = "       U\"0123456789\" /* non-str */\n";
2791   lexer_test test (case_, content, NULL);
2792
2793   /* Verify that we get the expected token back, with the correct
2794      location information.  */
2795   const cpp_token *tok = test.get_token ();
2796   ASSERT_EQ (tok->type, CPP_STRING32);
2797   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2798
2799   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2800   cpp_string dst_string;
2801   const enum cpp_ttype type = CPP_STRING32;
2802   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2803                                       &dst_string, type);
2804   ASSERT_TRUE (result);
2805
2806   /* The cpp_reader defaults to big-endian, so dst_string should
2807      now be encoded as UTF-32BE.  */
2808   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2809   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2810   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2811   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2812   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2813   free (const_cast <unsigned char *> (dst_string.text));
2814
2815   /* We don't yet support generating substring location information
2816      for L"" strings.  */
2817   ASSERT_HAS_NO_SUBSTRING_RANGES
2818     (test, tok->src_loc, type,
2819      "execution character set != source character set");
2820 }
2821
2822 /* Lex a u8-string literal.
2823    Verify the substring location data after running cpp_interpret_string
2824    on it.  */
2825
2826 static void
2827 test_lexer_string_locations_u8 (const line_table_case &case_)
2828 {
2829   /* Digits 0-9.
2830      ....................000000000.11111111112.22222222233333
2831      ....................123456789.01234567890.12345678901234  */
2832   const char *content = "      u8\"0123456789\" /* non-str */\n";
2833   lexer_test test (case_, content, NULL);
2834
2835   /* Verify that we get the expected token back, with the correct
2836      location information.  */
2837   const cpp_token *tok = test.get_token ();
2838   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2839   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2840
2841   /* Verify that cpp_interpret_string works.  */
2842   cpp_string dst_string;
2843   const enum cpp_ttype type = CPP_STRING;
2844   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2845                                       &dst_string, type);
2846   ASSERT_TRUE (result);
2847   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2848   free (const_cast <unsigned char *> (dst_string.text));
2849
2850   /* Verify ranges of individual characters.  This no longer includes the
2851      opening quote, but does include the closing quote.  */
2852   for (int i = 0; i <= 10; i++)
2853     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2854 }
2855
2856 /* Lex a string literal containing UTF-8 source characters.
2857    Verify the substring location data after running cpp_interpret_string
2858    on it.  */
2859
2860 static void
2861 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2862 {
2863  /* This string literal is written out to the source file as UTF-8,
2864     and is of the form "before mojibake after", where "mojibake"
2865     is written as the following four unicode code points:
2866        U+6587 CJK UNIFIED IDEOGRAPH-6587
2867        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2868        U+5316 CJK UNIFIED IDEOGRAPH-5316
2869        U+3051 HIRAGANA LETTER KE.
2870      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2871      "before" and "after" are 1 byte per unicode character.
2872
2873      The numbering shown are "columns", which are *byte* numbers within
2874      the line, rather than unicode character numbers.
2875
2876      .................... 000000000.1111111.
2877      .................... 123456789.0123456.  */
2878   const char *content = ("        \"before "
2879                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
2880                               UTF-8: 0xE6 0x96 0x87
2881                               C octal escaped UTF-8: \346\226\207
2882                             "column" numbers: 17-19.  */
2883                          "\346\226\207"
2884
2885                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2886                               UTF-8: 0xE5 0xAD 0x97
2887                               C octal escaped UTF-8: \345\255\227
2888                             "column" numbers: 20-22.  */
2889                          "\345\255\227"
2890
2891                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
2892                               UTF-8: 0xE5 0x8C 0x96
2893                               C octal escaped UTF-8: \345\214\226
2894                             "column" numbers: 23-25.  */
2895                          "\345\214\226"
2896
2897                          /* U+3051 HIRAGANA LETTER KE
2898                               UTF-8: 0xE3 0x81 0x91
2899                               C octal escaped UTF-8: \343\201\221
2900                             "column" numbers: 26-28.  */
2901                          "\343\201\221"
2902
2903                          /* column numbers 29 onwards
2904                           2333333.33334444444444
2905                           9012345.67890123456789. */
2906                          " after\" /* non-str */\n");
2907   lexer_test test (case_, content, NULL);
2908
2909   /* Verify that we get the expected token back, with the correct
2910      location information.  */
2911   const cpp_token *tok = test.get_token ();
2912   ASSERT_EQ (tok->type, CPP_STRING);
2913   ASSERT_TOKEN_AS_TEXT_EQ
2914     (test.m_parser, tok,
2915      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
2916
2917   /* Verify that cpp_interpret_string works.  */
2918   cpp_string dst_string;
2919   const enum cpp_ttype type = CPP_STRING;
2920   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2921                                       &dst_string, type);
2922   ASSERT_TRUE (result);
2923   ASSERT_STREQ
2924     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
2925      (const char *)dst_string.text);
2926   free (const_cast <unsigned char *> (dst_string.text));
2927
2928   /* Verify ranges of individual characters.  This no longer includes the
2929      opening quote, but does include the closing quote.
2930      Assuming that both source and execution encodings are UTF-8, we have
2931      a run of 25 octets in each, plus the NUL terminator.  */
2932   for (int i = 0; i < 25; i++)
2933     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2934   /* NUL-terminator should use the closing quote at column 35.  */
2935   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
2936
2937   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
2938 }
2939
2940 /* Test of string literal concatenation.  */
2941
2942 static void
2943 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
2944 {
2945   /* Digits 0-9.
2946      .....................000000000.111111.11112222222222
2947      .....................123456789.012345.67890123456789.  */
2948   const char *content = ("        \"01234\" /* non-str */\n"
2949                          "        \"56789\" /* non-str */\n");
2950   lexer_test test (case_, content, NULL);
2951
2952   location_t input_locs[2];
2953
2954   /* Verify that we get the expected tokens back.  */
2955   auto_vec <cpp_string> input_strings;
2956   const cpp_token *tok_a = test.get_token ();
2957   ASSERT_EQ (tok_a->type, CPP_STRING);
2958   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
2959   input_strings.safe_push (tok_a->val.str);
2960   input_locs[0] = tok_a->src_loc;
2961
2962   const cpp_token *tok_b = test.get_token ();
2963   ASSERT_EQ (tok_b->type, CPP_STRING);
2964   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
2965   input_strings.safe_push (tok_b->val.str);
2966   input_locs[1] = tok_b->src_loc;
2967
2968   /* Verify that cpp_interpret_string works.  */
2969   cpp_string dst_string;
2970   const enum cpp_ttype type = CPP_STRING;
2971   bool result = cpp_interpret_string (test.m_parser,
2972                                       input_strings.address (), 2,
2973                                       &dst_string, type);
2974   ASSERT_TRUE (result);
2975   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2976   free (const_cast <unsigned char *> (dst_string.text));
2977
2978   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2979   test.m_concats.record_string_concatenation (2, input_locs);
2980
2981   location_t initial_loc = input_locs[0];
2982
2983   /* "01234" on line 1.  */
2984   for (int i = 0; i <= 4; i++)
2985     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
2986   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
2987   for (int i = 5; i <= 10; i++)
2988     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
2989
2990   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2991 }
2992
2993 /* Another test of string literal concatenation.  */
2994
2995 static void
2996 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
2997 {
2998   /* Digits 0-9.
2999      .....................000000000.111.11111112222222
3000      .....................123456789.012.34567890123456.  */
3001   const char *content = ("        \"01\" /* non-str */\n"
3002                          "        \"23\" /* non-str */\n"
3003                          "        \"45\" /* non-str */\n"
3004                          "        \"67\" /* non-str */\n"
3005                          "        \"89\" /* non-str */\n");
3006   lexer_test test (case_, content, NULL);
3007
3008   auto_vec <cpp_string> input_strings;
3009   location_t input_locs[5];
3010
3011   /* Verify that we get the expected tokens back.  */
3012   for (int i = 0; i < 5; i++)
3013     {
3014       const cpp_token *tok = test.get_token ();
3015       ASSERT_EQ (tok->type, CPP_STRING);
3016       input_strings.safe_push (tok->val.str);
3017       input_locs[i] = tok->src_loc;
3018     }
3019
3020   /* Verify that cpp_interpret_string works.  */
3021   cpp_string dst_string;
3022   const enum cpp_ttype type = CPP_STRING;
3023   bool result = cpp_interpret_string (test.m_parser,
3024                                       input_strings.address (), 5,
3025                                       &dst_string, type);
3026   ASSERT_TRUE (result);
3027   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3028   free (const_cast <unsigned char *> (dst_string.text));
3029
3030   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3031   test.m_concats.record_string_concatenation (5, input_locs);
3032
3033   location_t initial_loc = input_locs[0];
3034
3035   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3036      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3037      and expect get_source_range_for_substring to fail.
3038      However, for a string concatenation test, we can have a case
3039      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3040      but subsequent strings can be after it.
3041      Attempting to detect this within assert_char_at_range
3042      would overcomplicate the logic for the common test cases, so
3043      we detect it here.  */
3044   if (should_have_column_data_p (input_locs[0])
3045       && !should_have_column_data_p (input_locs[4]))
3046     {
3047       /* Verify that get_source_range_for_substring gracefully rejects
3048          this case.  */
3049       source_range actual_range;
3050       const char *err
3051         = get_source_range_for_char (test.m_parser, &test.m_concats,
3052                                      initial_loc, type, 0, &actual_range);
3053       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3054       return;
3055     }
3056
3057   for (int i = 0; i < 5; i++)
3058     for (int j = 0; j < 2; j++)
3059       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3060                             i + 1, 10 + j, 10 + j);
3061
3062   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3063   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3064
3065   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3066 }
3067
3068 /* Another test of string literal concatenation, this time combined with
3069    various kinds of escaped characters.  */
3070
3071 static void
3072 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3073 {
3074   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3075      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3076   const char *content
3077     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3078        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3079     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3080   lexer_test test (case_, content, NULL);
3081
3082   auto_vec <cpp_string> input_strings;
3083   location_t input_locs[4];
3084
3085   /* Verify that we get the expected tokens back.  */
3086   for (int i = 0; i < 4; i++)
3087     {
3088       const cpp_token *tok = test.get_token ();
3089       ASSERT_EQ (tok->type, CPP_STRING);
3090       input_strings.safe_push (tok->val.str);
3091       input_locs[i] = tok->src_loc;
3092     }
3093
3094   /* Verify that cpp_interpret_string works.  */
3095   cpp_string dst_string;
3096   const enum cpp_ttype type = CPP_STRING;
3097   bool result = cpp_interpret_string (test.m_parser,
3098                                       input_strings.address (), 4,
3099                                       &dst_string, type);
3100   ASSERT_TRUE (result);
3101   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3102   free (const_cast <unsigned char *> (dst_string.text));
3103
3104   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3105   test.m_concats.record_string_concatenation (4, input_locs);
3106
3107   location_t initial_loc = input_locs[0];
3108
3109   for (int i = 0; i <= 4; i++)
3110     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3111   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3112   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3113   for (int i = 7; i <= 9; i++)
3114     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3115
3116   /* NUL-terminator should use the location of the final closing quote.  */
3117   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3118
3119   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3120 }
3121
3122 /* Test of string literal in a macro.  */
3123
3124 static void
3125 test_lexer_string_locations_macro (const line_table_case &case_)
3126 {
3127   /* Digits 0-9.
3128      .....................0000000001111111111.22222222223.
3129      .....................1234567890123456789.01234567890.  */
3130   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3131                          "  MACRO");
3132   lexer_test test (case_, content, NULL);
3133
3134   /* Verify that we get the expected tokens back.  */
3135   const cpp_token *tok = test.get_token ();
3136   ASSERT_EQ (tok->type, CPP_PADDING);
3137
3138   tok = test.get_token ();
3139   ASSERT_EQ (tok->type, CPP_STRING);
3140   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3141
3142   /* Verify ranges of individual characters.  We ought to
3143      see columns within the macro definition.  */
3144   for (int i = 0; i <= 10; i++)
3145     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3146                           i, 1, 20 + i, 20 + i);
3147
3148   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3149
3150   tok = test.get_token ();
3151   ASSERT_EQ (tok->type, CPP_PADDING);
3152 }
3153
3154 /* Test of stringification of a macro argument.  */
3155
3156 static void
3157 test_lexer_string_locations_stringified_macro_argument
3158   (const line_table_case &case_)
3159 {
3160   /* .....................000000000111111111122222222223.
3161      .....................123456789012345678901234567890.  */
3162   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3163                          "MACRO(foo)\n");
3164   lexer_test test (case_, content, NULL);
3165
3166   /* Verify that we get the expected token back.  */
3167   const cpp_token *tok = test.get_token ();
3168   ASSERT_EQ (tok->type, CPP_PADDING);
3169
3170   tok = test.get_token ();
3171   ASSERT_EQ (tok->type, CPP_STRING);
3172   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3173
3174   /* We don't support getting the location of a stringified macro
3175      argument.  Verify that it fails gracefully.  */
3176   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3177                                   "cpp_interpret_string_1 failed");
3178
3179   tok = test.get_token ();
3180   ASSERT_EQ (tok->type, CPP_PADDING);
3181
3182   tok = test.get_token ();
3183   ASSERT_EQ (tok->type, CPP_PADDING);
3184 }
3185
3186 /* Ensure that we are fail gracefully if something attempts to pass
3187    in a location that isn't a string literal token.  Seen on this code:
3188
3189      const char a[] = " %d ";
3190      __builtin_printf (a, 0.5);
3191                        ^
3192
3193    when c-format.c erroneously used the indicated one-character
3194    location as the format string location, leading to a read past the
3195    end of a string buffer in cpp_interpret_string_1.  */
3196
3197 static void
3198 test_lexer_string_locations_non_string (const line_table_case &case_)
3199 {
3200   /* .....................000000000111111111122222222223.
3201      .....................123456789012345678901234567890.  */
3202   const char *content = ("         a\n");
3203   lexer_test test (case_, content, NULL);
3204
3205   /* Verify that we get the expected token back.  */
3206   const cpp_token *tok = test.get_token ();
3207   ASSERT_EQ (tok->type, CPP_NAME);
3208   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3209
3210   /* At this point, libcpp is attempting to interpret the name as a
3211      string literal, despite it not starting with a quote.  We don't detect
3212      that, but we should at least fail gracefully.  */
3213   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3214                                   "cpp_interpret_string_1 failed");
3215 }
3216
3217 /* Ensure that we can read substring information for a token which
3218    starts in one linemap and ends in another .  Adapted from
3219    gcc.dg/cpp/pr69985.c.  */
3220
3221 static void
3222 test_lexer_string_locations_long_line (const line_table_case &case_)
3223 {
3224   /* .....................000000.000111111111
3225      .....................123456.789012346789.  */
3226   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3227                          "     \"0123456789012345678901234567890123456789"
3228                          "0123456789012345678901234567890123456789"
3229                          "0123456789012345678901234567890123456789"
3230                          "0123456789\"\n");
3231
3232   lexer_test test (case_, content, NULL);
3233
3234   /* Verify that we get the expected token back.  */
3235   const cpp_token *tok = test.get_token ();
3236   ASSERT_EQ (tok->type, CPP_STRING);
3237
3238   if (!should_have_column_data_p (line_table->highest_location))
3239     return;
3240
3241   /* Verify ranges of individual characters.  */
3242   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3243   for (int i = 0; i < 131; i++)
3244     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3245                           i, 2, 7 + i, 7 + i);
3246 }
3247
3248 /* Test of locations within a raw string that doesn't contain a newline.  */
3249
3250 static void
3251 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3252 {
3253   /* .....................00.0000000111111111122.
3254      .....................12.3456789012345678901.  */
3255   const char *content = ("R\"foo(0123456789)foo\"\n");
3256   lexer_test test (case_, content, NULL);
3257
3258   /* Verify that we get the expected token back.  */
3259   const cpp_token *tok = test.get_token ();
3260   ASSERT_EQ (tok->type, CPP_STRING);
3261
3262   /* Verify that cpp_interpret_string works.  */
3263   cpp_string dst_string;
3264   const enum cpp_ttype type = CPP_STRING;
3265   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3266                                       &dst_string, type);
3267   ASSERT_TRUE (result);
3268   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3269   free (const_cast <unsigned char *> (dst_string.text));
3270
3271   if (!should_have_column_data_p (line_table->highest_location))
3272     return;
3273
3274   /* 0-9, plus the nil terminator.  */
3275   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3276   for (int i = 0; i < 11; i++)
3277     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3278                           i, 1, 7 + i, 7 + i);
3279 }
3280
3281 /* Test of locations within a raw string that contains a newline.  */
3282
3283 static void
3284 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3285 {
3286   /* .....................00.0000.
3287      .....................12.3456.  */
3288   const char *content = ("R\"foo(\n"
3289   /* .....................00000.
3290      .....................12345.  */
3291                          "hello\n"
3292                          "world\n"
3293   /* .....................00000.
3294      .....................12345.  */
3295                          ")foo\"\n");
3296   lexer_test test (case_, content, NULL);
3297
3298   /* Verify that we get the expected token back.  */
3299   const cpp_token *tok = test.get_token ();
3300   ASSERT_EQ (tok->type, CPP_STRING);
3301
3302   /* Verify that cpp_interpret_string works.  */
3303   cpp_string dst_string;
3304   const enum cpp_ttype type = CPP_STRING;
3305   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3306                                       &dst_string, type);
3307   ASSERT_TRUE (result);
3308   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3309   free (const_cast <unsigned char *> (dst_string.text));
3310
3311   if (!should_have_column_data_p (line_table->highest_location))
3312     return;
3313
3314   /* Currently we don't support locations within raw strings that
3315      contain newlines.  */
3316   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3317                                   "range endpoints are on different lines");
3318 }
3319
3320 /* Test of parsing an unterminated raw string.  */
3321
3322 static void
3323 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3324 {
3325   const char *content = "R\"ouch()ouCh\" /* etc */";
3326
3327   lexer_error_sink errors;
3328   lexer_test test (case_, content, &errors);
3329   test.m_implicitly_expect_EOF = false;
3330
3331   /* Attempt to parse the raw string.  */
3332   const cpp_token *tok = test.get_token ();
3333   ASSERT_EQ (tok->type, CPP_EOF);
3334
3335   ASSERT_EQ (1, errors.m_errors.length ());
3336   /* We expect the message "unterminated raw string"
3337      in the "cpplib" translation domain.
3338      It's not clear that dgettext is available on all supported hosts,
3339      so this assertion is commented-out for now.
3340        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3341                      errors.m_errors[0]);
3342   */
3343 }
3344
3345 /* Test of lexing char constants.  */
3346
3347 static void
3348 test_lexer_char_constants (const line_table_case &case_)
3349 {
3350   /* Various char constants.
3351      .....................0000000001111111111.22222222223.
3352      .....................1234567890123456789.01234567890.  */
3353   const char *content = ("         'a'\n"
3354                          "        u'a'\n"
3355                          "        U'a'\n"
3356                          "        L'a'\n"
3357                          "         'abc'\n");
3358   lexer_test test (case_, content, NULL);
3359
3360   /* Verify that we get the expected tokens back.  */
3361   /* 'a'.  */
3362   const cpp_token *tok = test.get_token ();
3363   ASSERT_EQ (tok->type, CPP_CHAR);
3364   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3365
3366   unsigned int chars_seen;
3367   int unsignedp;
3368   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3369                                           &chars_seen, &unsignedp);
3370   ASSERT_EQ (cc, 'a');
3371   ASSERT_EQ (chars_seen, 1);
3372
3373   /* u'a'.  */
3374   tok = test.get_token ();
3375   ASSERT_EQ (tok->type, CPP_CHAR16);
3376   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3377
3378   /* U'a'.  */
3379   tok = test.get_token ();
3380   ASSERT_EQ (tok->type, CPP_CHAR32);
3381   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3382
3383   /* L'a'.  */
3384   tok = test.get_token ();
3385   ASSERT_EQ (tok->type, CPP_WCHAR);
3386   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3387
3388   /* 'abc' (c-char-sequence).  */
3389   tok = test.get_token ();
3390   ASSERT_EQ (tok->type, CPP_CHAR);
3391   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3392 }
3393 /* A table of interesting location_t values, giving one axis of our test
3394    matrix.  */
3395
3396 static const location_t boundary_locations[] = {
3397   /* Zero means "don't override the default values for a new line_table".  */
3398   0,
3399
3400   /* An arbitrary non-zero value that isn't close to one of
3401      the boundary values below.  */
3402   0x10000,
3403
3404   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3405   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3406   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3407   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3408   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3409   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3410
3411   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3412   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3413   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3414   LINE_MAP_MAX_LOCATION_WITH_COLS,
3415   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3416   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3417 };
3418
3419 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3420
3421 void
3422 for_each_line_table_case (void (*testcase) (const line_table_case &))
3423 {
3424   /* As noted above in the description of struct line_table_case,
3425      we want to explore a test matrix of interesting line_table
3426      situations, running various selftests for each case within the
3427      matrix.  */
3428
3429   /* Run all tests with:
3430      (a) line_table->default_range_bits == 0, and
3431      (b) line_table->default_range_bits == 5.  */
3432   int num_cases_tested = 0;
3433   for (int default_range_bits = 0; default_range_bits <= 5;
3434        default_range_bits += 5)
3435     {
3436       /* ...and use each of the "interesting" location values as
3437          the starting location within line_table.  */
3438       const int num_boundary_locations
3439         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3440       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3441         {
3442           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3443
3444           testcase (c);
3445
3446           num_cases_tested++;
3447         }
3448     }
3449
3450   /* Verify that we fully covered the test matrix.  */
3451   ASSERT_EQ (num_cases_tested, 2 * 12);
3452 }
3453
3454 /* Run all of the selftests within this file.  */
3455
3456 void
3457 input_c_tests ()
3458 {
3459   test_should_have_column_data_p ();
3460   test_unknown_location ();
3461   test_builtins ();
3462   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3463
3464   for_each_line_table_case (test_accessing_ordinary_linemaps);
3465   for_each_line_table_case (test_lexer);
3466   for_each_line_table_case (test_lexer_string_locations_simple);
3467   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3468   for_each_line_table_case (test_lexer_string_locations_hex);
3469   for_each_line_table_case (test_lexer_string_locations_oct);
3470   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3471   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3472   for_each_line_table_case (test_lexer_string_locations_ucn4);
3473   for_each_line_table_case (test_lexer_string_locations_ucn8);
3474   for_each_line_table_case (test_lexer_string_locations_wide_string);
3475   for_each_line_table_case (test_lexer_string_locations_string16);
3476   for_each_line_table_case (test_lexer_string_locations_string32);
3477   for_each_line_table_case (test_lexer_string_locations_u8);
3478   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3479   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3480   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3481   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3482   for_each_line_table_case (test_lexer_string_locations_macro);
3483   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3484   for_each_line_table_case (test_lexer_string_locations_non_string);
3485   for_each_line_table_case (test_lexer_string_locations_long_line);
3486   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3487   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3488   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3489   for_each_line_table_case (test_lexer_char_constants);
3490
3491   test_reading_source_line ();
3492 }
3493
3494 } // namespace selftest
3495
3496 #endif /* CHECKING_P */