gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2017 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic-core.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* This is a cache used by get_next_line to store the content of a
  33    file to be searched for file lines.  */
  34 struct fcache
  35 {
  36   /* These are information used to store a line boundary.  */
  37   struct line_info
  38   {
  39     /* The line number.  It starts from 1.  */
  40     size_t line_num;
  41
  42     /* The position (byte count) of the beginning of the line,
  43        relative to the file data pointer.  This starts at zero.  */
  44     size_t start_pos;
  45
  46     /* The position (byte count) of the last byte of the line.  This
  47        normally points to the '\n' character, or to one byte after the
  48        last byte of the file, if the file doesn't contain a '\n'
  49        character.  */
  50     size_t end_pos;
  51
  52     line_info (size_t l, size_t s, size_t e)
  53       : line_num (l), start_pos (s), end_pos (e)
  54     {}
  55
  56     line_info ()
  57       :line_num (0), start_pos (0), end_pos (0)
  58     {}
  59   };
  60
  61   /* The number of time this file has been accessed.  This is used
  62      to designate which file cache to evict from the cache
  63      array.  */
  64   unsigned use_count;
  65
  66   /* The file_path is the key for identifying a particular file in
  67      the cache.
  68      For libcpp-using code, the underlying buffer for this field is
  69      owned by the corresponding _cpp_file within the cpp_reader.  */
  70   const char *file_path;
  71
  72   FILE *fp;
  73
  74   /* This points to the content of the file that we've read so
  75      far.  */
  76   char *data;
  77
  78   /*  The size of the DATA array above.*/
  79   size_t size;
  80
  81   /* The number of bytes read from the underlying file so far.  This
  82      must be less (or equal) than SIZE above.  */
  83   size_t nb_read;
  84
  85   /* The index of the beginning of the current line.  */
  86   size_t line_start_idx;
  87
  88   /* The number of the previous line read.  This starts at 1.  Zero
  89      means we've read no line so far.  */
  90   size_t line_num;
  91
  92   /* This is the total number of lines of the current file.  At the
  93      moment, we try to get this information from the line map
  94      subsystem.  Note that this is just a hint.  When using the C++
  95      front-end, this hint is correct because the input file is then
  96      completely tokenized before parsing starts; so the line map knows
  97      the number of lines before compilation really starts.  For e.g,
  98      the C front-end, it can happen that we start emitting diagnostics
  99      before the line map has seen the end of the file.  */
 100   size_t total_lines;
 101
 102   /* Could this file be missing a trailing newline on its final line?
 103      Initially true (to cope with empty files), set to true/false
 104      as each line is read.  */
 105   bool missing_trailing_newline;
 106
 107   /* This is a record of the beginning and end of the lines we've seen
 108      while reading the file.  This is useful to avoid walking the data
 109      from the beginning when we are asked to read a line that is
 110      before LINE_START_IDX above.  Note that the maximum size of this
 111      record is fcache_line_record_size, so that the memory consumption
 112      doesn't explode.  We thus scale total_lines down to
 113      fcache_line_record_size.  */
 114   vec<line_info, va_heap> line_record;
 115
 116   fcache ();
 117   ~fcache ();
 118 };
 119
 120 /* Current position in real source file.  */
 121
 122 location_t input_location = UNKNOWN_LOCATION;
 123
 124 struct line_maps *line_table;
 125
 126 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 127    This needs to be a global so that it can be a GC root, and thus
 128    prevent the stashed copy from being garbage-collected if the GC runs
 129    during a line_table_test.  */
 130
 131 struct line_maps *saved_line_table;
 132
 133 static fcache *fcache_tab;
 134 static const size_t fcache_tab_size = 16;
 135 static const size_t fcache_buffer_size = 4 * 1024;
 136 static const size_t fcache_line_record_size = 100;
 137
 138 /* Expand the source location LOC into a human readable location.  If
 139    LOC resolves to a builtin location, the file name of the readable
 140    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 141    TRUE and LOC is virtual, then it is resolved to the expansion
 142    point of the involved macro.  Otherwise, it is resolved to the
 143    spelling location of the token.
 144
 145    When resolving to the spelling location of the token, if the
 146    resulting location is for a built-in location (that is, it has no
 147    associated line/column) in the context of a macro expansion, the
 148    returned location is the first one (while unwinding the macro
 149    location towards its expansion point) that is in real source
 150    code.  */
 151
 152 static expanded_location
 153 expand_location_1 (source_location loc,
 154                    bool expansion_point_p)
 155 {
 156   expanded_location xloc;
 157   const line_map_ordinary *map;
 158   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 159   tree block = NULL;
 160
 161   if (IS_ADHOC_LOC (loc))
 162     {
 163       block = LOCATION_BLOCK (loc);
 164       loc = LOCATION_LOCUS (loc);
 165     }
 166
 167   memset (&xloc, 0, sizeof (xloc));
 168
 169   if (loc >= RESERVED_LOCATION_COUNT)
 170     {
 171       if (!expansion_point_p)
 172         {
 173           /* We want to resolve LOC to its spelling location.
 174
 175              But if that spelling location is a reserved location that
 176              appears in the context of a macro expansion (like for a
 177              location for a built-in token), let's consider the first
 178              location (toward the expansion point) that is not reserved;
 179              that is, the first location that is in real source code.  */
 180           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 181                                                           loc, NULL);
 182           lrk = LRK_SPELLING_LOCATION;
 183         }
 184       loc = linemap_resolve_location (line_table, loc,
 185                                       lrk, &map);
 186       xloc = linemap_expand_location (line_table, map, loc);
 187     }
 188
 189   xloc.data = block;
 190   if (loc <= BUILTINS_LOCATION)
 191     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 192
 193   return xloc;
 194 }
 195
 196 /* Initialize the set of cache used for files accessed by caret
 197    diagnostic.  */
 198
 199 static void
 200 diagnostic_file_cache_init (void)
 201 {
 202   if (fcache_tab == NULL)
 203     fcache_tab = new fcache[fcache_tab_size];
 204 }
 205
 206 /* Free the resources used by the set of cache used for files accessed
 207    by caret diagnostic.  */
 208
 209 void
 210 diagnostic_file_cache_fini (void)
 211 {
 212   if (fcache_tab)
 213     {
 214       delete [] (fcache_tab);
 215       fcache_tab = NULL;
 216     }
 217 }
 218
 219 /* Return the total lines number that have been read so far by the
 220    line map (in the preprocessor) so far.  For languages like C++ that
 221    entirely preprocess the input file before starting to parse, this
 222    equals the actual number of lines of the file.  */
 223
 224 static size_t
 225 total_lines_num (const char *file_path)
 226 {
 227   size_t r = 0;
 228   source_location l = 0;
 229   if (linemap_get_file_highest_location (line_table, file_path, &l))
 230     {
 231       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 232       expanded_location xloc = expand_location (l);
 233       r = xloc.line;
 234     }
 235   return r;
 236 }
 237
 238 /* Lookup the cache used for the content of a given file accessed by
 239    caret diagnostic.  Return the found cached file, or NULL if no
 240    cached file was found.  */
 241
 242 static fcache*
 243 lookup_file_in_cache_tab (const char *file_path)
 244 {
 245   if (file_path == NULL)
 246     return NULL;
 247
 248   diagnostic_file_cache_init ();
 249
 250   /* This will contain the found cached file.  */
 251   fcache *r = NULL;
 252   for (unsigned i = 0; i < fcache_tab_size; ++i)
 253     {
 254       fcache *c = &fcache_tab[i];
 255       if (c->file_path && !strcmp (c->file_path, file_path))
 256         {
 257           ++c->use_count;
 258           r = c;
 259         }
 260     }
 261
 262   if (r)
 263     ++r->use_count;
 264
 265   return r;
 266 }
 267
 268 /* Purge any mention of FILENAME from the cache of files used for
 269    printing source code.  For use in selftests when working
 270    with tempfiles.  */
 271
 272 void
 273 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 274 {
 275   gcc_assert (file_path);
 276
 277   fcache *r = lookup_file_in_cache_tab (file_path);
 278   if (!r)
 279     /* Not found.  */
 280     return;
 281
 282   r->file_path = NULL;
 283   if (r->fp)
 284     fclose (r->fp);
 285   r->fp = NULL;
 286   r->nb_read = 0;
 287   r->line_start_idx = 0;
 288   r->line_num = 0;
 289   r->line_record.truncate (0);
 290   r->use_count = 0;
 291   r->total_lines = 0;
 292   r->missing_trailing_newline = true;
 293 }
 294
 295 /* Return the file cache that has been less used, recently, or the
 296    first empty one.  If HIGHEST_USE_COUNT is non-null,
 297    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 298    in the cache table.  */
 299
 300 static fcache*
 301 evicted_cache_tab_entry (unsigned *highest_use_count)
 302 {
 303   diagnostic_file_cache_init ();
 304
 305   fcache *to_evict = &fcache_tab[0];
 306   unsigned huc = to_evict->use_count;
 307   for (unsigned i = 1; i < fcache_tab_size; ++i)
 308     {
 309       fcache *c = &fcache_tab[i];
 310       bool c_is_empty = (c->file_path == NULL);
 311
 312       if (c->use_count < to_evict->use_count
 313           || (to_evict->file_path && c_is_empty))
 314         /* We evict C because it's either an entry with a lower use
 315            count or one that is empty.  */
 316         to_evict = c;
 317
 318       if (huc < c->use_count)
 319         huc = c->use_count;
 320
 321       if (c_is_empty)
 322         /* We've reached the end of the cache; subsequent elements are
 323            all empty.  */
 324         break;
 325     }
 326
 327   if (highest_use_count)
 328     *highest_use_count = huc;
 329
 330   return to_evict;
 331 }
 332
 333 /* Create the cache used for the content of a given file to be
 334    accessed by caret diagnostic.  This cache is added to an array of
 335    cache and can be retrieved by lookup_file_in_cache_tab.  This
 336    function returns the created cache.  Note that only the last
 337    fcache_tab_size files are cached.  */
 338
 339 static fcache*
 340 add_file_to_cache_tab (const char *file_path)
 341 {
 342
 343   FILE *fp = fopen (file_path, "r");
 344   if (fp == NULL)
 345     return NULL;
 346
 347   unsigned highest_use_count = 0;
 348   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 349   r->file_path = file_path;
 350   if (r->fp)
 351     fclose (r->fp);
 352   r->fp = fp;
 353   r->nb_read = 0;
 354   r->line_start_idx = 0;
 355   r->line_num = 0;
 356   r->line_record.truncate (0);
 357   /* Ensure that this cache entry doesn't get evicted next time
 358      add_file_to_cache_tab is called.  */
 359   r->use_count = ++highest_use_count;
 360   r->total_lines = total_lines_num (file_path);
 361   r->missing_trailing_newline = true;
 362
 363   return r;
 364 }
 365
 366 /* Lookup the cache used for the content of a given file accessed by
 367    caret diagnostic.  If no cached file was found, create a new cache
 368    for this file, add it to the array of cached file and return
 369    it.  */
 370
 371 static fcache*
 372 lookup_or_add_file_to_cache_tab (const char *file_path)
 373 {
 374   fcache *r = lookup_file_in_cache_tab (file_path);
 375   if (r == NULL)
 376     r = add_file_to_cache_tab (file_path);
 377   return r;
 378 }
 379
 380 /* Default constructor for a cache of file used by caret
 381    diagnostic.  */
 382
 383 fcache::fcache ()
 384 : use_count (0), file_path (NULL), fp (NULL), data (0),
 385   size (0), nb_read (0), line_start_idx (0), line_num (0),
 386   total_lines (0), missing_trailing_newline (true)
 387 {
 388   line_record.create (0);
 389 }
 390
 391 /* Destructor for a cache of file used by caret diagnostic.  */
 392
 393 fcache::~fcache ()
 394 {
 395   if (fp)
 396     {
 397       fclose (fp);
 398       fp = NULL;
 399     }
 400   if (data)
 401     {
 402       XDELETEVEC (data);
 403       data = 0;
 404     }
 405   line_record.release ();
 406 }
 407
 408 /* Returns TRUE iff the cache would need to be filled with data coming
 409    from the file.  That is, either the cache is empty or full or the
 410    current line is empty.  Note that if the cache is full, it would
 411    need to be extended and filled again.  */
 412
 413 static bool
 414 needs_read (fcache *c)
 415 {
 416   return (c->nb_read == 0
 417           || c->nb_read == c->size
 418           || (c->line_start_idx >= c->nb_read - 1));
 419 }
 420
 421 /*  Return TRUE iff the cache is full and thus needs to be
 422     extended.  */
 423
 424 static bool
 425 needs_grow (fcache *c)
 426 {
 427   return c->nb_read == c->size;
 428 }
 429
 430 /* Grow the cache if it needs to be extended.  */
 431
 432 static void
 433 maybe_grow (fcache *c)
 434 {
 435   if (!needs_grow (c))
 436     return;
 437
 438   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 439   c->data = XRESIZEVEC (char, c->data, size);
 440   c->size = size;
 441 }
 442
 443 /*  Read more data into the cache.  Extends the cache if need be.
 444     Returns TRUE iff new data could be read.  */
 445
 446 static bool
 447 read_data (fcache *c)
 448 {
 449   if (feof (c->fp) || ferror (c->fp))
 450     return false;
 451
 452   maybe_grow (c);
 453
 454   char * from = c->data + c->nb_read;
 455   size_t to_read = c->size - c->nb_read;
 456   size_t nb_read = fread (from, 1, to_read, c->fp);
 457
 458   if (ferror (c->fp))
 459     return false;
 460
 461   c->nb_read += nb_read;
 462   return !!nb_read;
 463 }
 464
 465 /* Read new data iff the cache needs to be filled with more data
 466    coming from the file FP.  Return TRUE iff the cache was filled with
 467    mode data.  */
 468
 469 static bool
 470 maybe_read_data (fcache *c)
 471 {
 472   if (!needs_read (c))
 473     return false;
 474   return read_data (c);
 475 }
 476
 477 /* Read a new line from file FP, using C as a cache for the data
 478    coming from the file.  Upon successful completion, *LINE is set to
 479    the beginning of the line found.  *LINE points directly in the
 480    line cache and is only valid until the next call of get_next_line.
 481    *LINE_LEN is set to the length of the line.  Note that the line
 482    does not contain any terminal delimiter.  This function returns
 483    true if some data was read or process from the cache, false
 484    otherwise.  Note that subsequent calls to get_next_line might
 485    make the content of *LINE invalid.  */
 486
 487 static bool
 488 get_next_line (fcache *c, char **line, ssize_t *line_len)
 489 {
 490   /* Fill the cache with data to process.  */
 491   maybe_read_data (c);
 492
 493   size_t remaining_size = c->nb_read - c->line_start_idx;
 494   if (remaining_size == 0)
 495     /* There is no more data to process.  */
 496     return false;
 497
 498   char *line_start = c->data + c->line_start_idx;
 499
 500   char *next_line_start = NULL;
 501   size_t len = 0;
 502   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 503   if (line_end == NULL)
 504     {
 505       /* We haven't found the end-of-line delimiter in the cache.
 506          Fill the cache with more data from the file and look for the
 507          '\n'.  */
 508       while (maybe_read_data (c))
 509         {
 510           line_start = c->data + c->line_start_idx;
 511           remaining_size = c->nb_read - c->line_start_idx;
 512           line_end = (char *) memchr (line_start, '\n', remaining_size);
 513           if (line_end != NULL)
 514             {
 515               next_line_start = line_end + 1;
 516               break;
 517             }
 518         }
 519       if (line_end == NULL)
 520         {
 521           /* We've loadded all the file into the cache and still no
 522              '\n'.  Let's say the line ends up at one byte passed the
 523              end of the file.  This is to stay consistent with the case
 524              of when the line ends up with a '\n' and line_end points to
 525              that terminal '\n'.  That consistency is useful below in
 526              the len calculation.  */
 527           line_end = c->data + c->nb_read ;
 528           c->missing_trailing_newline = true;
 529         }
 530       else
 531         c->missing_trailing_newline = false;
 532     }
 533   else
 534     {
 535       next_line_start = line_end + 1;
 536       c->missing_trailing_newline = false;
 537     }
 538
 539   if (ferror (c->fp))
 540     return false;
 541
 542   /* At this point, we've found the end of the of line.  It either
 543      points to the '\n' or to one byte after the last byte of the
 544      file.  */
 545   gcc_assert (line_end != NULL);
 546
 547   len = line_end - line_start;
 548
 549   if (c->line_start_idx < c->nb_read)
 550     *line = line_start;
 551
 552   ++c->line_num;
 553
 554   /* Before we update our line record, make sure the hint about the
 555      total number of lines of the file is correct.  If it's not, then
 556      we give up recording line boundaries from now on.  */
 557   bool update_line_record = true;
 558   if (c->line_num > c->total_lines)
 559     update_line_record = false;
 560
 561     /* Now update our line record so that re-reading lines from the
 562      before c->line_start_idx is faster.  */
 563   if (update_line_record
 564       && c->line_record.length () < fcache_line_record_size)
 565     {
 566       /* If the file lines fits in the line record, we just record all
 567          its lines ...*/
 568       if (c->total_lines <= fcache_line_record_size
 569           && c->line_num > c->line_record.length ())
 570         c->line_record.safe_push (fcache::line_info (c->line_num,
 571                                                  c->line_start_idx,
 572                                                  line_end - c->data));
 573       else if (c->total_lines > fcache_line_record_size)
 574         {
 575           /* ... otherwise, we just scale total_lines down to
 576              (fcache_line_record_size lines.  */
 577           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 578           if (c->line_record.length () == 0
 579               || n >= c->line_record.length ())
 580             c->line_record.safe_push (fcache::line_info (c->line_num,
 581                                                      c->line_start_idx,
 582                                                      line_end - c->data));
 583         }
 584     }
 585
 586   /* Update c->line_start_idx so that it points to the next line to be
 587      read.  */
 588   if (next_line_start)
 589     c->line_start_idx = next_line_start - c->data;
 590   else
 591     /* We didn't find any terminal '\n'.  Let's consider that the end
 592        of line is the end of the data in the cache.  The next
 593        invocation of get_next_line will either read more data from the
 594        underlying file or return false early because we've reached the
 595        end of the file.  */
 596     c->line_start_idx = c->nb_read;
 597
 598   *line_len = len;
 599
 600   return true;
 601 }
 602
 603 /* Consume the next bytes coming from the cache (or from its
 604    underlying file if there are remaining unread bytes in the file)
 605    until we reach the next end-of-line (or end-of-file).  There is no
 606    copying from the cache involved.  Return TRUE upon successful
 607    completion.  */
 608
 609 static bool
 610 goto_next_line (fcache *cache)
 611 {
 612   char *l;
 613   ssize_t len;
 614
 615   return get_next_line (cache, &l, &len);
 616 }
 617
 618 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 619    If the line was read successfully, *LINE points to the beginning
 620    of the line in the file cache and *LINE_LEN is the length of the
 621    line.  *LINE is not nul-terminated, but may contain zero bytes.
 622    *LINE is only valid until the next call of read_line_num.
 623    This function returns bool if a line was read.  */
 624
 625 static bool
 626 read_line_num (fcache *c, size_t line_num,
 627                char **line, ssize_t *line_len)
 628 {
 629   gcc_assert (line_num > 0);
 630
 631   if (line_num <= c->line_num)
 632     {
 633       /* We've been asked to read lines that are before c->line_num.
 634          So lets use our line record (if it's not empty) to try to
 635          avoid re-reading the file from the beginning again.  */
 636
 637       if (c->line_record.is_empty ())
 638         {
 639           c->line_start_idx = 0;
 640           c->line_num = 0;
 641         }
 642       else
 643         {
 644           fcache::line_info *i = NULL;
 645           if (c->total_lines <= fcache_line_record_size)
 646             {
 647               /* In languages where the input file is not totally
 648                  preprocessed up front, the c->total_lines hint
 649                  can be smaller than the number of lines of the
 650                  file.  In that case, only the first
 651                  c->total_lines have been recorded.
 652
 653                  Otherwise, the first c->total_lines we've read have
 654                  their start/end recorded here.  */
 655               i = (line_num <= c->total_lines)
 656                 ? &c->line_record[line_num - 1]
 657                 : &c->line_record[c->total_lines - 1];
 658               gcc_assert (i->line_num <= line_num);
 659             }
 660           else
 661             {
 662               /*  So the file had more lines than our line record
 663                   size.  Thus the number of lines we've recorded has
 664                   been scaled down to fcache_line_reacord_size.  Let's
 665                   pick the start/end of the recorded line that is
 666                   closest to line_num.  */
 667               size_t n = (line_num <= c->total_lines)
 668                 ? line_num * fcache_line_record_size / c->total_lines
 669                 : c ->line_record.length () - 1;
 670               if (n < c->line_record.length ())
 671                 {
 672                   i = &c->line_record[n];
 673                   gcc_assert (i->line_num <= line_num);
 674                 }
 675             }
 676
 677           if (i && i->line_num == line_num)
 678             {
 679               /* We have the start/end of the line.  */
 680               *line = c->data + i->start_pos;
 681               *line_len = i->end_pos - i->start_pos;
 682               return true;
 683             }
 684
 685           if (i)
 686             {
 687               c->line_start_idx = i->start_pos;
 688               c->line_num = i->line_num - 1;
 689             }
 690           else
 691             {
 692               c->line_start_idx = 0;
 693               c->line_num = 0;
 694             }
 695         }
 696     }
 697
 698   /*  Let's walk from line c->line_num up to line_num - 1, without
 699       copying any line.  */
 700   while (c->line_num < line_num - 1)
 701     if (!goto_next_line (c))
 702       return false;
 703
 704   /* The line we want is the next one.  Let's read and copy it back to
 705      the caller.  */
 706   return get_next_line (c, line, line_len);
 707 }
 708
 709 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 710    The line is not nul-terminated.  The returned pointer is only
 711    valid until the next call of location_get_source_line.
 712    Note that the line can contain several null characters,
 713    so LINE_LEN, if non-null, points to the actual length of the line.
 714    If the function fails, NULL is returned.  */
 715
 716 const char *
 717 location_get_source_line (const char *file_path, int line,
 718                           int *line_len)
 719 {
 720   char *buffer = NULL;
 721   ssize_t len;
 722
 723   if (line == 0)
 724     return NULL;
 725
 726   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 727   if (c == NULL)
 728     return NULL;
 729
 730   bool read = read_line_num (c, line, &buffer, &len);
 731
 732   if (read && line_len)
 733     *line_len = len;
 734
 735   return read ? buffer : NULL;
 736 }
 737
 738 /* Determine if FILE_PATH missing a trailing newline on its final line.
 739    Only valid to call once all of the file has been loaded, by
 740    requesting a line number beyond the end of the file.  */
 741
 742 bool
 743 location_missing_trailing_newline (const char *file_path)
 744 {
 745   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 746   if (c == NULL)
 747     return false;
 748
 749   return c->missing_trailing_newline;
 750 }
 751
 752 /* Test if the location originates from the spelling location of a
 753    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 754    virtual) location of a built-in token that appears in the expansion
 755    list of a macro.  Please note that this function also works on
 756    tokens that result from built-in tokens.  For instance, the
 757    function would return true if passed a token "4" that is the result
 758    of the expansion of the built-in __LINE__ macro.  */
 759 bool
 760 is_location_from_builtin_token (source_location loc)
 761 {
 762   const line_map_ordinary *map = NULL;
 763   loc = linemap_resolve_location (line_table, loc,
 764                                   LRK_SPELLING_LOCATION, &map);
 765   return loc == BUILTINS_LOCATION;
 766 }
 767
 768 /* Expand the source location LOC into a human readable location.  If
 769    LOC is virtual, it resolves to the expansion point of the involved
 770    macro.  If LOC resolves to a builtin location, the file name of the
 771    readable location is set to the string "<built-in>".  */
 772
 773 expanded_location
 774 expand_location (source_location loc)
 775 {
 776   return expand_location_1 (loc, /*expansion_point_p=*/true);
 777 }
 778
 779 /* Expand the source location LOC into a human readable location.  If
 780    LOC is virtual, it resolves to the expansion location of the
 781    relevant macro.  If LOC resolves to a builtin location, the file
 782    name of the readable location is set to the string
 783    "<built-in>".  */
 784
 785 expanded_location
 786 expand_location_to_spelling_point (source_location loc)
 787 {
 788   return expand_location_1 (loc, /*expansion_point_p=*/false);
 789 }
 790
 791 /* The rich_location class within libcpp requires a way to expand
 792    source_location instances, and relies on the client code
 793    providing a symbol named
 794      linemap_client_expand_location_to_spelling_point
 795    to do this.
 796
 797    This is the implementation for libcommon.a (all host binaries),
 798    which simply calls into expand_location_to_spelling_point.  */
 799
 800 expanded_location
 801 linemap_client_expand_location_to_spelling_point (source_location loc)
 802 {
 803   return expand_location_to_spelling_point (loc);
 804 }
 805
 806
 807 /* If LOCATION is in a system header and if it is a virtual location for
 808    a token coming from the expansion of a macro, unwind it to the
 809    location of the expansion point of the macro.  Otherwise, just return
 810    LOCATION.
 811
 812    This is used for instance when we want to emit diagnostics about a
 813    token that may be located in a macro that is itself defined in a
 814    system header, for example, for the NULL macro.  In such a case, if
 815    LOCATION were passed directly to diagnostic functions such as
 816    warning_at, the diagnostic would be suppressed (unless
 817    -Wsystem-headers).  */
 818
 819 source_location
 820 expansion_point_location_if_in_system_header (source_location location)
 821 {
 822   if (in_system_header_at (location))
 823     location = linemap_resolve_location (line_table, location,
 824                                          LRK_MACRO_EXPANSION_POINT,
 825                                          NULL);
 826   return location;
 827 }
 828
 829 /* If LOCATION is a virtual location for a token coming from the expansion
 830    of a macro, unwind to the location of the expansion point of the macro.  */
 831
 832 source_location
 833 expansion_point_location (source_location location)
 834 {
 835   return linemap_resolve_location (line_table, location,
 836                                    LRK_MACRO_EXPANSION_POINT, NULL);
 837 }
 838
 839 /* Construct a location with caret at CARET, ranging from START to
 840    finish e.g.
 841
 842                  11111111112
 843         12345678901234567890
 844      522
 845      523   return foo + bar;
 846                   ~~~~^~~~~
 847      524
 848
 849    The location's caret is at the "+", line 523 column 15, but starts
 850    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 851    of "bar" at column 19.  */
 852
 853 location_t
 854 make_location (location_t caret, location_t start, location_t finish)
 855 {
 856   location_t pure_loc = get_pure_location (caret);
 857   source_range src_range;
 858   src_range.m_start = get_start (start);
 859   src_range.m_finish = get_finish (finish);
 860   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 861                                                    pure_loc,
 862                                                    src_range,
 863                                                    NULL);
 864   return combined_loc;
 865 }
 866
 867 #define ONE_K 1024
 868 #define ONE_M (ONE_K * ONE_K)
 869
 870 /* Display a number as an integer multiple of either:
 871    - 1024, if said integer is >= to 10 K (in base 2)
 872    - 1024 * 1024, if said integer is >= 10 M in (base 2)
 873  */
 874 #define SCALE(x) ((unsigned long) ((x) < 10 * ONE_K \
 875                   ? (x) \
 876                   : ((x) < 10 * ONE_M \
 877                      ? (x) / ONE_K \
 878                      : (x) / ONE_M)))
 879
 880 /* For a given integer, display either:
 881    - the character 'k', if the number is higher than 10 K (in base 2)
 882      but strictly lower than 10 M (in base 2)
 883    - the character 'M' if the number is higher than 10 M (in base2)
 884    - the charcter ' ' if the number is strictly lower  than 10 K  */
 885 #define STAT_LABEL(x) ((x) < 10 * ONE_K ? ' ' : ((x) < 10 * ONE_M ? 'k' : 'M'))
 886
 887 /* Display an integer amount as multiple of 1K or 1M (in base 2).
 888    Display the correct unit (either k, M, or ' ') after the amount, as
 889    well.  */
 890 #define FORMAT_AMOUNT(size) SCALE (size), STAT_LABEL (size)
 891
 892 /* Dump statistics to stderr about the memory usage of the line_table
 893    set of line maps.  This also displays some statistics about macro
 894    expansion.  */
 895
 896 void
 897 dump_line_table_statistics (void)
 898 {
 899   struct linemap_stats s;
 900   long total_used_map_size,
 901     macro_maps_size,
 902     total_allocated_map_size;
 903
 904   memset (&s, 0, sizeof (s));
 905
 906   linemap_get_statistics (line_table, &s);
 907
 908   macro_maps_size = s.macro_maps_used_size
 909     + s.macro_maps_locations_size;
 910
 911   total_allocated_map_size = s.ordinary_maps_allocated_size
 912     + s.macro_maps_allocated_size
 913     + s.macro_maps_locations_size;
 914
 915   total_used_map_size = s.ordinary_maps_used_size
 916     + s.macro_maps_used_size
 917     + s.macro_maps_locations_size;
 918
 919   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 920            s.num_expanded_macros);
 921   if (s.num_expanded_macros != 0)
 922     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 923              s.num_macro_tokens / s.num_expanded_macros);
 924   fprintf (stderr,
 925            "\nLine Table allocations during the "
 926            "compilation process\n");
 927   fprintf (stderr, "Number of ordinary maps used:        %5ld%c\n",
 928            SCALE (s.num_ordinary_maps_used),
 929            STAT_LABEL (s.num_ordinary_maps_used));
 930   fprintf (stderr, "Ordinary map used size:              %5ld%c\n",
 931            SCALE (s.ordinary_maps_used_size),
 932            STAT_LABEL (s.ordinary_maps_used_size));
 933   fprintf (stderr, "Number of ordinary maps allocated:   %5ld%c\n",
 934            SCALE (s.num_ordinary_maps_allocated),
 935            STAT_LABEL (s.num_ordinary_maps_allocated));
 936   fprintf (stderr, "Ordinary maps allocated size:        %5ld%c\n",
 937            SCALE (s.ordinary_maps_allocated_size),
 938            STAT_LABEL (s.ordinary_maps_allocated_size));
 939   fprintf (stderr, "Number of macro maps used:           %5ld%c\n",
 940            SCALE (s.num_macro_maps_used),
 941            STAT_LABEL (s.num_macro_maps_used));
 942   fprintf (stderr, "Macro maps used size:                %5ld%c\n",
 943            SCALE (s.macro_maps_used_size),
 944            STAT_LABEL (s.macro_maps_used_size));
 945   fprintf (stderr, "Macro maps locations size:           %5ld%c\n",
 946            SCALE (s.macro_maps_locations_size),
 947            STAT_LABEL (s.macro_maps_locations_size));
 948   fprintf (stderr, "Macro maps size:                     %5ld%c\n",
 949            SCALE (macro_maps_size),
 950            STAT_LABEL (macro_maps_size));
 951   fprintf (stderr, "Duplicated maps locations size:      %5ld%c\n",
 952            SCALE (s.duplicated_macro_maps_locations_size),
 953            STAT_LABEL (s.duplicated_macro_maps_locations_size));
 954   fprintf (stderr, "Total allocated maps size:           %5ld%c\n",
 955            SCALE (total_allocated_map_size),
 956            STAT_LABEL (total_allocated_map_size));
 957   fprintf (stderr, "Total used maps size:                %5ld%c\n",
 958            SCALE (total_used_map_size),
 959            STAT_LABEL (total_used_map_size));
 960   fprintf (stderr, "Ad-hoc table size:                   %5ld%c\n",
 961            SCALE (s.adhoc_table_size),
 962            STAT_LABEL (s.adhoc_table_size));
 963   fprintf (stderr, "Ad-hoc table entries used:           %5ld\n",
 964            s.adhoc_table_entries_used);
 965   fprintf (stderr, "optimized_ranges: %i\n",
 966            line_table->num_optimized_ranges);
 967   fprintf (stderr, "unoptimized_ranges: %i\n",
 968            line_table->num_unoptimized_ranges);
 969
 970   fprintf (stderr, "\n");
 971 }
 972
 973 /* Get location one beyond the final location in ordinary map IDX.  */
 974
 975 static source_location
 976 get_end_location (struct line_maps *set, unsigned int idx)
 977 {
 978   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
 979     return set->highest_location;
 980
 981   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
 982   return MAP_START_LOCATION (next_map);
 983 }
 984
 985 /* Helper function for write_digit_row.  */
 986
 987 static void
 988 write_digit (FILE *stream, int digit)
 989 {
 990   fputc ('0' + (digit % 10), stream);
 991 }
 992
 993 /* Helper function for dump_location_info.
 994    Write a row of numbers to STREAM, numbering a source line,
 995    giving the units, tens, hundreds etc of the column number.  */
 996
 997 static void
 998 write_digit_row (FILE *stream, int indent,
 999                  const line_map_ordinary *map,
1000                  source_location loc, int max_col, int divisor)
1001 {
1002   fprintf (stream, "%*c", indent, ' ');
1003   fprintf (stream, "|");
1004   for (int column = 1; column < max_col; column++)
1005     {
1006       source_location column_loc = loc + (column << map->m_range_bits);
1007       write_digit (stream, column_loc / divisor);
1008     }
1009   fprintf (stream, "\n");
1010 }
1011
1012 /* Write a half-closed (START) / half-open (END) interval of
1013    source_location to STREAM.  */
1014
1015 static void
1016 dump_location_range (FILE *stream,
1017                      source_location start, source_location end)
1018 {
1019   fprintf (stream,
1020            "  source_location interval: %u <= loc < %u\n",
1021            start, end);
1022 }
1023
1024 /* Write a labelled description of a half-closed (START) / half-open (END)
1025    interval of source_location to STREAM.  */
1026
1027 static void
1028 dump_labelled_location_range (FILE *stream,
1029                               const char *name,
1030                               source_location start, source_location end)
1031 {
1032   fprintf (stream, "%s\n", name);
1033   dump_location_range (stream, start, end);
1034   fprintf (stream, "\n");
1035 }
1036
1037 /* Write a visualization of the locations in the line_table to STREAM.  */
1038
1039 void
1040 dump_location_info (FILE *stream)
1041 {
1042   /* Visualize the reserved locations.  */
1043   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1044                                 0, RESERVED_LOCATION_COUNT);
1045
1046   /* Visualize the ordinary line_map instances, rendering the sources. */
1047   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1048     {
1049       source_location end_location = get_end_location (line_table, idx);
1050       /* half-closed: doesn't include this one. */
1051
1052       const line_map_ordinary *map
1053         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1054       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1055       dump_location_range (stream,
1056                            MAP_START_LOCATION (map), end_location);
1057       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1058       fprintf (stream, "  starting at line: %i\n",
1059                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1060       fprintf (stream, "  column and range bits: %i\n",
1061                map->m_column_and_range_bits);
1062       fprintf (stream, "  column bits: %i\n",
1063                map->m_column_and_range_bits - map->m_range_bits);
1064       fprintf (stream, "  range bits: %i\n",
1065                map->m_range_bits);
1066
1067       /* Render the span of source lines that this "map" covers.  */
1068       for (source_location loc = MAP_START_LOCATION (map);
1069            loc < end_location;
1070            loc += (1 << map->m_range_bits) )
1071         {
1072           gcc_assert (pure_location_p (line_table, loc) );
1073
1074           expanded_location exploc
1075             = linemap_expand_location (line_table, map, loc);
1076
1077           if (0 == exploc.column)
1078             {
1079               /* Beginning of a new source line: draw the line.  */
1080
1081               int line_size;
1082               const char *line_text = location_get_source_line (exploc.file,
1083                                                                 exploc.line,
1084                                                                 &line_size);
1085               if (!line_text)
1086                 break;
1087               fprintf (stream,
1088                        "%s:%3i|loc:%5i|%.*s\n",
1089                        exploc.file, exploc.line,
1090                        loc,
1091                        line_size, line_text);
1092
1093               /* "loc" is at column 0, which means "the whole line".
1094                  Render the locations *within* the line, by underlining
1095                  it, showing the source_location numeric values
1096                  at each column.  */
1097               int max_col = (1 << map->m_column_and_range_bits) - 1;
1098               if (max_col > line_size)
1099                 max_col = line_size + 1;
1100
1101               int indent = 14 + strlen (exploc.file);
1102
1103               /* Thousands.  */
1104               if (end_location > 999)
1105                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1106
1107               /* Hundreds.  */
1108               if (end_location > 99)
1109                 write_digit_row (stream, indent, map, loc, max_col, 100);
1110
1111               /* Tens.  */
1112               write_digit_row (stream, indent, map, loc, max_col, 10);
1113
1114               /* Units.  */
1115               write_digit_row (stream, indent, map, loc, max_col, 1);
1116             }
1117         }
1118       fprintf (stream, "\n");
1119     }
1120
1121   /* Visualize unallocated values.  */
1122   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1123                                 line_table->highest_location,
1124                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1125
1126   /* Visualize the macro line_map instances, rendering the sources. */
1127   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1128     {
1129       /* Each macro map that is allocated owns source_location values
1130          that are *lower* that the one before them.
1131          Hence it's meaningful to view them either in order of ascending
1132          source locations, or in order of ascending macro map index.  */
1133       const bool ascending_source_locations = true;
1134       unsigned int idx = (ascending_source_locations
1135                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1136                           : i);
1137       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1138       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1139                idx,
1140                linemap_map_get_macro_name (map),
1141                MACRO_MAP_NUM_MACRO_TOKENS (map));
1142       dump_location_range (stream,
1143                            map->start_location,
1144                            (map->start_location
1145                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1146       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1147               "expansion point is location %i",
1148               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1149       fprintf (stream, "  map->start_location: %u\n",
1150                map->start_location);
1151
1152       fprintf (stream, "  macro_locations:\n");
1153       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1154         {
1155           source_location x = MACRO_MAP_LOCATIONS (map)[2 * i];
1156           source_location y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1157
1158           /* linemap_add_macro_token encodes token numbers in an expansion
1159              by putting them after MAP_START_LOCATION. */
1160
1161           /* I'm typically seeing 4 uninitialized entries at the end of
1162              0xafafafaf.
1163              This appears to be due to macro.c:replace_args
1164              adding 2 extra args for padding tokens; presumably there may
1165              be a leading and/or trailing padding token injected,
1166              each for 2 more location slots.
1167              This would explain there being up to 4 source_locations slots
1168              that may be uninitialized.  */
1169
1170           fprintf (stream, "    %u: %u, %u\n",
1171                    i,
1172                    x,
1173                    y);
1174           if (x == y)
1175             {
1176               if (x < MAP_START_LOCATION (map))
1177                 inform (x, "token %u has x-location == y-location == %u", i, x);
1178               else
1179                 fprintf (stream,
1180                          "x-location == y-location == %u encodes token # %u\n",
1181                          x, x - MAP_START_LOCATION (map));
1182                 }
1183           else
1184             {
1185               inform (x, "token %u has x-location == %u", i, x);
1186               inform (x, "token %u has y-location == %u", i, y);
1187             }
1188         }
1189       fprintf (stream, "\n");
1190     }
1191
1192   /* It appears that MAX_SOURCE_LOCATION itself is never assigned to a
1193      macro map, presumably due to an off-by-one error somewhere
1194      between the logic in linemap_enter_macro and
1195      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1196   dump_labelled_location_range (stream, "MAX_SOURCE_LOCATION",
1197                                 MAX_SOURCE_LOCATION,
1198                                 MAX_SOURCE_LOCATION + 1);
1199
1200   /* Visualize ad-hoc values.  */
1201   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1202                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
1203 }
1204
1205 /* string_concat's constructor.  */
1206
1207 string_concat::string_concat (int num, location_t *locs)
1208   : m_num (num)
1209 {
1210   m_locs = ggc_vec_alloc <location_t> (num);
1211   for (int i = 0; i < num; i++)
1212     m_locs[i] = locs[i];
1213 }
1214
1215 /* string_concat_db's constructor.  */
1216
1217 string_concat_db::string_concat_db ()
1218 {
1219   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1220 }
1221
1222 /* Record that a string concatenation occurred, covering NUM
1223    string literal tokens.  LOCS is an array of size NUM, containing the
1224    locations of the tokens.  A copy of LOCS is taken.  */
1225
1226 void
1227 string_concat_db::record_string_concatenation (int num, location_t *locs)
1228 {
1229   gcc_assert (num > 1);
1230   gcc_assert (locs);
1231
1232   location_t key_loc = get_key_loc (locs[0]);
1233
1234   string_concat *concat
1235     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1236   m_table->put (key_loc, concat);
1237 }
1238
1239 /* Determine if LOC was the location of the the initial token of a
1240    concatenation of string literal tokens.
1241    If so, *OUT_NUM is written to with the number of tokens, and
1242    *OUT_LOCS with the location of an array of locations of the
1243    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1244    storage owned by the string_concat_db.
1245    Otherwise, return false.  */
1246
1247 bool
1248 string_concat_db::get_string_concatenation (location_t loc,
1249                                             int *out_num,
1250                                             location_t **out_locs)
1251 {
1252   gcc_assert (out_num);
1253   gcc_assert (out_locs);
1254
1255   location_t key_loc = get_key_loc (loc);
1256
1257   string_concat **concat = m_table->get (key_loc);
1258   if (!concat)
1259     return false;
1260
1261   *out_num = (*concat)->m_num;
1262   *out_locs =(*concat)->m_locs;
1263   return true;
1264 }
1265
1266 /* Internal function.  Canonicalize LOC into a form suitable for
1267    use as a key within the database, stripping away macro expansion,
1268    ad-hoc information, and range information, using the location of
1269    the start of LOC within an ordinary linemap.  */
1270
1271 location_t
1272 string_concat_db::get_key_loc (location_t loc)
1273 {
1274   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1275                                   NULL);
1276
1277   loc = get_range_from_loc (line_table, loc).m_start;
1278
1279   return loc;
1280 }
1281
1282 /* Helper class for use within get_substring_ranges_for_loc.
1283    An vec of cpp_string with responsibility for releasing all of the
1284    str->text for each str in the vector.  */
1285
1286 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1287 {
1288  public:
1289   auto_cpp_string_vec (int alloc)
1290     : auto_vec <cpp_string> (alloc) {}
1291
1292   ~auto_cpp_string_vec ()
1293   {
1294     /* Clean up the copies within this vec.  */
1295     int i;
1296     cpp_string *str;
1297     FOR_EACH_VEC_ELT (*this, i, str)
1298       free (const_cast <unsigned char *> (str->text));
1299   }
1300 };
1301
1302 /* Attempt to populate RANGES with source location information on the
1303    individual characters within the string literal found at STRLOC.
1304    If CONCATS is non-NULL, then any string literals that the token at
1305    STRLOC  was concatenated with are also added to RANGES.
1306
1307    Return NULL if successful, or an error message if any errors occurred (in
1308    which case RANGES may be only partially populated and should not
1309    be used).
1310
1311    This is implemented by re-parsing the relevant source line(s).  */
1312
1313 static const char *
1314 get_substring_ranges_for_loc (cpp_reader *pfile,
1315                               string_concat_db *concats,
1316                               location_t strloc,
1317                               enum cpp_ttype type,
1318                               cpp_substring_ranges &ranges)
1319 {
1320   gcc_assert (pfile);
1321
1322   if (strloc == UNKNOWN_LOCATION)
1323     return "unknown location";
1324
1325   /* Reparsing the strings requires accurate location information.
1326      If -ftrack-macro-expansion has been overridden from its default
1327      of 2, then we might have a location of a macro expansion point,
1328      rather than the location of the literal itself.
1329      Avoid this by requiring that we have full macro expansion tracking
1330      for substring locations to be available.  */
1331   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1332     return "track_macro_expansion != 2";
1333
1334   /* If #line or # 44 "file"-style directives are present, then there's
1335      no guarantee that the line numbers we have can be used to locate
1336      the strings.  For example, we might have a .i file with # directives
1337      pointing back to lines within a .c file, but the .c file might
1338      have been edited since the .i file was created.
1339      In such a case, the safest course is to disable on-demand substring
1340      locations.  */
1341   if (line_table->seen_line_directive)
1342     return "seen line directive";
1343
1344   /* If string concatenation has occurred at STRLOC, get the locations
1345      of all of the literal tokens making up the compound string.
1346      Otherwise, just use STRLOC.  */
1347   int num_locs = 1;
1348   location_t *strlocs = &strloc;
1349   if (concats)
1350     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1351
1352   auto_cpp_string_vec strs (num_locs);
1353   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1354   for (int i = 0; i < num_locs; i++)
1355     {
1356       /* Get range of strloc.  We will use it to locate the start and finish
1357          of the literal token within the line.  */
1358       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1359
1360       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1361         /* If the string is within a macro expansion, we can't get at the
1362            end location.  */
1363         return "macro expansion";
1364
1365       if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1366         /* If so, we can't reliably determine where the token started within
1367            its line.  */
1368         return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1369
1370       if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1371         /* If so, we can't reliably determine where the token finished within
1372            its line.  */
1373         return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1374
1375       expanded_location start
1376         = expand_location_to_spelling_point (src_range.m_start);
1377       expanded_location finish
1378         = expand_location_to_spelling_point (src_range.m_finish);
1379       if (start.file != finish.file)
1380         return "range endpoints are in different files";
1381       if (start.line != finish.line)
1382         return "range endpoints are on different lines";
1383       if (start.column > finish.column)
1384         return "range endpoints are reversed";
1385
1386       int line_width;
1387       const char *line = location_get_source_line (start.file, start.line,
1388                                                    &line_width);
1389       if (line == NULL)
1390         return "unable to read source line";
1391
1392       /* Determine the location of the literal (including quotes
1393          and leading prefix chars, such as the 'u' in a u""
1394          token).  */
1395       const char *literal = line + start.column - 1;
1396       int literal_length = finish.column - start.column + 1;
1397
1398       /* Ensure that we don't crash if we got the wrong location.  */
1399       if (line_width < (start.column - 1 + literal_length))
1400         return "line is not wide enough";
1401
1402       cpp_string from;
1403       from.len = literal_length;
1404       /* Make a copy of the literal, to avoid having to rely on
1405          the lifetime of the copy of the line within the cache.
1406          This will be released by the auto_cpp_string_vec dtor.  */
1407       from.text = XDUPVEC (unsigned char, literal, literal_length);
1408       strs.safe_push (from);
1409
1410       /* For very long lines, a new linemap could have started
1411          halfway through the token.
1412          Ensure that the loc_reader uses the linemap of the
1413          *end* of the token for its start location.  */
1414       const line_map_ordinary *final_ord_map;
1415       linemap_resolve_location (line_table, src_range.m_finish,
1416                                 LRK_MACRO_EXPANSION_POINT, &final_ord_map);
1417       location_t start_loc
1418         = linemap_position_for_line_and_column (line_table, final_ord_map,
1419                                                 start.line, start.column);
1420
1421       cpp_string_location_reader loc_reader (start_loc, line_table);
1422       loc_readers.safe_push (loc_reader);
1423     }
1424
1425   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1426   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1427                                                  loc_readers.address (),
1428                                                  num_locs, &ranges, type);
1429   if (err)
1430     return err;
1431
1432   /* Success: "ranges" should now contain information on the string.  */
1433   return NULL;
1434 }
1435
1436 /* Attempt to populate *OUT_LOC with source location information on the
1437    given characters within the string literal found at STRLOC.
1438    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1439    character set.
1440
1441    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1442    and string literal "012345\n789"
1443    *OUT_LOC is written to with:
1444      "012345\n789"
1445          ~^~~~~
1446
1447    If CONCATS is non-NULL, then any string literals that the token at
1448    STRLOC was concatenated with are also considered.
1449
1450    This is implemented by re-parsing the relevant source line(s).
1451
1452    Return NULL if successful, or an error message if any errors occurred.
1453    Error messages are intended for GCC developers (to help debugging) rather
1454    than for end-users.  */
1455
1456 const char *
1457 get_source_location_for_substring (cpp_reader *pfile,
1458                                    string_concat_db *concats,
1459                                    location_t strloc,
1460                                    enum cpp_ttype type,
1461                                    int caret_idx, int start_idx, int end_idx,
1462                                    source_location *out_loc)
1463 {
1464   gcc_checking_assert (caret_idx >= 0);
1465   gcc_checking_assert (start_idx >= 0);
1466   gcc_checking_assert (end_idx >= 0);
1467   gcc_assert (out_loc);
1468
1469   cpp_substring_ranges ranges;
1470   const char *err
1471     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1472   if (err)
1473     return err;
1474
1475   if (caret_idx >= ranges.get_num_ranges ())
1476     return "caret_idx out of range";
1477   if (start_idx >= ranges.get_num_ranges ())
1478     return "start_idx out of range";
1479   if (end_idx >= ranges.get_num_ranges ())
1480     return "end_idx out of range";
1481
1482   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1483                             ranges.get_range (start_idx).m_start,
1484                             ranges.get_range (end_idx).m_finish);
1485   return NULL;
1486 }
1487
1488 #if CHECKING_P
1489
1490 namespace selftest {
1491
1492 /* Selftests of location handling.  */
1493
1494 /* Attempt to populate *OUT_RANGE with source location information on the
1495    given character within the string literal found at STRLOC.
1496    CHAR_IDX refers to an offset within the execution character set.
1497    If CONCATS is non-NULL, then any string literals that the token at
1498    STRLOC was concatenated with are also considered.
1499
1500    This is implemented by re-parsing the relevant source line(s).
1501
1502    Return NULL if successful, or an error message if any errors occurred.
1503    Error messages are intended for GCC developers (to help debugging) rather
1504    than for end-users.  */
1505
1506 static const char *
1507 get_source_range_for_char (cpp_reader *pfile,
1508                            string_concat_db *concats,
1509                            location_t strloc,
1510                            enum cpp_ttype type,
1511                            int char_idx,
1512                            source_range *out_range)
1513 {
1514   gcc_checking_assert (char_idx >= 0);
1515   gcc_assert (out_range);
1516
1517   cpp_substring_ranges ranges;
1518   const char *err
1519     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1520   if (err)
1521     return err;
1522
1523   if (char_idx >= ranges.get_num_ranges ())
1524     return "char_idx out of range";
1525
1526   *out_range = ranges.get_range (char_idx);
1527   return NULL;
1528 }
1529
1530 /* As get_source_range_for_char, but write to *OUT the number
1531    of ranges that are available.  */
1532
1533 static const char *
1534 get_num_source_ranges_for_substring (cpp_reader *pfile,
1535                                      string_concat_db *concats,
1536                                      location_t strloc,
1537                                      enum cpp_ttype type,
1538                                      int *out)
1539 {
1540   gcc_assert (out);
1541
1542   cpp_substring_ranges ranges;
1543   const char *err
1544     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1545
1546   if (err)
1547     return err;
1548
1549   *out = ranges.get_num_ranges ();
1550   return NULL;
1551 }
1552
1553 /* Selftests of location handling.  */
1554
1555 /* Helper function for verifying location data: when location_t
1556    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1557    as having column 0.  */
1558
1559 static bool
1560 should_have_column_data_p (location_t loc)
1561 {
1562   if (IS_ADHOC_LOC (loc))
1563     loc = get_location_from_adhoc_loc (line_table, loc);
1564   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1565     return false;
1566   return true;
1567 }
1568
1569 /* Selftest for should_have_column_data_p.  */
1570
1571 static void
1572 test_should_have_column_data_p ()
1573 {
1574   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1575   ASSERT_TRUE
1576     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1577   ASSERT_FALSE
1578     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1579 }
1580
1581 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1582    on LOC.  */
1583
1584 static void
1585 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1586               location_t loc)
1587 {
1588   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1589   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1590   /* If location_t values are sufficiently high, then column numbers
1591      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1592      When close to the threshold, column numbers *may* be present: if
1593      the final linemap before the threshold contains a line that straddles
1594      the threshold, locations in that line have column information.  */
1595   if (should_have_column_data_p (loc))
1596     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1597 }
1598
1599 /* Various selftests involve constructing a line table and one or more
1600    line maps within it.
1601
1602    For maximum test coverage we want to run these tests with a variety
1603    of situations:
1604    - line_table->default_range_bits: some frontends use a non-zero value
1605    and others use zero
1606    - the fallback modes within line-map.c: there are various threshold
1607    values for source_location/location_t beyond line-map.c changes
1608    behavior (disabling of the range-packing optimization, disabling
1609    of column-tracking).  We can exercise these by starting the line_table
1610    at interesting values at or near these thresholds.
1611
1612    The following struct describes a particular case within our test
1613    matrix.  */
1614
1615 struct line_table_case
1616 {
1617   line_table_case (int default_range_bits, int base_location)
1618   : m_default_range_bits (default_range_bits),
1619     m_base_location (base_location)
1620   {}
1621
1622   int m_default_range_bits;
1623   int m_base_location;
1624 };
1625
1626 /* Constructor.  Store the old value of line_table, and create a new
1627    one, using sane defaults.  */
1628
1629 line_table_test::line_table_test ()
1630 {
1631   gcc_assert (saved_line_table == NULL);
1632   saved_line_table = line_table;
1633   line_table = ggc_alloc<line_maps> ();
1634   linemap_init (line_table, BUILTINS_LOCATION);
1635   gcc_assert (saved_line_table->reallocator);
1636   line_table->reallocator = saved_line_table->reallocator;
1637   gcc_assert (saved_line_table->round_alloc_size);
1638   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1639   line_table->default_range_bits = 0;
1640 }
1641
1642 /* Constructor.  Store the old value of line_table, and create a new
1643    one, using the sitation described in CASE_.  */
1644
1645 line_table_test::line_table_test (const line_table_case &case_)
1646 {
1647   gcc_assert (saved_line_table == NULL);
1648   saved_line_table = line_table;
1649   line_table = ggc_alloc<line_maps> ();
1650   linemap_init (line_table, BUILTINS_LOCATION);
1651   gcc_assert (saved_line_table->reallocator);
1652   line_table->reallocator = saved_line_table->reallocator;
1653   gcc_assert (saved_line_table->round_alloc_size);
1654   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1655   line_table->default_range_bits = case_.m_default_range_bits;
1656   if (case_.m_base_location)
1657     {
1658       line_table->highest_location = case_.m_base_location;
1659       line_table->highest_line = case_.m_base_location;
1660     }
1661 }
1662
1663 /* Destructor.  Restore the old value of line_table.  */
1664
1665 line_table_test::~line_table_test ()
1666 {
1667   gcc_assert (saved_line_table != NULL);
1668   line_table = saved_line_table;
1669   saved_line_table = NULL;
1670 }
1671
1672 /* Verify basic operation of ordinary linemaps.  */
1673
1674 static void
1675 test_accessing_ordinary_linemaps (const line_table_case &case_)
1676 {
1677   line_table_test ltt (case_);
1678
1679   /* Build a simple linemap describing some locations. */
1680   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1681
1682   linemap_line_start (line_table, 1, 100);
1683   location_t loc_a = linemap_position_for_column (line_table, 1);
1684   location_t loc_b = linemap_position_for_column (line_table, 23);
1685
1686   linemap_line_start (line_table, 2, 100);
1687   location_t loc_c = linemap_position_for_column (line_table, 1);
1688   location_t loc_d = linemap_position_for_column (line_table, 17);
1689
1690   /* Example of a very long line.  */
1691   linemap_line_start (line_table, 3, 2000);
1692   location_t loc_e = linemap_position_for_column (line_table, 700);
1693
1694   /* Transitioning back to a short line.  */
1695   linemap_line_start (line_table, 4, 0);
1696   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1697
1698   if (should_have_column_data_p (loc_back_to_short))
1699     {
1700       /* Verify that we switched to short lines in the linemap.  */
1701       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1702       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1703     }
1704
1705   /* Example of a line that will eventually be seen to be longer
1706      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1707      below that.  */
1708   linemap_line_start (line_table, 5, 2000);
1709
1710   location_t loc_start_of_very_long_line
1711     = linemap_position_for_column (line_table, 2000);
1712   location_t loc_too_wide
1713     = linemap_position_for_column (line_table, 4097);
1714   location_t loc_too_wide_2
1715     = linemap_position_for_column (line_table, 4098);
1716
1717   /* ...and back to a sane line length.  */
1718   linemap_line_start (line_table, 6, 100);
1719   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1720
1721   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1722
1723   /* Multiple files.  */
1724   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1725   linemap_line_start (line_table, 1, 200);
1726   location_t loc_f = linemap_position_for_column (line_table, 150);
1727   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1728
1729   /* Verify that we can recover the location info.  */
1730   assert_loceq ("foo.c", 1, 1, loc_a);
1731   assert_loceq ("foo.c", 1, 23, loc_b);
1732   assert_loceq ("foo.c", 2, 1, loc_c);
1733   assert_loceq ("foo.c", 2, 17, loc_d);
1734   assert_loceq ("foo.c", 3, 700, loc_e);
1735   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1736
1737   /* In the very wide line, the initial location should be fully tracked.  */
1738   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1739   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1740      be disabled.  */
1741   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1742   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1743   /*...and column-tracking should be re-enabled for subsequent lines.  */
1744   assert_loceq ("foo.c", 6, 10, loc_sane_again);
1745
1746   assert_loceq ("bar.c", 1, 150, loc_f);
1747
1748   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1749   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1750
1751   /* Verify using make_location to build a range, and extracting data
1752      back from it.  */
1753   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1754   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1755   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1756   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1757   ASSERT_EQ (loc_b, src_range.m_start);
1758   ASSERT_EQ (loc_d, src_range.m_finish);
1759 }
1760
1761 /* Verify various properties of UNKNOWN_LOCATION.  */
1762
1763 static void
1764 test_unknown_location ()
1765 {
1766   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1767   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1768   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1769 }
1770
1771 /* Verify various properties of BUILTINS_LOCATION.  */
1772
1773 static void
1774 test_builtins ()
1775 {
1776   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1777   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1778 }
1779
1780 /* Regression test for make_location.
1781    Ensure that we use pure locations for the start/finish of the range,
1782    rather than storing a packed or ad-hoc range as the start/finish.  */
1783
1784 static void
1785 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1786 {
1787   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1788      with C++ frontend.
1789      ....................0000000001111111111222.
1790      ....................1234567890123456789012.  */
1791   const char *content = "     r += !aaa == bbb;\n";
1792   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1793   line_table_test ltt (case_);
1794   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1795
1796   const location_t c11 = linemap_position_for_column (line_table, 11);
1797   const location_t c12 = linemap_position_for_column (line_table, 12);
1798   const location_t c13 = linemap_position_for_column (line_table, 13);
1799   const location_t c14 = linemap_position_for_column (line_table, 14);
1800   const location_t c21 = linemap_position_for_column (line_table, 21);
1801
1802   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1803     return;
1804
1805   /* Use column 13 for the caret location, arbitrarily, to verify that we
1806      handle start != caret.  */
1807   const location_t aaa = make_location (c13, c12, c14);
1808   ASSERT_EQ (c13, get_pure_location (aaa));
1809   ASSERT_EQ (c12, get_start (aaa));
1810   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1811   ASSERT_EQ (c14, get_finish (aaa));
1812   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1813
1814   /* Make a location using a location with a range as the start-point.  */
1815   const location_t not_aaa = make_location (c11, aaa, c14);
1816   ASSERT_EQ (c11, get_pure_location (not_aaa));
1817   /* It should use the start location of the range, not store the range
1818      itself.  */
1819   ASSERT_EQ (c12, get_start (not_aaa));
1820   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1821   ASSERT_EQ (c14, get_finish (not_aaa));
1822   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1823
1824   /* Similarly, make a location with a range as the end-point.  */
1825   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1826   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1827   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1828   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1829   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1830   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1831   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1832   /* It should use the finish location of the range, not store the range
1833      itself.  */
1834   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1835   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1836   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1837   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1838   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1839 }
1840
1841 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1842
1843 static void
1844 test_reading_source_line ()
1845 {
1846   /* Create a tempfile and write some text to it.  */
1847   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1848                         "01234567890123456789\n"
1849                         "This is the test text\n"
1850                         "This is the 3rd line");
1851
1852   /* Read back a specific line from the tempfile.  */
1853   int line_size;
1854   const char *source_line = location_get_source_line (tmp.get_filename (),
1855                                                       3, &line_size);
1856   ASSERT_TRUE (source_line != NULL);
1857   ASSERT_EQ (20, line_size);
1858   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1859                          source_line, line_size));
1860
1861   source_line = location_get_source_line (tmp.get_filename (),
1862                                           2, &line_size);
1863   ASSERT_TRUE (source_line != NULL);
1864   ASSERT_EQ (21, line_size);
1865   ASSERT_TRUE (!strncmp ("This is the test text",
1866                          source_line, line_size));
1867
1868   source_line = location_get_source_line (tmp.get_filename (),
1869                                           4, &line_size);
1870   ASSERT_TRUE (source_line == NULL);
1871 }
1872
1873 /* Tests of lexing.  */
1874
1875 /* Verify that token TOK from PARSER has cpp_token_as_text
1876    equal to EXPECTED_TEXT.  */
1877
1878 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1879   SELFTEST_BEGIN_STMT                                                   \
1880     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1881     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1882   SELFTEST_END_STMT
1883
1884 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1885    and ranges from EXP_START_COL to EXP_FINISH_COL.
1886    Use LOC as the effective location of the selftest.  */
1887
1888 static void
1889 assert_token_loc_eq (const location &loc,
1890                      const cpp_token *tok,
1891                      const char *exp_filename, int exp_linenum,
1892                      int exp_start_col, int exp_finish_col)
1893 {
1894   location_t tok_loc = tok->src_loc;
1895   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1896   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1897
1898   /* If location_t values are sufficiently high, then column numbers
1899      will be unavailable.  */
1900   if (!should_have_column_data_p (tok_loc))
1901     return;
1902
1903   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
1904   source_range tok_range = get_range_from_loc (line_table, tok_loc);
1905   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
1906   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
1907 }
1908
1909 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
1910    SELFTEST_LOCATION as the effective location of the selftest.  */
1911
1912 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
1913                             EXP_START_COL, EXP_FINISH_COL) \
1914   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
1915                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
1916
1917 /* Test of lexing a file using libcpp, verifying tokens and their
1918    location information.  */
1919
1920 static void
1921 test_lexer (const line_table_case &case_)
1922 {
1923   /* Create a tempfile and write some text to it.  */
1924   const char *content =
1925     /*00000000011111111112222222222333333.3333444444444.455555555556
1926       12345678901234567890123456789012345.6789012345678.901234567890.  */
1927     ("test_name /* c-style comment */\n"
1928      "                                  \"test literal\"\n"
1929      " // test c++-style comment\n"
1930      "   42\n");
1931   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
1932
1933   line_table_test ltt (case_);
1934
1935   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
1936
1937   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
1938   ASSERT_NE (fname, NULL);
1939
1940   /* Verify that we get the expected tokens back, with the correct
1941      location information.  */
1942
1943   location_t loc;
1944   const cpp_token *tok;
1945   tok = cpp_get_token_with_location (parser, &loc);
1946   ASSERT_NE (tok, NULL);
1947   ASSERT_EQ (tok->type, CPP_NAME);
1948   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
1949   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
1950
1951   tok = cpp_get_token_with_location (parser, &loc);
1952   ASSERT_NE (tok, NULL);
1953   ASSERT_EQ (tok->type, CPP_STRING);
1954   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
1955   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
1956
1957   tok = cpp_get_token_with_location (parser, &loc);
1958   ASSERT_NE (tok, NULL);
1959   ASSERT_EQ (tok->type, CPP_NUMBER);
1960   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
1961   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
1962
1963   tok = cpp_get_token_with_location (parser, &loc);
1964   ASSERT_NE (tok, NULL);
1965   ASSERT_EQ (tok->type, CPP_EOF);
1966
1967   cpp_finish (parser, NULL);
1968   cpp_destroy (parser);
1969 }
1970
1971 /* Forward decls.  */
1972
1973 struct lexer_test;
1974 class lexer_test_options;
1975
1976 /* A class for specifying options of a lexer_test.
1977    The "apply" vfunc is called during the lexer_test constructor.  */
1978
1979 class lexer_test_options
1980 {
1981  public:
1982   virtual void apply (lexer_test &) = 0;
1983 };
1984
1985 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
1986    in its dtor.
1987
1988    This is needed by struct lexer_test to ensure that the cleanup of the
1989    cpp_reader happens *after* the cleanup of the temp_source_file.  */
1990
1991 class cpp_reader_ptr
1992 {
1993  public:
1994   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
1995
1996   ~cpp_reader_ptr ()
1997   {
1998     cpp_finish (m_ptr, NULL);
1999     cpp_destroy (m_ptr);
2000   }
2001
2002   operator cpp_reader * () const { return m_ptr; }
2003
2004  private:
2005   cpp_reader *m_ptr;
2006 };
2007
2008 /* A struct for writing lexer tests.  */
2009
2010 struct lexer_test
2011 {
2012   lexer_test (const line_table_case &case_, const char *content,
2013               lexer_test_options *options);
2014   ~lexer_test ();
2015
2016   const cpp_token *get_token ();
2017
2018   /* The ordering of these fields matters.
2019      The line_table_test must be first, since the cpp_reader_ptr
2020      uses it.
2021      The cpp_reader must be cleaned up *after* the temp_source_file
2022      since the filenames in input.c's input cache are owned by the
2023      cpp_reader; in particular, when ~temp_source_file evicts the
2024      filename the filenames must still be alive.  */
2025   line_table_test m_ltt;
2026   cpp_reader_ptr m_parser;
2027   temp_source_file m_tempfile;
2028   string_concat_db m_concats;
2029   bool m_implicitly_expect_EOF;
2030 };
2031
2032 /* Use an EBCDIC encoding for the execution charset, specifically
2033    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2034
2035    This exercises iconv integration within libcpp.
2036    Not every build of iconv supports the given charset,
2037    so we need to flag this error and handle it gracefully.  */
2038
2039 class ebcdic_execution_charset : public lexer_test_options
2040 {
2041  public:
2042   ebcdic_execution_charset () : m_num_iconv_errors (0)
2043     {
2044       gcc_assert (s_singleton == NULL);
2045       s_singleton = this;
2046     }
2047   ~ebcdic_execution_charset ()
2048     {
2049       gcc_assert (s_singleton == this);
2050       s_singleton = NULL;
2051     }
2052
2053   void apply (lexer_test &test) FINAL OVERRIDE
2054   {
2055     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2056     cpp_opts->narrow_charset = "IBM1047";
2057
2058     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2059     callbacks->error = on_error;
2060   }
2061
2062   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2063                         int level ATTRIBUTE_UNUSED,
2064                         int reason ATTRIBUTE_UNUSED,
2065                         rich_location *richloc ATTRIBUTE_UNUSED,
2066                         const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2067     ATTRIBUTE_FPTR_PRINTF(5,0)
2068   {
2069     gcc_assert (s_singleton);
2070     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2071     const char *msg = "conversion from %s to %s not supported by iconv";
2072 #ifdef ENABLE_NLS
2073     msg = dgettext ("cpplib", msg);
2074 #endif
2075     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2076        when the local iconv build doesn't support the conversion.  */
2077     if (strcmp (msgid, msg) == 0)
2078       {
2079         s_singleton->m_num_iconv_errors++;
2080         return true;
2081       }
2082
2083     /* Otherwise, we have an unexpected error.  */
2084     abort ();
2085   }
2086
2087   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2088
2089  private:
2090   static ebcdic_execution_charset *s_singleton;
2091   int m_num_iconv_errors;
2092 };
2093
2094 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2095
2096 /* A lexer_test_options subclass that records a list of error
2097    messages emitted by the lexer.  */
2098
2099 class lexer_error_sink : public lexer_test_options
2100 {
2101  public:
2102   lexer_error_sink ()
2103   {
2104     gcc_assert (s_singleton == NULL);
2105     s_singleton = this;
2106   }
2107   ~lexer_error_sink ()
2108   {
2109     gcc_assert (s_singleton == this);
2110     s_singleton = NULL;
2111
2112     int i;
2113     char *str;
2114     FOR_EACH_VEC_ELT (m_errors, i, str)
2115       free (str);
2116   }
2117
2118   void apply (lexer_test &test) FINAL OVERRIDE
2119   {
2120     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2121     callbacks->error = on_error;
2122   }
2123
2124   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2125                         int level ATTRIBUTE_UNUSED,
2126                         int reason ATTRIBUTE_UNUSED,
2127                         rich_location *richloc ATTRIBUTE_UNUSED,
2128                         const char *msgid, va_list *ap)
2129     ATTRIBUTE_FPTR_PRINTF(5,0)
2130   {
2131     char *msg = xvasprintf (msgid, *ap);
2132     s_singleton->m_errors.safe_push (msg);
2133     return true;
2134   }
2135
2136   auto_vec<char *> m_errors;
2137
2138  private:
2139   static lexer_error_sink *s_singleton;
2140 };
2141
2142 lexer_error_sink *lexer_error_sink::s_singleton;
2143
2144 /* Constructor.  Override line_table with a new instance based on CASE_,
2145    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2146    start parsing the tempfile.  */
2147
2148 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2149                         lexer_test_options *options)
2150 : m_ltt (case_),
2151   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2152   /* Create a tempfile and write the text to it.  */
2153   m_tempfile (SELFTEST_LOCATION, ".c", content),
2154   m_concats (),
2155   m_implicitly_expect_EOF (true)
2156 {
2157   if (options)
2158     options->apply (*this);
2159
2160   cpp_init_iconv (m_parser);
2161
2162   /* Parse the file.  */
2163   const char *fname = cpp_read_main_file (m_parser,
2164                                           m_tempfile.get_filename ());
2165   ASSERT_NE (fname, NULL);
2166 }
2167
2168 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2169
2170 lexer_test::~lexer_test ()
2171 {
2172   location_t loc;
2173   const cpp_token *tok;
2174
2175   if (m_implicitly_expect_EOF)
2176     {
2177       tok = cpp_get_token_with_location (m_parser, &loc);
2178       ASSERT_NE (tok, NULL);
2179       ASSERT_EQ (tok->type, CPP_EOF);
2180     }
2181 }
2182
2183 /* Get the next token from m_parser.  */
2184
2185 const cpp_token *
2186 lexer_test::get_token ()
2187 {
2188   location_t loc;
2189   const cpp_token *tok;
2190
2191   tok = cpp_get_token_with_location (m_parser, &loc);
2192   ASSERT_NE (tok, NULL);
2193   return tok;
2194 }
2195
2196 /* Verify that locations within string literals are correctly handled.  */
2197
2198 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2199    using the string concatenation database for TEST.
2200
2201    Assert that the character at index IDX is on EXPECTED_LINE,
2202    and that it begins at column EXPECTED_START_COL and ends at
2203    EXPECTED_FINISH_COL (unless the locations are beyond
2204    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2205    columns).  */
2206
2207 static void
2208 assert_char_at_range (const location &loc,
2209                       lexer_test& test,
2210                       location_t strloc, enum cpp_ttype type, int idx,
2211                       int expected_line, int expected_start_col,
2212                       int expected_finish_col)
2213 {
2214   cpp_reader *pfile = test.m_parser;
2215   string_concat_db *concats = &test.m_concats;
2216
2217   source_range actual_range = source_range();
2218   const char *err
2219     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2220                                  &actual_range);
2221   if (should_have_column_data_p (strloc))
2222     ASSERT_EQ_AT (loc, NULL, err);
2223   else
2224     {
2225       ASSERT_STREQ_AT (loc,
2226                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2227                        err);
2228       return;
2229     }
2230
2231   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2232   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2233   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2234   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2235
2236   if (should_have_column_data_p (actual_range.m_start))
2237     {
2238       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2239       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2240     }
2241   if (should_have_column_data_p (actual_range.m_finish))
2242     {
2243       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2244       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2245     }
2246 }
2247
2248 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2249    the effective location of any errors.  */
2250
2251 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2252                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2253   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2254                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2255                         (EXPECTED_FINISH_COL))
2256
2257 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2258    using the string concatenation database for TEST.
2259
2260    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2261
2262 static void
2263 assert_num_substring_ranges (const location &loc,
2264                              lexer_test& test,
2265                              location_t strloc,
2266                              enum cpp_ttype type,
2267                              int expected_num_ranges)
2268 {
2269   cpp_reader *pfile = test.m_parser;
2270   string_concat_db *concats = &test.m_concats;
2271
2272   int actual_num_ranges = -1;
2273   const char *err
2274     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2275                                            &actual_num_ranges);
2276   if (should_have_column_data_p (strloc))
2277     ASSERT_EQ_AT (loc, NULL, err);
2278   else
2279     {
2280       ASSERT_STREQ_AT (loc,
2281                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2282                        err);
2283       return;
2284     }
2285   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2286 }
2287
2288 /* Macro for calling assert_num_substring_ranges, supplying
2289    SELFTEST_LOCATION for the effective location of any errors.  */
2290
2291 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2292                                     EXPECTED_NUM_RANGES)                \
2293   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2294                                (TYPE), (EXPECTED_NUM_RANGES))
2295
2296
2297 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2298    returns an error (using the string concatenation database for TEST).  */
2299
2300 static void
2301 assert_has_no_substring_ranges (const location &loc,
2302                                 lexer_test& test,
2303                                 location_t strloc,
2304                                 enum cpp_ttype type,
2305                                 const char *expected_err)
2306 {
2307   cpp_reader *pfile = test.m_parser;
2308   string_concat_db *concats = &test.m_concats;
2309   cpp_substring_ranges ranges;
2310   const char *actual_err
2311     = get_substring_ranges_for_loc (pfile, concats, strloc,
2312                                     type, ranges);
2313   if (should_have_column_data_p (strloc))
2314     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2315   else
2316     ASSERT_STREQ_AT (loc,
2317                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2318                      actual_err);
2319 }
2320
2321 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2322     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2323                                     (STRLOC), (TYPE), (ERR))
2324
2325 /* Lex a simple string literal.  Verify the substring location data, before
2326    and after running cpp_interpret_string on it.  */
2327
2328 static void
2329 test_lexer_string_locations_simple (const line_table_case &case_)
2330 {
2331   /* Digits 0-9 (with 0 at column 10), the simple way.
2332      ....................000000000.11111111112.2222222223333333333
2333      ....................123456789.01234567890.1234567890123456789
2334      We add a trailing comment to ensure that we correctly locate
2335      the end of the string literal token.  */
2336   const char *content = "        \"0123456789\" /* not a string */\n";
2337   lexer_test test (case_, content, NULL);
2338
2339   /* Verify that we get the expected token back, with the correct
2340      location information.  */
2341   const cpp_token *tok = test.get_token ();
2342   ASSERT_EQ (tok->type, CPP_STRING);
2343   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2344   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2345
2346   /* At this point in lexing, the quote characters are treated as part of
2347      the string (they are stripped off by cpp_interpret_string).  */
2348
2349   ASSERT_EQ (tok->val.str.len, 12);
2350
2351   /* Verify that cpp_interpret_string works.  */
2352   cpp_string dst_string;
2353   const enum cpp_ttype type = CPP_STRING;
2354   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2355                                       &dst_string, type);
2356   ASSERT_TRUE (result);
2357   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2358   free (const_cast <unsigned char *> (dst_string.text));
2359
2360   /* Verify ranges of individual characters.  This no longer includes the
2361      opening quote, but does include the closing quote.  */
2362   for (int i = 0; i <= 10; i++)
2363     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2364                           10 + i, 10 + i);
2365
2366   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2367 }
2368
2369 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2370    encoding.  */
2371
2372 static void
2373 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2374 {
2375   /* EBCDIC support requires iconv.  */
2376   if (!HAVE_ICONV)
2377     return;
2378
2379   /* Digits 0-9 (with 0 at column 10), the simple way.
2380      ....................000000000.11111111112.2222222223333333333
2381      ....................123456789.01234567890.1234567890123456789
2382      We add a trailing comment to ensure that we correctly locate
2383      the end of the string literal token.  */
2384   const char *content = "        \"0123456789\" /* not a string */\n";
2385   ebcdic_execution_charset use_ebcdic;
2386   lexer_test test (case_, content, &use_ebcdic);
2387
2388   /* Verify that we get the expected token back, with the correct
2389      location information.  */
2390   const cpp_token *tok = test.get_token ();
2391   ASSERT_EQ (tok->type, CPP_STRING);
2392   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2393   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2394
2395   /* At this point in lexing, the quote characters are treated as part of
2396      the string (they are stripped off by cpp_interpret_string).  */
2397
2398   ASSERT_EQ (tok->val.str.len, 12);
2399
2400   /* The remainder of the test requires an iconv implementation that
2401      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2402   if (use_ebcdic.iconv_errors_occurred_p ())
2403     return;
2404
2405   /* Verify that cpp_interpret_string works.  */
2406   cpp_string dst_string;
2407   const enum cpp_ttype type = CPP_STRING;
2408   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2409                                       &dst_string, type);
2410   ASSERT_TRUE (result);
2411   /* We should now have EBCDIC-encoded text, specifically
2412      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2413      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2414   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2415                 (const char *)dst_string.text);
2416   free (const_cast <unsigned char *> (dst_string.text));
2417
2418   /* Verify that we don't attempt to record substring location information
2419      for such cases.  */
2420   ASSERT_HAS_NO_SUBSTRING_RANGES
2421     (test, tok->src_loc, type,
2422      "execution character set != source character set");
2423 }
2424
2425 /* Lex a string literal containing a hex-escaped character.
2426    Verify the substring location data, before and after running
2427    cpp_interpret_string on it.  */
2428
2429 static void
2430 test_lexer_string_locations_hex (const line_table_case &case_)
2431 {
2432   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2433      and with a space in place of digit 6, to terminate the escaped
2434      hex code.
2435      ....................000000000.111111.11112222.
2436      ....................123456789.012345.67890123.  */
2437   const char *content = "        \"01234\\x35 789\"\n";
2438   lexer_test test (case_, content, NULL);
2439
2440   /* Verify that we get the expected token back, with the correct
2441      location information.  */
2442   const cpp_token *tok = test.get_token ();
2443   ASSERT_EQ (tok->type, CPP_STRING);
2444   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2445   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2446
2447   /* At this point in lexing, the quote characters are treated as part of
2448      the string (they are stripped off by cpp_interpret_string).  */
2449   ASSERT_EQ (tok->val.str.len, 15);
2450
2451   /* Verify that cpp_interpret_string works.  */
2452   cpp_string dst_string;
2453   const enum cpp_ttype type = CPP_STRING;
2454   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2455                                       &dst_string, type);
2456   ASSERT_TRUE (result);
2457   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2458   free (const_cast <unsigned char *> (dst_string.text));
2459
2460   /* Verify ranges of individual characters.  This no longer includes the
2461      opening quote, but does include the closing quote.  */
2462   for (int i = 0; i <= 4; i++)
2463     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2464   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2465   for (int i = 6; i <= 10; i++)
2466     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2467
2468   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2469 }
2470
2471 /* Lex a string literal containing an octal-escaped character.
2472    Verify the substring location data after running cpp_interpret_string
2473    on it.  */
2474
2475 static void
2476 test_lexer_string_locations_oct (const line_table_case &case_)
2477 {
2478   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2479      and with a space in place of digit 6, to terminate the escaped
2480      octal code.
2481      ....................000000000.111111.11112222.2222223333333333444
2482      ....................123456789.012345.67890123.4567890123456789012  */
2483   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2484   lexer_test test (case_, content, NULL);
2485
2486   /* Verify that we get the expected token back, with the correct
2487      location information.  */
2488   const cpp_token *tok = test.get_token ();
2489   ASSERT_EQ (tok->type, CPP_STRING);
2490   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2491
2492   /* Verify that cpp_interpret_string works.  */
2493   cpp_string dst_string;
2494   const enum cpp_ttype type = CPP_STRING;
2495   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2496                                       &dst_string, type);
2497   ASSERT_TRUE (result);
2498   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2499   free (const_cast <unsigned char *> (dst_string.text));
2500
2501   /* Verify ranges of individual characters.  This no longer includes the
2502      opening quote, but does include the closing quote.  */
2503   for (int i = 0; i < 5; i++)
2504     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2505   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2506   for (int i = 6; i <= 10; i++)
2507     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2508
2509   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2510 }
2511
2512 /* Test of string literal containing letter escapes.  */
2513
2514 static void
2515 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2516 {
2517   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2518      .....................000000000.1.11111.1.1.11222.22222223333333
2519      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2520   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2521   lexer_test test (case_, content, NULL);
2522
2523   /* Verify that we get the expected tokens back.  */
2524   const cpp_token *tok = test.get_token ();
2525   ASSERT_EQ (tok->type, CPP_STRING);
2526   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2527
2528   /* Verify ranges of individual characters. */
2529   /* "\t".  */
2530   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2531                         0, 1, 10, 11);
2532   /* "foo". */
2533   for (int i = 1; i <= 3; i++)
2534     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2535                           i, 1, 11 + i, 11 + i);
2536   /* "\\" and "\n".  */
2537   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2538                         4, 1, 15, 16);
2539   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2540                         5, 1, 17, 18);
2541
2542   /* "bar" and closing quote for nul-terminator.  */
2543   for (int i = 6; i <= 9; i++)
2544     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2545                           i, 1, 13 + i, 13 + i);
2546
2547   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2548 }
2549
2550 /* Another test of a string literal containing a letter escape.
2551    Based on string seen in
2552      printf ("%-%\n");
2553    in gcc.dg/format/c90-printf-1.c.  */
2554
2555 static void
2556 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2557 {
2558   /* .....................000000000.1111.11.1111.22222222223.
2559      .....................123456789.0123.45.6789.01234567890.  */
2560   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2561   lexer_test test (case_, content, NULL);
2562
2563   /* Verify that we get the expected tokens back.  */
2564   const cpp_token *tok = test.get_token ();
2565   ASSERT_EQ (tok->type, CPP_STRING);
2566   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2567
2568   /* Verify ranges of individual characters. */
2569   /* "%-%".  */
2570   for (int i = 0; i < 3; i++)
2571     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2572                           i, 1, 10 + i, 10 + i);
2573   /* "\n".  */
2574   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2575                         3, 1, 13, 14);
2576
2577   /* Closing quote for nul-terminator.  */
2578   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2579                         4, 1, 15, 15);
2580
2581   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2582 }
2583
2584 /* Lex a string literal containing UCN 4 characters.
2585    Verify the substring location data after running cpp_interpret_string
2586    on it.  */
2587
2588 static void
2589 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2590 {
2591   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2592      as UCN 4.
2593      ....................000000000.111111.111122.222222223.33333333344444
2594      ....................123456789.012345.678901.234567890.12345678901234  */
2595   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2596   lexer_test test (case_, content, NULL);
2597
2598   /* Verify that we get the expected token back, with the correct
2599      location information.  */
2600   const cpp_token *tok = test.get_token ();
2601   ASSERT_EQ (tok->type, CPP_STRING);
2602   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2603
2604   /* Verify that cpp_interpret_string works.
2605      The string should be encoded in the execution character
2606      set.  Assuming that that is UTF-8, we should have the following:
2607      -----------  ----  -----  -------  ----------------
2608      Byte offset  Byte  Octal  Unicode  Source Column(s)
2609      -----------  ----  -----  -------  ----------------
2610      0            0x30         '0'      10
2611      1            0x31         '1'      11
2612      2            0x32         '2'      12
2613      3            0x33         '3'      13
2614      4            0x34         '4'      14
2615      5            0xE2  \342   U+2174   15-20
2616      6            0x85  \205    (cont)  15-20
2617      7            0xB4  \264    (cont)  15-20
2618      8            0xE2  \342   U+2175   21-26
2619      9            0x85  \205    (cont)  21-26
2620      10           0xB5  \265    (cont)  21-26
2621      11           0x37         '7'      27
2622      12           0x38         '8'      28
2623      13           0x39         '9'      29
2624      14           0x00                  30 (closing quote)
2625      -----------  ----  -----  -------  ---------------.  */
2626
2627   cpp_string dst_string;
2628   const enum cpp_ttype type = CPP_STRING;
2629   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2630                                       &dst_string, type);
2631   ASSERT_TRUE (result);
2632   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2633                 (const char *)dst_string.text);
2634   free (const_cast <unsigned char *> (dst_string.text));
2635
2636   /* Verify ranges of individual characters.  This no longer includes the
2637      opening quote, but does include the closing quote.
2638      '01234'.  */
2639   for (int i = 0; i <= 4; i++)
2640     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2641   /* U+2174.  */
2642   for (int i = 5; i <= 7; i++)
2643     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2644   /* U+2175.  */
2645   for (int i = 8; i <= 10; i++)
2646     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2647   /* '789' and nul terminator  */
2648   for (int i = 11; i <= 14; i++)
2649     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2650
2651   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2652 }
2653
2654 /* Lex a string literal containing UCN 8 characters.
2655    Verify the substring location data after running cpp_interpret_string
2656    on it.  */
2657
2658 static void
2659 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2660 {
2661   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2662      ....................000000000.111111.1111222222.2222333333333.344444
2663      ....................123456789.012345.6789012345.6789012345678.901234  */
2664   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2665   lexer_test test (case_, content, NULL);
2666
2667   /* Verify that we get the expected token back, with the correct
2668      location information.  */
2669   const cpp_token *tok = test.get_token ();
2670   ASSERT_EQ (tok->type, CPP_STRING);
2671   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2672                            "\"01234\\U00002174\\U00002175789\"");
2673
2674   /* Verify that cpp_interpret_string works.
2675      The UTF-8 encoding of the string is identical to that from
2676      the ucn4 testcase above; the only difference is the column
2677      locations.  */
2678   cpp_string dst_string;
2679   const enum cpp_ttype type = CPP_STRING;
2680   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2681                                       &dst_string, type);
2682   ASSERT_TRUE (result);
2683   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2684                 (const char *)dst_string.text);
2685   free (const_cast <unsigned char *> (dst_string.text));
2686
2687   /* Verify ranges of individual characters.  This no longer includes the
2688      opening quote, but does include the closing quote.
2689      '01234'.  */
2690   for (int i = 0; i <= 4; i++)
2691     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2692   /* U+2174.  */
2693   for (int i = 5; i <= 7; i++)
2694     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2695   /* U+2175.  */
2696   for (int i = 8; i <= 10; i++)
2697     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2698   /* '789' at columns 35-37  */
2699   for (int i = 11; i <= 13; i++)
2700     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2701   /* Closing quote/nul-terminator at column 38.  */
2702   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2703
2704   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2705 }
2706
2707 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2708
2709 static uint32_t
2710 uint32_from_big_endian (const uint32_t *ptr_be_value)
2711 {
2712   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2713   return (((uint32_t) buf[0] << 24)
2714           | ((uint32_t) buf[1] << 16)
2715           | ((uint32_t) buf[2] << 8)
2716           | (uint32_t) buf[3]);
2717 }
2718
2719 /* Lex a wide string literal and verify that attempts to read substring
2720    location data from it fail gracefully.  */
2721
2722 static void
2723 test_lexer_string_locations_wide_string (const line_table_case &case_)
2724 {
2725   /* Digits 0-9.
2726      ....................000000000.11111111112.22222222233333
2727      ....................123456789.01234567890.12345678901234  */
2728   const char *content = "       L\"0123456789\" /* non-str */\n";
2729   lexer_test test (case_, content, NULL);
2730
2731   /* Verify that we get the expected token back, with the correct
2732      location information.  */
2733   const cpp_token *tok = test.get_token ();
2734   ASSERT_EQ (tok->type, CPP_WSTRING);
2735   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2736
2737   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2738   cpp_string dst_string;
2739   const enum cpp_ttype type = CPP_WSTRING;
2740   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2741                                       &dst_string, type);
2742   ASSERT_TRUE (result);
2743   /* The cpp_reader defaults to big-endian with
2744      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2745      now be encoded as UTF-32BE.  */
2746   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2747   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2748   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2749   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2750   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2751   free (const_cast <unsigned char *> (dst_string.text));
2752
2753   /* We don't yet support generating substring location information
2754      for L"" strings.  */
2755   ASSERT_HAS_NO_SUBSTRING_RANGES
2756     (test, tok->src_loc, type,
2757      "execution character set != source character set");
2758 }
2759
2760 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2761
2762 static uint16_t
2763 uint16_from_big_endian (const uint16_t *ptr_be_value)
2764 {
2765   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2766   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2767 }
2768
2769 /* Lex a u"" string literal and verify that attempts to read substring
2770    location data from it fail gracefully.  */
2771
2772 static void
2773 test_lexer_string_locations_string16 (const line_table_case &case_)
2774 {
2775   /* Digits 0-9.
2776      ....................000000000.11111111112.22222222233333
2777      ....................123456789.01234567890.12345678901234  */
2778   const char *content = "       u\"0123456789\" /* non-str */\n";
2779   lexer_test test (case_, content, NULL);
2780
2781   /* Verify that we get the expected token back, with the correct
2782      location information.  */
2783   const cpp_token *tok = test.get_token ();
2784   ASSERT_EQ (tok->type, CPP_STRING16);
2785   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2786
2787   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2788   cpp_string dst_string;
2789   const enum cpp_ttype type = CPP_STRING16;
2790   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2791                                       &dst_string, type);
2792   ASSERT_TRUE (result);
2793
2794   /* The cpp_reader defaults to big-endian, so dst_string should
2795      now be encoded as UTF-16BE.  */
2796   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2797   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2798   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2799   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2800   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2801   free (const_cast <unsigned char *> (dst_string.text));
2802
2803   /* We don't yet support generating substring location information
2804      for L"" strings.  */
2805   ASSERT_HAS_NO_SUBSTRING_RANGES
2806     (test, tok->src_loc, type,
2807      "execution character set != source character set");
2808 }
2809
2810 /* Lex a U"" string literal and verify that attempts to read substring
2811    location data from it fail gracefully.  */
2812
2813 static void
2814 test_lexer_string_locations_string32 (const line_table_case &case_)
2815 {
2816   /* Digits 0-9.
2817      ....................000000000.11111111112.22222222233333
2818      ....................123456789.01234567890.12345678901234  */
2819   const char *content = "       U\"0123456789\" /* non-str */\n";
2820   lexer_test test (case_, content, NULL);
2821
2822   /* Verify that we get the expected token back, with the correct
2823      location information.  */
2824   const cpp_token *tok = test.get_token ();
2825   ASSERT_EQ (tok->type, CPP_STRING32);
2826   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2827
2828   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2829   cpp_string dst_string;
2830   const enum cpp_ttype type = CPP_STRING32;
2831   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2832                                       &dst_string, type);
2833   ASSERT_TRUE (result);
2834
2835   /* The cpp_reader defaults to big-endian, so dst_string should
2836      now be encoded as UTF-32BE.  */
2837   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2838   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2839   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2840   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2841   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2842   free (const_cast <unsigned char *> (dst_string.text));
2843
2844   /* We don't yet support generating substring location information
2845      for L"" strings.  */
2846   ASSERT_HAS_NO_SUBSTRING_RANGES
2847     (test, tok->src_loc, type,
2848      "execution character set != source character set");
2849 }
2850
2851 /* Lex a u8-string literal.
2852    Verify the substring location data after running cpp_interpret_string
2853    on it.  */
2854
2855 static void
2856 test_lexer_string_locations_u8 (const line_table_case &case_)
2857 {
2858   /* Digits 0-9.
2859      ....................000000000.11111111112.22222222233333
2860      ....................123456789.01234567890.12345678901234  */
2861   const char *content = "      u8\"0123456789\" /* non-str */\n";
2862   lexer_test test (case_, content, NULL);
2863
2864   /* Verify that we get the expected token back, with the correct
2865      location information.  */
2866   const cpp_token *tok = test.get_token ();
2867   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2868   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2869
2870   /* Verify that cpp_interpret_string works.  */
2871   cpp_string dst_string;
2872   const enum cpp_ttype type = CPP_STRING;
2873   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2874                                       &dst_string, type);
2875   ASSERT_TRUE (result);
2876   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2877   free (const_cast <unsigned char *> (dst_string.text));
2878
2879   /* Verify ranges of individual characters.  This no longer includes the
2880      opening quote, but does include the closing quote.  */
2881   for (int i = 0; i <= 10; i++)
2882     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2883 }
2884
2885 /* Lex a string literal containing UTF-8 source characters.
2886    Verify the substring location data after running cpp_interpret_string
2887    on it.  */
2888
2889 static void
2890 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2891 {
2892  /* This string literal is written out to the source file as UTF-8,
2893     and is of the form "before mojibake after", where "mojibake"
2894     is written as the following four unicode code points:
2895        U+6587 CJK UNIFIED IDEOGRAPH-6587
2896        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2897        U+5316 CJK UNIFIED IDEOGRAPH-5316
2898        U+3051 HIRAGANA LETTER KE.
2899      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2900      "before" and "after" are 1 byte per unicode character.
2901
2902      The numbering shown are "columns", which are *byte* numbers within
2903      the line, rather than unicode character numbers.
2904
2905      .................... 000000000.1111111.
2906      .................... 123456789.0123456.  */
2907   const char *content = ("        \"before "
2908                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
2909                               UTF-8: 0xE6 0x96 0x87
2910                               C octal escaped UTF-8: \346\226\207
2911                             "column" numbers: 17-19.  */
2912                          "\346\226\207"
2913
2914                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2915                               UTF-8: 0xE5 0xAD 0x97
2916                               C octal escaped UTF-8: \345\255\227
2917                             "column" numbers: 20-22.  */
2918                          "\345\255\227"
2919
2920                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
2921                               UTF-8: 0xE5 0x8C 0x96
2922                               C octal escaped UTF-8: \345\214\226
2923                             "column" numbers: 23-25.  */
2924                          "\345\214\226"
2925
2926                          /* U+3051 HIRAGANA LETTER KE
2927                               UTF-8: 0xE3 0x81 0x91
2928                               C octal escaped UTF-8: \343\201\221
2929                             "column" numbers: 26-28.  */
2930                          "\343\201\221"
2931
2932                          /* column numbers 29 onwards
2933                           2333333.33334444444444
2934                           9012345.67890123456789. */
2935                          " after\" /* non-str */\n");
2936   lexer_test test (case_, content, NULL);
2937
2938   /* Verify that we get the expected token back, with the correct
2939      location information.  */
2940   const cpp_token *tok = test.get_token ();
2941   ASSERT_EQ (tok->type, CPP_STRING);
2942   ASSERT_TOKEN_AS_TEXT_EQ
2943     (test.m_parser, tok,
2944      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
2945
2946   /* Verify that cpp_interpret_string works.  */
2947   cpp_string dst_string;
2948   const enum cpp_ttype type = CPP_STRING;
2949   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2950                                       &dst_string, type);
2951   ASSERT_TRUE (result);
2952   ASSERT_STREQ
2953     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
2954      (const char *)dst_string.text);
2955   free (const_cast <unsigned char *> (dst_string.text));
2956
2957   /* Verify ranges of individual characters.  This no longer includes the
2958      opening quote, but does include the closing quote.
2959      Assuming that both source and execution encodings are UTF-8, we have
2960      a run of 25 octets in each, plus the NUL terminator.  */
2961   for (int i = 0; i < 25; i++)
2962     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2963   /* NUL-terminator should use the closing quote at column 35.  */
2964   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
2965
2966   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
2967 }
2968
2969 /* Test of string literal concatenation.  */
2970
2971 static void
2972 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
2973 {
2974   /* Digits 0-9.
2975      .....................000000000.111111.11112222222222
2976      .....................123456789.012345.67890123456789.  */
2977   const char *content = ("        \"01234\" /* non-str */\n"
2978                          "        \"56789\" /* non-str */\n");
2979   lexer_test test (case_, content, NULL);
2980
2981   location_t input_locs[2];
2982
2983   /* Verify that we get the expected tokens back.  */
2984   auto_vec <cpp_string> input_strings;
2985   const cpp_token *tok_a = test.get_token ();
2986   ASSERT_EQ (tok_a->type, CPP_STRING);
2987   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
2988   input_strings.safe_push (tok_a->val.str);
2989   input_locs[0] = tok_a->src_loc;
2990
2991   const cpp_token *tok_b = test.get_token ();
2992   ASSERT_EQ (tok_b->type, CPP_STRING);
2993   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
2994   input_strings.safe_push (tok_b->val.str);
2995   input_locs[1] = tok_b->src_loc;
2996
2997   /* Verify that cpp_interpret_string works.  */
2998   cpp_string dst_string;
2999   const enum cpp_ttype type = CPP_STRING;
3000   bool result = cpp_interpret_string (test.m_parser,
3001                                       input_strings.address (), 2,
3002                                       &dst_string, type);
3003   ASSERT_TRUE (result);
3004   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3005   free (const_cast <unsigned char *> (dst_string.text));
3006
3007   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3008   test.m_concats.record_string_concatenation (2, input_locs);
3009
3010   location_t initial_loc = input_locs[0];
3011
3012   /* "01234" on line 1.  */
3013   for (int i = 0; i <= 4; i++)
3014     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3015   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3016   for (int i = 5; i <= 10; i++)
3017     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3018
3019   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3020 }
3021
3022 /* Another test of string literal concatenation.  */
3023
3024 static void
3025 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3026 {
3027   /* Digits 0-9.
3028      .....................000000000.111.11111112222222
3029      .....................123456789.012.34567890123456.  */
3030   const char *content = ("        \"01\" /* non-str */\n"
3031                          "        \"23\" /* non-str */\n"
3032                          "        \"45\" /* non-str */\n"
3033                          "        \"67\" /* non-str */\n"
3034                          "        \"89\" /* non-str */\n");
3035   lexer_test test (case_, content, NULL);
3036
3037   auto_vec <cpp_string> input_strings;
3038   location_t input_locs[5];
3039
3040   /* Verify that we get the expected tokens back.  */
3041   for (int i = 0; i < 5; i++)
3042     {
3043       const cpp_token *tok = test.get_token ();
3044       ASSERT_EQ (tok->type, CPP_STRING);
3045       input_strings.safe_push (tok->val.str);
3046       input_locs[i] = tok->src_loc;
3047     }
3048
3049   /* Verify that cpp_interpret_string works.  */
3050   cpp_string dst_string;
3051   const enum cpp_ttype type = CPP_STRING;
3052   bool result = cpp_interpret_string (test.m_parser,
3053                                       input_strings.address (), 5,
3054                                       &dst_string, type);
3055   ASSERT_TRUE (result);
3056   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3057   free (const_cast <unsigned char *> (dst_string.text));
3058
3059   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3060   test.m_concats.record_string_concatenation (5, input_locs);
3061
3062   location_t initial_loc = input_locs[0];
3063
3064   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3065      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3066      and expect get_source_range_for_substring to fail.
3067      However, for a string concatenation test, we can have a case
3068      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3069      but subsequent strings can be after it.
3070      Attempting to detect this within assert_char_at_range
3071      would overcomplicate the logic for the common test cases, so
3072      we detect it here.  */
3073   if (should_have_column_data_p (input_locs[0])
3074       && !should_have_column_data_p (input_locs[4]))
3075     {
3076       /* Verify that get_source_range_for_substring gracefully rejects
3077          this case.  */
3078       source_range actual_range;
3079       const char *err
3080         = get_source_range_for_char (test.m_parser, &test.m_concats,
3081                                      initial_loc, type, 0, &actual_range);
3082       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3083       return;
3084     }
3085
3086   for (int i = 0; i < 5; i++)
3087     for (int j = 0; j < 2; j++)
3088       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3089                             i + 1, 10 + j, 10 + j);
3090
3091   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3092   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3093
3094   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3095 }
3096
3097 /* Another test of string literal concatenation, this time combined with
3098    various kinds of escaped characters.  */
3099
3100 static void
3101 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3102 {
3103   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3104      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3105   const char *content
3106     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3107        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3108     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3109   lexer_test test (case_, content, NULL);
3110
3111   auto_vec <cpp_string> input_strings;
3112   location_t input_locs[4];
3113
3114   /* Verify that we get the expected tokens back.  */
3115   for (int i = 0; i < 4; i++)
3116     {
3117       const cpp_token *tok = test.get_token ();
3118       ASSERT_EQ (tok->type, CPP_STRING);
3119       input_strings.safe_push (tok->val.str);
3120       input_locs[i] = tok->src_loc;
3121     }
3122
3123   /* Verify that cpp_interpret_string works.  */
3124   cpp_string dst_string;
3125   const enum cpp_ttype type = CPP_STRING;
3126   bool result = cpp_interpret_string (test.m_parser,
3127                                       input_strings.address (), 4,
3128                                       &dst_string, type);
3129   ASSERT_TRUE (result);
3130   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3131   free (const_cast <unsigned char *> (dst_string.text));
3132
3133   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3134   test.m_concats.record_string_concatenation (4, input_locs);
3135
3136   location_t initial_loc = input_locs[0];
3137
3138   for (int i = 0; i <= 4; i++)
3139     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3140   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3141   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3142   for (int i = 7; i <= 9; i++)
3143     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3144
3145   /* NUL-terminator should use the location of the final closing quote.  */
3146   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3147
3148   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3149 }
3150
3151 /* Test of string literal in a macro.  */
3152
3153 static void
3154 test_lexer_string_locations_macro (const line_table_case &case_)
3155 {
3156   /* Digits 0-9.
3157      .....................0000000001111111111.22222222223.
3158      .....................1234567890123456789.01234567890.  */
3159   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3160                          "  MACRO");
3161   lexer_test test (case_, content, NULL);
3162
3163   /* Verify that we get the expected tokens back.  */
3164   const cpp_token *tok = test.get_token ();
3165   ASSERT_EQ (tok->type, CPP_PADDING);
3166
3167   tok = test.get_token ();
3168   ASSERT_EQ (tok->type, CPP_STRING);
3169   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3170
3171   /* Verify ranges of individual characters.  We ought to
3172      see columns within the macro definition.  */
3173   for (int i = 0; i <= 10; i++)
3174     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3175                           i, 1, 20 + i, 20 + i);
3176
3177   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3178
3179   tok = test.get_token ();
3180   ASSERT_EQ (tok->type, CPP_PADDING);
3181 }
3182
3183 /* Test of stringification of a macro argument.  */
3184
3185 static void
3186 test_lexer_string_locations_stringified_macro_argument
3187   (const line_table_case &case_)
3188 {
3189   /* .....................000000000111111111122222222223.
3190      .....................123456789012345678901234567890.  */
3191   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3192                          "MACRO(foo)\n");
3193   lexer_test test (case_, content, NULL);
3194
3195   /* Verify that we get the expected token back.  */
3196   const cpp_token *tok = test.get_token ();
3197   ASSERT_EQ (tok->type, CPP_PADDING);
3198
3199   tok = test.get_token ();
3200   ASSERT_EQ (tok->type, CPP_STRING);
3201   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3202
3203   /* We don't support getting the location of a stringified macro
3204      argument.  Verify that it fails gracefully.  */
3205   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3206                                   "cpp_interpret_string_1 failed");
3207
3208   tok = test.get_token ();
3209   ASSERT_EQ (tok->type, CPP_PADDING);
3210
3211   tok = test.get_token ();
3212   ASSERT_EQ (tok->type, CPP_PADDING);
3213 }
3214
3215 /* Ensure that we are fail gracefully if something attempts to pass
3216    in a location that isn't a string literal token.  Seen on this code:
3217
3218      const char a[] = " %d ";
3219      __builtin_printf (a, 0.5);
3220                        ^
3221
3222    when c-format.c erroneously used the indicated one-character
3223    location as the format string location, leading to a read past the
3224    end of a string buffer in cpp_interpret_string_1.  */
3225
3226 static void
3227 test_lexer_string_locations_non_string (const line_table_case &case_)
3228 {
3229   /* .....................000000000111111111122222222223.
3230      .....................123456789012345678901234567890.  */
3231   const char *content = ("         a\n");
3232   lexer_test test (case_, content, NULL);
3233
3234   /* Verify that we get the expected token back.  */
3235   const cpp_token *tok = test.get_token ();
3236   ASSERT_EQ (tok->type, CPP_NAME);
3237   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3238
3239   /* At this point, libcpp is attempting to interpret the name as a
3240      string literal, despite it not starting with a quote.  We don't detect
3241      that, but we should at least fail gracefully.  */
3242   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3243                                   "cpp_interpret_string_1 failed");
3244 }
3245
3246 /* Ensure that we can read substring information for a token which
3247    starts in one linemap and ends in another .  Adapted from
3248    gcc.dg/cpp/pr69985.c.  */
3249
3250 static void
3251 test_lexer_string_locations_long_line (const line_table_case &case_)
3252 {
3253   /* .....................000000.000111111111
3254      .....................123456.789012346789.  */
3255   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3256                          "     \"0123456789012345678901234567890123456789"
3257                          "0123456789012345678901234567890123456789"
3258                          "0123456789012345678901234567890123456789"
3259                          "0123456789\"\n");
3260
3261   lexer_test test (case_, content, NULL);
3262
3263   /* Verify that we get the expected token back.  */
3264   const cpp_token *tok = test.get_token ();
3265   ASSERT_EQ (tok->type, CPP_STRING);
3266
3267   if (!should_have_column_data_p (line_table->highest_location))
3268     return;
3269
3270   /* Verify ranges of individual characters.  */
3271   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3272   for (int i = 0; i < 131; i++)
3273     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3274                           i, 2, 7 + i, 7 + i);
3275 }
3276
3277 /* Test of locations within a raw string that doesn't contain a newline.  */
3278
3279 static void
3280 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3281 {
3282   /* .....................00.0000000111111111122.
3283      .....................12.3456789012345678901.  */
3284   const char *content = ("R\"foo(0123456789)foo\"\n");
3285   lexer_test test (case_, content, NULL);
3286
3287   /* Verify that we get the expected token back.  */
3288   const cpp_token *tok = test.get_token ();
3289   ASSERT_EQ (tok->type, CPP_STRING);
3290
3291   /* Verify that cpp_interpret_string works.  */
3292   cpp_string dst_string;
3293   const enum cpp_ttype type = CPP_STRING;
3294   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3295                                       &dst_string, type);
3296   ASSERT_TRUE (result);
3297   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3298   free (const_cast <unsigned char *> (dst_string.text));
3299
3300   if (!should_have_column_data_p (line_table->highest_location))
3301     return;
3302
3303   /* 0-9, plus the nil terminator.  */
3304   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3305   for (int i = 0; i < 11; i++)
3306     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3307                           i, 1, 7 + i, 7 + i);
3308 }
3309
3310 /* Test of locations within a raw string that contains a newline.  */
3311
3312 static void
3313 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3314 {
3315   /* .....................00.0000.
3316      .....................12.3456.  */
3317   const char *content = ("R\"foo(\n"
3318   /* .....................00000.
3319      .....................12345.  */
3320                          "hello\n"
3321                          "world\n"
3322   /* .....................00000.
3323      .....................12345.  */
3324                          ")foo\"\n");
3325   lexer_test test (case_, content, NULL);
3326
3327   /* Verify that we get the expected token back.  */
3328   const cpp_token *tok = test.get_token ();
3329   ASSERT_EQ (tok->type, CPP_STRING);
3330
3331   /* Verify that cpp_interpret_string works.  */
3332   cpp_string dst_string;
3333   const enum cpp_ttype type = CPP_STRING;
3334   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3335                                       &dst_string, type);
3336   ASSERT_TRUE (result);
3337   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3338   free (const_cast <unsigned char *> (dst_string.text));
3339
3340   if (!should_have_column_data_p (line_table->highest_location))
3341     return;
3342
3343   /* Currently we don't support locations within raw strings that
3344      contain newlines.  */
3345   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3346                                   "range endpoints are on different lines");
3347 }
3348
3349 /* Test of parsing an unterminated raw string.  */
3350
3351 static void
3352 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3353 {
3354   const char *content = "R\"ouch()ouCh\" /* etc */";
3355
3356   lexer_error_sink errors;
3357   lexer_test test (case_, content, &errors);
3358   test.m_implicitly_expect_EOF = false;
3359
3360   /* Attempt to parse the raw string.  */
3361   const cpp_token *tok = test.get_token ();
3362   ASSERT_EQ (tok->type, CPP_EOF);
3363
3364   ASSERT_EQ (1, errors.m_errors.length ());
3365   /* We expect the message "unterminated raw string"
3366      in the "cpplib" translation domain.
3367      It's not clear that dgettext is available on all supported hosts,
3368      so this assertion is commented-out for now.
3369        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3370                      errors.m_errors[0]);
3371   */
3372 }
3373
3374 /* Test of lexing char constants.  */
3375
3376 static void
3377 test_lexer_char_constants (const line_table_case &case_)
3378 {
3379   /* Various char constants.
3380      .....................0000000001111111111.22222222223.
3381      .....................1234567890123456789.01234567890.  */
3382   const char *content = ("         'a'\n"
3383                          "        u'a'\n"
3384                          "        U'a'\n"
3385                          "        L'a'\n"
3386                          "         'abc'\n");
3387   lexer_test test (case_, content, NULL);
3388
3389   /* Verify that we get the expected tokens back.  */
3390   /* 'a'.  */
3391   const cpp_token *tok = test.get_token ();
3392   ASSERT_EQ (tok->type, CPP_CHAR);
3393   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3394
3395   unsigned int chars_seen;
3396   int unsignedp;
3397   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3398                                           &chars_seen, &unsignedp);
3399   ASSERT_EQ (cc, 'a');
3400   ASSERT_EQ (chars_seen, 1);
3401
3402   /* u'a'.  */
3403   tok = test.get_token ();
3404   ASSERT_EQ (tok->type, CPP_CHAR16);
3405   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3406
3407   /* U'a'.  */
3408   tok = test.get_token ();
3409   ASSERT_EQ (tok->type, CPP_CHAR32);
3410   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3411
3412   /* L'a'.  */
3413   tok = test.get_token ();
3414   ASSERT_EQ (tok->type, CPP_WCHAR);
3415   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3416
3417   /* 'abc' (c-char-sequence).  */
3418   tok = test.get_token ();
3419   ASSERT_EQ (tok->type, CPP_CHAR);
3420   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3421 }
3422 /* A table of interesting location_t values, giving one axis of our test
3423    matrix.  */
3424
3425 static const location_t boundary_locations[] = {
3426   /* Zero means "don't override the default values for a new line_table".  */
3427   0,
3428
3429   /* An arbitrary non-zero value that isn't close to one of
3430      the boundary values below.  */
3431   0x10000,
3432
3433   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3434   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3435   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3436   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3437   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3438   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3439
3440   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3441   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3442   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3443   LINE_MAP_MAX_LOCATION_WITH_COLS,
3444   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3445   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3446 };
3447
3448 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3449
3450 void
3451 for_each_line_table_case (void (*testcase) (const line_table_case &))
3452 {
3453   /* As noted above in the description of struct line_table_case,
3454      we want to explore a test matrix of interesting line_table
3455      situations, running various selftests for each case within the
3456      matrix.  */
3457
3458   /* Run all tests with:
3459      (a) line_table->default_range_bits == 0, and
3460      (b) line_table->default_range_bits == 5.  */
3461   int num_cases_tested = 0;
3462   for (int default_range_bits = 0; default_range_bits <= 5;
3463        default_range_bits += 5)
3464     {
3465       /* ...and use each of the "interesting" location values as
3466          the starting location within line_table.  */
3467       const int num_boundary_locations
3468         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3469       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3470         {
3471           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3472
3473           testcase (c);
3474
3475           num_cases_tested++;
3476         }
3477     }
3478
3479   /* Verify that we fully covered the test matrix.  */
3480   ASSERT_EQ (num_cases_tested, 2 * 12);
3481 }
3482
3483 /* Run all of the selftests within this file.  */
3484
3485 void
3486 input_c_tests ()
3487 {
3488   test_should_have_column_data_p ();
3489   test_unknown_location ();
3490   test_builtins ();
3491   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3492
3493   for_each_line_table_case (test_accessing_ordinary_linemaps);
3494   for_each_line_table_case (test_lexer);
3495   for_each_line_table_case (test_lexer_string_locations_simple);
3496   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3497   for_each_line_table_case (test_lexer_string_locations_hex);
3498   for_each_line_table_case (test_lexer_string_locations_oct);
3499   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3500   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3501   for_each_line_table_case (test_lexer_string_locations_ucn4);
3502   for_each_line_table_case (test_lexer_string_locations_ucn8);
3503   for_each_line_table_case (test_lexer_string_locations_wide_string);
3504   for_each_line_table_case (test_lexer_string_locations_string16);
3505   for_each_line_table_case (test_lexer_string_locations_string32);
3506   for_each_line_table_case (test_lexer_string_locations_u8);
3507   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3508   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3509   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3510   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3511   for_each_line_table_case (test_lexer_string_locations_macro);
3512   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3513   for_each_line_table_case (test_lexer_string_locations_non_string);
3514   for_each_line_table_case (test_lexer_string_locations_long_line);
3515   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3516   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3517   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3518   for_each_line_table_case (test_lexer_char_constants);
3519
3520   test_reading_source_line ();
3521 }
3522
3523 } // namespace selftest
3524
3525 #endif /* CHECKING_P */