gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2017 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic-core.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* This is a cache used by get_next_line to store the content of a
  33    file to be searched for file lines.  */
  34 struct fcache
  35 {
  36   /* These are information used to store a line boundary.  */
  37   struct line_info
  38   {
  39     /* The line number.  It starts from 1.  */
  40     size_t line_num;
  41
  42     /* The position (byte count) of the beginning of the line,
  43        relative to the file data pointer.  This starts at zero.  */
  44     size_t start_pos;
  45
  46     /* The position (byte count) of the last byte of the line.  This
  47        normally points to the '\n' character, or to one byte after the
  48        last byte of the file, if the file doesn't contain a '\n'
  49        character.  */
  50     size_t end_pos;
  51
  52     line_info (size_t l, size_t s, size_t e)
  53       : line_num (l), start_pos (s), end_pos (e)
  54     {}
  55
  56     line_info ()
  57       :line_num (0), start_pos (0), end_pos (0)
  58     {}
  59   };
  60
  61   /* The number of time this file has been accessed.  This is used
  62      to designate which file cache to evict from the cache
  63      array.  */
  64   unsigned use_count;
  65
  66   /* The file_path is the key for identifying a particular file in
  67      the cache.
  68      For libcpp-using code, the underlying buffer for this field is
  69      owned by the corresponding _cpp_file within the cpp_reader.  */
  70   const char *file_path;
  71
  72   FILE *fp;
  73
  74   /* This points to the content of the file that we've read so
  75      far.  */
  76   char *data;
  77
  78   /*  The size of the DATA array above.*/
  79   size_t size;
  80
  81   /* The number of bytes read from the underlying file so far.  This
  82      must be less (or equal) than SIZE above.  */
  83   size_t nb_read;
  84
  85   /* The index of the beginning of the current line.  */
  86   size_t line_start_idx;
  87
  88   /* The number of the previous line read.  This starts at 1.  Zero
  89      means we've read no line so far.  */
  90   size_t line_num;
  91
  92   /* This is the total number of lines of the current file.  At the
  93      moment, we try to get this information from the line map
  94      subsystem.  Note that this is just a hint.  When using the C++
  95      front-end, this hint is correct because the input file is then
  96      completely tokenized before parsing starts; so the line map knows
  97      the number of lines before compilation really starts.  For e.g,
  98      the C front-end, it can happen that we start emitting diagnostics
  99      before the line map has seen the end of the file.  */
 100   size_t total_lines;
 101
 102   /* Could this file be missing a trailing newline on its final line?
 103      Initially true (to cope with empty files), set to true/false
 104      as each line is read.  */
 105   bool missing_trailing_newline;
 106
 107   /* This is a record of the beginning and end of the lines we've seen
 108      while reading the file.  This is useful to avoid walking the data
 109      from the beginning when we are asked to read a line that is
 110      before LINE_START_IDX above.  Note that the maximum size of this
 111      record is fcache_line_record_size, so that the memory consumption
 112      doesn't explode.  We thus scale total_lines down to
 113      fcache_line_record_size.  */
 114   vec<line_info, va_heap> line_record;
 115
 116   fcache ();
 117   ~fcache ();
 118 };
 119
 120 /* Current position in real source file.  */
 121
 122 location_t input_location = UNKNOWN_LOCATION;
 123
 124 struct line_maps *line_table;
 125
 126 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 127    This needs to be a global so that it can be a GC root, and thus
 128    prevent the stashed copy from being garbage-collected if the GC runs
 129    during a line_table_test.  */
 130
 131 struct line_maps *saved_line_table;
 132
 133 static fcache *fcache_tab;
 134 static const size_t fcache_tab_size = 16;
 135 static const size_t fcache_buffer_size = 4 * 1024;
 136 static const size_t fcache_line_record_size = 100;
 137
 138 /* Expand the source location LOC into a human readable location.  If
 139    LOC resolves to a builtin location, the file name of the readable
 140    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 141    TRUE and LOC is virtual, then it is resolved to the expansion
 142    point of the involved macro.  Otherwise, it is resolved to the
 143    spelling location of the token.
 144
 145    When resolving to the spelling location of the token, if the
 146    resulting location is for a built-in location (that is, it has no
 147    associated line/column) in the context of a macro expansion, the
 148    returned location is the first one (while unwinding the macro
 149    location towards its expansion point) that is in real source
 150    code.  */
 151
 152 static expanded_location
 153 expand_location_1 (source_location loc,
 154                    bool expansion_point_p)
 155 {
 156   expanded_location xloc;
 157   const line_map_ordinary *map;
 158   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 159   tree block = NULL;
 160
 161   if (IS_ADHOC_LOC (loc))
 162     {
 163       block = LOCATION_BLOCK (loc);
 164       loc = LOCATION_LOCUS (loc);
 165     }
 166
 167   memset (&xloc, 0, sizeof (xloc));
 168
 169   if (loc >= RESERVED_LOCATION_COUNT)
 170     {
 171       if (!expansion_point_p)
 172         {
 173           /* We want to resolve LOC to its spelling location.
 174
 175              But if that spelling location is a reserved location that
 176              appears in the context of a macro expansion (like for a
 177              location for a built-in token), let's consider the first
 178              location (toward the expansion point) that is not reserved;
 179              that is, the first location that is in real source code.  */
 180           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 181                                                           loc, NULL);
 182           lrk = LRK_SPELLING_LOCATION;
 183         }
 184       loc = linemap_resolve_location (line_table, loc,
 185                                       lrk, &map);
 186       xloc = linemap_expand_location (line_table, map, loc);
 187     }
 188
 189   xloc.data = block;
 190   if (loc <= BUILTINS_LOCATION)
 191     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 192
 193   return xloc;
 194 }
 195
 196 /* Initialize the set of cache used for files accessed by caret
 197    diagnostic.  */
 198
 199 static void
 200 diagnostic_file_cache_init (void)
 201 {
 202   if (fcache_tab == NULL)
 203     fcache_tab = new fcache[fcache_tab_size];
 204 }
 205
 206 /* Free the resources used by the set of cache used for files accessed
 207    by caret diagnostic.  */
 208
 209 void
 210 diagnostic_file_cache_fini (void)
 211 {
 212   if (fcache_tab)
 213     {
 214       delete [] (fcache_tab);
 215       fcache_tab = NULL;
 216     }
 217 }
 218
 219 /* Return the total lines number that have been read so far by the
 220    line map (in the preprocessor) so far.  For languages like C++ that
 221    entirely preprocess the input file before starting to parse, this
 222    equals the actual number of lines of the file.  */
 223
 224 static size_t
 225 total_lines_num (const char *file_path)
 226 {
 227   size_t r = 0;
 228   source_location l = 0;
 229   if (linemap_get_file_highest_location (line_table, file_path, &l))
 230     {
 231       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 232       expanded_location xloc = expand_location (l);
 233       r = xloc.line;
 234     }
 235   return r;
 236 }
 237
 238 /* Lookup the cache used for the content of a given file accessed by
 239    caret diagnostic.  Return the found cached file, or NULL if no
 240    cached file was found.  */
 241
 242 static fcache*
 243 lookup_file_in_cache_tab (const char *file_path)
 244 {
 245   if (file_path == NULL)
 246     return NULL;
 247
 248   diagnostic_file_cache_init ();
 249
 250   /* This will contain the found cached file.  */
 251   fcache *r = NULL;
 252   for (unsigned i = 0; i < fcache_tab_size; ++i)
 253     {
 254       fcache *c = &fcache_tab[i];
 255       if (c->file_path && !strcmp (c->file_path, file_path))
 256         {
 257           ++c->use_count;
 258           r = c;
 259         }
 260     }
 261
 262   if (r)
 263     ++r->use_count;
 264
 265   return r;
 266 }
 267
 268 /* Purge any mention of FILENAME from the cache of files used for
 269    printing source code.  For use in selftests when working
 270    with tempfiles.  */
 271
 272 void
 273 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 274 {
 275   gcc_assert (file_path);
 276
 277   fcache *r = lookup_file_in_cache_tab (file_path);
 278   if (!r)
 279     /* Not found.  */
 280     return;
 281
 282   r->file_path = NULL;
 283   if (r->fp)
 284     fclose (r->fp);
 285   r->fp = NULL;
 286   r->nb_read = 0;
 287   r->line_start_idx = 0;
 288   r->line_num = 0;
 289   r->line_record.truncate (0);
 290   r->use_count = 0;
 291   r->total_lines = 0;
 292   r->missing_trailing_newline = true;
 293 }
 294
 295 /* Return the file cache that has been less used, recently, or the
 296    first empty one.  If HIGHEST_USE_COUNT is non-null,
 297    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 298    in the cache table.  */
 299
 300 static fcache*
 301 evicted_cache_tab_entry (unsigned *highest_use_count)
 302 {
 303   diagnostic_file_cache_init ();
 304
 305   fcache *to_evict = &fcache_tab[0];
 306   unsigned huc = to_evict->use_count;
 307   for (unsigned i = 1; i < fcache_tab_size; ++i)
 308     {
 309       fcache *c = &fcache_tab[i];
 310       bool c_is_empty = (c->file_path == NULL);
 311
 312       if (c->use_count < to_evict->use_count
 313           || (to_evict->file_path && c_is_empty))
 314         /* We evict C because it's either an entry with a lower use
 315            count or one that is empty.  */
 316         to_evict = c;
 317
 318       if (huc < c->use_count)
 319         huc = c->use_count;
 320
 321       if (c_is_empty)
 322         /* We've reached the end of the cache; subsequent elements are
 323            all empty.  */
 324         break;
 325     }
 326
 327   if (highest_use_count)
 328     *highest_use_count = huc;
 329
 330   return to_evict;
 331 }
 332
 333 /* Create the cache used for the content of a given file to be
 334    accessed by caret diagnostic.  This cache is added to an array of
 335    cache and can be retrieved by lookup_file_in_cache_tab.  This
 336    function returns the created cache.  Note that only the last
 337    fcache_tab_size files are cached.  */
 338
 339 static fcache*
 340 add_file_to_cache_tab (const char *file_path)
 341 {
 342
 343   FILE *fp = fopen (file_path, "r");
 344   if (fp == NULL)
 345     return NULL;
 346
 347   unsigned highest_use_count = 0;
 348   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 349   r->file_path = file_path;
 350   if (r->fp)
 351     fclose (r->fp);
 352   r->fp = fp;
 353   r->nb_read = 0;
 354   r->line_start_idx = 0;
 355   r->line_num = 0;
 356   r->line_record.truncate (0);
 357   /* Ensure that this cache entry doesn't get evicted next time
 358      add_file_to_cache_tab is called.  */
 359   r->use_count = ++highest_use_count;
 360   r->total_lines = total_lines_num (file_path);
 361   r->missing_trailing_newline = true;
 362
 363   return r;
 364 }
 365
 366 /* Lookup the cache used for the content of a given file accessed by
 367    caret diagnostic.  If no cached file was found, create a new cache
 368    for this file, add it to the array of cached file and return
 369    it.  */
 370
 371 static fcache*
 372 lookup_or_add_file_to_cache_tab (const char *file_path)
 373 {
 374   fcache *r = lookup_file_in_cache_tab (file_path);
 375   if (r == NULL)
 376     r = add_file_to_cache_tab (file_path);
 377   return r;
 378 }
 379
 380 /* Default constructor for a cache of file used by caret
 381    diagnostic.  */
 382
 383 fcache::fcache ()
 384 : use_count (0), file_path (NULL), fp (NULL), data (0),
 385   size (0), nb_read (0), line_start_idx (0), line_num (0),
 386   total_lines (0), missing_trailing_newline (true)
 387 {
 388   line_record.create (0);
 389 }
 390
 391 /* Destructor for a cache of file used by caret diagnostic.  */
 392
 393 fcache::~fcache ()
 394 {
 395   if (fp)
 396     {
 397       fclose (fp);
 398       fp = NULL;
 399     }
 400   if (data)
 401     {
 402       XDELETEVEC (data);
 403       data = 0;
 404     }
 405   line_record.release ();
 406 }
 407
 408 /* Returns TRUE iff the cache would need to be filled with data coming
 409    from the file.  That is, either the cache is empty or full or the
 410    current line is empty.  Note that if the cache is full, it would
 411    need to be extended and filled again.  */
 412
 413 static bool
 414 needs_read (fcache *c)
 415 {
 416   return (c->nb_read == 0
 417           || c->nb_read == c->size
 418           || (c->line_start_idx >= c->nb_read - 1));
 419 }
 420
 421 /*  Return TRUE iff the cache is full and thus needs to be
 422     extended.  */
 423
 424 static bool
 425 needs_grow (fcache *c)
 426 {
 427   return c->nb_read == c->size;
 428 }
 429
 430 /* Grow the cache if it needs to be extended.  */
 431
 432 static void
 433 maybe_grow (fcache *c)
 434 {
 435   if (!needs_grow (c))
 436     return;
 437
 438   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 439   c->data = XRESIZEVEC (char, c->data, size);
 440   c->size = size;
 441 }
 442
 443 /*  Read more data into the cache.  Extends the cache if need be.
 444     Returns TRUE iff new data could be read.  */
 445
 446 static bool
 447 read_data (fcache *c)
 448 {
 449   if (feof (c->fp) || ferror (c->fp))
 450     return false;
 451
 452   maybe_grow (c);
 453
 454   char * from = c->data + c->nb_read;
 455   size_t to_read = c->size - c->nb_read;
 456   size_t nb_read = fread (from, 1, to_read, c->fp);
 457
 458   if (ferror (c->fp))
 459     return false;
 460
 461   c->nb_read += nb_read;
 462   return !!nb_read;
 463 }
 464
 465 /* Read new data iff the cache needs to be filled with more data
 466    coming from the file FP.  Return TRUE iff the cache was filled with
 467    mode data.  */
 468
 469 static bool
 470 maybe_read_data (fcache *c)
 471 {
 472   if (!needs_read (c))
 473     return false;
 474   return read_data (c);
 475 }
 476
 477 /* Read a new line from file FP, using C as a cache for the data
 478    coming from the file.  Upon successful completion, *LINE is set to
 479    the beginning of the line found.  *LINE points directly in the
 480    line cache and is only valid until the next call of get_next_line.
 481    *LINE_LEN is set to the length of the line.  Note that the line
 482    does not contain any terminal delimiter.  This function returns
 483    true if some data was read or process from the cache, false
 484    otherwise.  Note that subsequent calls to get_next_line might
 485    make the content of *LINE invalid.  */
 486
 487 static bool
 488 get_next_line (fcache *c, char **line, ssize_t *line_len)
 489 {
 490   /* Fill the cache with data to process.  */
 491   maybe_read_data (c);
 492
 493   size_t remaining_size = c->nb_read - c->line_start_idx;
 494   if (remaining_size == 0)
 495     /* There is no more data to process.  */
 496     return false;
 497
 498   char *line_start = c->data + c->line_start_idx;
 499
 500   char *next_line_start = NULL;
 501   size_t len = 0;
 502   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 503   if (line_end == NULL)
 504     {
 505       /* We haven't found the end-of-line delimiter in the cache.
 506          Fill the cache with more data from the file and look for the
 507          '\n'.  */
 508       while (maybe_read_data (c))
 509         {
 510           line_start = c->data + c->line_start_idx;
 511           remaining_size = c->nb_read - c->line_start_idx;
 512           line_end = (char *) memchr (line_start, '\n', remaining_size);
 513           if (line_end != NULL)
 514             {
 515               next_line_start = line_end + 1;
 516               break;
 517             }
 518         }
 519       if (line_end == NULL)
 520         {
 521           /* We've loadded all the file into the cache and still no
 522              '\n'.  Let's say the line ends up at one byte passed the
 523              end of the file.  This is to stay consistent with the case
 524              of when the line ends up with a '\n' and line_end points to
 525              that terminal '\n'.  That consistency is useful below in
 526              the len calculation.  */
 527           line_end = c->data + c->nb_read ;
 528           c->missing_trailing_newline = true;
 529         }
 530       else
 531         c->missing_trailing_newline = false;
 532     }
 533   else
 534     {
 535       next_line_start = line_end + 1;
 536       c->missing_trailing_newline = false;
 537     }
 538
 539   if (ferror (c->fp))
 540     return false;
 541
 542   /* At this point, we've found the end of the of line.  It either
 543      points to the '\n' or to one byte after the last byte of the
 544      file.  */
 545   gcc_assert (line_end != NULL);
 546
 547   len = line_end - line_start;
 548
 549   if (c->line_start_idx < c->nb_read)
 550     *line = line_start;
 551
 552   ++c->line_num;
 553
 554   /* Before we update our line record, make sure the hint about the
 555      total number of lines of the file is correct.  If it's not, then
 556      we give up recording line boundaries from now on.  */
 557   bool update_line_record = true;
 558   if (c->line_num > c->total_lines)
 559     update_line_record = false;
 560
 561     /* Now update our line record so that re-reading lines from the
 562      before c->line_start_idx is faster.  */
 563   if (update_line_record
 564       && c->line_record.length () < fcache_line_record_size)
 565     {
 566       /* If the file lines fits in the line record, we just record all
 567          its lines ...*/
 568       if (c->total_lines <= fcache_line_record_size
 569           && c->line_num > c->line_record.length ())
 570         c->line_record.safe_push (fcache::line_info (c->line_num,
 571                                                  c->line_start_idx,
 572                                                  line_end - c->data));
 573       else if (c->total_lines > fcache_line_record_size)
 574         {
 575           /* ... otherwise, we just scale total_lines down to
 576              (fcache_line_record_size lines.  */
 577           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 578           if (c->line_record.length () == 0
 579               || n >= c->line_record.length ())
 580             c->line_record.safe_push (fcache::line_info (c->line_num,
 581                                                      c->line_start_idx,
 582                                                      line_end - c->data));
 583         }
 584     }
 585
 586   /* Update c->line_start_idx so that it points to the next line to be
 587      read.  */
 588   if (next_line_start)
 589     c->line_start_idx = next_line_start - c->data;
 590   else
 591     /* We didn't find any terminal '\n'.  Let's consider that the end
 592        of line is the end of the data in the cache.  The next
 593        invocation of get_next_line will either read more data from the
 594        underlying file or return false early because we've reached the
 595        end of the file.  */
 596     c->line_start_idx = c->nb_read;
 597
 598   *line_len = len;
 599
 600   return true;
 601 }
 602
 603 /* Consume the next bytes coming from the cache (or from its
 604    underlying file if there are remaining unread bytes in the file)
 605    until we reach the next end-of-line (or end-of-file).  There is no
 606    copying from the cache involved.  Return TRUE upon successful
 607    completion.  */
 608
 609 static bool
 610 goto_next_line (fcache *cache)
 611 {
 612   char *l;
 613   ssize_t len;
 614
 615   return get_next_line (cache, &l, &len);
 616 }
 617
 618 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 619    If the line was read successfully, *LINE points to the beginning
 620    of the line in the file cache and *LINE_LEN is the length of the
 621    line.  *LINE is not nul-terminated, but may contain zero bytes.
 622    *LINE is only valid until the next call of read_line_num.
 623    This function returns bool if a line was read.  */
 624
 625 static bool
 626 read_line_num (fcache *c, size_t line_num,
 627                char **line, ssize_t *line_len)
 628 {
 629   gcc_assert (line_num > 0);
 630
 631   if (line_num <= c->line_num)
 632     {
 633       /* We've been asked to read lines that are before c->line_num.
 634          So lets use our line record (if it's not empty) to try to
 635          avoid re-reading the file from the beginning again.  */
 636
 637       if (c->line_record.is_empty ())
 638         {
 639           c->line_start_idx = 0;
 640           c->line_num = 0;
 641         }
 642       else
 643         {
 644           fcache::line_info *i = NULL;
 645           if (c->total_lines <= fcache_line_record_size)
 646             {
 647               /* In languages where the input file is not totally
 648                  preprocessed up front, the c->total_lines hint
 649                  can be smaller than the number of lines of the
 650                  file.  In that case, only the first
 651                  c->total_lines have been recorded.
 652
 653                  Otherwise, the first c->total_lines we've read have
 654                  their start/end recorded here.  */
 655               i = (line_num <= c->total_lines)
 656                 ? &c->line_record[line_num - 1]
 657                 : &c->line_record[c->total_lines - 1];
 658               gcc_assert (i->line_num <= line_num);
 659             }
 660           else
 661             {
 662               /*  So the file had more lines than our line record
 663                   size.  Thus the number of lines we've recorded has
 664                   been scaled down to fcache_line_reacord_size.  Let's
 665                   pick the start/end of the recorded line that is
 666                   closest to line_num.  */
 667               size_t n = (line_num <= c->total_lines)
 668                 ? line_num * fcache_line_record_size / c->total_lines
 669                 : c ->line_record.length () - 1;
 670               if (n < c->line_record.length ())
 671                 {
 672                   i = &c->line_record[n];
 673                   gcc_assert (i->line_num <= line_num);
 674                 }
 675             }
 676
 677           if (i && i->line_num == line_num)
 678             {
 679               /* We have the start/end of the line.  */
 680               *line = c->data + i->start_pos;
 681               *line_len = i->end_pos - i->start_pos;
 682               return true;
 683             }
 684
 685           if (i)
 686             {
 687               c->line_start_idx = i->start_pos;
 688               c->line_num = i->line_num - 1;
 689             }
 690           else
 691             {
 692               c->line_start_idx = 0;
 693               c->line_num = 0;
 694             }
 695         }
 696     }
 697
 698   /*  Let's walk from line c->line_num up to line_num - 1, without
 699       copying any line.  */
 700   while (c->line_num < line_num - 1)
 701     if (!goto_next_line (c))
 702       return false;
 703
 704   /* The line we want is the next one.  Let's read and copy it back to
 705      the caller.  */
 706   return get_next_line (c, line, line_len);
 707 }
 708
 709 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 710    The line is not nul-terminated.  The returned pointer is only
 711    valid until the next call of location_get_source_line.
 712    Note that the line can contain several null characters,
 713    so LINE_LEN, if non-null, points to the actual length of the line.
 714    If the function fails, NULL is returned.  */
 715
 716 const char *
 717 location_get_source_line (const char *file_path, int line,
 718                           int *line_len)
 719 {
 720   char *buffer = NULL;
 721   ssize_t len;
 722
 723   if (line == 0)
 724     return NULL;
 725
 726   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 727   if (c == NULL)
 728     return NULL;
 729
 730   bool read = read_line_num (c, line, &buffer, &len);
 731
 732   if (read && line_len)
 733     *line_len = len;
 734
 735   return read ? buffer : NULL;
 736 }
 737
 738 /* Determine if FILE_PATH missing a trailing newline on its final line.
 739    Only valid to call once all of the file has been loaded, by
 740    requesting a line number beyond the end of the file.  */
 741
 742 bool
 743 location_missing_trailing_newline (const char *file_path)
 744 {
 745   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 746   if (c == NULL)
 747     return false;
 748
 749   return c->missing_trailing_newline;
 750 }
 751
 752 /* Test if the location originates from the spelling location of a
 753    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 754    virtual) location of a built-in token that appears in the expansion
 755    list of a macro.  Please note that this function also works on
 756    tokens that result from built-in tokens.  For instance, the
 757    function would return true if passed a token "4" that is the result
 758    of the expansion of the built-in __LINE__ macro.  */
 759 bool
 760 is_location_from_builtin_token (source_location loc)
 761 {
 762   const line_map_ordinary *map = NULL;
 763   loc = linemap_resolve_location (line_table, loc,
 764                                   LRK_SPELLING_LOCATION, &map);
 765   return loc == BUILTINS_LOCATION;
 766 }
 767
 768 /* Expand the source location LOC into a human readable location.  If
 769    LOC is virtual, it resolves to the expansion point of the involved
 770    macro.  If LOC resolves to a builtin location, the file name of the
 771    readable location is set to the string "<built-in>".  */
 772
 773 expanded_location
 774 expand_location (source_location loc)
 775 {
 776   return expand_location_1 (loc, /*expansion_point_p=*/true);
 777 }
 778
 779 /* Expand the source location LOC into a human readable location.  If
 780    LOC is virtual, it resolves to the expansion location of the
 781    relevant macro.  If LOC resolves to a builtin location, the file
 782    name of the readable location is set to the string
 783    "<built-in>".  */
 784
 785 expanded_location
 786 expand_location_to_spelling_point (source_location loc)
 787 {
 788   return expand_location_1 (loc, /*expansion_point_p=*/false);
 789 }
 790
 791 /* The rich_location class within libcpp requires a way to expand
 792    source_location instances, and relies on the client code
 793    providing a symbol named
 794      linemap_client_expand_location_to_spelling_point
 795    to do this.
 796
 797    This is the implementation for libcommon.a (all host binaries),
 798    which simply calls into expand_location_to_spelling_point.  */
 799
 800 expanded_location
 801 linemap_client_expand_location_to_spelling_point (source_location loc)
 802 {
 803   return expand_location_to_spelling_point (loc);
 804 }
 805
 806
 807 /* If LOCATION is in a system header and if it is a virtual location for
 808    a token coming from the expansion of a macro, unwind it to the
 809    location of the expansion point of the macro.  Otherwise, just return
 810    LOCATION.
 811
 812    This is used for instance when we want to emit diagnostics about a
 813    token that may be located in a macro that is itself defined in a
 814    system header, for example, for the NULL macro.  In such a case, if
 815    LOCATION were passed directly to diagnostic functions such as
 816    warning_at, the diagnostic would be suppressed (unless
 817    -Wsystem-headers).  */
 818
 819 source_location
 820 expansion_point_location_if_in_system_header (source_location location)
 821 {
 822   if (in_system_header_at (location))
 823     location = linemap_resolve_location (line_table, location,
 824                                          LRK_MACRO_EXPANSION_POINT,
 825                                          NULL);
 826   return location;
 827 }
 828
 829 /* If LOCATION is a virtual location for a token coming from the expansion
 830    of a macro, unwind to the location of the expansion point of the macro.  */
 831
 832 source_location
 833 expansion_point_location (source_location location)
 834 {
 835   return linemap_resolve_location (line_table, location,
 836                                    LRK_MACRO_EXPANSION_POINT, NULL);
 837 }
 838
 839 /* Construct a location with caret at CARET, ranging from START to
 840    finish e.g.
 841
 842                  11111111112
 843         12345678901234567890
 844      522
 845      523   return foo + bar;
 846                   ~~~~^~~~~
 847      524
 848
 849    The location's caret is at the "+", line 523 column 15, but starts
 850    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 851    of "bar" at column 19.  */
 852
 853 location_t
 854 make_location (location_t caret, location_t start, location_t finish)
 855 {
 856   location_t pure_loc = get_pure_location (caret);
 857   source_range src_range;
 858   src_range.m_start = get_start (start);
 859   src_range.m_finish = get_finish (finish);
 860   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 861                                                    pure_loc,
 862                                                    src_range,
 863                                                    NULL);
 864   return combined_loc;
 865 }
 866
 867 #define ONE_K 1024
 868 #define ONE_M (ONE_K * ONE_K)
 869
 870 /* Display a number as an integer multiple of either:
 871    - 1024, if said integer is >= to 10 K (in base 2)
 872    - 1024 * 1024, if said integer is >= 10 M in (base 2)
 873  */
 874 #define SCALE(x) ((unsigned long) ((x) < 10 * ONE_K \
 875                   ? (x) \
 876                   : ((x) < 10 * ONE_M \
 877                      ? (x) / ONE_K \
 878                      : (x) / ONE_M)))
 879
 880 /* For a given integer, display either:
 881    - the character 'k', if the number is higher than 10 K (in base 2)
 882      but strictly lower than 10 M (in base 2)
 883    - the character 'M' if the number is higher than 10 M (in base2)
 884    - the charcter ' ' if the number is strictly lower  than 10 K  */
 885 #define STAT_LABEL(x) ((x) < 10 * ONE_K ? ' ' : ((x) < 10 * ONE_M ? 'k' : 'M'))
 886
 887 /* Display an integer amount as multiple of 1K or 1M (in base 2).
 888    Display the correct unit (either k, M, or ' ') after the amout, as
 889    well.  */
 890 #define FORMAT_AMOUNT(size) SCALE (size), STAT_LABEL (size)
 891
 892 /* Dump statistics to stderr about the memory usage of the line_table
 893    set of line maps.  This also displays some statistics about macro
 894    expansion.  */
 895
 896 void
 897 dump_line_table_statistics (void)
 898 {
 899   struct linemap_stats s;
 900   long total_used_map_size,
 901     macro_maps_size,
 902     total_allocated_map_size;
 903
 904   memset (&s, 0, sizeof (s));
 905
 906   linemap_get_statistics (line_table, &s);
 907
 908   macro_maps_size = s.macro_maps_used_size
 909     + s.macro_maps_locations_size;
 910
 911   total_allocated_map_size = s.ordinary_maps_allocated_size
 912     + s.macro_maps_allocated_size
 913     + s.macro_maps_locations_size;
 914
 915   total_used_map_size = s.ordinary_maps_used_size
 916     + s.macro_maps_used_size
 917     + s.macro_maps_locations_size;
 918
 919   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 920            s.num_expanded_macros);
 921   if (s.num_expanded_macros != 0)
 922     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 923              s.num_macro_tokens / s.num_expanded_macros);
 924   fprintf (stderr,
 925            "\nLine Table allocations during the "
 926            "compilation process\n");
 927   fprintf (stderr, "Number of ordinary maps used:        %5ld%c\n",
 928            SCALE (s.num_ordinary_maps_used),
 929            STAT_LABEL (s.num_ordinary_maps_used));
 930   fprintf (stderr, "Ordinary map used size:              %5ld%c\n",
 931            SCALE (s.ordinary_maps_used_size),
 932            STAT_LABEL (s.ordinary_maps_used_size));
 933   fprintf (stderr, "Number of ordinary maps allocated:   %5ld%c\n",
 934            SCALE (s.num_ordinary_maps_allocated),
 935            STAT_LABEL (s.num_ordinary_maps_allocated));
 936   fprintf (stderr, "Ordinary maps allocated size:        %5ld%c\n",
 937            SCALE (s.ordinary_maps_allocated_size),
 938            STAT_LABEL (s.ordinary_maps_allocated_size));
 939   fprintf (stderr, "Number of macro maps used:           %5ld%c\n",
 940            SCALE (s.num_macro_maps_used),
 941            STAT_LABEL (s.num_macro_maps_used));
 942   fprintf (stderr, "Macro maps used size:                %5ld%c\n",
 943            SCALE (s.macro_maps_used_size),
 944            STAT_LABEL (s.macro_maps_used_size));
 945   fprintf (stderr, "Macro maps locations size:           %5ld%c\n",
 946            SCALE (s.macro_maps_locations_size),
 947            STAT_LABEL (s.macro_maps_locations_size));
 948   fprintf (stderr, "Macro maps size:                     %5ld%c\n",
 949            SCALE (macro_maps_size),
 950            STAT_LABEL (macro_maps_size));
 951   fprintf (stderr, "Duplicated maps locations size:      %5ld%c\n",
 952            SCALE (s.duplicated_macro_maps_locations_size),
 953            STAT_LABEL (s.duplicated_macro_maps_locations_size));
 954   fprintf (stderr, "Total allocated maps size:           %5ld%c\n",
 955            SCALE (total_allocated_map_size),
 956            STAT_LABEL (total_allocated_map_size));
 957   fprintf (stderr, "Total used maps size:                %5ld%c\n",
 958            SCALE (total_used_map_size),
 959            STAT_LABEL (total_used_map_size));
 960   fprintf (stderr, "Ad-hoc table size:                   %5ld%c\n",
 961            SCALE (s.adhoc_table_size),
 962            STAT_LABEL (s.adhoc_table_size));
 963   fprintf (stderr, "Ad-hoc table entries used:           %5ld\n",
 964            s.adhoc_table_entries_used);
 965   fprintf (stderr, "optimized_ranges: %i\n",
 966            line_table->num_optimized_ranges);
 967   fprintf (stderr, "unoptimized_ranges: %i\n",
 968            line_table->num_unoptimized_ranges);
 969
 970   fprintf (stderr, "\n");
 971 }
 972
 973 /* Get location one beyond the final location in ordinary map IDX.  */
 974
 975 static source_location
 976 get_end_location (struct line_maps *set, unsigned int idx)
 977 {
 978   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
 979     return set->highest_location;
 980
 981   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
 982   return MAP_START_LOCATION (next_map);
 983 }
 984
 985 /* Helper function for write_digit_row.  */
 986
 987 static void
 988 write_digit (FILE *stream, int digit)
 989 {
 990   fputc ('0' + (digit % 10), stream);
 991 }
 992
 993 /* Helper function for dump_location_info.
 994    Write a row of numbers to STREAM, numbering a source line,
 995    giving the units, tens, hundreds etc of the column number.  */
 996
 997 static void
 998 write_digit_row (FILE *stream, int indent,
 999                  const line_map_ordinary *map,
1000                  source_location loc, int max_col, int divisor)
1001 {
1002   fprintf (stream, "%*c", indent, ' ');
1003   fprintf (stream, "|");
1004   for (int column = 1; column < max_col; column++)
1005     {
1006       source_location column_loc = loc + (column << map->m_range_bits);
1007       write_digit (stream, column_loc / divisor);
1008     }
1009   fprintf (stream, "\n");
1010 }
1011
1012 /* Write a half-closed (START) / half-open (END) interval of
1013    source_location to STREAM.  */
1014
1015 static void
1016 dump_location_range (FILE *stream,
1017                      source_location start, source_location end)
1018 {
1019   fprintf (stream,
1020            "  source_location interval: %u <= loc < %u\n",
1021            start, end);
1022 }
1023
1024 /* Write a labelled description of a half-closed (START) / half-open (END)
1025    interval of source_location to STREAM.  */
1026
1027 static void
1028 dump_labelled_location_range (FILE *stream,
1029                               const char *name,
1030                               source_location start, source_location end)
1031 {
1032   fprintf (stream, "%s\n", name);
1033   dump_location_range (stream, start, end);
1034   fprintf (stream, "\n");
1035 }
1036
1037 /* Write a visualization of the locations in the line_table to STREAM.  */
1038
1039 void
1040 dump_location_info (FILE *stream)
1041 {
1042   /* Visualize the reserved locations.  */
1043   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1044                                 0, RESERVED_LOCATION_COUNT);
1045
1046   /* Visualize the ordinary line_map instances, rendering the sources. */
1047   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1048     {
1049       source_location end_location = get_end_location (line_table, idx);
1050       /* half-closed: doesn't include this one. */
1051
1052       const line_map_ordinary *map
1053         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1054       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1055       dump_location_range (stream,
1056                            MAP_START_LOCATION (map), end_location);
1057       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1058       fprintf (stream, "  starting at line: %i\n",
1059                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1060       fprintf (stream, "  column and range bits: %i\n",
1061                map->m_column_and_range_bits);
1062       fprintf (stream, "  column bits: %i\n",
1063                map->m_column_and_range_bits - map->m_range_bits);
1064       fprintf (stream, "  range bits: %i\n",
1065                map->m_range_bits);
1066
1067       /* Render the span of source lines that this "map" covers.  */
1068       for (source_location loc = MAP_START_LOCATION (map);
1069            loc < end_location;
1070            loc += (1 << map->m_range_bits) )
1071         {
1072           gcc_assert (pure_location_p (line_table, loc) );
1073
1074           expanded_location exploc
1075             = linemap_expand_location (line_table, map, loc);
1076
1077           if (0 == exploc.column)
1078             {
1079               /* Beginning of a new source line: draw the line.  */
1080
1081               int line_size;
1082               const char *line_text = location_get_source_line (exploc.file,
1083                                                                 exploc.line,
1084                                                                 &line_size);
1085               if (!line_text)
1086                 break;
1087               fprintf (stream,
1088                        "%s:%3i|loc:%5i|%.*s\n",
1089                        exploc.file, exploc.line,
1090                        loc,
1091                        line_size, line_text);
1092
1093               /* "loc" is at column 0, which means "the whole line".
1094                  Render the locations *within* the line, by underlining
1095                  it, showing the source_location numeric values
1096                  at each column.  */
1097               int max_col = (1 << map->m_column_and_range_bits) - 1;
1098               if (max_col > line_size)
1099                 max_col = line_size + 1;
1100
1101               int indent = 14 + strlen (exploc.file);
1102
1103               /* Thousands.  */
1104               if (end_location > 999)
1105                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1106
1107               /* Hundreds.  */
1108               if (end_location > 99)
1109                 write_digit_row (stream, indent, map, loc, max_col, 100);
1110
1111               /* Tens.  */
1112               write_digit_row (stream, indent, map, loc, max_col, 10);
1113
1114               /* Units.  */
1115               write_digit_row (stream, indent, map, loc, max_col, 1);
1116             }
1117         }
1118       fprintf (stream, "\n");
1119     }
1120
1121   /* Visualize unallocated values.  */
1122   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1123                                 line_table->highest_location,
1124                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1125
1126   /* Visualize the macro line_map instances, rendering the sources. */
1127   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1128     {
1129       /* Each macro map that is allocated owns source_location values
1130          that are *lower* that the one before them.
1131          Hence it's meaningful to view them either in order of ascending
1132          source locations, or in order of ascending macro map index.  */
1133       const bool ascending_source_locations = true;
1134       unsigned int idx = (ascending_source_locations
1135                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1136                           : i);
1137       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1138       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1139                idx,
1140                linemap_map_get_macro_name (map),
1141                MACRO_MAP_NUM_MACRO_TOKENS (map));
1142       dump_location_range (stream,
1143                            map->start_location,
1144                            (map->start_location
1145                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1146       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1147               "expansion point is location %i",
1148               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1149       fprintf (stream, "  map->start_location: %u\n",
1150                map->start_location);
1151
1152       fprintf (stream, "  macro_locations:\n");
1153       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1154         {
1155           source_location x = MACRO_MAP_LOCATIONS (map)[2 * i];
1156           source_location y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1157
1158           /* linemap_add_macro_token encodes token numbers in an expansion
1159              by putting them after MAP_START_LOCATION. */
1160
1161           /* I'm typically seeing 4 uninitialized entries at the end of
1162              0xafafafaf.
1163              This appears to be due to macro.c:replace_args
1164              adding 2 extra args for padding tokens; presumably there may
1165              be a leading and/or trailing padding token injected,
1166              each for 2 more location slots.
1167              This would explain there being up to 4 source_locations slots
1168              that may be uninitialized.  */
1169
1170           fprintf (stream, "    %u: %u, %u\n",
1171                    i,
1172                    x,
1173                    y);
1174           if (x == y)
1175             {
1176               if (x < MAP_START_LOCATION (map))
1177                 inform (x, "token %u has x-location == y-location == %u", i, x);
1178               else
1179                 fprintf (stream,
1180                          "x-location == y-location == %u encodes token # %u\n",
1181                          x, x - MAP_START_LOCATION (map));
1182                 }
1183           else
1184             {
1185               inform (x, "token %u has x-location == %u", i, x);
1186               inform (x, "token %u has y-location == %u", i, y);
1187             }
1188         }
1189       fprintf (stream, "\n");
1190     }
1191
1192   /* It appears that MAX_SOURCE_LOCATION itself is never assigned to a
1193      macro map, presumably due to an off-by-one error somewhere
1194      between the logic in linemap_enter_macro and
1195      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1196   dump_labelled_location_range (stream, "MAX_SOURCE_LOCATION",
1197                                 MAX_SOURCE_LOCATION,
1198                                 MAX_SOURCE_LOCATION + 1);
1199
1200   /* Visualize ad-hoc values.  */
1201   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1202                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
1203 }
1204
1205 /* string_concat's constructor.  */
1206
1207 string_concat::string_concat (int num, location_t *locs)
1208   : m_num (num)
1209 {
1210   m_locs = ggc_vec_alloc <location_t> (num);
1211   for (int i = 0; i < num; i++)
1212     m_locs[i] = locs[i];
1213 }
1214
1215 /* string_concat_db's constructor.  */
1216
1217 string_concat_db::string_concat_db ()
1218 {
1219   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1220 }
1221
1222 /* Record that a string concatenation occurred, covering NUM
1223    string literal tokens.  LOCS is an array of size NUM, containing the
1224    locations of the tokens.  A copy of LOCS is taken.  */
1225
1226 void
1227 string_concat_db::record_string_concatenation (int num, location_t *locs)
1228 {
1229   gcc_assert (num > 1);
1230   gcc_assert (locs);
1231
1232   location_t key_loc = get_key_loc (locs[0]);
1233
1234   string_concat *concat
1235     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1236   m_table->put (key_loc, concat);
1237 }
1238
1239 /* Determine if LOC was the location of the the initial token of a
1240    concatenation of string literal tokens.
1241    If so, *OUT_NUM is written to with the number of tokens, and
1242    *OUT_LOCS with the location of an array of locations of the
1243    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1244    storage owned by the string_concat_db.
1245    Otherwise, return false.  */
1246
1247 bool
1248 string_concat_db::get_string_concatenation (location_t loc,
1249                                             int *out_num,
1250                                             location_t **out_locs)
1251 {
1252   gcc_assert (out_num);
1253   gcc_assert (out_locs);
1254
1255   location_t key_loc = get_key_loc (loc);
1256
1257   string_concat **concat = m_table->get (key_loc);
1258   if (!concat)
1259     return false;
1260
1261   *out_num = (*concat)->m_num;
1262   *out_locs =(*concat)->m_locs;
1263   return true;
1264 }
1265
1266 /* Internal function.  Canonicalize LOC into a form suitable for
1267    use as a key within the database, stripping away macro expansion,
1268    ad-hoc information, and range information, using the location of
1269    the start of LOC within an ordinary linemap.  */
1270
1271 location_t
1272 string_concat_db::get_key_loc (location_t loc)
1273 {
1274   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1275                                   NULL);
1276
1277   loc = get_range_from_loc (line_table, loc).m_start;
1278
1279   return loc;
1280 }
1281
1282 /* Helper class for use within get_substring_ranges_for_loc.
1283    An vec of cpp_string with responsibility for releasing all of the
1284    str->text for each str in the vector.  */
1285
1286 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1287 {
1288  public:
1289   auto_cpp_string_vec (int alloc)
1290     : auto_vec <cpp_string> (alloc) {}
1291
1292   ~auto_cpp_string_vec ()
1293   {
1294     /* Clean up the copies within this vec.  */
1295     int i;
1296     cpp_string *str;
1297     FOR_EACH_VEC_ELT (*this, i, str)
1298       free (const_cast <unsigned char *> (str->text));
1299   }
1300 };
1301
1302 /* Attempt to populate RANGES with source location information on the
1303    individual characters within the string literal found at STRLOC.
1304    If CONCATS is non-NULL, then any string literals that the token at
1305    STRLOC  was concatenated with are also added to RANGES.
1306
1307    Return NULL if successful, or an error message if any errors occurred (in
1308    which case RANGES may be only partially populated and should not
1309    be used).
1310
1311    This is implemented by re-parsing the relevant source line(s).  */
1312
1313 static const char *
1314 get_substring_ranges_for_loc (cpp_reader *pfile,
1315                               string_concat_db *concats,
1316                               location_t strloc,
1317                               enum cpp_ttype type,
1318                               cpp_substring_ranges &ranges)
1319 {
1320   gcc_assert (pfile);
1321
1322   if (strloc == UNKNOWN_LOCATION)
1323     return "unknown location";
1324
1325   /* Reparsing the strings requires accurate location information.
1326      If -ftrack-macro-expansion has been overridden from its default
1327      of 2, then we might have a location of a macro expansion point,
1328      rather than the location of the literal itself.
1329      Avoid this by requiring that we have full macro expansion tracking
1330      for substring locations to be available.  */
1331   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1332     return "track_macro_expansion != 2";
1333
1334   /* If #line or # 44 "file"-style directives are present, then there's
1335      no guarantee that the line numbers we have can be used to locate
1336      the strings.  For example, we might have a .i file with # directives
1337      pointing back to lines within a .c file, but the .c file might
1338      have been edited since the .i file was created.
1339      In such a case, the safest course is to disable on-demand substring
1340      locations.  */
1341   if (line_table->seen_line_directive)
1342     return "seen line directive";
1343
1344   /* If string concatenation has occurred at STRLOC, get the locations
1345      of all of the literal tokens making up the compound string.
1346      Otherwise, just use STRLOC.  */
1347   int num_locs = 1;
1348   location_t *strlocs = &strloc;
1349   if (concats)
1350     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1351
1352   auto_cpp_string_vec strs (num_locs);
1353   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1354   for (int i = 0; i < num_locs; i++)
1355     {
1356       /* Get range of strloc.  We will use it to locate the start and finish
1357          of the literal token within the line.  */
1358       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1359
1360       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1361         /* If the string is within a macro expansion, we can't get at the
1362            end location.  */
1363         return "macro expansion";
1364
1365       if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1366         /* If so, we can't reliably determine where the token started within
1367            its line.  */
1368         return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1369
1370       if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1371         /* If so, we can't reliably determine where the token finished within
1372            its line.  */
1373         return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1374
1375       expanded_location start
1376         = expand_location_to_spelling_point (src_range.m_start);
1377       expanded_location finish
1378         = expand_location_to_spelling_point (src_range.m_finish);
1379       if (start.file != finish.file)
1380         return "range endpoints are in different files";
1381       if (start.line != finish.line)
1382         return "range endpoints are on different lines";
1383       if (start.column > finish.column)
1384         return "range endpoints are reversed";
1385
1386       int line_width;
1387       const char *line = location_get_source_line (start.file, start.line,
1388                                                    &line_width);
1389       if (line == NULL)
1390         return "unable to read source line";
1391
1392       /* Determine the location of the literal (including quotes
1393          and leading prefix chars, such as the 'u' in a u""
1394          token).  */
1395       const char *literal = line + start.column - 1;
1396       int literal_length = finish.column - start.column + 1;
1397
1398       gcc_assert (line_width >= (start.column - 1 + literal_length));
1399       cpp_string from;
1400       from.len = literal_length;
1401       /* Make a copy of the literal, to avoid having to rely on
1402          the lifetime of the copy of the line within the cache.
1403          This will be released by the auto_cpp_string_vec dtor.  */
1404       from.text = XDUPVEC (unsigned char, literal, literal_length);
1405       strs.safe_push (from);
1406
1407       /* For very long lines, a new linemap could have started
1408          halfway through the token.
1409          Ensure that the loc_reader uses the linemap of the
1410          *end* of the token for its start location.  */
1411       const line_map_ordinary *final_ord_map;
1412       linemap_resolve_location (line_table, src_range.m_finish,
1413                                 LRK_MACRO_EXPANSION_POINT, &final_ord_map);
1414       location_t start_loc
1415         = linemap_position_for_line_and_column (line_table, final_ord_map,
1416                                                 start.line, start.column);
1417
1418       cpp_string_location_reader loc_reader (start_loc, line_table);
1419       loc_readers.safe_push (loc_reader);
1420     }
1421
1422   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1423   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1424                                                  loc_readers.address (),
1425                                                  num_locs, &ranges, type);
1426   if (err)
1427     return err;
1428
1429   /* Success: "ranges" should now contain information on the string.  */
1430   return NULL;
1431 }
1432
1433 /* Attempt to populate *OUT_LOC with source location information on the
1434    given characters within the string literal found at STRLOC.
1435    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1436    character set.
1437
1438    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1439    and string literal "012345\n789"
1440    *OUT_LOC is written to with:
1441      "012345\n789"
1442          ~^~~~~
1443
1444    If CONCATS is non-NULL, then any string literals that the token at
1445    STRLOC was concatenated with are also considered.
1446
1447    This is implemented by re-parsing the relevant source line(s).
1448
1449    Return NULL if successful, or an error message if any errors occurred.
1450    Error messages are intended for GCC developers (to help debugging) rather
1451    than for end-users.  */
1452
1453 const char *
1454 get_source_location_for_substring (cpp_reader *pfile,
1455                                    string_concat_db *concats,
1456                                    location_t strloc,
1457                                    enum cpp_ttype type,
1458                                    int caret_idx, int start_idx, int end_idx,
1459                                    source_location *out_loc)
1460 {
1461   gcc_checking_assert (caret_idx >= 0);
1462   gcc_checking_assert (start_idx >= 0);
1463   gcc_checking_assert (end_idx >= 0);
1464   gcc_assert (out_loc);
1465
1466   cpp_substring_ranges ranges;
1467   const char *err
1468     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1469   if (err)
1470     return err;
1471
1472   if (caret_idx >= ranges.get_num_ranges ())
1473     return "caret_idx out of range";
1474   if (start_idx >= ranges.get_num_ranges ())
1475     return "start_idx out of range";
1476   if (end_idx >= ranges.get_num_ranges ())
1477     return "end_idx out of range";
1478
1479   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1480                             ranges.get_range (start_idx).m_start,
1481                             ranges.get_range (end_idx).m_finish);
1482   return NULL;
1483 }
1484
1485 #if CHECKING_P
1486
1487 namespace selftest {
1488
1489 /* Selftests of location handling.  */
1490
1491 /* Attempt to populate *OUT_RANGE with source location information on the
1492    given character within the string literal found at STRLOC.
1493    CHAR_IDX refers to an offset within the execution character set.
1494    If CONCATS is non-NULL, then any string literals that the token at
1495    STRLOC was concatenated with are also considered.
1496
1497    This is implemented by re-parsing the relevant source line(s).
1498
1499    Return NULL if successful, or an error message if any errors occurred.
1500    Error messages are intended for GCC developers (to help debugging) rather
1501    than for end-users.  */
1502
1503 static const char *
1504 get_source_range_for_char (cpp_reader *pfile,
1505                            string_concat_db *concats,
1506                            location_t strloc,
1507                            enum cpp_ttype type,
1508                            int char_idx,
1509                            source_range *out_range)
1510 {
1511   gcc_checking_assert (char_idx >= 0);
1512   gcc_assert (out_range);
1513
1514   cpp_substring_ranges ranges;
1515   const char *err
1516     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1517   if (err)
1518     return err;
1519
1520   if (char_idx >= ranges.get_num_ranges ())
1521     return "char_idx out of range";
1522
1523   *out_range = ranges.get_range (char_idx);
1524   return NULL;
1525 }
1526
1527 /* As get_source_range_for_char, but write to *OUT the number
1528    of ranges that are available.  */
1529
1530 static const char *
1531 get_num_source_ranges_for_substring (cpp_reader *pfile,
1532                                      string_concat_db *concats,
1533                                      location_t strloc,
1534                                      enum cpp_ttype type,
1535                                      int *out)
1536 {
1537   gcc_assert (out);
1538
1539   cpp_substring_ranges ranges;
1540   const char *err
1541     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1542
1543   if (err)
1544     return err;
1545
1546   *out = ranges.get_num_ranges ();
1547   return NULL;
1548 }
1549
1550 /* Selftests of location handling.  */
1551
1552 /* Helper function for verifying location data: when location_t
1553    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1554    as having column 0.  */
1555
1556 static bool
1557 should_have_column_data_p (location_t loc)
1558 {
1559   if (IS_ADHOC_LOC (loc))
1560     loc = get_location_from_adhoc_loc (line_table, loc);
1561   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1562     return false;
1563   return true;
1564 }
1565
1566 /* Selftest for should_have_column_data_p.  */
1567
1568 static void
1569 test_should_have_column_data_p ()
1570 {
1571   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1572   ASSERT_TRUE
1573     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1574   ASSERT_FALSE
1575     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1576 }
1577
1578 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1579    on LOC.  */
1580
1581 static void
1582 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1583               location_t loc)
1584 {
1585   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1586   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1587   /* If location_t values are sufficiently high, then column numbers
1588      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1589      When close to the threshold, column numbers *may* be present: if
1590      the final linemap before the threshold contains a line that straddles
1591      the threshold, locations in that line have column information.  */
1592   if (should_have_column_data_p (loc))
1593     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1594 }
1595
1596 /* Various selftests involve constructing a line table and one or more
1597    line maps within it.
1598
1599    For maximum test coverage we want to run these tests with a variety
1600    of situations:
1601    - line_table->default_range_bits: some frontends use a non-zero value
1602    and others use zero
1603    - the fallback modes within line-map.c: there are various threshold
1604    values for source_location/location_t beyond line-map.c changes
1605    behavior (disabling of the range-packing optimization, disabling
1606    of column-tracking).  We can exercise these by starting the line_table
1607    at interesting values at or near these thresholds.
1608
1609    The following struct describes a particular case within our test
1610    matrix.  */
1611
1612 struct line_table_case
1613 {
1614   line_table_case (int default_range_bits, int base_location)
1615   : m_default_range_bits (default_range_bits),
1616     m_base_location (base_location)
1617   {}
1618
1619   int m_default_range_bits;
1620   int m_base_location;
1621 };
1622
1623 /* Constructor.  Store the old value of line_table, and create a new
1624    one, using sane defaults.  */
1625
1626 line_table_test::line_table_test ()
1627 {
1628   gcc_assert (saved_line_table == NULL);
1629   saved_line_table = line_table;
1630   line_table = ggc_alloc<line_maps> ();
1631   linemap_init (line_table, BUILTINS_LOCATION);
1632   gcc_assert (saved_line_table->reallocator);
1633   line_table->reallocator = saved_line_table->reallocator;
1634   gcc_assert (saved_line_table->round_alloc_size);
1635   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1636   line_table->default_range_bits = 0;
1637 }
1638
1639 /* Constructor.  Store the old value of line_table, and create a new
1640    one, using the sitation described in CASE_.  */
1641
1642 line_table_test::line_table_test (const line_table_case &case_)
1643 {
1644   gcc_assert (saved_line_table == NULL);
1645   saved_line_table = line_table;
1646   line_table = ggc_alloc<line_maps> ();
1647   linemap_init (line_table, BUILTINS_LOCATION);
1648   gcc_assert (saved_line_table->reallocator);
1649   line_table->reallocator = saved_line_table->reallocator;
1650   gcc_assert (saved_line_table->round_alloc_size);
1651   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1652   line_table->default_range_bits = case_.m_default_range_bits;
1653   if (case_.m_base_location)
1654     {
1655       line_table->highest_location = case_.m_base_location;
1656       line_table->highest_line = case_.m_base_location;
1657     }
1658 }
1659
1660 /* Destructor.  Restore the old value of line_table.  */
1661
1662 line_table_test::~line_table_test ()
1663 {
1664   gcc_assert (saved_line_table != NULL);
1665   line_table = saved_line_table;
1666   saved_line_table = NULL;
1667 }
1668
1669 /* Verify basic operation of ordinary linemaps.  */
1670
1671 static void
1672 test_accessing_ordinary_linemaps (const line_table_case &case_)
1673 {
1674   line_table_test ltt (case_);
1675
1676   /* Build a simple linemap describing some locations. */
1677   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1678
1679   linemap_line_start (line_table, 1, 100);
1680   location_t loc_a = linemap_position_for_column (line_table, 1);
1681   location_t loc_b = linemap_position_for_column (line_table, 23);
1682
1683   linemap_line_start (line_table, 2, 100);
1684   location_t loc_c = linemap_position_for_column (line_table, 1);
1685   location_t loc_d = linemap_position_for_column (line_table, 17);
1686
1687   /* Example of a very long line.  */
1688   linemap_line_start (line_table, 3, 2000);
1689   location_t loc_e = linemap_position_for_column (line_table, 700);
1690
1691   /* Transitioning back to a short line.  */
1692   linemap_line_start (line_table, 4, 0);
1693   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1694
1695   if (should_have_column_data_p (loc_back_to_short))
1696     {
1697       /* Verify that we switched to short lines in the linemap.  */
1698       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1699       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1700     }
1701
1702   /* Example of a line that will eventually be seen to be longer
1703      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1704      below that.  */
1705   linemap_line_start (line_table, 5, 2000);
1706
1707   location_t loc_start_of_very_long_line
1708     = linemap_position_for_column (line_table, 2000);
1709   location_t loc_too_wide
1710     = linemap_position_for_column (line_table, 4097);
1711   location_t loc_too_wide_2
1712     = linemap_position_for_column (line_table, 4098);
1713
1714   /* ...and back to a sane line length.  */
1715   linemap_line_start (line_table, 6, 100);
1716   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1717
1718   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1719
1720   /* Multiple files.  */
1721   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1722   linemap_line_start (line_table, 1, 200);
1723   location_t loc_f = linemap_position_for_column (line_table, 150);
1724   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1725
1726   /* Verify that we can recover the location info.  */
1727   assert_loceq ("foo.c", 1, 1, loc_a);
1728   assert_loceq ("foo.c", 1, 23, loc_b);
1729   assert_loceq ("foo.c", 2, 1, loc_c);
1730   assert_loceq ("foo.c", 2, 17, loc_d);
1731   assert_loceq ("foo.c", 3, 700, loc_e);
1732   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1733
1734   /* In the very wide line, the initial location should be fully tracked.  */
1735   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1736   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1737      be disabled.  */
1738   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1739   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1740   /*...and column-tracking should be re-enabled for subsequent lines.  */
1741   assert_loceq ("foo.c", 6, 10, loc_sane_again);
1742
1743   assert_loceq ("bar.c", 1, 150, loc_f);
1744
1745   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1746   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1747
1748   /* Verify using make_location to build a range, and extracting data
1749      back from it.  */
1750   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1751   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1752   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1753   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1754   ASSERT_EQ (loc_b, src_range.m_start);
1755   ASSERT_EQ (loc_d, src_range.m_finish);
1756 }
1757
1758 /* Verify various properties of UNKNOWN_LOCATION.  */
1759
1760 static void
1761 test_unknown_location ()
1762 {
1763   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1764   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1765   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1766 }
1767
1768 /* Verify various properties of BUILTINS_LOCATION.  */
1769
1770 static void
1771 test_builtins ()
1772 {
1773   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1774   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1775 }
1776
1777 /* Regression test for make_location.
1778    Ensure that we use pure locations for the start/finish of the range,
1779    rather than storing a packed or ad-hoc range as the start/finish.  */
1780
1781 static void
1782 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1783 {
1784   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1785      with C++ frontend.
1786      ....................0000000001111111111222.
1787      ....................1234567890123456789012.  */
1788   const char *content = "     r += !aaa == bbb;\n";
1789   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1790   line_table_test ltt (case_);
1791   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1792
1793   const location_t c11 = linemap_position_for_column (line_table, 11);
1794   const location_t c12 = linemap_position_for_column (line_table, 12);
1795   const location_t c13 = linemap_position_for_column (line_table, 13);
1796   const location_t c14 = linemap_position_for_column (line_table, 14);
1797   const location_t c21 = linemap_position_for_column (line_table, 21);
1798
1799   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1800     return;
1801
1802   /* Use column 13 for the caret location, arbitrarily, to verify that we
1803      handle start != caret.  */
1804   const location_t aaa = make_location (c13, c12, c14);
1805   ASSERT_EQ (c13, get_pure_location (aaa));
1806   ASSERT_EQ (c12, get_start (aaa));
1807   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1808   ASSERT_EQ (c14, get_finish (aaa));
1809   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1810
1811   /* Make a location using a location with a range as the start-point.  */
1812   const location_t not_aaa = make_location (c11, aaa, c14);
1813   ASSERT_EQ (c11, get_pure_location (not_aaa));
1814   /* It should use the start location of the range, not store the range
1815      itself.  */
1816   ASSERT_EQ (c12, get_start (not_aaa));
1817   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1818   ASSERT_EQ (c14, get_finish (not_aaa));
1819   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1820
1821   /* Similarly, make a location with a range as the end-point.  */
1822   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1823   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1824   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1825   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1826   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1827   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1828   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1829   /* It should use the finish location of the range, not store the range
1830      itself.  */
1831   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1832   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1833   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1834   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1835   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1836 }
1837
1838 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1839
1840 static void
1841 test_reading_source_line ()
1842 {
1843   /* Create a tempfile and write some text to it.  */
1844   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1845                         "01234567890123456789\n"
1846                         "This is the test text\n"
1847                         "This is the 3rd line");
1848
1849   /* Read back a specific line from the tempfile.  */
1850   int line_size;
1851   const char *source_line = location_get_source_line (tmp.get_filename (),
1852                                                       3, &line_size);
1853   ASSERT_TRUE (source_line != NULL);
1854   ASSERT_EQ (20, line_size);
1855   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1856                          source_line, line_size));
1857
1858   source_line = location_get_source_line (tmp.get_filename (),
1859                                           2, &line_size);
1860   ASSERT_TRUE (source_line != NULL);
1861   ASSERT_EQ (21, line_size);
1862   ASSERT_TRUE (!strncmp ("This is the test text",
1863                          source_line, line_size));
1864
1865   source_line = location_get_source_line (tmp.get_filename (),
1866                                           4, &line_size);
1867   ASSERT_TRUE (source_line == NULL);
1868 }
1869
1870 /* Tests of lexing.  */
1871
1872 /* Verify that token TOK from PARSER has cpp_token_as_text
1873    equal to EXPECTED_TEXT.  */
1874
1875 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1876   SELFTEST_BEGIN_STMT                                                   \
1877     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1878     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1879   SELFTEST_END_STMT
1880
1881 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1882    and ranges from EXP_START_COL to EXP_FINISH_COL.
1883    Use LOC as the effective location of the selftest.  */
1884
1885 static void
1886 assert_token_loc_eq (const location &loc,
1887                      const cpp_token *tok,
1888                      const char *exp_filename, int exp_linenum,
1889                      int exp_start_col, int exp_finish_col)
1890 {
1891   location_t tok_loc = tok->src_loc;
1892   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1893   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1894
1895   /* If location_t values are sufficiently high, then column numbers
1896      will be unavailable.  */
1897   if (!should_have_column_data_p (tok_loc))
1898     return;
1899
1900   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
1901   source_range tok_range = get_range_from_loc (line_table, tok_loc);
1902   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
1903   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
1904 }
1905
1906 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
1907    SELFTEST_LOCATION as the effective location of the selftest.  */
1908
1909 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
1910                             EXP_START_COL, EXP_FINISH_COL) \
1911   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
1912                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
1913
1914 /* Test of lexing a file using libcpp, verifying tokens and their
1915    location information.  */
1916
1917 static void
1918 test_lexer (const line_table_case &case_)
1919 {
1920   /* Create a tempfile and write some text to it.  */
1921   const char *content =
1922     /*00000000011111111112222222222333333.3333444444444.455555555556
1923       12345678901234567890123456789012345.6789012345678.901234567890.  */
1924     ("test_name /* c-style comment */\n"
1925      "                                  \"test literal\"\n"
1926      " // test c++-style comment\n"
1927      "   42\n");
1928   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
1929
1930   line_table_test ltt (case_);
1931
1932   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
1933
1934   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
1935   ASSERT_NE (fname, NULL);
1936
1937   /* Verify that we get the expected tokens back, with the correct
1938      location information.  */
1939
1940   location_t loc;
1941   const cpp_token *tok;
1942   tok = cpp_get_token_with_location (parser, &loc);
1943   ASSERT_NE (tok, NULL);
1944   ASSERT_EQ (tok->type, CPP_NAME);
1945   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
1946   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
1947
1948   tok = cpp_get_token_with_location (parser, &loc);
1949   ASSERT_NE (tok, NULL);
1950   ASSERT_EQ (tok->type, CPP_STRING);
1951   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
1952   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
1953
1954   tok = cpp_get_token_with_location (parser, &loc);
1955   ASSERT_NE (tok, NULL);
1956   ASSERT_EQ (tok->type, CPP_NUMBER);
1957   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
1958   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
1959
1960   tok = cpp_get_token_with_location (parser, &loc);
1961   ASSERT_NE (tok, NULL);
1962   ASSERT_EQ (tok->type, CPP_EOF);
1963
1964   cpp_finish (parser, NULL);
1965   cpp_destroy (parser);
1966 }
1967
1968 /* Forward decls.  */
1969
1970 struct lexer_test;
1971 class lexer_test_options;
1972
1973 /* A class for specifying options of a lexer_test.
1974    The "apply" vfunc is called during the lexer_test constructor.  */
1975
1976 class lexer_test_options
1977 {
1978  public:
1979   virtual void apply (lexer_test &) = 0;
1980 };
1981
1982 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
1983    in its dtor.
1984
1985    This is needed by struct lexer_test to ensure that the cleanup of the
1986    cpp_reader happens *after* the cleanup of the temp_source_file.  */
1987
1988 class cpp_reader_ptr
1989 {
1990  public:
1991   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
1992
1993   ~cpp_reader_ptr ()
1994   {
1995     cpp_finish (m_ptr, NULL);
1996     cpp_destroy (m_ptr);
1997   }
1998
1999   operator cpp_reader * () const { return m_ptr; }
2000
2001  private:
2002   cpp_reader *m_ptr;
2003 };
2004
2005 /* A struct for writing lexer tests.  */
2006
2007 struct lexer_test
2008 {
2009   lexer_test (const line_table_case &case_, const char *content,
2010               lexer_test_options *options);
2011   ~lexer_test ();
2012
2013   const cpp_token *get_token ();
2014
2015   /* The ordering of these fields matters.
2016      The line_table_test must be first, since the cpp_reader_ptr
2017      uses it.
2018      The cpp_reader must be cleaned up *after* the temp_source_file
2019      since the filenames in input.c's input cache are owned by the
2020      cpp_reader; in particular, when ~temp_source_file evicts the
2021      filename the filenames must still be alive.  */
2022   line_table_test m_ltt;
2023   cpp_reader_ptr m_parser;
2024   temp_source_file m_tempfile;
2025   string_concat_db m_concats;
2026   bool m_implicitly_expect_EOF;
2027 };
2028
2029 /* Use an EBCDIC encoding for the execution charset, specifically
2030    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2031
2032    This exercises iconv integration within libcpp.
2033    Not every build of iconv supports the given charset,
2034    so we need to flag this error and handle it gracefully.  */
2035
2036 class ebcdic_execution_charset : public lexer_test_options
2037 {
2038  public:
2039   ebcdic_execution_charset () : m_num_iconv_errors (0)
2040     {
2041       gcc_assert (s_singleton == NULL);
2042       s_singleton = this;
2043     }
2044   ~ebcdic_execution_charset ()
2045     {
2046       gcc_assert (s_singleton == this);
2047       s_singleton = NULL;
2048     }
2049
2050   void apply (lexer_test &test) FINAL OVERRIDE
2051   {
2052     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2053     cpp_opts->narrow_charset = "IBM1047";
2054
2055     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2056     callbacks->error = on_error;
2057   }
2058
2059   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2060                         int level ATTRIBUTE_UNUSED,
2061                         int reason ATTRIBUTE_UNUSED,
2062                         rich_location *richloc ATTRIBUTE_UNUSED,
2063                         const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2064     ATTRIBUTE_FPTR_PRINTF(5,0)
2065   {
2066     gcc_assert (s_singleton);
2067     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2068     const char *msg = "conversion from %s to %s not supported by iconv";
2069 #ifdef ENABLE_NLS
2070     msg = dgettext ("cpplib", msg);
2071 #endif
2072     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2073        when the local iconv build doesn't support the conversion.  */
2074     if (strcmp (msgid, msg) == 0)
2075       {
2076         s_singleton->m_num_iconv_errors++;
2077         return true;
2078       }
2079
2080     /* Otherwise, we have an unexpected error.  */
2081     abort ();
2082   }
2083
2084   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2085
2086  private:
2087   static ebcdic_execution_charset *s_singleton;
2088   int m_num_iconv_errors;
2089 };
2090
2091 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2092
2093 /* A lexer_test_options subclass that records a list of error
2094    messages emitted by the lexer.  */
2095
2096 class lexer_error_sink : public lexer_test_options
2097 {
2098  public:
2099   lexer_error_sink ()
2100   {
2101     gcc_assert (s_singleton == NULL);
2102     s_singleton = this;
2103   }
2104   ~lexer_error_sink ()
2105   {
2106     gcc_assert (s_singleton == this);
2107     s_singleton = NULL;
2108
2109     int i;
2110     char *str;
2111     FOR_EACH_VEC_ELT (m_errors, i, str)
2112       free (str);
2113   }
2114
2115   void apply (lexer_test &test) FINAL OVERRIDE
2116   {
2117     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2118     callbacks->error = on_error;
2119   }
2120
2121   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2122                         int level ATTRIBUTE_UNUSED,
2123                         int reason ATTRIBUTE_UNUSED,
2124                         rich_location *richloc ATTRIBUTE_UNUSED,
2125                         const char *msgid, va_list *ap)
2126     ATTRIBUTE_FPTR_PRINTF(5,0)
2127   {
2128     char *msg = xvasprintf (msgid, *ap);
2129     s_singleton->m_errors.safe_push (msg);
2130     return true;
2131   }
2132
2133   auto_vec<char *> m_errors;
2134
2135  private:
2136   static lexer_error_sink *s_singleton;
2137 };
2138
2139 lexer_error_sink *lexer_error_sink::s_singleton;
2140
2141 /* Constructor.  Override line_table with a new instance based on CASE_,
2142    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2143    start parsing the tempfile.  */
2144
2145 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2146                         lexer_test_options *options)
2147 : m_ltt (case_),
2148   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2149   /* Create a tempfile and write the text to it.  */
2150   m_tempfile (SELFTEST_LOCATION, ".c", content),
2151   m_concats (),
2152   m_implicitly_expect_EOF (true)
2153 {
2154   if (options)
2155     options->apply (*this);
2156
2157   cpp_init_iconv (m_parser);
2158
2159   /* Parse the file.  */
2160   const char *fname = cpp_read_main_file (m_parser,
2161                                           m_tempfile.get_filename ());
2162   ASSERT_NE (fname, NULL);
2163 }
2164
2165 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2166
2167 lexer_test::~lexer_test ()
2168 {
2169   location_t loc;
2170   const cpp_token *tok;
2171
2172   if (m_implicitly_expect_EOF)
2173     {
2174       tok = cpp_get_token_with_location (m_parser, &loc);
2175       ASSERT_NE (tok, NULL);
2176       ASSERT_EQ (tok->type, CPP_EOF);
2177     }
2178 }
2179
2180 /* Get the next token from m_parser.  */
2181
2182 const cpp_token *
2183 lexer_test::get_token ()
2184 {
2185   location_t loc;
2186   const cpp_token *tok;
2187
2188   tok = cpp_get_token_with_location (m_parser, &loc);
2189   ASSERT_NE (tok, NULL);
2190   return tok;
2191 }
2192
2193 /* Verify that locations within string literals are correctly handled.  */
2194
2195 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2196    using the string concatenation database for TEST.
2197
2198    Assert that the character at index IDX is on EXPECTED_LINE,
2199    and that it begins at column EXPECTED_START_COL and ends at
2200    EXPECTED_FINISH_COL (unless the locations are beyond
2201    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2202    columns).  */
2203
2204 static void
2205 assert_char_at_range (const location &loc,
2206                       lexer_test& test,
2207                       location_t strloc, enum cpp_ttype type, int idx,
2208                       int expected_line, int expected_start_col,
2209                       int expected_finish_col)
2210 {
2211   cpp_reader *pfile = test.m_parser;
2212   string_concat_db *concats = &test.m_concats;
2213
2214   source_range actual_range = source_range();
2215   const char *err
2216     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2217                                  &actual_range);
2218   if (should_have_column_data_p (strloc))
2219     ASSERT_EQ_AT (loc, NULL, err);
2220   else
2221     {
2222       ASSERT_STREQ_AT (loc,
2223                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2224                        err);
2225       return;
2226     }
2227
2228   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2229   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2230   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2231   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2232
2233   if (should_have_column_data_p (actual_range.m_start))
2234     {
2235       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2236       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2237     }
2238   if (should_have_column_data_p (actual_range.m_finish))
2239     {
2240       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2241       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2242     }
2243 }
2244
2245 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2246    the effective location of any errors.  */
2247
2248 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2249                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2250   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2251                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2252                         (EXPECTED_FINISH_COL))
2253
2254 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2255    using the string concatenation database for TEST.
2256
2257    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2258
2259 static void
2260 assert_num_substring_ranges (const location &loc,
2261                              lexer_test& test,
2262                              location_t strloc,
2263                              enum cpp_ttype type,
2264                              int expected_num_ranges)
2265 {
2266   cpp_reader *pfile = test.m_parser;
2267   string_concat_db *concats = &test.m_concats;
2268
2269   int actual_num_ranges = -1;
2270   const char *err
2271     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2272                                            &actual_num_ranges);
2273   if (should_have_column_data_p (strloc))
2274     ASSERT_EQ_AT (loc, NULL, err);
2275   else
2276     {
2277       ASSERT_STREQ_AT (loc,
2278                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2279                        err);
2280       return;
2281     }
2282   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2283 }
2284
2285 /* Macro for calling assert_num_substring_ranges, supplying
2286    SELFTEST_LOCATION for the effective location of any errors.  */
2287
2288 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2289                                     EXPECTED_NUM_RANGES)                \
2290   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2291                                (TYPE), (EXPECTED_NUM_RANGES))
2292
2293
2294 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2295    returns an error (using the string concatenation database for TEST).  */
2296
2297 static void
2298 assert_has_no_substring_ranges (const location &loc,
2299                                 lexer_test& test,
2300                                 location_t strloc,
2301                                 enum cpp_ttype type,
2302                                 const char *expected_err)
2303 {
2304   cpp_reader *pfile = test.m_parser;
2305   string_concat_db *concats = &test.m_concats;
2306   cpp_substring_ranges ranges;
2307   const char *actual_err
2308     = get_substring_ranges_for_loc (pfile, concats, strloc,
2309                                     type, ranges);
2310   if (should_have_column_data_p (strloc))
2311     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2312   else
2313     ASSERT_STREQ_AT (loc,
2314                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2315                      actual_err);
2316 }
2317
2318 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2319     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2320                                     (STRLOC), (TYPE), (ERR))
2321
2322 /* Lex a simple string literal.  Verify the substring location data, before
2323    and after running cpp_interpret_string on it.  */
2324
2325 static void
2326 test_lexer_string_locations_simple (const line_table_case &case_)
2327 {
2328   /* Digits 0-9 (with 0 at column 10), the simple way.
2329      ....................000000000.11111111112.2222222223333333333
2330      ....................123456789.01234567890.1234567890123456789
2331      We add a trailing comment to ensure that we correctly locate
2332      the end of the string literal token.  */
2333   const char *content = "        \"0123456789\" /* not a string */\n";
2334   lexer_test test (case_, content, NULL);
2335
2336   /* Verify that we get the expected token back, with the correct
2337      location information.  */
2338   const cpp_token *tok = test.get_token ();
2339   ASSERT_EQ (tok->type, CPP_STRING);
2340   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2341   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2342
2343   /* At this point in lexing, the quote characters are treated as part of
2344      the string (they are stripped off by cpp_interpret_string).  */
2345
2346   ASSERT_EQ (tok->val.str.len, 12);
2347
2348   /* Verify that cpp_interpret_string works.  */
2349   cpp_string dst_string;
2350   const enum cpp_ttype type = CPP_STRING;
2351   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2352                                       &dst_string, type);
2353   ASSERT_TRUE (result);
2354   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2355   free (const_cast <unsigned char *> (dst_string.text));
2356
2357   /* Verify ranges of individual characters.  This no longer includes the
2358      opening quote, but does include the closing quote.  */
2359   for (int i = 0; i <= 10; i++)
2360     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2361                           10 + i, 10 + i);
2362
2363   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2364 }
2365
2366 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2367    encoding.  */
2368
2369 static void
2370 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2371 {
2372   /* EBCDIC support requires iconv.  */
2373   if (!HAVE_ICONV)
2374     return;
2375
2376   /* Digits 0-9 (with 0 at column 10), the simple way.
2377      ....................000000000.11111111112.2222222223333333333
2378      ....................123456789.01234567890.1234567890123456789
2379      We add a trailing comment to ensure that we correctly locate
2380      the end of the string literal token.  */
2381   const char *content = "        \"0123456789\" /* not a string */\n";
2382   ebcdic_execution_charset use_ebcdic;
2383   lexer_test test (case_, content, &use_ebcdic);
2384
2385   /* Verify that we get the expected token back, with the correct
2386      location information.  */
2387   const cpp_token *tok = test.get_token ();
2388   ASSERT_EQ (tok->type, CPP_STRING);
2389   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2390   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2391
2392   /* At this point in lexing, the quote characters are treated as part of
2393      the string (they are stripped off by cpp_interpret_string).  */
2394
2395   ASSERT_EQ (tok->val.str.len, 12);
2396
2397   /* The remainder of the test requires an iconv implementation that
2398      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2399   if (use_ebcdic.iconv_errors_occurred_p ())
2400     return;
2401
2402   /* Verify that cpp_interpret_string works.  */
2403   cpp_string dst_string;
2404   const enum cpp_ttype type = CPP_STRING;
2405   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2406                                       &dst_string, type);
2407   ASSERT_TRUE (result);
2408   /* We should now have EBCDIC-encoded text, specifically
2409      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2410      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2411   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2412                 (const char *)dst_string.text);
2413   free (const_cast <unsigned char *> (dst_string.text));
2414
2415   /* Verify that we don't attempt to record substring location information
2416      for such cases.  */
2417   ASSERT_HAS_NO_SUBSTRING_RANGES
2418     (test, tok->src_loc, type,
2419      "execution character set != source character set");
2420 }
2421
2422 /* Lex a string literal containing a hex-escaped character.
2423    Verify the substring location data, before and after running
2424    cpp_interpret_string on it.  */
2425
2426 static void
2427 test_lexer_string_locations_hex (const line_table_case &case_)
2428 {
2429   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2430      and with a space in place of digit 6, to terminate the escaped
2431      hex code.
2432      ....................000000000.111111.11112222.
2433      ....................123456789.012345.67890123.  */
2434   const char *content = "        \"01234\\x35 789\"\n";
2435   lexer_test test (case_, content, NULL);
2436
2437   /* Verify that we get the expected token back, with the correct
2438      location information.  */
2439   const cpp_token *tok = test.get_token ();
2440   ASSERT_EQ (tok->type, CPP_STRING);
2441   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2442   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2443
2444   /* At this point in lexing, the quote characters are treated as part of
2445      the string (they are stripped off by cpp_interpret_string).  */
2446   ASSERT_EQ (tok->val.str.len, 15);
2447
2448   /* Verify that cpp_interpret_string works.  */
2449   cpp_string dst_string;
2450   const enum cpp_ttype type = CPP_STRING;
2451   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2452                                       &dst_string, type);
2453   ASSERT_TRUE (result);
2454   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2455   free (const_cast <unsigned char *> (dst_string.text));
2456
2457   /* Verify ranges of individual characters.  This no longer includes the
2458      opening quote, but does include the closing quote.  */
2459   for (int i = 0; i <= 4; i++)
2460     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2461   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2462   for (int i = 6; i <= 10; i++)
2463     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2464
2465   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2466 }
2467
2468 /* Lex a string literal containing an octal-escaped character.
2469    Verify the substring location data after running cpp_interpret_string
2470    on it.  */
2471
2472 static void
2473 test_lexer_string_locations_oct (const line_table_case &case_)
2474 {
2475   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2476      and with a space in place of digit 6, to terminate the escaped
2477      octal code.
2478      ....................000000000.111111.11112222.2222223333333333444
2479      ....................123456789.012345.67890123.4567890123456789012  */
2480   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2481   lexer_test test (case_, content, NULL);
2482
2483   /* Verify that we get the expected token back, with the correct
2484      location information.  */
2485   const cpp_token *tok = test.get_token ();
2486   ASSERT_EQ (tok->type, CPP_STRING);
2487   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2488
2489   /* Verify that cpp_interpret_string works.  */
2490   cpp_string dst_string;
2491   const enum cpp_ttype type = CPP_STRING;
2492   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2493                                       &dst_string, type);
2494   ASSERT_TRUE (result);
2495   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2496   free (const_cast <unsigned char *> (dst_string.text));
2497
2498   /* Verify ranges of individual characters.  This no longer includes the
2499      opening quote, but does include the closing quote.  */
2500   for (int i = 0; i < 5; i++)
2501     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2502   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2503   for (int i = 6; i <= 10; i++)
2504     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2505
2506   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2507 }
2508
2509 /* Test of string literal containing letter escapes.  */
2510
2511 static void
2512 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2513 {
2514   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2515      .....................000000000.1.11111.1.1.11222.22222223333333
2516      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2517   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2518   lexer_test test (case_, content, NULL);
2519
2520   /* Verify that we get the expected tokens back.  */
2521   const cpp_token *tok = test.get_token ();
2522   ASSERT_EQ (tok->type, CPP_STRING);
2523   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2524
2525   /* Verify ranges of individual characters. */
2526   /* "\t".  */
2527   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2528                         0, 1, 10, 11);
2529   /* "foo". */
2530   for (int i = 1; i <= 3; i++)
2531     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2532                           i, 1, 11 + i, 11 + i);
2533   /* "\\" and "\n".  */
2534   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2535                         4, 1, 15, 16);
2536   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2537                         5, 1, 17, 18);
2538
2539   /* "bar" and closing quote for nul-terminator.  */
2540   for (int i = 6; i <= 9; i++)
2541     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2542                           i, 1, 13 + i, 13 + i);
2543
2544   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2545 }
2546
2547 /* Another test of a string literal containing a letter escape.
2548    Based on string seen in
2549      printf ("%-%\n");
2550    in gcc.dg/format/c90-printf-1.c.  */
2551
2552 static void
2553 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2554 {
2555   /* .....................000000000.1111.11.1111.22222222223.
2556      .....................123456789.0123.45.6789.01234567890.  */
2557   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2558   lexer_test test (case_, content, NULL);
2559
2560   /* Verify that we get the expected tokens back.  */
2561   const cpp_token *tok = test.get_token ();
2562   ASSERT_EQ (tok->type, CPP_STRING);
2563   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2564
2565   /* Verify ranges of individual characters. */
2566   /* "%-%".  */
2567   for (int i = 0; i < 3; i++)
2568     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2569                           i, 1, 10 + i, 10 + i);
2570   /* "\n".  */
2571   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2572                         3, 1, 13, 14);
2573
2574   /* Closing quote for nul-terminator.  */
2575   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2576                         4, 1, 15, 15);
2577
2578   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2579 }
2580
2581 /* Lex a string literal containing UCN 4 characters.
2582    Verify the substring location data after running cpp_interpret_string
2583    on it.  */
2584
2585 static void
2586 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2587 {
2588   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2589      as UCN 4.
2590      ....................000000000.111111.111122.222222223.33333333344444
2591      ....................123456789.012345.678901.234567890.12345678901234  */
2592   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2593   lexer_test test (case_, content, NULL);
2594
2595   /* Verify that we get the expected token back, with the correct
2596      location information.  */
2597   const cpp_token *tok = test.get_token ();
2598   ASSERT_EQ (tok->type, CPP_STRING);
2599   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2600
2601   /* Verify that cpp_interpret_string works.
2602      The string should be encoded in the execution character
2603      set.  Assuming that that is UTF-8, we should have the following:
2604      -----------  ----  -----  -------  ----------------
2605      Byte offset  Byte  Octal  Unicode  Source Column(s)
2606      -----------  ----  -----  -------  ----------------
2607      0            0x30         '0'      10
2608      1            0x31         '1'      11
2609      2            0x32         '2'      12
2610      3            0x33         '3'      13
2611      4            0x34         '4'      14
2612      5            0xE2  \342   U+2174   15-20
2613      6            0x85  \205    (cont)  15-20
2614      7            0xB4  \264    (cont)  15-20
2615      8            0xE2  \342   U+2175   21-26
2616      9            0x85  \205    (cont)  21-26
2617      10           0xB5  \265    (cont)  21-26
2618      11           0x37         '7'      27
2619      12           0x38         '8'      28
2620      13           0x39         '9'      29
2621      14           0x00                  30 (closing quote)
2622      -----------  ----  -----  -------  ---------------.  */
2623
2624   cpp_string dst_string;
2625   const enum cpp_ttype type = CPP_STRING;
2626   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2627                                       &dst_string, type);
2628   ASSERT_TRUE (result);
2629   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2630                 (const char *)dst_string.text);
2631   free (const_cast <unsigned char *> (dst_string.text));
2632
2633   /* Verify ranges of individual characters.  This no longer includes the
2634      opening quote, but does include the closing quote.
2635      '01234'.  */
2636   for (int i = 0; i <= 4; i++)
2637     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2638   /* U+2174.  */
2639   for (int i = 5; i <= 7; i++)
2640     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2641   /* U+2175.  */
2642   for (int i = 8; i <= 10; i++)
2643     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2644   /* '789' and nul terminator  */
2645   for (int i = 11; i <= 14; i++)
2646     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2647
2648   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2649 }
2650
2651 /* Lex a string literal containing UCN 8 characters.
2652    Verify the substring location data after running cpp_interpret_string
2653    on it.  */
2654
2655 static void
2656 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2657 {
2658   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2659      ....................000000000.111111.1111222222.2222333333333.344444
2660      ....................123456789.012345.6789012345.6789012345678.901234  */
2661   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2662   lexer_test test (case_, content, NULL);
2663
2664   /* Verify that we get the expected token back, with the correct
2665      location information.  */
2666   const cpp_token *tok = test.get_token ();
2667   ASSERT_EQ (tok->type, CPP_STRING);
2668   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2669                            "\"01234\\U00002174\\U00002175789\"");
2670
2671   /* Verify that cpp_interpret_string works.
2672      The UTF-8 encoding of the string is identical to that from
2673      the ucn4 testcase above; the only difference is the column
2674      locations.  */
2675   cpp_string dst_string;
2676   const enum cpp_ttype type = CPP_STRING;
2677   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2678                                       &dst_string, type);
2679   ASSERT_TRUE (result);
2680   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2681                 (const char *)dst_string.text);
2682   free (const_cast <unsigned char *> (dst_string.text));
2683
2684   /* Verify ranges of individual characters.  This no longer includes the
2685      opening quote, but does include the closing quote.
2686      '01234'.  */
2687   for (int i = 0; i <= 4; i++)
2688     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2689   /* U+2174.  */
2690   for (int i = 5; i <= 7; i++)
2691     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2692   /* U+2175.  */
2693   for (int i = 8; i <= 10; i++)
2694     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2695   /* '789' at columns 35-37  */
2696   for (int i = 11; i <= 13; i++)
2697     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2698   /* Closing quote/nul-terminator at column 38.  */
2699   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2700
2701   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2702 }
2703
2704 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2705
2706 static uint32_t
2707 uint32_from_big_endian (const uint32_t *ptr_be_value)
2708 {
2709   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2710   return (((uint32_t) buf[0] << 24)
2711           | ((uint32_t) buf[1] << 16)
2712           | ((uint32_t) buf[2] << 8)
2713           | (uint32_t) buf[3]);
2714 }
2715
2716 /* Lex a wide string literal and verify that attempts to read substring
2717    location data from it fail gracefully.  */
2718
2719 static void
2720 test_lexer_string_locations_wide_string (const line_table_case &case_)
2721 {
2722   /* Digits 0-9.
2723      ....................000000000.11111111112.22222222233333
2724      ....................123456789.01234567890.12345678901234  */
2725   const char *content = "       L\"0123456789\" /* non-str */\n";
2726   lexer_test test (case_, content, NULL);
2727
2728   /* Verify that we get the expected token back, with the correct
2729      location information.  */
2730   const cpp_token *tok = test.get_token ();
2731   ASSERT_EQ (tok->type, CPP_WSTRING);
2732   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2733
2734   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2735   cpp_string dst_string;
2736   const enum cpp_ttype type = CPP_WSTRING;
2737   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2738                                       &dst_string, type);
2739   ASSERT_TRUE (result);
2740   /* The cpp_reader defaults to big-endian with
2741      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2742      now be encoded as UTF-32BE.  */
2743   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2744   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2745   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2746   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2747   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2748   free (const_cast <unsigned char *> (dst_string.text));
2749
2750   /* We don't yet support generating substring location information
2751      for L"" strings.  */
2752   ASSERT_HAS_NO_SUBSTRING_RANGES
2753     (test, tok->src_loc, type,
2754      "execution character set != source character set");
2755 }
2756
2757 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2758
2759 static uint16_t
2760 uint16_from_big_endian (const uint16_t *ptr_be_value)
2761 {
2762   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2763   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2764 }
2765
2766 /* Lex a u"" string literal and verify that attempts to read substring
2767    location data from it fail gracefully.  */
2768
2769 static void
2770 test_lexer_string_locations_string16 (const line_table_case &case_)
2771 {
2772   /* Digits 0-9.
2773      ....................000000000.11111111112.22222222233333
2774      ....................123456789.01234567890.12345678901234  */
2775   const char *content = "       u\"0123456789\" /* non-str */\n";
2776   lexer_test test (case_, content, NULL);
2777
2778   /* Verify that we get the expected token back, with the correct
2779      location information.  */
2780   const cpp_token *tok = test.get_token ();
2781   ASSERT_EQ (tok->type, CPP_STRING16);
2782   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2783
2784   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2785   cpp_string dst_string;
2786   const enum cpp_ttype type = CPP_STRING16;
2787   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2788                                       &dst_string, type);
2789   ASSERT_TRUE (result);
2790
2791   /* The cpp_reader defaults to big-endian, so dst_string should
2792      now be encoded as UTF-16BE.  */
2793   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2794   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2795   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2796   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2797   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2798   free (const_cast <unsigned char *> (dst_string.text));
2799
2800   /* We don't yet support generating substring location information
2801      for L"" strings.  */
2802   ASSERT_HAS_NO_SUBSTRING_RANGES
2803     (test, tok->src_loc, type,
2804      "execution character set != source character set");
2805 }
2806
2807 /* Lex a U"" string literal and verify that attempts to read substring
2808    location data from it fail gracefully.  */
2809
2810 static void
2811 test_lexer_string_locations_string32 (const line_table_case &case_)
2812 {
2813   /* Digits 0-9.
2814      ....................000000000.11111111112.22222222233333
2815      ....................123456789.01234567890.12345678901234  */
2816   const char *content = "       U\"0123456789\" /* non-str */\n";
2817   lexer_test test (case_, content, NULL);
2818
2819   /* Verify that we get the expected token back, with the correct
2820      location information.  */
2821   const cpp_token *tok = test.get_token ();
2822   ASSERT_EQ (tok->type, CPP_STRING32);
2823   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2824
2825   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2826   cpp_string dst_string;
2827   const enum cpp_ttype type = CPP_STRING32;
2828   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2829                                       &dst_string, type);
2830   ASSERT_TRUE (result);
2831
2832   /* The cpp_reader defaults to big-endian, so dst_string should
2833      now be encoded as UTF-32BE.  */
2834   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2835   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2836   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2837   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2838   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2839   free (const_cast <unsigned char *> (dst_string.text));
2840
2841   /* We don't yet support generating substring location information
2842      for L"" strings.  */
2843   ASSERT_HAS_NO_SUBSTRING_RANGES
2844     (test, tok->src_loc, type,
2845      "execution character set != source character set");
2846 }
2847
2848 /* Lex a u8-string literal.
2849    Verify the substring location data after running cpp_interpret_string
2850    on it.  */
2851
2852 static void
2853 test_lexer_string_locations_u8 (const line_table_case &case_)
2854 {
2855   /* Digits 0-9.
2856      ....................000000000.11111111112.22222222233333
2857      ....................123456789.01234567890.12345678901234  */
2858   const char *content = "      u8\"0123456789\" /* non-str */\n";
2859   lexer_test test (case_, content, NULL);
2860
2861   /* Verify that we get the expected token back, with the correct
2862      location information.  */
2863   const cpp_token *tok = test.get_token ();
2864   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2865   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2866
2867   /* Verify that cpp_interpret_string works.  */
2868   cpp_string dst_string;
2869   const enum cpp_ttype type = CPP_STRING;
2870   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2871                                       &dst_string, type);
2872   ASSERT_TRUE (result);
2873   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2874   free (const_cast <unsigned char *> (dst_string.text));
2875
2876   /* Verify ranges of individual characters.  This no longer includes the
2877      opening quote, but does include the closing quote.  */
2878   for (int i = 0; i <= 10; i++)
2879     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2880 }
2881
2882 /* Lex a string literal containing UTF-8 source characters.
2883    Verify the substring location data after running cpp_interpret_string
2884    on it.  */
2885
2886 static void
2887 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2888 {
2889  /* This string literal is written out to the source file as UTF-8,
2890     and is of the form "before mojibake after", where "mojibake"
2891     is written as the following four unicode code points:
2892        U+6587 CJK UNIFIED IDEOGRAPH-6587
2893        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2894        U+5316 CJK UNIFIED IDEOGRAPH-5316
2895        U+3051 HIRAGANA LETTER KE.
2896      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2897      "before" and "after" are 1 byte per unicode character.
2898
2899      The numbering shown are "columns", which are *byte* numbers within
2900      the line, rather than unicode character numbers.
2901
2902      .................... 000000000.1111111.
2903      .................... 123456789.0123456.  */
2904   const char *content = ("        \"before "
2905                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
2906                               UTF-8: 0xE6 0x96 0x87
2907                               C octal escaped UTF-8: \346\226\207
2908                             "column" numbers: 17-19.  */
2909                          "\346\226\207"
2910
2911                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2912                               UTF-8: 0xE5 0xAD 0x97
2913                               C octal escaped UTF-8: \345\255\227
2914                             "column" numbers: 20-22.  */
2915                          "\345\255\227"
2916
2917                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
2918                               UTF-8: 0xE5 0x8C 0x96
2919                               C octal escaped UTF-8: \345\214\226
2920                             "column" numbers: 23-25.  */
2921                          "\345\214\226"
2922
2923                          /* U+3051 HIRAGANA LETTER KE
2924                               UTF-8: 0xE3 0x81 0x91
2925                               C octal escaped UTF-8: \343\201\221
2926                             "column" numbers: 26-28.  */
2927                          "\343\201\221"
2928
2929                          /* column numbers 29 onwards
2930                           2333333.33334444444444
2931                           9012345.67890123456789. */
2932                          " after\" /* non-str */\n");
2933   lexer_test test (case_, content, NULL);
2934
2935   /* Verify that we get the expected token back, with the correct
2936      location information.  */
2937   const cpp_token *tok = test.get_token ();
2938   ASSERT_EQ (tok->type, CPP_STRING);
2939   ASSERT_TOKEN_AS_TEXT_EQ
2940     (test.m_parser, tok,
2941      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
2942
2943   /* Verify that cpp_interpret_string works.  */
2944   cpp_string dst_string;
2945   const enum cpp_ttype type = CPP_STRING;
2946   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2947                                       &dst_string, type);
2948   ASSERT_TRUE (result);
2949   ASSERT_STREQ
2950     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
2951      (const char *)dst_string.text);
2952   free (const_cast <unsigned char *> (dst_string.text));
2953
2954   /* Verify ranges of individual characters.  This no longer includes the
2955      opening quote, but does include the closing quote.
2956      Assuming that both source and execution encodings are UTF-8, we have
2957      a run of 25 octets in each, plus the NUL terminator.  */
2958   for (int i = 0; i < 25; i++)
2959     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2960   /* NUL-terminator should use the closing quote at column 35.  */
2961   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
2962
2963   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
2964 }
2965
2966 /* Test of string literal concatenation.  */
2967
2968 static void
2969 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
2970 {
2971   /* Digits 0-9.
2972      .....................000000000.111111.11112222222222
2973      .....................123456789.012345.67890123456789.  */
2974   const char *content = ("        \"01234\" /* non-str */\n"
2975                          "        \"56789\" /* non-str */\n");
2976   lexer_test test (case_, content, NULL);
2977
2978   location_t input_locs[2];
2979
2980   /* Verify that we get the expected tokens back.  */
2981   auto_vec <cpp_string> input_strings;
2982   const cpp_token *tok_a = test.get_token ();
2983   ASSERT_EQ (tok_a->type, CPP_STRING);
2984   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
2985   input_strings.safe_push (tok_a->val.str);
2986   input_locs[0] = tok_a->src_loc;
2987
2988   const cpp_token *tok_b = test.get_token ();
2989   ASSERT_EQ (tok_b->type, CPP_STRING);
2990   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
2991   input_strings.safe_push (tok_b->val.str);
2992   input_locs[1] = tok_b->src_loc;
2993
2994   /* Verify that cpp_interpret_string works.  */
2995   cpp_string dst_string;
2996   const enum cpp_ttype type = CPP_STRING;
2997   bool result = cpp_interpret_string (test.m_parser,
2998                                       input_strings.address (), 2,
2999                                       &dst_string, type);
3000   ASSERT_TRUE (result);
3001   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3002   free (const_cast <unsigned char *> (dst_string.text));
3003
3004   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3005   test.m_concats.record_string_concatenation (2, input_locs);
3006
3007   location_t initial_loc = input_locs[0];
3008
3009   /* "01234" on line 1.  */
3010   for (int i = 0; i <= 4; i++)
3011     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3012   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3013   for (int i = 5; i <= 10; i++)
3014     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3015
3016   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3017 }
3018
3019 /* Another test of string literal concatenation.  */
3020
3021 static void
3022 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3023 {
3024   /* Digits 0-9.
3025      .....................000000000.111.11111112222222
3026      .....................123456789.012.34567890123456.  */
3027   const char *content = ("        \"01\" /* non-str */\n"
3028                          "        \"23\" /* non-str */\n"
3029                          "        \"45\" /* non-str */\n"
3030                          "        \"67\" /* non-str */\n"
3031                          "        \"89\" /* non-str */\n");
3032   lexer_test test (case_, content, NULL);
3033
3034   auto_vec <cpp_string> input_strings;
3035   location_t input_locs[5];
3036
3037   /* Verify that we get the expected tokens back.  */
3038   for (int i = 0; i < 5; i++)
3039     {
3040       const cpp_token *tok = test.get_token ();
3041       ASSERT_EQ (tok->type, CPP_STRING);
3042       input_strings.safe_push (tok->val.str);
3043       input_locs[i] = tok->src_loc;
3044     }
3045
3046   /* Verify that cpp_interpret_string works.  */
3047   cpp_string dst_string;
3048   const enum cpp_ttype type = CPP_STRING;
3049   bool result = cpp_interpret_string (test.m_parser,
3050                                       input_strings.address (), 5,
3051                                       &dst_string, type);
3052   ASSERT_TRUE (result);
3053   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3054   free (const_cast <unsigned char *> (dst_string.text));
3055
3056   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3057   test.m_concats.record_string_concatenation (5, input_locs);
3058
3059   location_t initial_loc = input_locs[0];
3060
3061   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3062      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3063      and expect get_source_range_for_substring to fail.
3064      However, for a string concatenation test, we can have a case
3065      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3066      but subsequent strings can be after it.
3067      Attempting to detect this within assert_char_at_range
3068      would overcomplicate the logic for the common test cases, so
3069      we detect it here.  */
3070   if (should_have_column_data_p (input_locs[0])
3071       && !should_have_column_data_p (input_locs[4]))
3072     {
3073       /* Verify that get_source_range_for_substring gracefully rejects
3074          this case.  */
3075       source_range actual_range;
3076       const char *err
3077         = get_source_range_for_char (test.m_parser, &test.m_concats,
3078                                      initial_loc, type, 0, &actual_range);
3079       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3080       return;
3081     }
3082
3083   for (int i = 0; i < 5; i++)
3084     for (int j = 0; j < 2; j++)
3085       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3086                             i + 1, 10 + j, 10 + j);
3087
3088   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3089   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3090
3091   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3092 }
3093
3094 /* Another test of string literal concatenation, this time combined with
3095    various kinds of escaped characters.  */
3096
3097 static void
3098 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3099 {
3100   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3101      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3102   const char *content
3103     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3104        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3105     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3106   lexer_test test (case_, content, NULL);
3107
3108   auto_vec <cpp_string> input_strings;
3109   location_t input_locs[4];
3110
3111   /* Verify that we get the expected tokens back.  */
3112   for (int i = 0; i < 4; i++)
3113     {
3114       const cpp_token *tok = test.get_token ();
3115       ASSERT_EQ (tok->type, CPP_STRING);
3116       input_strings.safe_push (tok->val.str);
3117       input_locs[i] = tok->src_loc;
3118     }
3119
3120   /* Verify that cpp_interpret_string works.  */
3121   cpp_string dst_string;
3122   const enum cpp_ttype type = CPP_STRING;
3123   bool result = cpp_interpret_string (test.m_parser,
3124                                       input_strings.address (), 4,
3125                                       &dst_string, type);
3126   ASSERT_TRUE (result);
3127   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3128   free (const_cast <unsigned char *> (dst_string.text));
3129
3130   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3131   test.m_concats.record_string_concatenation (4, input_locs);
3132
3133   location_t initial_loc = input_locs[0];
3134
3135   for (int i = 0; i <= 4; i++)
3136     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3137   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3138   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3139   for (int i = 7; i <= 9; i++)
3140     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3141
3142   /* NUL-terminator should use the location of the final closing quote.  */
3143   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3144
3145   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3146 }
3147
3148 /* Test of string literal in a macro.  */
3149
3150 static void
3151 test_lexer_string_locations_macro (const line_table_case &case_)
3152 {
3153   /* Digits 0-9.
3154      .....................0000000001111111111.22222222223.
3155      .....................1234567890123456789.01234567890.  */
3156   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3157                          "  MACRO");
3158   lexer_test test (case_, content, NULL);
3159
3160   /* Verify that we get the expected tokens back.  */
3161   const cpp_token *tok = test.get_token ();
3162   ASSERT_EQ (tok->type, CPP_PADDING);
3163
3164   tok = test.get_token ();
3165   ASSERT_EQ (tok->type, CPP_STRING);
3166   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3167
3168   /* Verify ranges of individual characters.  We ought to
3169      see columns within the macro definition.  */
3170   for (int i = 0; i <= 10; i++)
3171     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3172                           i, 1, 20 + i, 20 + i);
3173
3174   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3175
3176   tok = test.get_token ();
3177   ASSERT_EQ (tok->type, CPP_PADDING);
3178 }
3179
3180 /* Test of stringification of a macro argument.  */
3181
3182 static void
3183 test_lexer_string_locations_stringified_macro_argument
3184   (const line_table_case &case_)
3185 {
3186   /* .....................000000000111111111122222222223.
3187      .....................123456789012345678901234567890.  */
3188   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3189                          "MACRO(foo)\n");
3190   lexer_test test (case_, content, NULL);
3191
3192   /* Verify that we get the expected token back.  */
3193   const cpp_token *tok = test.get_token ();
3194   ASSERT_EQ (tok->type, CPP_PADDING);
3195
3196   tok = test.get_token ();
3197   ASSERT_EQ (tok->type, CPP_STRING);
3198   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3199
3200   /* We don't support getting the location of a stringified macro
3201      argument.  Verify that it fails gracefully.  */
3202   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3203                                   "cpp_interpret_string_1 failed");
3204
3205   tok = test.get_token ();
3206   ASSERT_EQ (tok->type, CPP_PADDING);
3207
3208   tok = test.get_token ();
3209   ASSERT_EQ (tok->type, CPP_PADDING);
3210 }
3211
3212 /* Ensure that we are fail gracefully if something attempts to pass
3213    in a location that isn't a string literal token.  Seen on this code:
3214
3215      const char a[] = " %d ";
3216      __builtin_printf (a, 0.5);
3217                        ^
3218
3219    when c-format.c erroneously used the indicated one-character
3220    location as the format string location, leading to a read past the
3221    end of a string buffer in cpp_interpret_string_1.  */
3222
3223 static void
3224 test_lexer_string_locations_non_string (const line_table_case &case_)
3225 {
3226   /* .....................000000000111111111122222222223.
3227      .....................123456789012345678901234567890.  */
3228   const char *content = ("         a\n");
3229   lexer_test test (case_, content, NULL);
3230
3231   /* Verify that we get the expected token back.  */
3232   const cpp_token *tok = test.get_token ();
3233   ASSERT_EQ (tok->type, CPP_NAME);
3234   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3235
3236   /* At this point, libcpp is attempting to interpret the name as a
3237      string literal, despite it not starting with a quote.  We don't detect
3238      that, but we should at least fail gracefully.  */
3239   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3240                                   "cpp_interpret_string_1 failed");
3241 }
3242
3243 /* Ensure that we can read substring information for a token which
3244    starts in one linemap and ends in another .  Adapted from
3245    gcc.dg/cpp/pr69985.c.  */
3246
3247 static void
3248 test_lexer_string_locations_long_line (const line_table_case &case_)
3249 {
3250   /* .....................000000.000111111111
3251      .....................123456.789012346789.  */
3252   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3253                          "     \"0123456789012345678901234567890123456789"
3254                          "0123456789012345678901234567890123456789"
3255                          "0123456789012345678901234567890123456789"
3256                          "0123456789\"\n");
3257
3258   lexer_test test (case_, content, NULL);
3259
3260   /* Verify that we get the expected token back.  */
3261   const cpp_token *tok = test.get_token ();
3262   ASSERT_EQ (tok->type, CPP_STRING);
3263
3264   if (!should_have_column_data_p (line_table->highest_location))
3265     return;
3266
3267   /* Verify ranges of individual characters.  */
3268   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3269   for (int i = 0; i < 131; i++)
3270     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3271                           i, 2, 7 + i, 7 + i);
3272 }
3273
3274 /* Test of locations within a raw string that doesn't contain a newline.  */
3275
3276 static void
3277 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3278 {
3279   /* .....................00.0000000111111111122.
3280      .....................12.3456789012345678901.  */
3281   const char *content = ("R\"foo(0123456789)foo\"\n");
3282   lexer_test test (case_, content, NULL);
3283
3284   /* Verify that we get the expected token back.  */
3285   const cpp_token *tok = test.get_token ();
3286   ASSERT_EQ (tok->type, CPP_STRING);
3287
3288   /* Verify that cpp_interpret_string works.  */
3289   cpp_string dst_string;
3290   const enum cpp_ttype type = CPP_STRING;
3291   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3292                                       &dst_string, type);
3293   ASSERT_TRUE (result);
3294   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3295   free (const_cast <unsigned char *> (dst_string.text));
3296
3297   if (!should_have_column_data_p (line_table->highest_location))
3298     return;
3299
3300   /* 0-9, plus the nil terminator.  */
3301   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3302   for (int i = 0; i < 11; i++)
3303     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3304                           i, 1, 7 + i, 7 + i);
3305 }
3306
3307 /* Test of locations within a raw string that contains a newline.  */
3308
3309 static void
3310 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3311 {
3312   /* .....................00.0000.
3313      .....................12.3456.  */
3314   const char *content = ("R\"foo(\n"
3315   /* .....................00000.
3316      .....................12345.  */
3317                          "hello\n"
3318                          "world\n"
3319   /* .....................00000.
3320      .....................12345.  */
3321                          ")foo\"\n");
3322   lexer_test test (case_, content, NULL);
3323
3324   /* Verify that we get the expected token back.  */
3325   const cpp_token *tok = test.get_token ();
3326   ASSERT_EQ (tok->type, CPP_STRING);
3327
3328   /* Verify that cpp_interpret_string works.  */
3329   cpp_string dst_string;
3330   const enum cpp_ttype type = CPP_STRING;
3331   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3332                                       &dst_string, type);
3333   ASSERT_TRUE (result);
3334   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3335   free (const_cast <unsigned char *> (dst_string.text));
3336
3337   if (!should_have_column_data_p (line_table->highest_location))
3338     return;
3339
3340   /* Currently we don't support locations within raw strings that
3341      contain newlines.  */
3342   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3343                                   "range endpoints are on different lines");
3344 }
3345
3346 /* Test of parsing an unterminated raw string.  */
3347
3348 static void
3349 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3350 {
3351   const char *content = "R\"ouch()ouCh\" /* etc */";
3352
3353   lexer_error_sink errors;
3354   lexer_test test (case_, content, &errors);
3355   test.m_implicitly_expect_EOF = false;
3356
3357   /* Attempt to parse the raw string.  */
3358   const cpp_token *tok = test.get_token ();
3359   ASSERT_EQ (tok->type, CPP_EOF);
3360
3361   ASSERT_EQ (1, errors.m_errors.length ());
3362   /* We expect the message "unterminated raw string"
3363      in the "cpplib" translation domain.
3364      It's not clear that dgettext is available on all supported hosts,
3365      so this assertion is commented-out for now.
3366        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3367                      errors.m_errors[0]);
3368   */
3369 }
3370
3371 /* Test of lexing char constants.  */
3372
3373 static void
3374 test_lexer_char_constants (const line_table_case &case_)
3375 {
3376   /* Various char constants.
3377      .....................0000000001111111111.22222222223.
3378      .....................1234567890123456789.01234567890.  */
3379   const char *content = ("         'a'\n"
3380                          "        u'a'\n"
3381                          "        U'a'\n"
3382                          "        L'a'\n"
3383                          "         'abc'\n");
3384   lexer_test test (case_, content, NULL);
3385
3386   /* Verify that we get the expected tokens back.  */
3387   /* 'a'.  */
3388   const cpp_token *tok = test.get_token ();
3389   ASSERT_EQ (tok->type, CPP_CHAR);
3390   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3391
3392   unsigned int chars_seen;
3393   int unsignedp;
3394   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3395                                           &chars_seen, &unsignedp);
3396   ASSERT_EQ (cc, 'a');
3397   ASSERT_EQ (chars_seen, 1);
3398
3399   /* u'a'.  */
3400   tok = test.get_token ();
3401   ASSERT_EQ (tok->type, CPP_CHAR16);
3402   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3403
3404   /* U'a'.  */
3405   tok = test.get_token ();
3406   ASSERT_EQ (tok->type, CPP_CHAR32);
3407   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3408
3409   /* L'a'.  */
3410   tok = test.get_token ();
3411   ASSERT_EQ (tok->type, CPP_WCHAR);
3412   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3413
3414   /* 'abc' (c-char-sequence).  */
3415   tok = test.get_token ();
3416   ASSERT_EQ (tok->type, CPP_CHAR);
3417   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3418 }
3419 /* A table of interesting location_t values, giving one axis of our test
3420    matrix.  */
3421
3422 static const location_t boundary_locations[] = {
3423   /* Zero means "don't override the default values for a new line_table".  */
3424   0,
3425
3426   /* An arbitrary non-zero value that isn't close to one of
3427      the boundary values below.  */
3428   0x10000,
3429
3430   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3431   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3432   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3433   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3434   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3435   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3436
3437   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3438   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3439   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3440   LINE_MAP_MAX_LOCATION_WITH_COLS,
3441   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3442   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3443 };
3444
3445 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3446
3447 void
3448 for_each_line_table_case (void (*testcase) (const line_table_case &))
3449 {
3450   /* As noted above in the description of struct line_table_case,
3451      we want to explore a test matrix of interesting line_table
3452      situations, running various selftests for each case within the
3453      matrix.  */
3454
3455   /* Run all tests with:
3456      (a) line_table->default_range_bits == 0, and
3457      (b) line_table->default_range_bits == 5.  */
3458   int num_cases_tested = 0;
3459   for (int default_range_bits = 0; default_range_bits <= 5;
3460        default_range_bits += 5)
3461     {
3462       /* ...and use each of the "interesting" location values as
3463          the starting location within line_table.  */
3464       const int num_boundary_locations
3465         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3466       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3467         {
3468           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3469
3470           testcase (c);
3471
3472           num_cases_tested++;
3473         }
3474     }
3475
3476   /* Verify that we fully covered the test matrix.  */
3477   ASSERT_EQ (num_cases_tested, 2 * 12);
3478 }
3479
3480 /* Run all of the selftests within this file.  */
3481
3482 void
3483 input_c_tests ()
3484 {
3485   test_should_have_column_data_p ();
3486   test_unknown_location ();
3487   test_builtins ();
3488   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3489
3490   for_each_line_table_case (test_accessing_ordinary_linemaps);
3491   for_each_line_table_case (test_lexer);
3492   for_each_line_table_case (test_lexer_string_locations_simple);
3493   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3494   for_each_line_table_case (test_lexer_string_locations_hex);
3495   for_each_line_table_case (test_lexer_string_locations_oct);
3496   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3497   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3498   for_each_line_table_case (test_lexer_string_locations_ucn4);
3499   for_each_line_table_case (test_lexer_string_locations_ucn8);
3500   for_each_line_table_case (test_lexer_string_locations_wide_string);
3501   for_each_line_table_case (test_lexer_string_locations_string16);
3502   for_each_line_table_case (test_lexer_string_locations_string32);
3503   for_each_line_table_case (test_lexer_string_locations_u8);
3504   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3505   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3506   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3507   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3508   for_each_line_table_case (test_lexer_string_locations_macro);
3509   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3510   for_each_line_table_case (test_lexer_string_locations_non_string);
3511   for_each_line_table_case (test_lexer_string_locations_long_line);
3512   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3513   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3514   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3515   for_each_line_table_case (test_lexer_char_constants);
3516
3517   test_reading_source_line ();
3518 }
3519
3520 } // namespace selftest
3521
3522 #endif /* CHECKING_P */