gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2016 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic-core.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* This is a cache used by get_next_line to store the content of a
  33    file to be searched for file lines.  */
  34 struct fcache
  35 {
  36   /* These are information used to store a line boundary.  */
  37   struct line_info
  38   {
  39     /* The line number.  It starts from 1.  */
  40     size_t line_num;
  41
  42     /* The position (byte count) of the beginning of the line,
  43        relative to the file data pointer.  This starts at zero.  */
  44     size_t start_pos;
  45
  46     /* The position (byte count) of the last byte of the line.  This
  47        normally points to the '\n' character, or to one byte after the
  48        last byte of the file, if the file doesn't contain a '\n'
  49        character.  */
  50     size_t end_pos;
  51
  52     line_info (size_t l, size_t s, size_t e)
  53       : line_num (l), start_pos (s), end_pos (e)
  54     {}
  55
  56     line_info ()
  57       :line_num (0), start_pos (0), end_pos (0)
  58     {}
  59   };
  60
  61   /* The number of time this file has been accessed.  This is used
  62      to designate which file cache to evict from the cache
  63      array.  */
  64   unsigned use_count;
  65
  66   /* The file_path is the key for identifying a particular file in
  67      the cache.
  68      For libcpp-using code, the underlying buffer for this field is
  69      owned by the corresponding _cpp_file within the cpp_reader.  */
  70   const char *file_path;
  71
  72   FILE *fp;
  73
  74   /* This points to the content of the file that we've read so
  75      far.  */
  76   char *data;
  77
  78   /*  The size of the DATA array above.*/
  79   size_t size;
  80
  81   /* The number of bytes read from the underlying file so far.  This
  82      must be less (or equal) than SIZE above.  */
  83   size_t nb_read;
  84
  85   /* The index of the beginning of the current line.  */
  86   size_t line_start_idx;
  87
  88   /* The number of the previous line read.  This starts at 1.  Zero
  89      means we've read no line so far.  */
  90   size_t line_num;
  91
  92   /* This is the total number of lines of the current file.  At the
  93      moment, we try to get this information from the line map
  94      subsystem.  Note that this is just a hint.  When using the C++
  95      front-end, this hint is correct because the input file is then
  96      completely tokenized before parsing starts; so the line map knows
  97      the number of lines before compilation really starts.  For e.g,
  98      the C front-end, it can happen that we start emitting diagnostics
  99      before the line map has seen the end of the file.  */
 100   size_t total_lines;
 101
 102   /* Could this file be missing a trailing newline on its final line?
 103      Initially true (to cope with empty files), set to true/false
 104      as each line is read.  */
 105   bool missing_trailing_newline;
 106
 107   /* This is a record of the beginning and end of the lines we've seen
 108      while reading the file.  This is useful to avoid walking the data
 109      from the beginning when we are asked to read a line that is
 110      before LINE_START_IDX above.  Note that the maximum size of this
 111      record is fcache_line_record_size, so that the memory consumption
 112      doesn't explode.  We thus scale total_lines down to
 113      fcache_line_record_size.  */
 114   vec<line_info, va_heap> line_record;
 115
 116   fcache ();
 117   ~fcache ();
 118 };
 119
 120 /* Current position in real source file.  */
 121
 122 location_t input_location = UNKNOWN_LOCATION;
 123
 124 struct line_maps *line_table;
 125
 126 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 127    This needs to be a global so that it can be a GC root, and thus
 128    prevent the stashed copy from being garbage-collected if the GC runs
 129    during a line_table_test.  */
 130
 131 struct line_maps *saved_line_table;
 132
 133 static fcache *fcache_tab;
 134 static const size_t fcache_tab_size = 16;
 135 static const size_t fcache_buffer_size = 4 * 1024;
 136 static const size_t fcache_line_record_size = 100;
 137
 138 /* Expand the source location LOC into a human readable location.  If
 139    LOC resolves to a builtin location, the file name of the readable
 140    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 141    TRUE and LOC is virtual, then it is resolved to the expansion
 142    point of the involved macro.  Otherwise, it is resolved to the
 143    spelling location of the token.
 144
 145    When resolving to the spelling location of the token, if the
 146    resulting location is for a built-in location (that is, it has no
 147    associated line/column) in the context of a macro expansion, the
 148    returned location is the first one (while unwinding the macro
 149    location towards its expansion point) that is in real source
 150    code.  */
 151
 152 static expanded_location
 153 expand_location_1 (source_location loc,
 154                    bool expansion_point_p)
 155 {
 156   expanded_location xloc;
 157   const line_map_ordinary *map;
 158   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 159   tree block = NULL;
 160
 161   if (IS_ADHOC_LOC (loc))
 162     {
 163       block = LOCATION_BLOCK (loc);
 164       loc = LOCATION_LOCUS (loc);
 165     }
 166
 167   memset (&xloc, 0, sizeof (xloc));
 168
 169   if (loc >= RESERVED_LOCATION_COUNT)
 170     {
 171       if (!expansion_point_p)
 172         {
 173           /* We want to resolve LOC to its spelling location.
 174
 175              But if that spelling location is a reserved location that
 176              appears in the context of a macro expansion (like for a
 177              location for a built-in token), let's consider the first
 178              location (toward the expansion point) that is not reserved;
 179              that is, the first location that is in real source code.  */
 180           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 181                                                           loc, NULL);
 182           lrk = LRK_SPELLING_LOCATION;
 183         }
 184       loc = linemap_resolve_location (line_table, loc,
 185                                       lrk, &map);
 186       xloc = linemap_expand_location (line_table, map, loc);
 187     }
 188
 189   xloc.data = block;
 190   if (loc <= BUILTINS_LOCATION)
 191     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 192
 193   return xloc;
 194 }
 195
 196 /* Initialize the set of cache used for files accessed by caret
 197    diagnostic.  */
 198
 199 static void
 200 diagnostic_file_cache_init (void)
 201 {
 202   if (fcache_tab == NULL)
 203     fcache_tab = new fcache[fcache_tab_size];
 204 }
 205
 206 /* Free the resources used by the set of cache used for files accessed
 207    by caret diagnostic.  */
 208
 209 void
 210 diagnostic_file_cache_fini (void)
 211 {
 212   if (fcache_tab)
 213     {
 214       delete [] (fcache_tab);
 215       fcache_tab = NULL;
 216     }
 217 }
 218
 219 /* Return the total lines number that have been read so far by the
 220    line map (in the preprocessor) so far.  For languages like C++ that
 221    entirely preprocess the input file before starting to parse, this
 222    equals the actual number of lines of the file.  */
 223
 224 static size_t
 225 total_lines_num (const char *file_path)
 226 {
 227   size_t r = 0;
 228   source_location l = 0;
 229   if (linemap_get_file_highest_location (line_table, file_path, &l))
 230     {
 231       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 232       expanded_location xloc = expand_location (l);
 233       r = xloc.line;
 234     }
 235   return r;
 236 }
 237
 238 /* Lookup the cache used for the content of a given file accessed by
 239    caret diagnostic.  Return the found cached file, or NULL if no
 240    cached file was found.  */
 241
 242 static fcache*
 243 lookup_file_in_cache_tab (const char *file_path)
 244 {
 245   if (file_path == NULL)
 246     return NULL;
 247
 248   diagnostic_file_cache_init ();
 249
 250   /* This will contain the found cached file.  */
 251   fcache *r = NULL;
 252   for (unsigned i = 0; i < fcache_tab_size; ++i)
 253     {
 254       fcache *c = &fcache_tab[i];
 255       if (c->file_path && !strcmp (c->file_path, file_path))
 256         {
 257           ++c->use_count;
 258           r = c;
 259         }
 260     }
 261
 262   if (r)
 263     ++r->use_count;
 264
 265   return r;
 266 }
 267
 268 /* Purge any mention of FILENAME from the cache of files used for
 269    printing source code.  For use in selftests when working
 270    with tempfiles.  */
 271
 272 void
 273 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 274 {
 275   gcc_assert (file_path);
 276
 277   fcache *r = lookup_file_in_cache_tab (file_path);
 278   if (!r)
 279     /* Not found.  */
 280     return;
 281
 282   r->file_path = NULL;
 283   if (r->fp)
 284     fclose (r->fp);
 285   r->fp = NULL;
 286   r->nb_read = 0;
 287   r->line_start_idx = 0;
 288   r->line_num = 0;
 289   r->line_record.truncate (0);
 290   r->use_count = 0;
 291   r->total_lines = 0;
 292   r->missing_trailing_newline = true;
 293 }
 294
 295 /* Return the file cache that has been less used, recently, or the
 296    first empty one.  If HIGHEST_USE_COUNT is non-null,
 297    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 298    in the cache table.  */
 299
 300 static fcache*
 301 evicted_cache_tab_entry (unsigned *highest_use_count)
 302 {
 303   diagnostic_file_cache_init ();
 304
 305   fcache *to_evict = &fcache_tab[0];
 306   unsigned huc = to_evict->use_count;
 307   for (unsigned i = 1; i < fcache_tab_size; ++i)
 308     {
 309       fcache *c = &fcache_tab[i];
 310       bool c_is_empty = (c->file_path == NULL);
 311
 312       if (c->use_count < to_evict->use_count
 313           || (to_evict->file_path && c_is_empty))
 314         /* We evict C because it's either an entry with a lower use
 315            count or one that is empty.  */
 316         to_evict = c;
 317
 318       if (huc < c->use_count)
 319         huc = c->use_count;
 320
 321       if (c_is_empty)
 322         /* We've reached the end of the cache; subsequent elements are
 323            all empty.  */
 324         break;
 325     }
 326
 327   if (highest_use_count)
 328     *highest_use_count = huc;
 329
 330   return to_evict;
 331 }
 332
 333 /* Create the cache used for the content of a given file to be
 334    accessed by caret diagnostic.  This cache is added to an array of
 335    cache and can be retrieved by lookup_file_in_cache_tab.  This
 336    function returns the created cache.  Note that only the last
 337    fcache_tab_size files are cached.  */
 338
 339 static fcache*
 340 add_file_to_cache_tab (const char *file_path)
 341 {
 342
 343   FILE *fp = fopen (file_path, "r");
 344   if (fp == NULL)
 345     return NULL;
 346
 347   unsigned highest_use_count = 0;
 348   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 349   r->file_path = file_path;
 350   if (r->fp)
 351     fclose (r->fp);
 352   r->fp = fp;
 353   r->nb_read = 0;
 354   r->line_start_idx = 0;
 355   r->line_num = 0;
 356   r->line_record.truncate (0);
 357   /* Ensure that this cache entry doesn't get evicted next time
 358      add_file_to_cache_tab is called.  */
 359   r->use_count = ++highest_use_count;
 360   r->total_lines = total_lines_num (file_path);
 361   r->missing_trailing_newline = true;
 362
 363   return r;
 364 }
 365
 366 /* Lookup the cache used for the content of a given file accessed by
 367    caret diagnostic.  If no cached file was found, create a new cache
 368    for this file, add it to the array of cached file and return
 369    it.  */
 370
 371 static fcache*
 372 lookup_or_add_file_to_cache_tab (const char *file_path)
 373 {
 374   fcache *r = lookup_file_in_cache_tab (file_path);
 375   if (r == NULL)
 376     r = add_file_to_cache_tab (file_path);
 377   return r;
 378 }
 379
 380 /* Default constructor for a cache of file used by caret
 381    diagnostic.  */
 382
 383 fcache::fcache ()
 384 : use_count (0), file_path (NULL), fp (NULL), data (0),
 385   size (0), nb_read (0), line_start_idx (0), line_num (0),
 386   total_lines (0), missing_trailing_newline (true)
 387 {
 388   line_record.create (0);
 389 }
 390
 391 /* Destructor for a cache of file used by caret diagnostic.  */
 392
 393 fcache::~fcache ()
 394 {
 395   if (fp)
 396     {
 397       fclose (fp);
 398       fp = NULL;
 399     }
 400   if (data)
 401     {
 402       XDELETEVEC (data);
 403       data = 0;
 404     }
 405   line_record.release ();
 406 }
 407
 408 /* Returns TRUE iff the cache would need to be filled with data coming
 409    from the file.  That is, either the cache is empty or full or the
 410    current line is empty.  Note that if the cache is full, it would
 411    need to be extended and filled again.  */
 412
 413 static bool
 414 needs_read (fcache *c)
 415 {
 416   return (c->nb_read == 0
 417           || c->nb_read == c->size
 418           || (c->line_start_idx >= c->nb_read - 1));
 419 }
 420
 421 /*  Return TRUE iff the cache is full and thus needs to be
 422     extended.  */
 423
 424 static bool
 425 needs_grow (fcache *c)
 426 {
 427   return c->nb_read == c->size;
 428 }
 429
 430 /* Grow the cache if it needs to be extended.  */
 431
 432 static void
 433 maybe_grow (fcache *c)
 434 {
 435   if (!needs_grow (c))
 436     return;
 437
 438   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 439   c->data = XRESIZEVEC (char, c->data, size);
 440   c->size = size;
 441 }
 442
 443 /*  Read more data into the cache.  Extends the cache if need be.
 444     Returns TRUE iff new data could be read.  */
 445
 446 static bool
 447 read_data (fcache *c)
 448 {
 449   if (feof (c->fp) || ferror (c->fp))
 450     return false;
 451
 452   maybe_grow (c);
 453
 454   char * from = c->data + c->nb_read;
 455   size_t to_read = c->size - c->nb_read;
 456   size_t nb_read = fread (from, 1, to_read, c->fp);
 457
 458   if (ferror (c->fp))
 459     return false;
 460
 461   c->nb_read += nb_read;
 462   return !!nb_read;
 463 }
 464
 465 /* Read new data iff the cache needs to be filled with more data
 466    coming from the file FP.  Return TRUE iff the cache was filled with
 467    mode data.  */
 468
 469 static bool
 470 maybe_read_data (fcache *c)
 471 {
 472   if (!needs_read (c))
 473     return false;
 474   return read_data (c);
 475 }
 476
 477 /* Read a new line from file FP, using C as a cache for the data
 478    coming from the file.  Upon successful completion, *LINE is set to
 479    the beginning of the line found.  *LINE points directly in the
 480    line cache and is only valid until the next call of get_next_line.
 481    *LINE_LEN is set to the length of the line.  Note that the line
 482    does not contain any terminal delimiter.  This function returns
 483    true if some data was read or process from the cache, false
 484    otherwise.  Note that subsequent calls to get_next_line might
 485    make the content of *LINE invalid.  */
 486
 487 static bool
 488 get_next_line (fcache *c, char **line, ssize_t *line_len)
 489 {
 490   /* Fill the cache with data to process.  */
 491   maybe_read_data (c);
 492
 493   size_t remaining_size = c->nb_read - c->line_start_idx;
 494   if (remaining_size == 0)
 495     /* There is no more data to process.  */
 496     return false;
 497
 498   char *line_start = c->data + c->line_start_idx;
 499
 500   char *next_line_start = NULL;
 501   size_t len = 0;
 502   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 503   if (line_end == NULL)
 504     {
 505       /* We haven't found the end-of-line delimiter in the cache.
 506          Fill the cache with more data from the file and look for the
 507          '\n'.  */
 508       while (maybe_read_data (c))
 509         {
 510           line_start = c->data + c->line_start_idx;
 511           remaining_size = c->nb_read - c->line_start_idx;
 512           line_end = (char *) memchr (line_start, '\n', remaining_size);
 513           if (line_end != NULL)
 514             {
 515               next_line_start = line_end + 1;
 516               break;
 517             }
 518         }
 519       if (line_end == NULL)
 520         {
 521           /* We've loadded all the file into the cache and still no
 522              '\n'.  Let's say the line ends up at one byte passed the
 523              end of the file.  This is to stay consistent with the case
 524              of when the line ends up with a '\n' and line_end points to
 525              that terminal '\n'.  That consistency is useful below in
 526              the len calculation.  */
 527           line_end = c->data + c->nb_read ;
 528           c->missing_trailing_newline = true;
 529         }
 530       else
 531         c->missing_trailing_newline = false;
 532     }
 533   else
 534     {
 535       next_line_start = line_end + 1;
 536       c->missing_trailing_newline = false;
 537     }
 538
 539   if (ferror (c->fp))
 540     return false;
 541
 542   /* At this point, we've found the end of the of line.  It either
 543      points to the '\n' or to one byte after the last byte of the
 544      file.  */
 545   gcc_assert (line_end != NULL);
 546
 547   len = line_end - line_start;
 548
 549   if (c->line_start_idx < c->nb_read)
 550     *line = line_start;
 551
 552   ++c->line_num;
 553
 554   /* Before we update our line record, make sure the hint about the
 555      total number of lines of the file is correct.  If it's not, then
 556      we give up recording line boundaries from now on.  */
 557   bool update_line_record = true;
 558   if (c->line_num > c->total_lines)
 559     update_line_record = false;
 560
 561     /* Now update our line record so that re-reading lines from the
 562      before c->line_start_idx is faster.  */
 563   if (update_line_record
 564       && c->line_record.length () < fcache_line_record_size)
 565     {
 566       /* If the file lines fits in the line record, we just record all
 567          its lines ...*/
 568       if (c->total_lines <= fcache_line_record_size
 569           && c->line_num > c->line_record.length ())
 570         c->line_record.safe_push (fcache::line_info (c->line_num,
 571                                                  c->line_start_idx,
 572                                                  line_end - c->data));
 573       else if (c->total_lines > fcache_line_record_size)
 574         {
 575           /* ... otherwise, we just scale total_lines down to
 576              (fcache_line_record_size lines.  */
 577           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 578           if (c->line_record.length () == 0
 579               || n >= c->line_record.length ())
 580             c->line_record.safe_push (fcache::line_info (c->line_num,
 581                                                      c->line_start_idx,
 582                                                      line_end - c->data));
 583         }
 584     }
 585
 586   /* Update c->line_start_idx so that it points to the next line to be
 587      read.  */
 588   if (next_line_start)
 589     c->line_start_idx = next_line_start - c->data;
 590   else
 591     /* We didn't find any terminal '\n'.  Let's consider that the end
 592        of line is the end of the data in the cache.  The next
 593        invocation of get_next_line will either read more data from the
 594        underlying file or return false early because we've reached the
 595        end of the file.  */
 596     c->line_start_idx = c->nb_read;
 597
 598   *line_len = len;
 599
 600   return true;
 601 }
 602
 603 /* Consume the next bytes coming from the cache (or from its
 604    underlying file if there are remaining unread bytes in the file)
 605    until we reach the next end-of-line (or end-of-file).  There is no
 606    copying from the cache involved.  Return TRUE upon successful
 607    completion.  */
 608
 609 static bool
 610 goto_next_line (fcache *cache)
 611 {
 612   char *l;
 613   ssize_t len;
 614
 615   return get_next_line (cache, &l, &len);
 616 }
 617
 618 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 619    If the line was read successfully, *LINE points to the beginning
 620    of the line in the file cache and *LINE_LEN is the length of the
 621    line.  *LINE is not nul-terminated, but may contain zero bytes.
 622    *LINE is only valid until the next call of read_line_num.
 623    This function returns bool if a line was read.  */
 624
 625 static bool
 626 read_line_num (fcache *c, size_t line_num,
 627                char **line, ssize_t *line_len)
 628 {
 629   gcc_assert (line_num > 0);
 630
 631   if (line_num <= c->line_num)
 632     {
 633       /* We've been asked to read lines that are before c->line_num.
 634          So lets use our line record (if it's not empty) to try to
 635          avoid re-reading the file from the beginning again.  */
 636
 637       if (c->line_record.is_empty ())
 638         {
 639           c->line_start_idx = 0;
 640           c->line_num = 0;
 641         }
 642       else
 643         {
 644           fcache::line_info *i = NULL;
 645           if (c->total_lines <= fcache_line_record_size)
 646             {
 647               /* In languages where the input file is not totally
 648                  preprocessed up front, the c->total_lines hint
 649                  can be smaller than the number of lines of the
 650                  file.  In that case, only the first
 651                  c->total_lines have been recorded.
 652
 653                  Otherwise, the first c->total_lines we've read have
 654                  their start/end recorded here.  */
 655               i = (line_num <= c->total_lines)
 656                 ? &c->line_record[line_num - 1]
 657                 : &c->line_record[c->total_lines - 1];
 658               gcc_assert (i->line_num <= line_num);
 659             }
 660           else
 661             {
 662               /*  So the file had more lines than our line record
 663                   size.  Thus the number of lines we've recorded has
 664                   been scaled down to fcache_line_reacord_size.  Let's
 665                   pick the start/end of the recorded line that is
 666                   closest to line_num.  */
 667               size_t n = (line_num <= c->total_lines)
 668                 ? line_num * fcache_line_record_size / c->total_lines
 669                 : c ->line_record.length () - 1;
 670               if (n < c->line_record.length ())
 671                 {
 672                   i = &c->line_record[n];
 673                   gcc_assert (i->line_num <= line_num);
 674                 }
 675             }
 676
 677           if (i && i->line_num == line_num)
 678             {
 679               /* We have the start/end of the line.  */
 680               *line = c->data + i->start_pos;
 681               *line_len = i->end_pos - i->start_pos;
 682               return true;
 683             }
 684
 685           if (i)
 686             {
 687               c->line_start_idx = i->start_pos;
 688               c->line_num = i->line_num - 1;
 689             }
 690           else
 691             {
 692               c->line_start_idx = 0;
 693               c->line_num = 0;
 694             }
 695         }
 696     }
 697
 698   /*  Let's walk from line c->line_num up to line_num - 1, without
 699       copying any line.  */
 700   while (c->line_num < line_num - 1)
 701     if (!goto_next_line (c))
 702       return false;
 703
 704   /* The line we want is the next one.  Let's read and copy it back to
 705      the caller.  */
 706   return get_next_line (c, line, line_len);
 707 }
 708
 709 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 710    The line is not nul-terminated.  The returned pointer is only
 711    valid until the next call of location_get_source_line.
 712    Note that the line can contain several null characters,
 713    so LINE_LEN, if non-null, points to the actual length of the line.
 714    If the function fails, NULL is returned.  */
 715
 716 const char *
 717 location_get_source_line (const char *file_path, int line,
 718                           int *line_len)
 719 {
 720   char *buffer = NULL;
 721   ssize_t len;
 722
 723   if (line == 0)
 724     return NULL;
 725
 726   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 727   if (c == NULL)
 728     return NULL;
 729
 730   bool read = read_line_num (c, line, &buffer, &len);
 731
 732   if (read && line_len)
 733     *line_len = len;
 734
 735   return read ? buffer : NULL;
 736 }
 737
 738 /* Determine if FILE_PATH missing a trailing newline on its final line.
 739    Only valid to call once all of the file has been loaded, by
 740    requesting a line number beyond the end of the file.  */
 741
 742 bool
 743 location_missing_trailing_newline (const char *file_path)
 744 {
 745   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 746   if (c == NULL)
 747     return false;
 748
 749   return c->missing_trailing_newline;
 750 }
 751
 752 /* Test if the location originates from the spelling location of a
 753    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 754    virtual) location of a built-in token that appears in the expansion
 755    list of a macro.  Please note that this function also works on
 756    tokens that result from built-in tokens.  For instance, the
 757    function would return true if passed a token "4" that is the result
 758    of the expansion of the built-in __LINE__ macro.  */
 759 bool
 760 is_location_from_builtin_token (source_location loc)
 761 {
 762   const line_map_ordinary *map = NULL;
 763   loc = linemap_resolve_location (line_table, loc,
 764                                   LRK_SPELLING_LOCATION, &map);
 765   return loc == BUILTINS_LOCATION;
 766 }
 767
 768 /* Expand the source location LOC into a human readable location.  If
 769    LOC is virtual, it resolves to the expansion point of the involved
 770    macro.  If LOC resolves to a builtin location, the file name of the
 771    readable location is set to the string "<built-in>".  */
 772
 773 expanded_location
 774 expand_location (source_location loc)
 775 {
 776   return expand_location_1 (loc, /*expansion_point_p=*/true);
 777 }
 778
 779 /* Expand the source location LOC into a human readable location.  If
 780    LOC is virtual, it resolves to the expansion location of the
 781    relevant macro.  If LOC resolves to a builtin location, the file
 782    name of the readable location is set to the string
 783    "<built-in>".  */
 784
 785 expanded_location
 786 expand_location_to_spelling_point (source_location loc)
 787 {
 788   return expand_location_1 (loc, /*expansion_point_p=*/false);
 789 }
 790
 791 /* The rich_location class within libcpp requires a way to expand
 792    source_location instances, and relies on the client code
 793    providing a symbol named
 794      linemap_client_expand_location_to_spelling_point
 795    to do this.
 796
 797    This is the implementation for libcommon.a (all host binaries),
 798    which simply calls into expand_location_to_spelling_point.  */
 799
 800 expanded_location
 801 linemap_client_expand_location_to_spelling_point (source_location loc)
 802 {
 803   return expand_location_to_spelling_point (loc);
 804 }
 805
 806
 807 /* If LOCATION is in a system header and if it is a virtual location for
 808    a token coming from the expansion of a macro, unwind it to the
 809    location of the expansion point of the macro.  Otherwise, just return
 810    LOCATION.
 811
 812    This is used for instance when we want to emit diagnostics about a
 813    token that may be located in a macro that is itself defined in a
 814    system header, for example, for the NULL macro.  In such a case, if
 815    LOCATION were passed directly to diagnostic functions such as
 816    warning_at, the diagnostic would be suppressed (unless
 817    -Wsystem-headers).  */
 818
 819 source_location
 820 expansion_point_location_if_in_system_header (source_location location)
 821 {
 822   if (in_system_header_at (location))
 823     location = linemap_resolve_location (line_table, location,
 824                                          LRK_MACRO_EXPANSION_POINT,
 825                                          NULL);
 826   return location;
 827 }
 828
 829 /* If LOCATION is a virtual location for a token coming from the expansion
 830    of a macro, unwind to the location of the expansion point of the macro.  */
 831
 832 source_location
 833 expansion_point_location (source_location location)
 834 {
 835   return linemap_resolve_location (line_table, location,
 836                                    LRK_MACRO_EXPANSION_POINT, NULL);
 837 }
 838
 839 /* Construct a location with caret at CARET, ranging from START to
 840    finish e.g.
 841
 842                  11111111112
 843         12345678901234567890
 844      522
 845      523   return foo + bar;
 846                   ~~~~^~~~~
 847      524
 848
 849    The location's caret is at the "+", line 523 column 15, but starts
 850    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 851    of "bar" at column 19.  */
 852
 853 location_t
 854 make_location (location_t caret, location_t start, location_t finish)
 855 {
 856   location_t pure_loc = get_pure_location (caret);
 857   source_range src_range;
 858   src_range.m_start = get_start (start);
 859   src_range.m_finish = get_finish (finish);
 860   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 861                                                    pure_loc,
 862                                                    src_range,
 863                                                    NULL);
 864   return combined_loc;
 865 }
 866
 867 #define ONE_K 1024
 868 #define ONE_M (ONE_K * ONE_K)
 869
 870 /* Display a number as an integer multiple of either:
 871    - 1024, if said integer is >= to 10 K (in base 2)
 872    - 1024 * 1024, if said integer is >= 10 M in (base 2)
 873  */
 874 #define SCALE(x) ((unsigned long) ((x) < 10 * ONE_K \
 875                   ? (x) \
 876                   : ((x) < 10 * ONE_M \
 877                      ? (x) / ONE_K \
 878                      : (x) / ONE_M)))
 879
 880 /* For a given integer, display either:
 881    - the character 'k', if the number is higher than 10 K (in base 2)
 882      but strictly lower than 10 M (in base 2)
 883    - the character 'M' if the number is higher than 10 M (in base2)
 884    - the charcter ' ' if the number is strictly lower  than 10 K  */
 885 #define STAT_LABEL(x) ((x) < 10 * ONE_K ? ' ' : ((x) < 10 * ONE_M ? 'k' : 'M'))
 886
 887 /* Display an integer amount as multiple of 1K or 1M (in base 2).
 888    Display the correct unit (either k, M, or ' ') after the amout, as
 889    well.  */
 890 #define FORMAT_AMOUNT(size) SCALE (size), STAT_LABEL (size)
 891
 892 /* Dump statistics to stderr about the memory usage of the line_table
 893    set of line maps.  This also displays some statistics about macro
 894    expansion.  */
 895
 896 void
 897 dump_line_table_statistics (void)
 898 {
 899   struct linemap_stats s;
 900   long total_used_map_size,
 901     macro_maps_size,
 902     total_allocated_map_size;
 903
 904   memset (&s, 0, sizeof (s));
 905
 906   linemap_get_statistics (line_table, &s);
 907
 908   macro_maps_size = s.macro_maps_used_size
 909     + s.macro_maps_locations_size;
 910
 911   total_allocated_map_size = s.ordinary_maps_allocated_size
 912     + s.macro_maps_allocated_size
 913     + s.macro_maps_locations_size;
 914
 915   total_used_map_size = s.ordinary_maps_used_size
 916     + s.macro_maps_used_size
 917     + s.macro_maps_locations_size;
 918
 919   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 920            s.num_expanded_macros);
 921   if (s.num_expanded_macros != 0)
 922     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 923              s.num_macro_tokens / s.num_expanded_macros);
 924   fprintf (stderr,
 925            "\nLine Table allocations during the "
 926            "compilation process\n");
 927   fprintf (stderr, "Number of ordinary maps used:        %5ld%c\n",
 928            SCALE (s.num_ordinary_maps_used),
 929            STAT_LABEL (s.num_ordinary_maps_used));
 930   fprintf (stderr, "Ordinary map used size:              %5ld%c\n",
 931            SCALE (s.ordinary_maps_used_size),
 932            STAT_LABEL (s.ordinary_maps_used_size));
 933   fprintf (stderr, "Number of ordinary maps allocated:   %5ld%c\n",
 934            SCALE (s.num_ordinary_maps_allocated),
 935            STAT_LABEL (s.num_ordinary_maps_allocated));
 936   fprintf (stderr, "Ordinary maps allocated size:        %5ld%c\n",
 937            SCALE (s.ordinary_maps_allocated_size),
 938            STAT_LABEL (s.ordinary_maps_allocated_size));
 939   fprintf (stderr, "Number of macro maps used:           %5ld%c\n",
 940            SCALE (s.num_macro_maps_used),
 941            STAT_LABEL (s.num_macro_maps_used));
 942   fprintf (stderr, "Macro maps used size:                %5ld%c\n",
 943            SCALE (s.macro_maps_used_size),
 944            STAT_LABEL (s.macro_maps_used_size));
 945   fprintf (stderr, "Macro maps locations size:           %5ld%c\n",
 946            SCALE (s.macro_maps_locations_size),
 947            STAT_LABEL (s.macro_maps_locations_size));
 948   fprintf (stderr, "Macro maps size:                     %5ld%c\n",
 949            SCALE (macro_maps_size),
 950            STAT_LABEL (macro_maps_size));
 951   fprintf (stderr, "Duplicated maps locations size:      %5ld%c\n",
 952            SCALE (s.duplicated_macro_maps_locations_size),
 953            STAT_LABEL (s.duplicated_macro_maps_locations_size));
 954   fprintf (stderr, "Total allocated maps size:           %5ld%c\n",
 955            SCALE (total_allocated_map_size),
 956            STAT_LABEL (total_allocated_map_size));
 957   fprintf (stderr, "Total used maps size:                %5ld%c\n",
 958            SCALE (total_used_map_size),
 959            STAT_LABEL (total_used_map_size));
 960   fprintf (stderr, "Ad-hoc table size:                   %5ld%c\n",
 961            SCALE (s.adhoc_table_size),
 962            STAT_LABEL (s.adhoc_table_size));
 963   fprintf (stderr, "Ad-hoc table entries used:           %5ld\n",
 964            s.adhoc_table_entries_used);
 965   fprintf (stderr, "optimized_ranges: %i\n",
 966            line_table->num_optimized_ranges);
 967   fprintf (stderr, "unoptimized_ranges: %i\n",
 968            line_table->num_unoptimized_ranges);
 969
 970   fprintf (stderr, "\n");
 971 }
 972
 973 /* Get location one beyond the final location in ordinary map IDX.  */
 974
 975 static source_location
 976 get_end_location (struct line_maps *set, unsigned int idx)
 977 {
 978   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
 979     return set->highest_location;
 980
 981   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
 982   return MAP_START_LOCATION (next_map);
 983 }
 984
 985 /* Helper function for write_digit_row.  */
 986
 987 static void
 988 write_digit (FILE *stream, int digit)
 989 {
 990   fputc ('0' + (digit % 10), stream);
 991 }
 992
 993 /* Helper function for dump_location_info.
 994    Write a row of numbers to STREAM, numbering a source line,
 995    giving the units, tens, hundreds etc of the column number.  */
 996
 997 static void
 998 write_digit_row (FILE *stream, int indent,
 999                  const line_map_ordinary *map,
1000                  source_location loc, int max_col, int divisor)
1001 {
1002   fprintf (stream, "%*c", indent, ' ');
1003   fprintf (stream, "|");
1004   for (int column = 1; column < max_col; column++)
1005     {
1006       source_location column_loc = loc + (column << map->m_range_bits);
1007       write_digit (stream, column_loc / divisor);
1008     }
1009   fprintf (stream, "\n");
1010 }
1011
1012 /* Write a half-closed (START) / half-open (END) interval of
1013    source_location to STREAM.  */
1014
1015 static void
1016 dump_location_range (FILE *stream,
1017                      source_location start, source_location end)
1018 {
1019   fprintf (stream,
1020            "  source_location interval: %u <= loc < %u\n",
1021            start, end);
1022 }
1023
1024 /* Write a labelled description of a half-closed (START) / half-open (END)
1025    interval of source_location to STREAM.  */
1026
1027 static void
1028 dump_labelled_location_range (FILE *stream,
1029                               const char *name,
1030                               source_location start, source_location end)
1031 {
1032   fprintf (stream, "%s\n", name);
1033   dump_location_range (stream, start, end);
1034   fprintf (stream, "\n");
1035 }
1036
1037 /* Write a visualization of the locations in the line_table to STREAM.  */
1038
1039 void
1040 dump_location_info (FILE *stream)
1041 {
1042   /* Visualize the reserved locations.  */
1043   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1044                                 0, RESERVED_LOCATION_COUNT);
1045
1046   /* Visualize the ordinary line_map instances, rendering the sources. */
1047   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1048     {
1049       source_location end_location = get_end_location (line_table, idx);
1050       /* half-closed: doesn't include this one. */
1051
1052       const line_map_ordinary *map
1053         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1054       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1055       dump_location_range (stream,
1056                            MAP_START_LOCATION (map), end_location);
1057       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1058       fprintf (stream, "  starting at line: %i\n",
1059                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1060       fprintf (stream, "  column and range bits: %i\n",
1061                map->m_column_and_range_bits);
1062       fprintf (stream, "  column bits: %i\n",
1063                map->m_column_and_range_bits - map->m_range_bits);
1064       fprintf (stream, "  range bits: %i\n",
1065                map->m_range_bits);
1066
1067       /* Render the span of source lines that this "map" covers.  */
1068       for (source_location loc = MAP_START_LOCATION (map);
1069            loc < end_location;
1070            loc += (1 << map->m_range_bits) )
1071         {
1072           gcc_assert (pure_location_p (line_table, loc) );
1073
1074           expanded_location exploc
1075             = linemap_expand_location (line_table, map, loc);
1076
1077           if (0 == exploc.column)
1078             {
1079               /* Beginning of a new source line: draw the line.  */
1080
1081               int line_size;
1082               const char *line_text = location_get_source_line (exploc.file,
1083                                                                 exploc.line,
1084                                                                 &line_size);
1085               if (!line_text)
1086                 break;
1087               fprintf (stream,
1088                        "%s:%3i|loc:%5i|%.*s\n",
1089                        exploc.file, exploc.line,
1090                        loc,
1091                        line_size, line_text);
1092
1093               /* "loc" is at column 0, which means "the whole line".
1094                  Render the locations *within* the line, by underlining
1095                  it, showing the source_location numeric values
1096                  at each column.  */
1097               int max_col = (1 << map->m_column_and_range_bits) - 1;
1098               if (max_col > line_size)
1099                 max_col = line_size + 1;
1100
1101               int indent = 14 + strlen (exploc.file);
1102
1103               /* Thousands.  */
1104               if (end_location > 999)
1105                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1106
1107               /* Hundreds.  */
1108               if (end_location > 99)
1109                 write_digit_row (stream, indent, map, loc, max_col, 100);
1110
1111               /* Tens.  */
1112               write_digit_row (stream, indent, map, loc, max_col, 10);
1113
1114               /* Units.  */
1115               write_digit_row (stream, indent, map, loc, max_col, 1);
1116             }
1117         }
1118       fprintf (stream, "\n");
1119     }
1120
1121   /* Visualize unallocated values.  */
1122   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1123                                 line_table->highest_location,
1124                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1125
1126   /* Visualize the macro line_map instances, rendering the sources. */
1127   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1128     {
1129       /* Each macro map that is allocated owns source_location values
1130          that are *lower* that the one before them.
1131          Hence it's meaningful to view them either in order of ascending
1132          source locations, or in order of ascending macro map index.  */
1133       const bool ascending_source_locations = true;
1134       unsigned int idx = (ascending_source_locations
1135                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1136                           : i);
1137       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1138       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1139                idx,
1140                linemap_map_get_macro_name (map),
1141                MACRO_MAP_NUM_MACRO_TOKENS (map));
1142       dump_location_range (stream,
1143                            map->start_location,
1144                            (map->start_location
1145                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1146       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1147               "expansion point is location %i",
1148               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1149       fprintf (stream, "  map->start_location: %u\n",
1150                map->start_location);
1151
1152       fprintf (stream, "  macro_locations:\n");
1153       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1154         {
1155           source_location x = MACRO_MAP_LOCATIONS (map)[2 * i];
1156           source_location y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1157
1158           /* linemap_add_macro_token encodes token numbers in an expansion
1159              by putting them after MAP_START_LOCATION. */
1160
1161           /* I'm typically seeing 4 uninitialized entries at the end of
1162              0xafafafaf.
1163              This appears to be due to macro.c:replace_args
1164              adding 2 extra args for padding tokens; presumably there may
1165              be a leading and/or trailing padding token injected,
1166              each for 2 more location slots.
1167              This would explain there being up to 4 source_locations slots
1168              that may be uninitialized.  */
1169
1170           fprintf (stream, "    %u: %u, %u\n",
1171                    i,
1172                    x,
1173                    y);
1174           if (x == y)
1175             {
1176               if (x < MAP_START_LOCATION (map))
1177                 inform (x, "token %u has x-location == y-location == %u", i, x);
1178               else
1179                 fprintf (stream,
1180                          "x-location == y-location == %u encodes token # %u\n",
1181                          x, x - MAP_START_LOCATION (map));
1182                 }
1183           else
1184             {
1185               inform (x, "token %u has x-location == %u", i, x);
1186               inform (x, "token %u has y-location == %u", i, y);
1187             }
1188         }
1189       fprintf (stream, "\n");
1190     }
1191
1192   /* It appears that MAX_SOURCE_LOCATION itself is never assigned to a
1193      macro map, presumably due to an off-by-one error somewhere
1194      between the logic in linemap_enter_macro and
1195      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1196   dump_labelled_location_range (stream, "MAX_SOURCE_LOCATION",
1197                                 MAX_SOURCE_LOCATION,
1198                                 MAX_SOURCE_LOCATION + 1);
1199
1200   /* Visualize ad-hoc values.  */
1201   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1202                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
1203 }
1204
1205 /* string_concat's constructor.  */
1206
1207 string_concat::string_concat (int num, location_t *locs)
1208   : m_num (num)
1209 {
1210   m_locs = ggc_vec_alloc <location_t> (num);
1211   for (int i = 0; i < num; i++)
1212     m_locs[i] = locs[i];
1213 }
1214
1215 /* string_concat_db's constructor.  */
1216
1217 string_concat_db::string_concat_db ()
1218 {
1219   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1220 }
1221
1222 /* Record that a string concatenation occurred, covering NUM
1223    string literal tokens.  LOCS is an array of size NUM, containing the
1224    locations of the tokens.  A copy of LOCS is taken.  */
1225
1226 void
1227 string_concat_db::record_string_concatenation (int num, location_t *locs)
1228 {
1229   gcc_assert (num > 1);
1230   gcc_assert (locs);
1231
1232   location_t key_loc = get_key_loc (locs[0]);
1233
1234   string_concat *concat
1235     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1236   m_table->put (key_loc, concat);
1237 }
1238
1239 /* Determine if LOC was the location of the the initial token of a
1240    concatenation of string literal tokens.
1241    If so, *OUT_NUM is written to with the number of tokens, and
1242    *OUT_LOCS with the location of an array of locations of the
1243    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1244    storage owned by the string_concat_db.
1245    Otherwise, return false.  */
1246
1247 bool
1248 string_concat_db::get_string_concatenation (location_t loc,
1249                                             int *out_num,
1250                                             location_t **out_locs)
1251 {
1252   gcc_assert (out_num);
1253   gcc_assert (out_locs);
1254
1255   location_t key_loc = get_key_loc (loc);
1256
1257   string_concat **concat = m_table->get (key_loc);
1258   if (!concat)
1259     return false;
1260
1261   *out_num = (*concat)->m_num;
1262   *out_locs =(*concat)->m_locs;
1263   return true;
1264 }
1265
1266 /* Internal function.  Canonicalize LOC into a form suitable for
1267    use as a key within the database, stripping away macro expansion,
1268    ad-hoc information, and range information, using the location of
1269    the start of LOC within an ordinary linemap.  */
1270
1271 location_t
1272 string_concat_db::get_key_loc (location_t loc)
1273 {
1274   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1275                                   NULL);
1276
1277   loc = get_range_from_loc (line_table, loc).m_start;
1278
1279   return loc;
1280 }
1281
1282 /* Helper class for use within get_substring_ranges_for_loc.
1283    An vec of cpp_string with responsibility for releasing all of the
1284    str->text for each str in the vector.  */
1285
1286 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1287 {
1288  public:
1289   auto_cpp_string_vec (int alloc)
1290     : auto_vec <cpp_string> (alloc) {}
1291
1292   ~auto_cpp_string_vec ()
1293   {
1294     /* Clean up the copies within this vec.  */
1295     int i;
1296     cpp_string *str;
1297     FOR_EACH_VEC_ELT (*this, i, str)
1298       free (const_cast <unsigned char *> (str->text));
1299   }
1300 };
1301
1302 /* Attempt to populate RANGES with source location information on the
1303    individual characters within the string literal found at STRLOC.
1304    If CONCATS is non-NULL, then any string literals that the token at
1305    STRLOC  was concatenated with are also added to RANGES.
1306
1307    Return NULL if successful, or an error message if any errors occurred (in
1308    which case RANGES may be only partially populated and should not
1309    be used).
1310
1311    This is implemented by re-parsing the relevant source line(s).  */
1312
1313 static const char *
1314 get_substring_ranges_for_loc (cpp_reader *pfile,
1315                               string_concat_db *concats,
1316                               location_t strloc,
1317                               enum cpp_ttype type,
1318                               cpp_substring_ranges &ranges)
1319 {
1320   gcc_assert (pfile);
1321
1322   if (strloc == UNKNOWN_LOCATION)
1323     return "unknown location";
1324
1325   /* Reparsing the strings requires accurate location information.
1326      If -ftrack-macro-expansion has been overridden from its default
1327      of 2, then we might have a location of a macro expansion point,
1328      rather than the location of the literal itself.
1329      Avoid this by requiring that we have full macro expansion tracking
1330      for substring locations to be available.  */
1331   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1332     return "track_macro_expansion != 2";
1333
1334   /* If string concatenation has occurred at STRLOC, get the locations
1335      of all of the literal tokens making up the compound string.
1336      Otherwise, just use STRLOC.  */
1337   int num_locs = 1;
1338   location_t *strlocs = &strloc;
1339   if (concats)
1340     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1341
1342   auto_cpp_string_vec strs (num_locs);
1343   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1344   for (int i = 0; i < num_locs; i++)
1345     {
1346       /* Get range of strloc.  We will use it to locate the start and finish
1347          of the literal token within the line.  */
1348       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1349
1350       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1351         /* If the string is within a macro expansion, we can't get at the
1352            end location.  */
1353         return "macro expansion";
1354
1355       if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1356         /* If so, we can't reliably determine where the token started within
1357            its line.  */
1358         return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1359
1360       if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1361         /* If so, we can't reliably determine where the token finished within
1362            its line.  */
1363         return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1364
1365       expanded_location start
1366         = expand_location_to_spelling_point (src_range.m_start);
1367       expanded_location finish
1368         = expand_location_to_spelling_point (src_range.m_finish);
1369       if (start.file != finish.file)
1370         return "range endpoints are in different files";
1371       if (start.line != finish.line)
1372         return "range endpoints are on different lines";
1373       if (start.column > finish.column)
1374         return "range endpoints are reversed";
1375
1376       int line_width;
1377       const char *line = location_get_source_line (start.file, start.line,
1378                                                    &line_width);
1379       if (line == NULL)
1380         return "unable to read source line";
1381
1382       /* Determine the location of the literal (including quotes
1383          and leading prefix chars, such as the 'u' in a u""
1384          token).  */
1385       const char *literal = line + start.column - 1;
1386       int literal_length = finish.column - start.column + 1;
1387
1388       gcc_assert (line_width >= (start.column - 1 + literal_length));
1389       cpp_string from;
1390       from.len = literal_length;
1391       /* Make a copy of the literal, to avoid having to rely on
1392          the lifetime of the copy of the line within the cache.
1393          This will be released by the auto_cpp_string_vec dtor.  */
1394       from.text = XDUPVEC (unsigned char, literal, literal_length);
1395       strs.safe_push (from);
1396
1397       /* For very long lines, a new linemap could have started
1398          halfway through the token.
1399          Ensure that the loc_reader uses the linemap of the
1400          *end* of the token for its start location.  */
1401       const line_map_ordinary *final_ord_map;
1402       linemap_resolve_location (line_table, src_range.m_finish,
1403                                 LRK_MACRO_EXPANSION_POINT, &final_ord_map);
1404       location_t start_loc
1405         = linemap_position_for_line_and_column (line_table, final_ord_map,
1406                                                 start.line, start.column);
1407
1408       cpp_string_location_reader loc_reader (start_loc, line_table);
1409       loc_readers.safe_push (loc_reader);
1410     }
1411
1412   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1413   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1414                                                  loc_readers.address (),
1415                                                  num_locs, &ranges, type);
1416   if (err)
1417     return err;
1418
1419   /* Success: "ranges" should now contain information on the string.  */
1420   return NULL;
1421 }
1422
1423 /* Attempt to populate *OUT_LOC with source location information on the
1424    given characters within the string literal found at STRLOC.
1425    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1426    character set.
1427
1428    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1429    and string literal "012345\n789"
1430    *OUT_LOC is written to with:
1431      "012345\n789"
1432          ~^~~~~
1433
1434    If CONCATS is non-NULL, then any string literals that the token at
1435    STRLOC was concatenated with are also considered.
1436
1437    This is implemented by re-parsing the relevant source line(s).
1438
1439    Return NULL if successful, or an error message if any errors occurred.
1440    Error messages are intended for GCC developers (to help debugging) rather
1441    than for end-users.  */
1442
1443 const char *
1444 get_source_location_for_substring (cpp_reader *pfile,
1445                                    string_concat_db *concats,
1446                                    location_t strloc,
1447                                    enum cpp_ttype type,
1448                                    int caret_idx, int start_idx, int end_idx,
1449                                    source_location *out_loc)
1450 {
1451   gcc_checking_assert (caret_idx >= 0);
1452   gcc_checking_assert (start_idx >= 0);
1453   gcc_checking_assert (end_idx >= 0);
1454   gcc_assert (out_loc);
1455
1456   cpp_substring_ranges ranges;
1457   const char *err
1458     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1459   if (err)
1460     return err;
1461
1462   if (caret_idx >= ranges.get_num_ranges ())
1463     return "caret_idx out of range";
1464   if (start_idx >= ranges.get_num_ranges ())
1465     return "start_idx out of range";
1466   if (end_idx >= ranges.get_num_ranges ())
1467     return "end_idx out of range";
1468
1469   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1470                             ranges.get_range (start_idx).m_start,
1471                             ranges.get_range (end_idx).m_finish);
1472   return NULL;
1473 }
1474
1475 #if CHECKING_P
1476
1477 namespace selftest {
1478
1479 /* Selftests of location handling.  */
1480
1481 /* Attempt to populate *OUT_RANGE with source location information on the
1482    given character within the string literal found at STRLOC.
1483    CHAR_IDX refers to an offset within the execution character set.
1484    If CONCATS is non-NULL, then any string literals that the token at
1485    STRLOC was concatenated with are also considered.
1486
1487    This is implemented by re-parsing the relevant source line(s).
1488
1489    Return NULL if successful, or an error message if any errors occurred.
1490    Error messages are intended for GCC developers (to help debugging) rather
1491    than for end-users.  */
1492
1493 static const char *
1494 get_source_range_for_char (cpp_reader *pfile,
1495                            string_concat_db *concats,
1496                            location_t strloc,
1497                            enum cpp_ttype type,
1498                            int char_idx,
1499                            source_range *out_range)
1500 {
1501   gcc_checking_assert (char_idx >= 0);
1502   gcc_assert (out_range);
1503
1504   cpp_substring_ranges ranges;
1505   const char *err
1506     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1507   if (err)
1508     return err;
1509
1510   if (char_idx >= ranges.get_num_ranges ())
1511     return "char_idx out of range";
1512
1513   *out_range = ranges.get_range (char_idx);
1514   return NULL;
1515 }
1516
1517 /* As get_source_range_for_char, but write to *OUT the number
1518    of ranges that are available.  */
1519
1520 static const char *
1521 get_num_source_ranges_for_substring (cpp_reader *pfile,
1522                                      string_concat_db *concats,
1523                                      location_t strloc,
1524                                      enum cpp_ttype type,
1525                                      int *out)
1526 {
1527   gcc_assert (out);
1528
1529   cpp_substring_ranges ranges;
1530   const char *err
1531     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1532
1533   if (err)
1534     return err;
1535
1536   *out = ranges.get_num_ranges ();
1537   return NULL;
1538 }
1539
1540 /* Selftests of location handling.  */
1541
1542 /* Helper function for verifying location data: when location_t
1543    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1544    as having column 0.  */
1545
1546 static bool
1547 should_have_column_data_p (location_t loc)
1548 {
1549   if (IS_ADHOC_LOC (loc))
1550     loc = get_location_from_adhoc_loc (line_table, loc);
1551   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1552     return false;
1553   return true;
1554 }
1555
1556 /* Selftest for should_have_column_data_p.  */
1557
1558 static void
1559 test_should_have_column_data_p ()
1560 {
1561   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1562   ASSERT_TRUE
1563     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1564   ASSERT_FALSE
1565     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1566 }
1567
1568 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1569    on LOC.  */
1570
1571 static void
1572 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1573               location_t loc)
1574 {
1575   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1576   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1577   /* If location_t values are sufficiently high, then column numbers
1578      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1579      When close to the threshold, column numbers *may* be present: if
1580      the final linemap before the threshold contains a line that straddles
1581      the threshold, locations in that line have column information.  */
1582   if (should_have_column_data_p (loc))
1583     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1584 }
1585
1586 /* Various selftests involve constructing a line table and one or more
1587    line maps within it.
1588
1589    For maximum test coverage we want to run these tests with a variety
1590    of situations:
1591    - line_table->default_range_bits: some frontends use a non-zero value
1592    and others use zero
1593    - the fallback modes within line-map.c: there are various threshold
1594    values for source_location/location_t beyond line-map.c changes
1595    behavior (disabling of the range-packing optimization, disabling
1596    of column-tracking).  We can exercise these by starting the line_table
1597    at interesting values at or near these thresholds.
1598
1599    The following struct describes a particular case within our test
1600    matrix.  */
1601
1602 struct line_table_case
1603 {
1604   line_table_case (int default_range_bits, int base_location)
1605   : m_default_range_bits (default_range_bits),
1606     m_base_location (base_location)
1607   {}
1608
1609   int m_default_range_bits;
1610   int m_base_location;
1611 };
1612
1613 /* Constructor.  Store the old value of line_table, and create a new
1614    one, using sane defaults.  */
1615
1616 line_table_test::line_table_test ()
1617 {
1618   gcc_assert (saved_line_table == NULL);
1619   saved_line_table = line_table;
1620   line_table = ggc_alloc<line_maps> ();
1621   linemap_init (line_table, BUILTINS_LOCATION);
1622   gcc_assert (saved_line_table->reallocator);
1623   line_table->reallocator = saved_line_table->reallocator;
1624   gcc_assert (saved_line_table->round_alloc_size);
1625   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1626   line_table->default_range_bits = 0;
1627 }
1628
1629 /* Constructor.  Store the old value of line_table, and create a new
1630    one, using the sitation described in CASE_.  */
1631
1632 line_table_test::line_table_test (const line_table_case &case_)
1633 {
1634   gcc_assert (saved_line_table == NULL);
1635   saved_line_table = line_table;
1636   line_table = ggc_alloc<line_maps> ();
1637   linemap_init (line_table, BUILTINS_LOCATION);
1638   gcc_assert (saved_line_table->reallocator);
1639   line_table->reallocator = saved_line_table->reallocator;
1640   gcc_assert (saved_line_table->round_alloc_size);
1641   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1642   line_table->default_range_bits = case_.m_default_range_bits;
1643   if (case_.m_base_location)
1644     {
1645       line_table->highest_location = case_.m_base_location;
1646       line_table->highest_line = case_.m_base_location;
1647     }
1648 }
1649
1650 /* Destructor.  Restore the old value of line_table.  */
1651
1652 line_table_test::~line_table_test ()
1653 {
1654   gcc_assert (saved_line_table != NULL);
1655   line_table = saved_line_table;
1656   saved_line_table = NULL;
1657 }
1658
1659 /* Verify basic operation of ordinary linemaps.  */
1660
1661 static void
1662 test_accessing_ordinary_linemaps (const line_table_case &case_)
1663 {
1664   line_table_test ltt (case_);
1665
1666   /* Build a simple linemap describing some locations. */
1667   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1668
1669   linemap_line_start (line_table, 1, 100);
1670   location_t loc_a = linemap_position_for_column (line_table, 1);
1671   location_t loc_b = linemap_position_for_column (line_table, 23);
1672
1673   linemap_line_start (line_table, 2, 100);
1674   location_t loc_c = linemap_position_for_column (line_table, 1);
1675   location_t loc_d = linemap_position_for_column (line_table, 17);
1676
1677   /* Example of a very long line.  */
1678   linemap_line_start (line_table, 3, 2000);
1679   location_t loc_e = linemap_position_for_column (line_table, 700);
1680
1681   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1682
1683   /* Multiple files.  */
1684   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1685   linemap_line_start (line_table, 1, 200);
1686   location_t loc_f = linemap_position_for_column (line_table, 150);
1687   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1688
1689   /* Verify that we can recover the location info.  */
1690   assert_loceq ("foo.c", 1, 1, loc_a);
1691   assert_loceq ("foo.c", 1, 23, loc_b);
1692   assert_loceq ("foo.c", 2, 1, loc_c);
1693   assert_loceq ("foo.c", 2, 17, loc_d);
1694   assert_loceq ("foo.c", 3, 700, loc_e);
1695   assert_loceq ("bar.c", 1, 150, loc_f);
1696
1697   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1698   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1699
1700   /* Verify using make_location to build a range, and extracting data
1701      back from it.  */
1702   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1703   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1704   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1705   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1706   ASSERT_EQ (loc_b, src_range.m_start);
1707   ASSERT_EQ (loc_d, src_range.m_finish);
1708 }
1709
1710 /* Verify various properties of UNKNOWN_LOCATION.  */
1711
1712 static void
1713 test_unknown_location ()
1714 {
1715   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1716   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1717   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1718 }
1719
1720 /* Verify various properties of BUILTINS_LOCATION.  */
1721
1722 static void
1723 test_builtins ()
1724 {
1725   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1726   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1727 }
1728
1729 /* Regression test for make_location.
1730    Ensure that we use pure locations for the start/finish of the range,
1731    rather than storing a packed or ad-hoc range as the start/finish.  */
1732
1733 static void
1734 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1735 {
1736   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1737      with C++ frontend.
1738      ....................0000000001111111111222.
1739      ....................1234567890123456789012.  */
1740   const char *content = "     r += !aaa == bbb;\n";
1741   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1742   line_table_test ltt (case_);
1743   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1744
1745   const location_t c11 = linemap_position_for_column (line_table, 11);
1746   const location_t c12 = linemap_position_for_column (line_table, 12);
1747   const location_t c13 = linemap_position_for_column (line_table, 13);
1748   const location_t c14 = linemap_position_for_column (line_table, 14);
1749   const location_t c21 = linemap_position_for_column (line_table, 21);
1750
1751   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1752     return;
1753
1754   /* Use column 13 for the caret location, arbitrarily, to verify that we
1755      handle start != caret.  */
1756   const location_t aaa = make_location (c13, c12, c14);
1757   ASSERT_EQ (c13, get_pure_location (aaa));
1758   ASSERT_EQ (c12, get_start (aaa));
1759   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1760   ASSERT_EQ (c14, get_finish (aaa));
1761   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1762
1763   /* Make a location using a location with a range as the start-point.  */
1764   const location_t not_aaa = make_location (c11, aaa, c14);
1765   ASSERT_EQ (c11, get_pure_location (not_aaa));
1766   /* It should use the start location of the range, not store the range
1767      itself.  */
1768   ASSERT_EQ (c12, get_start (not_aaa));
1769   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1770   ASSERT_EQ (c14, get_finish (not_aaa));
1771   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1772
1773   /* Similarly, make a location with a range as the end-point.  */
1774   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1775   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1776   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1777   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1778   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1779   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1780   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1781   /* It should use the finish location of the range, not store the range
1782      itself.  */
1783   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1784   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1785   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1786   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1787   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1788 }
1789
1790 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1791
1792 static void
1793 test_reading_source_line ()
1794 {
1795   /* Create a tempfile and write some text to it.  */
1796   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1797                         "01234567890123456789\n"
1798                         "This is the test text\n"
1799                         "This is the 3rd line");
1800
1801   /* Read back a specific line from the tempfile.  */
1802   int line_size;
1803   const char *source_line = location_get_source_line (tmp.get_filename (),
1804                                                       3, &line_size);
1805   ASSERT_TRUE (source_line != NULL);
1806   ASSERT_EQ (20, line_size);
1807   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1808                          source_line, line_size));
1809
1810   source_line = location_get_source_line (tmp.get_filename (),
1811                                           2, &line_size);
1812   ASSERT_TRUE (source_line != NULL);
1813   ASSERT_EQ (21, line_size);
1814   ASSERT_TRUE (!strncmp ("This is the test text",
1815                          source_line, line_size));
1816
1817   source_line = location_get_source_line (tmp.get_filename (),
1818                                           4, &line_size);
1819   ASSERT_TRUE (source_line == NULL);
1820 }
1821
1822 /* Tests of lexing.  */
1823
1824 /* Verify that token TOK from PARSER has cpp_token_as_text
1825    equal to EXPECTED_TEXT.  */
1826
1827 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1828   SELFTEST_BEGIN_STMT                                                   \
1829     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1830     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1831   SELFTEST_END_STMT
1832
1833 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1834    and ranges from EXP_START_COL to EXP_FINISH_COL.
1835    Use LOC as the effective location of the selftest.  */
1836
1837 static void
1838 assert_token_loc_eq (const location &loc,
1839                      const cpp_token *tok,
1840                      const char *exp_filename, int exp_linenum,
1841                      int exp_start_col, int exp_finish_col)
1842 {
1843   location_t tok_loc = tok->src_loc;
1844   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1845   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1846
1847   /* If location_t values are sufficiently high, then column numbers
1848      will be unavailable.  */
1849   if (!should_have_column_data_p (tok_loc))
1850     return;
1851
1852   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
1853   source_range tok_range = get_range_from_loc (line_table, tok_loc);
1854   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
1855   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
1856 }
1857
1858 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
1859    SELFTEST_LOCATION as the effective location of the selftest.  */
1860
1861 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
1862                             EXP_START_COL, EXP_FINISH_COL) \
1863   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
1864                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
1865
1866 /* Test of lexing a file using libcpp, verifying tokens and their
1867    location information.  */
1868
1869 static void
1870 test_lexer (const line_table_case &case_)
1871 {
1872   /* Create a tempfile and write some text to it.  */
1873   const char *content =
1874     /*00000000011111111112222222222333333.3333444444444.455555555556
1875       12345678901234567890123456789012345.6789012345678.901234567890.  */
1876     ("test_name /* c-style comment */\n"
1877      "                                  \"test literal\"\n"
1878      " // test c++-style comment\n"
1879      "   42\n");
1880   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
1881
1882   line_table_test ltt (case_);
1883
1884   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
1885
1886   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
1887   ASSERT_NE (fname, NULL);
1888
1889   /* Verify that we get the expected tokens back, with the correct
1890      location information.  */
1891
1892   location_t loc;
1893   const cpp_token *tok;
1894   tok = cpp_get_token_with_location (parser, &loc);
1895   ASSERT_NE (tok, NULL);
1896   ASSERT_EQ (tok->type, CPP_NAME);
1897   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
1898   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
1899
1900   tok = cpp_get_token_with_location (parser, &loc);
1901   ASSERT_NE (tok, NULL);
1902   ASSERT_EQ (tok->type, CPP_STRING);
1903   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
1904   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
1905
1906   tok = cpp_get_token_with_location (parser, &loc);
1907   ASSERT_NE (tok, NULL);
1908   ASSERT_EQ (tok->type, CPP_NUMBER);
1909   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
1910   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
1911
1912   tok = cpp_get_token_with_location (parser, &loc);
1913   ASSERT_NE (tok, NULL);
1914   ASSERT_EQ (tok->type, CPP_EOF);
1915
1916   cpp_finish (parser, NULL);
1917   cpp_destroy (parser);
1918 }
1919
1920 /* Forward decls.  */
1921
1922 struct lexer_test;
1923 class lexer_test_options;
1924
1925 /* A class for specifying options of a lexer_test.
1926    The "apply" vfunc is called during the lexer_test constructor.  */
1927
1928 class lexer_test_options
1929 {
1930  public:
1931   virtual void apply (lexer_test &) = 0;
1932 };
1933
1934 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
1935    in its dtor.
1936
1937    This is needed by struct lexer_test to ensure that the cleanup of the
1938    cpp_reader happens *after* the cleanup of the temp_source_file.  */
1939
1940 class cpp_reader_ptr
1941 {
1942  public:
1943   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
1944
1945   ~cpp_reader_ptr ()
1946   {
1947     cpp_finish (m_ptr, NULL);
1948     cpp_destroy (m_ptr);
1949   }
1950
1951   operator cpp_reader * () const { return m_ptr; }
1952
1953  private:
1954   cpp_reader *m_ptr;
1955 };
1956
1957 /* A struct for writing lexer tests.  */
1958
1959 struct lexer_test
1960 {
1961   lexer_test (const line_table_case &case_, const char *content,
1962               lexer_test_options *options);
1963   ~lexer_test ();
1964
1965   const cpp_token *get_token ();
1966
1967   /* The ordering of these fields matters.
1968      The line_table_test must be first, since the cpp_reader_ptr
1969      uses it.
1970      The cpp_reader must be cleaned up *after* the temp_source_file
1971      since the filenames in input.c's input cache are owned by the
1972      cpp_reader; in particular, when ~temp_source_file evicts the
1973      filename the filenames must still be alive.  */
1974   line_table_test m_ltt;
1975   cpp_reader_ptr m_parser;
1976   temp_source_file m_tempfile;
1977   string_concat_db m_concats;
1978 };
1979
1980 /* Use an EBCDIC encoding for the execution charset, specifically
1981    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
1982
1983    This exercises iconv integration within libcpp.
1984    Not every build of iconv supports the given charset,
1985    so we need to flag this error and handle it gracefully.  */
1986
1987 class ebcdic_execution_charset : public lexer_test_options
1988 {
1989  public:
1990   ebcdic_execution_charset () : m_num_iconv_errors (0)
1991     {
1992       gcc_assert (s_singleton == NULL);
1993       s_singleton = this;
1994     }
1995   ~ebcdic_execution_charset ()
1996     {
1997       gcc_assert (s_singleton == this);
1998       s_singleton = NULL;
1999     }
2000
2001   void apply (lexer_test &test) FINAL OVERRIDE
2002   {
2003     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2004     cpp_opts->narrow_charset = "IBM1047";
2005
2006     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2007     callbacks->error = on_error;
2008   }
2009
2010   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
2011                         int level ATTRIBUTE_UNUSED,
2012                         int reason ATTRIBUTE_UNUSED,
2013                         rich_location *richloc ATTRIBUTE_UNUSED,
2014                         const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2015     ATTRIBUTE_FPTR_PRINTF(5,0)
2016   {
2017     gcc_assert (s_singleton);
2018     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2019        when the local iconv build doesn't support the conversion.  */
2020     if (strstr (msgid, "not supported by iconv"))
2021       {
2022         s_singleton->m_num_iconv_errors++;
2023         return true;
2024       }
2025
2026     /* Otherwise, we have an unexpected error.  */
2027     abort ();
2028   }
2029
2030   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2031
2032  private:
2033   static ebcdic_execution_charset *s_singleton;
2034   int m_num_iconv_errors;
2035 };
2036
2037 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2038
2039 /* Constructor.  Override line_table with a new instance based on CASE_,
2040    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2041    start parsing the tempfile.  */
2042
2043 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2044                         lexer_test_options *options)
2045 : m_ltt (case_),
2046   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2047   /* Create a tempfile and write the text to it.  */
2048   m_tempfile (SELFTEST_LOCATION, ".c", content),
2049   m_concats ()
2050 {
2051   if (options)
2052     options->apply (*this);
2053
2054   cpp_init_iconv (m_parser);
2055
2056   /* Parse the file.  */
2057   const char *fname = cpp_read_main_file (m_parser,
2058                                           m_tempfile.get_filename ());
2059   ASSERT_NE (fname, NULL);
2060 }
2061
2062 /* Destructor.  Verify that the next token in m_parser is EOF.  */
2063
2064 lexer_test::~lexer_test ()
2065 {
2066   location_t loc;
2067   const cpp_token *tok;
2068
2069   tok = cpp_get_token_with_location (m_parser, &loc);
2070   ASSERT_NE (tok, NULL);
2071   ASSERT_EQ (tok->type, CPP_EOF);
2072 }
2073
2074 /* Get the next token from m_parser.  */
2075
2076 const cpp_token *
2077 lexer_test::get_token ()
2078 {
2079   location_t loc;
2080   const cpp_token *tok;
2081
2082   tok = cpp_get_token_with_location (m_parser, &loc);
2083   ASSERT_NE (tok, NULL);
2084   return tok;
2085 }
2086
2087 /* Verify that locations within string literals are correctly handled.  */
2088
2089 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2090    using the string concatenation database for TEST.
2091
2092    Assert that the character at index IDX is on EXPECTED_LINE,
2093    and that it begins at column EXPECTED_START_COL and ends at
2094    EXPECTED_FINISH_COL (unless the locations are beyond
2095    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2096    columns).  */
2097
2098 static void
2099 assert_char_at_range (const location &loc,
2100                       lexer_test& test,
2101                       location_t strloc, enum cpp_ttype type, int idx,
2102                       int expected_line, int expected_start_col,
2103                       int expected_finish_col)
2104 {
2105   cpp_reader *pfile = test.m_parser;
2106   string_concat_db *concats = &test.m_concats;
2107
2108   source_range actual_range;
2109   const char *err
2110     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2111                                  &actual_range);
2112   if (should_have_column_data_p (strloc))
2113     ASSERT_EQ_AT (loc, NULL, err);
2114   else
2115     {
2116       ASSERT_STREQ_AT (loc,
2117                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2118                        err);
2119       return;
2120     }
2121
2122   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2123   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2124   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2125   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2126
2127   if (should_have_column_data_p (actual_range.m_start))
2128     {
2129       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2130       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2131     }
2132   if (should_have_column_data_p (actual_range.m_finish))
2133     {
2134       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2135       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2136     }
2137 }
2138
2139 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2140    the effective location of any errors.  */
2141
2142 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2143                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2144   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2145                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2146                         (EXPECTED_FINISH_COL))
2147
2148 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2149    using the string concatenation database for TEST.
2150
2151    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2152
2153 static void
2154 assert_num_substring_ranges (const location &loc,
2155                              lexer_test& test,
2156                              location_t strloc,
2157                              enum cpp_ttype type,
2158                              int expected_num_ranges)
2159 {
2160   cpp_reader *pfile = test.m_parser;
2161   string_concat_db *concats = &test.m_concats;
2162
2163   int actual_num_ranges = -1;
2164   const char *err
2165     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2166                                            &actual_num_ranges);
2167   if (should_have_column_data_p (strloc))
2168     ASSERT_EQ_AT (loc, NULL, err);
2169   else
2170     {
2171       ASSERT_STREQ_AT (loc,
2172                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2173                        err);
2174       return;
2175     }
2176   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2177 }
2178
2179 /* Macro for calling assert_num_substring_ranges, supplying
2180    SELFTEST_LOCATION for the effective location of any errors.  */
2181
2182 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2183                                     EXPECTED_NUM_RANGES)                \
2184   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2185                                (TYPE), (EXPECTED_NUM_RANGES))
2186
2187
2188 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2189    returns an error (using the string concatenation database for TEST).  */
2190
2191 static void
2192 assert_has_no_substring_ranges (const location &loc,
2193                                 lexer_test& test,
2194                                 location_t strloc,
2195                                 enum cpp_ttype type,
2196                                 const char *expected_err)
2197 {
2198   cpp_reader *pfile = test.m_parser;
2199   string_concat_db *concats = &test.m_concats;
2200   cpp_substring_ranges ranges;
2201   const char *actual_err
2202     = get_substring_ranges_for_loc (pfile, concats, strloc,
2203                                     type, ranges);
2204   if (should_have_column_data_p (strloc))
2205     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2206   else
2207     ASSERT_STREQ_AT (loc,
2208                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2209                      actual_err);
2210 }
2211
2212 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2213     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2214                                     (STRLOC), (TYPE), (ERR))
2215
2216 /* Lex a simple string literal.  Verify the substring location data, before
2217    and after running cpp_interpret_string on it.  */
2218
2219 static void
2220 test_lexer_string_locations_simple (const line_table_case &case_)
2221 {
2222   /* Digits 0-9 (with 0 at column 10), the simple way.
2223      ....................000000000.11111111112.2222222223333333333
2224      ....................123456789.01234567890.1234567890123456789
2225      We add a trailing comment to ensure that we correctly locate
2226      the end of the string literal token.  */
2227   const char *content = "        \"0123456789\" /* not a string */\n";
2228   lexer_test test (case_, content, NULL);
2229
2230   /* Verify that we get the expected token back, with the correct
2231      location information.  */
2232   const cpp_token *tok = test.get_token ();
2233   ASSERT_EQ (tok->type, CPP_STRING);
2234   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2235   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2236
2237   /* At this point in lexing, the quote characters are treated as part of
2238      the string (they are stripped off by cpp_interpret_string).  */
2239
2240   ASSERT_EQ (tok->val.str.len, 12);
2241
2242   /* Verify that cpp_interpret_string works.  */
2243   cpp_string dst_string;
2244   const enum cpp_ttype type = CPP_STRING;
2245   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2246                                       &dst_string, type);
2247   ASSERT_TRUE (result);
2248   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2249   free (const_cast <unsigned char *> (dst_string.text));
2250
2251   /* Verify ranges of individual characters.  This no longer includes the
2252      opening quote, but does include the closing quote.  */
2253   for (int i = 0; i <= 10; i++)
2254     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2255                           10 + i, 10 + i);
2256
2257   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2258 }
2259
2260 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2261    encoding.  */
2262
2263 static void
2264 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2265 {
2266   /* EBCDIC support requires iconv.  */
2267   if (!HAVE_ICONV)
2268     return;
2269
2270   /* Digits 0-9 (with 0 at column 10), the simple way.
2271      ....................000000000.11111111112.2222222223333333333
2272      ....................123456789.01234567890.1234567890123456789
2273      We add a trailing comment to ensure that we correctly locate
2274      the end of the string literal token.  */
2275   const char *content = "        \"0123456789\" /* not a string */\n";
2276   ebcdic_execution_charset use_ebcdic;
2277   lexer_test test (case_, content, &use_ebcdic);
2278
2279   /* Verify that we get the expected token back, with the correct
2280      location information.  */
2281   const cpp_token *tok = test.get_token ();
2282   ASSERT_EQ (tok->type, CPP_STRING);
2283   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2284   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2285
2286   /* At this point in lexing, the quote characters are treated as part of
2287      the string (they are stripped off by cpp_interpret_string).  */
2288
2289   ASSERT_EQ (tok->val.str.len, 12);
2290
2291   /* The remainder of the test requires an iconv implementation that
2292      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2293   if (use_ebcdic.iconv_errors_occurred_p ())
2294     return;
2295
2296   /* Verify that cpp_interpret_string works.  */
2297   cpp_string dst_string;
2298   const enum cpp_ttype type = CPP_STRING;
2299   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2300                                       &dst_string, type);
2301   ASSERT_TRUE (result);
2302   /* We should now have EBCDIC-encoded text, specifically
2303      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2304      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2305   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2306                 (const char *)dst_string.text);
2307   free (const_cast <unsigned char *> (dst_string.text));
2308
2309   /* Verify that we don't attempt to record substring location information
2310      for such cases.  */
2311   ASSERT_HAS_NO_SUBSTRING_RANGES
2312     (test, tok->src_loc, type,
2313      "execution character set != source character set");
2314 }
2315
2316 /* Lex a string literal containing a hex-escaped character.
2317    Verify the substring location data, before and after running
2318    cpp_interpret_string on it.  */
2319
2320 static void
2321 test_lexer_string_locations_hex (const line_table_case &case_)
2322 {
2323   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2324      and with a space in place of digit 6, to terminate the escaped
2325      hex code.
2326      ....................000000000.111111.11112222.
2327      ....................123456789.012345.67890123.  */
2328   const char *content = "        \"01234\\x35 789\"\n";
2329   lexer_test test (case_, content, NULL);
2330
2331   /* Verify that we get the expected token back, with the correct
2332      location information.  */
2333   const cpp_token *tok = test.get_token ();
2334   ASSERT_EQ (tok->type, CPP_STRING);
2335   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2336   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2337
2338   /* At this point in lexing, the quote characters are treated as part of
2339      the string (they are stripped off by cpp_interpret_string).  */
2340   ASSERT_EQ (tok->val.str.len, 15);
2341
2342   /* Verify that cpp_interpret_string works.  */
2343   cpp_string dst_string;
2344   const enum cpp_ttype type = CPP_STRING;
2345   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2346                                       &dst_string, type);
2347   ASSERT_TRUE (result);
2348   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2349   free (const_cast <unsigned char *> (dst_string.text));
2350
2351   /* Verify ranges of individual characters.  This no longer includes the
2352      opening quote, but does include the closing quote.  */
2353   for (int i = 0; i <= 4; i++)
2354     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2355   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2356   for (int i = 6; i <= 10; i++)
2357     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2358
2359   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2360 }
2361
2362 /* Lex a string literal containing an octal-escaped character.
2363    Verify the substring location data after running cpp_interpret_string
2364    on it.  */
2365
2366 static void
2367 test_lexer_string_locations_oct (const line_table_case &case_)
2368 {
2369   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2370      and with a space in place of digit 6, to terminate the escaped
2371      octal code.
2372      ....................000000000.111111.11112222.2222223333333333444
2373      ....................123456789.012345.67890123.4567890123456789012  */
2374   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2375   lexer_test test (case_, content, NULL);
2376
2377   /* Verify that we get the expected token back, with the correct
2378      location information.  */
2379   const cpp_token *tok = test.get_token ();
2380   ASSERT_EQ (tok->type, CPP_STRING);
2381   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2382
2383   /* Verify that cpp_interpret_string works.  */
2384   cpp_string dst_string;
2385   const enum cpp_ttype type = CPP_STRING;
2386   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2387                                       &dst_string, type);
2388   ASSERT_TRUE (result);
2389   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2390   free (const_cast <unsigned char *> (dst_string.text));
2391
2392   /* Verify ranges of individual characters.  This no longer includes the
2393      opening quote, but does include the closing quote.  */
2394   for (int i = 0; i < 5; i++)
2395     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2396   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2397   for (int i = 6; i <= 10; i++)
2398     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2399
2400   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2401 }
2402
2403 /* Test of string literal containing letter escapes.  */
2404
2405 static void
2406 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2407 {
2408   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2409      .....................000000000.1.11111.1.1.11222.22222223333333
2410      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2411   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2412   lexer_test test (case_, content, NULL);
2413
2414   /* Verify that we get the expected tokens back.  */
2415   const cpp_token *tok = test.get_token ();
2416   ASSERT_EQ (tok->type, CPP_STRING);
2417   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2418
2419   /* Verify ranges of individual characters. */
2420   /* "\t".  */
2421   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2422                         0, 1, 10, 11);
2423   /* "foo". */
2424   for (int i = 1; i <= 3; i++)
2425     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2426                           i, 1, 11 + i, 11 + i);
2427   /* "\\" and "\n".  */
2428   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2429                         4, 1, 15, 16);
2430   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2431                         5, 1, 17, 18);
2432
2433   /* "bar" and closing quote for nul-terminator.  */
2434   for (int i = 6; i <= 9; i++)
2435     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2436                           i, 1, 13 + i, 13 + i);
2437
2438   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2439 }
2440
2441 /* Another test of a string literal containing a letter escape.
2442    Based on string seen in
2443      printf ("%-%\n");
2444    in gcc.dg/format/c90-printf-1.c.  */
2445
2446 static void
2447 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2448 {
2449   /* .....................000000000.1111.11.1111.22222222223.
2450      .....................123456789.0123.45.6789.01234567890.  */
2451   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2452   lexer_test test (case_, content, NULL);
2453
2454   /* Verify that we get the expected tokens back.  */
2455   const cpp_token *tok = test.get_token ();
2456   ASSERT_EQ (tok->type, CPP_STRING);
2457   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2458
2459   /* Verify ranges of individual characters. */
2460   /* "%-%".  */
2461   for (int i = 0; i < 3; i++)
2462     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2463                           i, 1, 10 + i, 10 + i);
2464   /* "\n".  */
2465   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2466                         3, 1, 13, 14);
2467
2468   /* Closing quote for nul-terminator.  */
2469   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2470                         4, 1, 15, 15);
2471
2472   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2473 }
2474
2475 /* Lex a string literal containing UCN 4 characters.
2476    Verify the substring location data after running cpp_interpret_string
2477    on it.  */
2478
2479 static void
2480 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2481 {
2482   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2483      as UCN 4.
2484      ....................000000000.111111.111122.222222223.33333333344444
2485      ....................123456789.012345.678901.234567890.12345678901234  */
2486   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2487   lexer_test test (case_, content, NULL);
2488
2489   /* Verify that we get the expected token back, with the correct
2490      location information.  */
2491   const cpp_token *tok = test.get_token ();
2492   ASSERT_EQ (tok->type, CPP_STRING);
2493   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2494
2495   /* Verify that cpp_interpret_string works.
2496      The string should be encoded in the execution character
2497      set.  Assuming that that is UTF-8, we should have the following:
2498      -----------  ----  -----  -------  ----------------
2499      Byte offset  Byte  Octal  Unicode  Source Column(s)
2500      -----------  ----  -----  -------  ----------------
2501      0            0x30         '0'      10
2502      1            0x31         '1'      11
2503      2            0x32         '2'      12
2504      3            0x33         '3'      13
2505      4            0x34         '4'      14
2506      5            0xE2  \342   U+2174   15-20
2507      6            0x85  \205    (cont)  15-20
2508      7            0xB4  \264    (cont)  15-20
2509      8            0xE2  \342   U+2175   21-26
2510      9            0x85  \205    (cont)  21-26
2511      10           0xB5  \265    (cont)  21-26
2512      11           0x37         '7'      27
2513      12           0x38         '8'      28
2514      13           0x39         '9'      29
2515      14           0x00                  30 (closing quote)
2516      -----------  ----  -----  -------  ---------------.  */
2517
2518   cpp_string dst_string;
2519   const enum cpp_ttype type = CPP_STRING;
2520   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2521                                       &dst_string, type);
2522   ASSERT_TRUE (result);
2523   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2524                 (const char *)dst_string.text);
2525   free (const_cast <unsigned char *> (dst_string.text));
2526
2527   /* Verify ranges of individual characters.  This no longer includes the
2528      opening quote, but does include the closing quote.
2529      '01234'.  */
2530   for (int i = 0; i <= 4; i++)
2531     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2532   /* U+2174.  */
2533   for (int i = 5; i <= 7; i++)
2534     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2535   /* U+2175.  */
2536   for (int i = 8; i <= 10; i++)
2537     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2538   /* '789' and nul terminator  */
2539   for (int i = 11; i <= 14; i++)
2540     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2541
2542   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2543 }
2544
2545 /* Lex a string literal containing UCN 8 characters.
2546    Verify the substring location data after running cpp_interpret_string
2547    on it.  */
2548
2549 static void
2550 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2551 {
2552   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2553      ....................000000000.111111.1111222222.2222333333333.344444
2554      ....................123456789.012345.6789012345.6789012345678.901234  */
2555   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2556   lexer_test test (case_, content, NULL);
2557
2558   /* Verify that we get the expected token back, with the correct
2559      location information.  */
2560   const cpp_token *tok = test.get_token ();
2561   ASSERT_EQ (tok->type, CPP_STRING);
2562   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2563                            "\"01234\\U00002174\\U00002175789\"");
2564
2565   /* Verify that cpp_interpret_string works.
2566      The UTF-8 encoding of the string is identical to that from
2567      the ucn4 testcase above; the only difference is the column
2568      locations.  */
2569   cpp_string dst_string;
2570   const enum cpp_ttype type = CPP_STRING;
2571   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2572                                       &dst_string, type);
2573   ASSERT_TRUE (result);
2574   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2575                 (const char *)dst_string.text);
2576   free (const_cast <unsigned char *> (dst_string.text));
2577
2578   /* Verify ranges of individual characters.  This no longer includes the
2579      opening quote, but does include the closing quote.
2580      '01234'.  */
2581   for (int i = 0; i <= 4; i++)
2582     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2583   /* U+2174.  */
2584   for (int i = 5; i <= 7; i++)
2585     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2586   /* U+2175.  */
2587   for (int i = 8; i <= 10; i++)
2588     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2589   /* '789' at columns 35-37  */
2590   for (int i = 11; i <= 13; i++)
2591     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2592   /* Closing quote/nul-terminator at column 38.  */
2593   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2594
2595   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2596 }
2597
2598 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2599
2600 static uint32_t
2601 uint32_from_big_endian (const uint32_t *ptr_be_value)
2602 {
2603   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2604   return (((uint32_t) buf[0] << 24)
2605           | ((uint32_t) buf[1] << 16)
2606           | ((uint32_t) buf[2] << 8)
2607           | (uint32_t) buf[3]);
2608 }
2609
2610 /* Lex a wide string literal and verify that attempts to read substring
2611    location data from it fail gracefully.  */
2612
2613 static void
2614 test_lexer_string_locations_wide_string (const line_table_case &case_)
2615 {
2616   /* Digits 0-9.
2617      ....................000000000.11111111112.22222222233333
2618      ....................123456789.01234567890.12345678901234  */
2619   const char *content = "       L\"0123456789\" /* non-str */\n";
2620   lexer_test test (case_, content, NULL);
2621
2622   /* Verify that we get the expected token back, with the correct
2623      location information.  */
2624   const cpp_token *tok = test.get_token ();
2625   ASSERT_EQ (tok->type, CPP_WSTRING);
2626   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2627
2628   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2629   cpp_string dst_string;
2630   const enum cpp_ttype type = CPP_WSTRING;
2631   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2632                                       &dst_string, type);
2633   ASSERT_TRUE (result);
2634   /* The cpp_reader defaults to big-endian with
2635      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2636      now be encoded as UTF-32BE.  */
2637   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2638   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2639   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2640   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2641   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2642   free (const_cast <unsigned char *> (dst_string.text));
2643
2644   /* We don't yet support generating substring location information
2645      for L"" strings.  */
2646   ASSERT_HAS_NO_SUBSTRING_RANGES
2647     (test, tok->src_loc, type,
2648      "execution character set != source character set");
2649 }
2650
2651 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2652
2653 static uint16_t
2654 uint16_from_big_endian (const uint16_t *ptr_be_value)
2655 {
2656   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2657   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2658 }
2659
2660 /* Lex a u"" string literal and verify that attempts to read substring
2661    location data from it fail gracefully.  */
2662
2663 static void
2664 test_lexer_string_locations_string16 (const line_table_case &case_)
2665 {
2666   /* Digits 0-9.
2667      ....................000000000.11111111112.22222222233333
2668      ....................123456789.01234567890.12345678901234  */
2669   const char *content = "       u\"0123456789\" /* non-str */\n";
2670   lexer_test test (case_, content, NULL);
2671
2672   /* Verify that we get the expected token back, with the correct
2673      location information.  */
2674   const cpp_token *tok = test.get_token ();
2675   ASSERT_EQ (tok->type, CPP_STRING16);
2676   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2677
2678   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2679   cpp_string dst_string;
2680   const enum cpp_ttype type = CPP_STRING16;
2681   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2682                                       &dst_string, type);
2683   ASSERT_TRUE (result);
2684
2685   /* The cpp_reader defaults to big-endian, so dst_string should
2686      now be encoded as UTF-16BE.  */
2687   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2688   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2689   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2690   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2691   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2692   free (const_cast <unsigned char *> (dst_string.text));
2693
2694   /* We don't yet support generating substring location information
2695      for L"" strings.  */
2696   ASSERT_HAS_NO_SUBSTRING_RANGES
2697     (test, tok->src_loc, type,
2698      "execution character set != source character set");
2699 }
2700
2701 /* Lex a U"" string literal and verify that attempts to read substring
2702    location data from it fail gracefully.  */
2703
2704 static void
2705 test_lexer_string_locations_string32 (const line_table_case &case_)
2706 {
2707   /* Digits 0-9.
2708      ....................000000000.11111111112.22222222233333
2709      ....................123456789.01234567890.12345678901234  */
2710   const char *content = "       U\"0123456789\" /* non-str */\n";
2711   lexer_test test (case_, content, NULL);
2712
2713   /* Verify that we get the expected token back, with the correct
2714      location information.  */
2715   const cpp_token *tok = test.get_token ();
2716   ASSERT_EQ (tok->type, CPP_STRING32);
2717   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2718
2719   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2720   cpp_string dst_string;
2721   const enum cpp_ttype type = CPP_STRING32;
2722   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2723                                       &dst_string, type);
2724   ASSERT_TRUE (result);
2725
2726   /* The cpp_reader defaults to big-endian, so dst_string should
2727      now be encoded as UTF-32BE.  */
2728   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2729   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2730   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2731   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2732   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2733   free (const_cast <unsigned char *> (dst_string.text));
2734
2735   /* We don't yet support generating substring location information
2736      for L"" strings.  */
2737   ASSERT_HAS_NO_SUBSTRING_RANGES
2738     (test, tok->src_loc, type,
2739      "execution character set != source character set");
2740 }
2741
2742 /* Lex a u8-string literal.
2743    Verify the substring location data after running cpp_interpret_string
2744    on it.  */
2745
2746 static void
2747 test_lexer_string_locations_u8 (const line_table_case &case_)
2748 {
2749   /* Digits 0-9.
2750      ....................000000000.11111111112.22222222233333
2751      ....................123456789.01234567890.12345678901234  */
2752   const char *content = "      u8\"0123456789\" /* non-str */\n";
2753   lexer_test test (case_, content, NULL);
2754
2755   /* Verify that we get the expected token back, with the correct
2756      location information.  */
2757   const cpp_token *tok = test.get_token ();
2758   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2759   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2760
2761   /* Verify that cpp_interpret_string works.  */
2762   cpp_string dst_string;
2763   const enum cpp_ttype type = CPP_STRING;
2764   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2765                                       &dst_string, type);
2766   ASSERT_TRUE (result);
2767   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2768   free (const_cast <unsigned char *> (dst_string.text));
2769
2770   /* Verify ranges of individual characters.  This no longer includes the
2771      opening quote, but does include the closing quote.  */
2772   for (int i = 0; i <= 10; i++)
2773     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2774 }
2775
2776 /* Lex a string literal containing UTF-8 source characters.
2777    Verify the substring location data after running cpp_interpret_string
2778    on it.  */
2779
2780 static void
2781 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2782 {
2783  /* This string literal is written out to the source file as UTF-8,
2784     and is of the form "before mojibake after", where "mojibake"
2785     is written as the following four unicode code points:
2786        U+6587 CJK UNIFIED IDEOGRAPH-6587
2787        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2788        U+5316 CJK UNIFIED IDEOGRAPH-5316
2789        U+3051 HIRAGANA LETTER KE.
2790      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2791      "before" and "after" are 1 byte per unicode character.
2792
2793      The numbering shown are "columns", which are *byte* numbers within
2794      the line, rather than unicode character numbers.
2795
2796      .................... 000000000.1111111.
2797      .................... 123456789.0123456.  */
2798   const char *content = ("        \"before "
2799                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
2800                               UTF-8: 0xE6 0x96 0x87
2801                               C octal escaped UTF-8: \346\226\207
2802                             "column" numbers: 17-19.  */
2803                          "\346\226\207"
2804
2805                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2806                               UTF-8: 0xE5 0xAD 0x97
2807                               C octal escaped UTF-8: \345\255\227
2808                             "column" numbers: 20-22.  */
2809                          "\345\255\227"
2810
2811                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
2812                               UTF-8: 0xE5 0x8C 0x96
2813                               C octal escaped UTF-8: \345\214\226
2814                             "column" numbers: 23-25.  */
2815                          "\345\214\226"
2816
2817                          /* U+3051 HIRAGANA LETTER KE
2818                               UTF-8: 0xE3 0x81 0x91
2819                               C octal escaped UTF-8: \343\201\221
2820                             "column" numbers: 26-28.  */
2821                          "\343\201\221"
2822
2823                          /* column numbers 29 onwards
2824                           2333333.33334444444444
2825                           9012345.67890123456789. */
2826                          " after\" /* non-str */\n");
2827   lexer_test test (case_, content, NULL);
2828
2829   /* Verify that we get the expected token back, with the correct
2830      location information.  */
2831   const cpp_token *tok = test.get_token ();
2832   ASSERT_EQ (tok->type, CPP_STRING);
2833   ASSERT_TOKEN_AS_TEXT_EQ
2834     (test.m_parser, tok,
2835      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
2836
2837   /* Verify that cpp_interpret_string works.  */
2838   cpp_string dst_string;
2839   const enum cpp_ttype type = CPP_STRING;
2840   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2841                                       &dst_string, type);
2842   ASSERT_TRUE (result);
2843   ASSERT_STREQ
2844     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
2845      (const char *)dst_string.text);
2846   free (const_cast <unsigned char *> (dst_string.text));
2847
2848   /* Verify ranges of individual characters.  This no longer includes the
2849      opening quote, but does include the closing quote.
2850      Assuming that both source and execution encodings are UTF-8, we have
2851      a run of 25 octets in each, plus the NUL terminator.  */
2852   for (int i = 0; i < 25; i++)
2853     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2854   /* NUL-terminator should use the closing quote at column 35.  */
2855   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
2856
2857   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
2858 }
2859
2860 /* Test of string literal concatenation.  */
2861
2862 static void
2863 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
2864 {
2865   /* Digits 0-9.
2866      .....................000000000.111111.11112222222222
2867      .....................123456789.012345.67890123456789.  */
2868   const char *content = ("        \"01234\" /* non-str */\n"
2869                          "        \"56789\" /* non-str */\n");
2870   lexer_test test (case_, content, NULL);
2871
2872   location_t input_locs[2];
2873
2874   /* Verify that we get the expected tokens back.  */
2875   auto_vec <cpp_string> input_strings;
2876   const cpp_token *tok_a = test.get_token ();
2877   ASSERT_EQ (tok_a->type, CPP_STRING);
2878   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
2879   input_strings.safe_push (tok_a->val.str);
2880   input_locs[0] = tok_a->src_loc;
2881
2882   const cpp_token *tok_b = test.get_token ();
2883   ASSERT_EQ (tok_b->type, CPP_STRING);
2884   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
2885   input_strings.safe_push (tok_b->val.str);
2886   input_locs[1] = tok_b->src_loc;
2887
2888   /* Verify that cpp_interpret_string works.  */
2889   cpp_string dst_string;
2890   const enum cpp_ttype type = CPP_STRING;
2891   bool result = cpp_interpret_string (test.m_parser,
2892                                       input_strings.address (), 2,
2893                                       &dst_string, type);
2894   ASSERT_TRUE (result);
2895   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2896   free (const_cast <unsigned char *> (dst_string.text));
2897
2898   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2899   test.m_concats.record_string_concatenation (2, input_locs);
2900
2901   location_t initial_loc = input_locs[0];
2902
2903   /* "01234" on line 1.  */
2904   for (int i = 0; i <= 4; i++)
2905     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
2906   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
2907   for (int i = 5; i <= 10; i++)
2908     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
2909
2910   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2911 }
2912
2913 /* Another test of string literal concatenation.  */
2914
2915 static void
2916 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
2917 {
2918   /* Digits 0-9.
2919      .....................000000000.111.11111112222222
2920      .....................123456789.012.34567890123456.  */
2921   const char *content = ("        \"01\" /* non-str */\n"
2922                          "        \"23\" /* non-str */\n"
2923                          "        \"45\" /* non-str */\n"
2924                          "        \"67\" /* non-str */\n"
2925                          "        \"89\" /* non-str */\n");
2926   lexer_test test (case_, content, NULL);
2927
2928   auto_vec <cpp_string> input_strings;
2929   location_t input_locs[5];
2930
2931   /* Verify that we get the expected tokens back.  */
2932   for (int i = 0; i < 5; i++)
2933     {
2934       const cpp_token *tok = test.get_token ();
2935       ASSERT_EQ (tok->type, CPP_STRING);
2936       input_strings.safe_push (tok->val.str);
2937       input_locs[i] = tok->src_loc;
2938     }
2939
2940   /* Verify that cpp_interpret_string works.  */
2941   cpp_string dst_string;
2942   const enum cpp_ttype type = CPP_STRING;
2943   bool result = cpp_interpret_string (test.m_parser,
2944                                       input_strings.address (), 5,
2945                                       &dst_string, type);
2946   ASSERT_TRUE (result);
2947   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2948   free (const_cast <unsigned char *> (dst_string.text));
2949
2950   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2951   test.m_concats.record_string_concatenation (5, input_locs);
2952
2953   location_t initial_loc = input_locs[0];
2954
2955   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
2956      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
2957      and expect get_source_range_for_substring to fail.
2958      However, for a string concatenation test, we can have a case
2959      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
2960      but subsequent strings can be after it.
2961      Attempting to detect this within assert_char_at_range
2962      would overcomplicate the logic for the common test cases, so
2963      we detect it here.  */
2964   if (should_have_column_data_p (input_locs[0])
2965       && !should_have_column_data_p (input_locs[4]))
2966     {
2967       /* Verify that get_source_range_for_substring gracefully rejects
2968          this case.  */
2969       source_range actual_range;
2970       const char *err
2971         = get_source_range_for_char (test.m_parser, &test.m_concats,
2972                                      initial_loc, type, 0, &actual_range);
2973       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
2974       return;
2975     }
2976
2977   for (int i = 0; i < 5; i++)
2978     for (int j = 0; j < 2; j++)
2979       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
2980                             i + 1, 10 + j, 10 + j);
2981
2982   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
2983   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
2984
2985   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2986 }
2987
2988 /* Another test of string literal concatenation, this time combined with
2989    various kinds of escaped characters.  */
2990
2991 static void
2992 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
2993 {
2994   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
2995      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
2996   const char *content
2997     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
2998        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
2999     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3000   lexer_test test (case_, content, NULL);
3001
3002   auto_vec <cpp_string> input_strings;
3003   location_t input_locs[4];
3004
3005   /* Verify that we get the expected tokens back.  */
3006   for (int i = 0; i < 4; i++)
3007     {
3008       const cpp_token *tok = test.get_token ();
3009       ASSERT_EQ (tok->type, CPP_STRING);
3010       input_strings.safe_push (tok->val.str);
3011       input_locs[i] = tok->src_loc;
3012     }
3013
3014   /* Verify that cpp_interpret_string works.  */
3015   cpp_string dst_string;
3016   const enum cpp_ttype type = CPP_STRING;
3017   bool result = cpp_interpret_string (test.m_parser,
3018                                       input_strings.address (), 4,
3019                                       &dst_string, type);
3020   ASSERT_TRUE (result);
3021   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3022   free (const_cast <unsigned char *> (dst_string.text));
3023
3024   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3025   test.m_concats.record_string_concatenation (4, input_locs);
3026
3027   location_t initial_loc = input_locs[0];
3028
3029   for (int i = 0; i <= 4; i++)
3030     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3031   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3032   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3033   for (int i = 7; i <= 9; i++)
3034     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3035
3036   /* NUL-terminator should use the location of the final closing quote.  */
3037   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3038
3039   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3040 }
3041
3042 /* Test of string literal in a macro.  */
3043
3044 static void
3045 test_lexer_string_locations_macro (const line_table_case &case_)
3046 {
3047   /* Digits 0-9.
3048      .....................0000000001111111111.22222222223.
3049      .....................1234567890123456789.01234567890.  */
3050   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3051                          "  MACRO");
3052   lexer_test test (case_, content, NULL);
3053
3054   /* Verify that we get the expected tokens back.  */
3055   const cpp_token *tok = test.get_token ();
3056   ASSERT_EQ (tok->type, CPP_PADDING);
3057
3058   tok = test.get_token ();
3059   ASSERT_EQ (tok->type, CPP_STRING);
3060   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3061
3062   /* Verify ranges of individual characters.  We ought to
3063      see columns within the macro definition.  */
3064   for (int i = 0; i <= 10; i++)
3065     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3066                           i, 1, 20 + i, 20 + i);
3067
3068   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3069
3070   tok = test.get_token ();
3071   ASSERT_EQ (tok->type, CPP_PADDING);
3072 }
3073
3074 /* Test of stringification of a macro argument.  */
3075
3076 static void
3077 test_lexer_string_locations_stringified_macro_argument
3078   (const line_table_case &case_)
3079 {
3080   /* .....................000000000111111111122222222223.
3081      .....................123456789012345678901234567890.  */
3082   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3083                          "MACRO(foo)\n");
3084   lexer_test test (case_, content, NULL);
3085
3086   /* Verify that we get the expected token back.  */
3087   const cpp_token *tok = test.get_token ();
3088   ASSERT_EQ (tok->type, CPP_PADDING);
3089
3090   tok = test.get_token ();
3091   ASSERT_EQ (tok->type, CPP_STRING);
3092   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3093
3094   /* We don't support getting the location of a stringified macro
3095      argument.  Verify that it fails gracefully.  */
3096   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3097                                   "cpp_interpret_string_1 failed");
3098
3099   tok = test.get_token ();
3100   ASSERT_EQ (tok->type, CPP_PADDING);
3101
3102   tok = test.get_token ();
3103   ASSERT_EQ (tok->type, CPP_PADDING);
3104 }
3105
3106 /* Ensure that we are fail gracefully if something attempts to pass
3107    in a location that isn't a string literal token.  Seen on this code:
3108
3109      const char a[] = " %d ";
3110      __builtin_printf (a, 0.5);
3111                        ^
3112
3113    when c-format.c erroneously used the indicated one-character
3114    location as the format string location, leading to a read past the
3115    end of a string buffer in cpp_interpret_string_1.  */
3116
3117 static void
3118 test_lexer_string_locations_non_string (const line_table_case &case_)
3119 {
3120   /* .....................000000000111111111122222222223.
3121      .....................123456789012345678901234567890.  */
3122   const char *content = ("         a\n");
3123   lexer_test test (case_, content, NULL);
3124
3125   /* Verify that we get the expected token back.  */
3126   const cpp_token *tok = test.get_token ();
3127   ASSERT_EQ (tok->type, CPP_NAME);
3128   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3129
3130   /* At this point, libcpp is attempting to interpret the name as a
3131      string literal, despite it not starting with a quote.  We don't detect
3132      that, but we should at least fail gracefully.  */
3133   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3134                                   "cpp_interpret_string_1 failed");
3135 }
3136
3137 /* Ensure that we can read substring information for a token which
3138    starts in one linemap and ends in another .  Adapted from
3139    gcc.dg/cpp/pr69985.c.  */
3140
3141 static void
3142 test_lexer_string_locations_long_line (const line_table_case &case_)
3143 {
3144   /* .....................000000.000111111111
3145      .....................123456.789012346789.  */
3146   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3147                          "     \"0123456789012345678901234567890123456789"
3148                          "0123456789012345678901234567890123456789"
3149                          "0123456789012345678901234567890123456789"
3150                          "0123456789\"\n");
3151
3152   lexer_test test (case_, content, NULL);
3153
3154   /* Verify that we get the expected token back.  */
3155   const cpp_token *tok = test.get_token ();
3156   ASSERT_EQ (tok->type, CPP_STRING);
3157
3158   if (!should_have_column_data_p (line_table->highest_location))
3159     return;
3160
3161   /* Verify ranges of individual characters.  */
3162   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3163   for (int i = 0; i < 131; i++)
3164     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3165                           i, 2, 7 + i, 7 + i);
3166 }
3167
3168 /* Test of locations within a raw string that doesn't contain a newline.  */
3169
3170 static void
3171 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3172 {
3173   /* .....................00.0000000111111111122.
3174      .....................12.3456789012345678901.  */
3175   const char *content = ("R\"foo(0123456789)foo\"\n");
3176   lexer_test test (case_, content, NULL);
3177
3178   /* Verify that we get the expected token back.  */
3179   const cpp_token *tok = test.get_token ();
3180   ASSERT_EQ (tok->type, CPP_STRING);
3181
3182   /* Verify that cpp_interpret_string works.  */
3183   cpp_string dst_string;
3184   const enum cpp_ttype type = CPP_STRING;
3185   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3186                                       &dst_string, type);
3187   ASSERT_TRUE (result);
3188   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3189   free (const_cast <unsigned char *> (dst_string.text));
3190
3191   if (!should_have_column_data_p (line_table->highest_location))
3192     return;
3193
3194   /* 0-9, plus the nil terminator.  */
3195   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3196   for (int i = 0; i < 11; i++)
3197     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3198                           i, 1, 7 + i, 7 + i);
3199 }
3200
3201 /* Test of locations within a raw string that contains a newline.  */
3202
3203 static void
3204 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3205 {
3206   /* .....................00.0000.
3207      .....................12.3456.  */
3208   const char *content = ("R\"foo(\n"
3209   /* .....................00000.
3210      .....................12345.  */
3211                          "hello\n"
3212                          "world\n"
3213   /* .....................00000.
3214      .....................12345.  */
3215                          ")foo\"\n");
3216   lexer_test test (case_, content, NULL);
3217
3218   /* Verify that we get the expected token back.  */
3219   const cpp_token *tok = test.get_token ();
3220   ASSERT_EQ (tok->type, CPP_STRING);
3221
3222   /* Verify that cpp_interpret_string works.  */
3223   cpp_string dst_string;
3224   const enum cpp_ttype type = CPP_STRING;
3225   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3226                                       &dst_string, type);
3227   ASSERT_TRUE (result);
3228   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3229   free (const_cast <unsigned char *> (dst_string.text));
3230
3231   if (!should_have_column_data_p (line_table->highest_location))
3232     return;
3233
3234   /* Currently we don't support locations within raw strings that
3235      contain newlines.  */
3236   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3237                                   "range endpoints are on different lines");
3238 }
3239
3240 /* Test of lexing char constants.  */
3241
3242 static void
3243 test_lexer_char_constants (const line_table_case &case_)
3244 {
3245   /* Various char constants.
3246      .....................0000000001111111111.22222222223.
3247      .....................1234567890123456789.01234567890.  */
3248   const char *content = ("         'a'\n"
3249                          "        u'a'\n"
3250                          "        U'a'\n"
3251                          "        L'a'\n"
3252                          "         'abc'\n");
3253   lexer_test test (case_, content, NULL);
3254
3255   /* Verify that we get the expected tokens back.  */
3256   /* 'a'.  */
3257   const cpp_token *tok = test.get_token ();
3258   ASSERT_EQ (tok->type, CPP_CHAR);
3259   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3260
3261   unsigned int chars_seen;
3262   int unsignedp;
3263   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3264                                           &chars_seen, &unsignedp);
3265   ASSERT_EQ (cc, 'a');
3266   ASSERT_EQ (chars_seen, 1);
3267
3268   /* u'a'.  */
3269   tok = test.get_token ();
3270   ASSERT_EQ (tok->type, CPP_CHAR16);
3271   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3272
3273   /* U'a'.  */
3274   tok = test.get_token ();
3275   ASSERT_EQ (tok->type, CPP_CHAR32);
3276   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3277
3278   /* L'a'.  */
3279   tok = test.get_token ();
3280   ASSERT_EQ (tok->type, CPP_WCHAR);
3281   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3282
3283   /* 'abc' (c-char-sequence).  */
3284   tok = test.get_token ();
3285   ASSERT_EQ (tok->type, CPP_CHAR);
3286   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3287 }
3288 /* A table of interesting location_t values, giving one axis of our test
3289    matrix.  */
3290
3291 static const location_t boundary_locations[] = {
3292   /* Zero means "don't override the default values for a new line_table".  */
3293   0,
3294
3295   /* An arbitrary non-zero value that isn't close to one of
3296      the boundary values below.  */
3297   0x10000,
3298
3299   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3300   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3301   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3302   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3303   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3304   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3305
3306   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3307   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3308   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3309   LINE_MAP_MAX_LOCATION_WITH_COLS,
3310   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3311   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3312 };
3313
3314 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3315
3316 void
3317 for_each_line_table_case (void (*testcase) (const line_table_case &))
3318 {
3319   /* As noted above in the description of struct line_table_case,
3320      we want to explore a test matrix of interesting line_table
3321      situations, running various selftests for each case within the
3322      matrix.  */
3323
3324   /* Run all tests with:
3325      (a) line_table->default_range_bits == 0, and
3326      (b) line_table->default_range_bits == 5.  */
3327   int num_cases_tested = 0;
3328   for (int default_range_bits = 0; default_range_bits <= 5;
3329        default_range_bits += 5)
3330     {
3331       /* ...and use each of the "interesting" location values as
3332          the starting location within line_table.  */
3333       const int num_boundary_locations
3334         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3335       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3336         {
3337           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3338
3339           testcase (c);
3340
3341           num_cases_tested++;
3342         }
3343     }
3344
3345   /* Verify that we fully covered the test matrix.  */
3346   ASSERT_EQ (num_cases_tested, 2 * 12);
3347 }
3348
3349 /* Run all of the selftests within this file.  */
3350
3351 void
3352 input_c_tests ()
3353 {
3354   test_should_have_column_data_p ();
3355   test_unknown_location ();
3356   test_builtins ();
3357   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3358
3359   for_each_line_table_case (test_accessing_ordinary_linemaps);
3360   for_each_line_table_case (test_lexer);
3361   for_each_line_table_case (test_lexer_string_locations_simple);
3362   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3363   for_each_line_table_case (test_lexer_string_locations_hex);
3364   for_each_line_table_case (test_lexer_string_locations_oct);
3365   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3366   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3367   for_each_line_table_case (test_lexer_string_locations_ucn4);
3368   for_each_line_table_case (test_lexer_string_locations_ucn8);
3369   for_each_line_table_case (test_lexer_string_locations_wide_string);
3370   for_each_line_table_case (test_lexer_string_locations_string16);
3371   for_each_line_table_case (test_lexer_string_locations_string32);
3372   for_each_line_table_case (test_lexer_string_locations_u8);
3373   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3374   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3375   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3376   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3377   for_each_line_table_case (test_lexer_string_locations_macro);
3378   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3379   for_each_line_table_case (test_lexer_string_locations_non_string);
3380   for_each_line_table_case (test_lexer_string_locations_long_line);
3381   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3382   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3383   for_each_line_table_case (test_lexer_char_constants);
3384
3385   test_reading_source_line ();
3386 }
3387
3388 } // namespace selftest
3389
3390 #endif /* CHECKING_P */