gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2016 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic-core.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* This is a cache used by get_next_line to store the content of a
  33    file to be searched for file lines.  */
  34 struct fcache
  35 {
  36   /* These are information used to store a line boundary.  */
  37   struct line_info
  38   {
  39     /* The line number.  It starts from 1.  */
  40     size_t line_num;
  41
  42     /* The position (byte count) of the beginning of the line,
  43        relative to the file data pointer.  This starts at zero.  */
  44     size_t start_pos;
  45
  46     /* The position (byte count) of the last byte of the line.  This
  47        normally points to the '\n' character, or to one byte after the
  48        last byte of the file, if the file doesn't contain a '\n'
  49        character.  */
  50     size_t end_pos;
  51
  52     line_info (size_t l, size_t s, size_t e)
  53       : line_num (l), start_pos (s), end_pos (e)
  54     {}
  55
  56     line_info ()
  57       :line_num (0), start_pos (0), end_pos (0)
  58     {}
  59   };
  60
  61   /* The number of time this file has been accessed.  This is used
  62      to designate which file cache to evict from the cache
  63      array.  */
  64   unsigned use_count;
  65
  66   const char *file_path;
  67
  68   FILE *fp;
  69
  70   /* This points to the content of the file that we've read so
  71      far.  */
  72   char *data;
  73
  74   /*  The size of the DATA array above.*/
  75   size_t size;
  76
  77   /* The number of bytes read from the underlying file so far.  This
  78      must be less (or equal) than SIZE above.  */
  79   size_t nb_read;
  80
  81   /* The index of the beginning of the current line.  */
  82   size_t line_start_idx;
  83
  84   /* The number of the previous line read.  This starts at 1.  Zero
  85      means we've read no line so far.  */
  86   size_t line_num;
  87
  88   /* This is the total number of lines of the current file.  At the
  89      moment, we try to get this information from the line map
  90      subsystem.  Note that this is just a hint.  When using the C++
  91      front-end, this hint is correct because the input file is then
  92      completely tokenized before parsing starts; so the line map knows
  93      the number of lines before compilation really starts.  For e.g,
  94      the C front-end, it can happen that we start emitting diagnostics
  95      before the line map has seen the end of the file.  */
  96   size_t total_lines;
  97
  98   /* Could this file be missing a trailing newline on its final line?
  99      Initially true (to cope with empty files), set to true/false
 100      as each line is read.  */
 101   bool missing_trailing_newline;
 102
 103   /* This is a record of the beginning and end of the lines we've seen
 104      while reading the file.  This is useful to avoid walking the data
 105      from the beginning when we are asked to read a line that is
 106      before LINE_START_IDX above.  Note that the maximum size of this
 107      record is fcache_line_record_size, so that the memory consumption
 108      doesn't explode.  We thus scale total_lines down to
 109      fcache_line_record_size.  */
 110   vec<line_info, va_heap> line_record;
 111
 112   fcache ();
 113   ~fcache ();
 114 };
 115
 116 /* Current position in real source file.  */
 117
 118 location_t input_location = UNKNOWN_LOCATION;
 119
 120 struct line_maps *line_table;
 121
 122 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 123    This needs to be a global so that it can be a GC root, and thus
 124    prevent the stashed copy from being garbage-collected if the GC runs
 125    during a line_table_test.  */
 126
 127 struct line_maps *saved_line_table;
 128
 129 static fcache *fcache_tab;
 130 static const size_t fcache_tab_size = 16;
 131 static const size_t fcache_buffer_size = 4 * 1024;
 132 static const size_t fcache_line_record_size = 100;
 133
 134 /* Expand the source location LOC into a human readable location.  If
 135    LOC resolves to a builtin location, the file name of the readable
 136    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 137    TRUE and LOC is virtual, then it is resolved to the expansion
 138    point of the involved macro.  Otherwise, it is resolved to the
 139    spelling location of the token.
 140
 141    When resolving to the spelling location of the token, if the
 142    resulting location is for a built-in location (that is, it has no
 143    associated line/column) in the context of a macro expansion, the
 144    returned location is the first one (while unwinding the macro
 145    location towards its expansion point) that is in real source
 146    code.  */
 147
 148 static expanded_location
 149 expand_location_1 (source_location loc,
 150                    bool expansion_point_p)
 151 {
 152   expanded_location xloc;
 153   const line_map_ordinary *map;
 154   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 155   tree block = NULL;
 156
 157   if (IS_ADHOC_LOC (loc))
 158     {
 159       block = LOCATION_BLOCK (loc);
 160       loc = LOCATION_LOCUS (loc);
 161     }
 162
 163   memset (&xloc, 0, sizeof (xloc));
 164
 165   if (loc >= RESERVED_LOCATION_COUNT)
 166     {
 167       if (!expansion_point_p)
 168         {
 169           /* We want to resolve LOC to its spelling location.
 170
 171              But if that spelling location is a reserved location that
 172              appears in the context of a macro expansion (like for a
 173              location for a built-in token), let's consider the first
 174              location (toward the expansion point) that is not reserved;
 175              that is, the first location that is in real source code.  */
 176           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 177                                                           loc, NULL);
 178           lrk = LRK_SPELLING_LOCATION;
 179         }
 180       loc = linemap_resolve_location (line_table, loc,
 181                                       lrk, &map);
 182       xloc = linemap_expand_location (line_table, map, loc);
 183     }
 184
 185   xloc.data = block;
 186   if (loc <= BUILTINS_LOCATION)
 187     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 188
 189   return xloc;
 190 }
 191
 192 /* Initialize the set of cache used for files accessed by caret
 193    diagnostic.  */
 194
 195 static void
 196 diagnostic_file_cache_init (void)
 197 {
 198   if (fcache_tab == NULL)
 199     fcache_tab = new fcache[fcache_tab_size];
 200 }
 201
 202 /* Free the resources used by the set of cache used for files accessed
 203    by caret diagnostic.  */
 204
 205 void
 206 diagnostic_file_cache_fini (void)
 207 {
 208   if (fcache_tab)
 209     {
 210       delete [] (fcache_tab);
 211       fcache_tab = NULL;
 212     }
 213 }
 214
 215 /* Return the total lines number that have been read so far by the
 216    line map (in the preprocessor) so far.  For languages like C++ that
 217    entirely preprocess the input file before starting to parse, this
 218    equals the actual number of lines of the file.  */
 219
 220 static size_t
 221 total_lines_num (const char *file_path)
 222 {
 223   size_t r = 0;
 224   source_location l = 0;
 225   if (linemap_get_file_highest_location (line_table, file_path, &l))
 226     {
 227       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 228       expanded_location xloc = expand_location (l);
 229       r = xloc.line;
 230     }
 231   return r;
 232 }
 233
 234 /* Lookup the cache used for the content of a given file accessed by
 235    caret diagnostic.  Return the found cached file, or NULL if no
 236    cached file was found.  */
 237
 238 static fcache*
 239 lookup_file_in_cache_tab (const char *file_path)
 240 {
 241   if (file_path == NULL)
 242     return NULL;
 243
 244   diagnostic_file_cache_init ();
 245
 246   /* This will contain the found cached file.  */
 247   fcache *r = NULL;
 248   for (unsigned i = 0; i < fcache_tab_size; ++i)
 249     {
 250       fcache *c = &fcache_tab[i];
 251       if (c->file_path && !strcmp (c->file_path, file_path))
 252         {
 253           ++c->use_count;
 254           r = c;
 255         }
 256     }
 257
 258   if (r)
 259     ++r->use_count;
 260
 261   return r;
 262 }
 263
 264 /* Purge any mention of FILENAME from the cache of files used for
 265    printing source code.  For use in selftests when working
 266    with tempfiles.  */
 267
 268 void
 269 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 270 {
 271   gcc_assert (file_path);
 272
 273   fcache *r = lookup_file_in_cache_tab (file_path);
 274   if (!r)
 275     /* Not found.  */
 276     return;
 277
 278   r->file_path = NULL;
 279   if (r->fp)
 280     fclose (r->fp);
 281   r->fp = NULL;
 282   r->nb_read = 0;
 283   r->line_start_idx = 0;
 284   r->line_num = 0;
 285   r->line_record.truncate (0);
 286   r->use_count = 0;
 287   r->total_lines = 0;
 288   r->missing_trailing_newline = true;
 289 }
 290
 291 /* Return the file cache that has been less used, recently, or the
 292    first empty one.  If HIGHEST_USE_COUNT is non-null,
 293    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 294    in the cache table.  */
 295
 296 static fcache*
 297 evicted_cache_tab_entry (unsigned *highest_use_count)
 298 {
 299   diagnostic_file_cache_init ();
 300
 301   fcache *to_evict = &fcache_tab[0];
 302   unsigned huc = to_evict->use_count;
 303   for (unsigned i = 1; i < fcache_tab_size; ++i)
 304     {
 305       fcache *c = &fcache_tab[i];
 306       bool c_is_empty = (c->file_path == NULL);
 307
 308       if (c->use_count < to_evict->use_count
 309           || (to_evict->file_path && c_is_empty))
 310         /* We evict C because it's either an entry with a lower use
 311            count or one that is empty.  */
 312         to_evict = c;
 313
 314       if (huc < c->use_count)
 315         huc = c->use_count;
 316
 317       if (c_is_empty)
 318         /* We've reached the end of the cache; subsequent elements are
 319            all empty.  */
 320         break;
 321     }
 322
 323   if (highest_use_count)
 324     *highest_use_count = huc;
 325
 326   return to_evict;
 327 }
 328
 329 /* Create the cache used for the content of a given file to be
 330    accessed by caret diagnostic.  This cache is added to an array of
 331    cache and can be retrieved by lookup_file_in_cache_tab.  This
 332    function returns the created cache.  Note that only the last
 333    fcache_tab_size files are cached.  */
 334
 335 static fcache*
 336 add_file_to_cache_tab (const char *file_path)
 337 {
 338
 339   FILE *fp = fopen (file_path, "r");
 340   if (fp == NULL)
 341     return NULL;
 342
 343   unsigned highest_use_count = 0;
 344   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 345   r->file_path = file_path;
 346   if (r->fp)
 347     fclose (r->fp);
 348   r->fp = fp;
 349   r->nb_read = 0;
 350   r->line_start_idx = 0;
 351   r->line_num = 0;
 352   r->line_record.truncate (0);
 353   /* Ensure that this cache entry doesn't get evicted next time
 354      add_file_to_cache_tab is called.  */
 355   r->use_count = ++highest_use_count;
 356   r->total_lines = total_lines_num (file_path);
 357   r->missing_trailing_newline = true;
 358
 359   return r;
 360 }
 361
 362 /* Lookup the cache used for the content of a given file accessed by
 363    caret diagnostic.  If no cached file was found, create a new cache
 364    for this file, add it to the array of cached file and return
 365    it.  */
 366
 367 static fcache*
 368 lookup_or_add_file_to_cache_tab (const char *file_path)
 369 {
 370   fcache *r = lookup_file_in_cache_tab (file_path);
 371   if (r == NULL)
 372     r = add_file_to_cache_tab (file_path);
 373   return r;
 374 }
 375
 376 /* Default constructor for a cache of file used by caret
 377    diagnostic.  */
 378
 379 fcache::fcache ()
 380 : use_count (0), file_path (NULL), fp (NULL), data (0),
 381   size (0), nb_read (0), line_start_idx (0), line_num (0),
 382   total_lines (0), missing_trailing_newline (true)
 383 {
 384   line_record.create (0);
 385 }
 386
 387 /* Destructor for a cache of file used by caret diagnostic.  */
 388
 389 fcache::~fcache ()
 390 {
 391   if (fp)
 392     {
 393       fclose (fp);
 394       fp = NULL;
 395     }
 396   if (data)
 397     {
 398       XDELETEVEC (data);
 399       data = 0;
 400     }
 401   line_record.release ();
 402 }
 403
 404 /* Returns TRUE iff the cache would need to be filled with data coming
 405    from the file.  That is, either the cache is empty or full or the
 406    current line is empty.  Note that if the cache is full, it would
 407    need to be extended and filled again.  */
 408
 409 static bool
 410 needs_read (fcache *c)
 411 {
 412   return (c->nb_read == 0
 413           || c->nb_read == c->size
 414           || (c->line_start_idx >= c->nb_read - 1));
 415 }
 416
 417 /*  Return TRUE iff the cache is full and thus needs to be
 418     extended.  */
 419
 420 static bool
 421 needs_grow (fcache *c)
 422 {
 423   return c->nb_read == c->size;
 424 }
 425
 426 /* Grow the cache if it needs to be extended.  */
 427
 428 static void
 429 maybe_grow (fcache *c)
 430 {
 431   if (!needs_grow (c))
 432     return;
 433
 434   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 435   c->data = XRESIZEVEC (char, c->data, size);
 436   c->size = size;
 437 }
 438
 439 /*  Read more data into the cache.  Extends the cache if need be.
 440     Returns TRUE iff new data could be read.  */
 441
 442 static bool
 443 read_data (fcache *c)
 444 {
 445   if (feof (c->fp) || ferror (c->fp))
 446     return false;
 447
 448   maybe_grow (c);
 449
 450   char * from = c->data + c->nb_read;
 451   size_t to_read = c->size - c->nb_read;
 452   size_t nb_read = fread (from, 1, to_read, c->fp);
 453
 454   if (ferror (c->fp))
 455     return false;
 456
 457   c->nb_read += nb_read;
 458   return !!nb_read;
 459 }
 460
 461 /* Read new data iff the cache needs to be filled with more data
 462    coming from the file FP.  Return TRUE iff the cache was filled with
 463    mode data.  */
 464
 465 static bool
 466 maybe_read_data (fcache *c)
 467 {
 468   if (!needs_read (c))
 469     return false;
 470   return read_data (c);
 471 }
 472
 473 /* Read a new line from file FP, using C as a cache for the data
 474    coming from the file.  Upon successful completion, *LINE is set to
 475    the beginning of the line found.  *LINE points directly in the
 476    line cache and is only valid until the next call of get_next_line.
 477    *LINE_LEN is set to the length of the line.  Note that the line
 478    does not contain any terminal delimiter.  This function returns
 479    true if some data was read or process from the cache, false
 480    otherwise.  Note that subsequent calls to get_next_line might
 481    make the content of *LINE invalid.  */
 482
 483 static bool
 484 get_next_line (fcache *c, char **line, ssize_t *line_len)
 485 {
 486   /* Fill the cache with data to process.  */
 487   maybe_read_data (c);
 488
 489   size_t remaining_size = c->nb_read - c->line_start_idx;
 490   if (remaining_size == 0)
 491     /* There is no more data to process.  */
 492     return false;
 493
 494   char *line_start = c->data + c->line_start_idx;
 495
 496   char *next_line_start = NULL;
 497   size_t len = 0;
 498   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 499   if (line_end == NULL)
 500     {
 501       /* We haven't found the end-of-line delimiter in the cache.
 502          Fill the cache with more data from the file and look for the
 503          '\n'.  */
 504       while (maybe_read_data (c))
 505         {
 506           line_start = c->data + c->line_start_idx;
 507           remaining_size = c->nb_read - c->line_start_idx;
 508           line_end = (char *) memchr (line_start, '\n', remaining_size);
 509           if (line_end != NULL)
 510             {
 511               next_line_start = line_end + 1;
 512               break;
 513             }
 514         }
 515       if (line_end == NULL)
 516         {
 517           /* We've loadded all the file into the cache and still no
 518              '\n'.  Let's say the line ends up at one byte passed the
 519              end of the file.  This is to stay consistent with the case
 520              of when the line ends up with a '\n' and line_end points to
 521              that terminal '\n'.  That consistency is useful below in
 522              the len calculation.  */
 523           line_end = c->data + c->nb_read ;
 524           c->missing_trailing_newline = true;
 525         }
 526       else
 527         c->missing_trailing_newline = false;
 528     }
 529   else
 530     {
 531       next_line_start = line_end + 1;
 532       c->missing_trailing_newline = false;
 533     }
 534
 535   if (ferror (c->fp))
 536     return false;
 537
 538   /* At this point, we've found the end of the of line.  It either
 539      points to the '\n' or to one byte after the last byte of the
 540      file.  */
 541   gcc_assert (line_end != NULL);
 542
 543   len = line_end - line_start;
 544
 545   if (c->line_start_idx < c->nb_read)
 546     *line = line_start;
 547
 548   ++c->line_num;
 549
 550   /* Before we update our line record, make sure the hint about the
 551      total number of lines of the file is correct.  If it's not, then
 552      we give up recording line boundaries from now on.  */
 553   bool update_line_record = true;
 554   if (c->line_num > c->total_lines)
 555     update_line_record = false;
 556
 557     /* Now update our line record so that re-reading lines from the
 558      before c->line_start_idx is faster.  */
 559   if (update_line_record
 560       && c->line_record.length () < fcache_line_record_size)
 561     {
 562       /* If the file lines fits in the line record, we just record all
 563          its lines ...*/
 564       if (c->total_lines <= fcache_line_record_size
 565           && c->line_num > c->line_record.length ())
 566         c->line_record.safe_push (fcache::line_info (c->line_num,
 567                                                  c->line_start_idx,
 568                                                  line_end - c->data));
 569       else if (c->total_lines > fcache_line_record_size)
 570         {
 571           /* ... otherwise, we just scale total_lines down to
 572              (fcache_line_record_size lines.  */
 573           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 574           if (c->line_record.length () == 0
 575               || n >= c->line_record.length ())
 576             c->line_record.safe_push (fcache::line_info (c->line_num,
 577                                                      c->line_start_idx,
 578                                                      line_end - c->data));
 579         }
 580     }
 581
 582   /* Update c->line_start_idx so that it points to the next line to be
 583      read.  */
 584   if (next_line_start)
 585     c->line_start_idx = next_line_start - c->data;
 586   else
 587     /* We didn't find any terminal '\n'.  Let's consider that the end
 588        of line is the end of the data in the cache.  The next
 589        invocation of get_next_line will either read more data from the
 590        underlying file or return false early because we've reached the
 591        end of the file.  */
 592     c->line_start_idx = c->nb_read;
 593
 594   *line_len = len;
 595
 596   return true;
 597 }
 598
 599 /* Consume the next bytes coming from the cache (or from its
 600    underlying file if there are remaining unread bytes in the file)
 601    until we reach the next end-of-line (or end-of-file).  There is no
 602    copying from the cache involved.  Return TRUE upon successful
 603    completion.  */
 604
 605 static bool
 606 goto_next_line (fcache *cache)
 607 {
 608   char *l;
 609   ssize_t len;
 610
 611   return get_next_line (cache, &l, &len);
 612 }
 613
 614 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 615    If the line was read successfully, *LINE points to the beginning
 616    of the line in the file cache and *LINE_LEN is the length of the
 617    line.  *LINE is not nul-terminated, but may contain zero bytes.
 618    *LINE is only valid until the next call of read_line_num.
 619    This function returns bool if a line was read.  */
 620
 621 static bool
 622 read_line_num (fcache *c, size_t line_num,
 623                char **line, ssize_t *line_len)
 624 {
 625   gcc_assert (line_num > 0);
 626
 627   if (line_num <= c->line_num)
 628     {
 629       /* We've been asked to read lines that are before c->line_num.
 630          So lets use our line record (if it's not empty) to try to
 631          avoid re-reading the file from the beginning again.  */
 632
 633       if (c->line_record.is_empty ())
 634         {
 635           c->line_start_idx = 0;
 636           c->line_num = 0;
 637         }
 638       else
 639         {
 640           fcache::line_info *i = NULL;
 641           if (c->total_lines <= fcache_line_record_size)
 642             {
 643               /* In languages where the input file is not totally
 644                  preprocessed up front, the c->total_lines hint
 645                  can be smaller than the number of lines of the
 646                  file.  In that case, only the first
 647                  c->total_lines have been recorded.
 648
 649                  Otherwise, the first c->total_lines we've read have
 650                  their start/end recorded here.  */
 651               i = (line_num <= c->total_lines)
 652                 ? &c->line_record[line_num - 1]
 653                 : &c->line_record[c->total_lines - 1];
 654               gcc_assert (i->line_num <= line_num);
 655             }
 656           else
 657             {
 658               /*  So the file had more lines than our line record
 659                   size.  Thus the number of lines we've recorded has
 660                   been scaled down to fcache_line_reacord_size.  Let's
 661                   pick the start/end of the recorded line that is
 662                   closest to line_num.  */
 663               size_t n = (line_num <= c->total_lines)
 664                 ? line_num * fcache_line_record_size / c->total_lines
 665                 : c ->line_record.length () - 1;
 666               if (n < c->line_record.length ())
 667                 {
 668                   i = &c->line_record[n];
 669                   gcc_assert (i->line_num <= line_num);
 670                 }
 671             }
 672
 673           if (i && i->line_num == line_num)
 674             {
 675               /* We have the start/end of the line.  */
 676               *line = c->data + i->start_pos;
 677               *line_len = i->end_pos - i->start_pos;
 678               return true;
 679             }
 680
 681           if (i)
 682             {
 683               c->line_start_idx = i->start_pos;
 684               c->line_num = i->line_num - 1;
 685             }
 686           else
 687             {
 688               c->line_start_idx = 0;
 689               c->line_num = 0;
 690             }
 691         }
 692     }
 693
 694   /*  Let's walk from line c->line_num up to line_num - 1, without
 695       copying any line.  */
 696   while (c->line_num < line_num - 1)
 697     if (!goto_next_line (c))
 698       return false;
 699
 700   /* The line we want is the next one.  Let's read and copy it back to
 701      the caller.  */
 702   return get_next_line (c, line, line_len);
 703 }
 704
 705 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 706    The line is not nul-terminated.  The returned pointer is only
 707    valid until the next call of location_get_source_line.
 708    Note that the line can contain several null characters,
 709    so LINE_LEN, if non-null, points to the actual length of the line.
 710    If the function fails, NULL is returned.  */
 711
 712 const char *
 713 location_get_source_line (const char *file_path, int line,
 714                           int *line_len)
 715 {
 716   char *buffer = NULL;
 717   ssize_t len;
 718
 719   if (line == 0)
 720     return NULL;
 721
 722   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 723   if (c == NULL)
 724     return NULL;
 725
 726   bool read = read_line_num (c, line, &buffer, &len);
 727
 728   if (read && line_len)
 729     *line_len = len;
 730
 731   return read ? buffer : NULL;
 732 }
 733
 734 /* Determine if FILE_PATH missing a trailing newline on its final line.
 735    Only valid to call once all of the file has been loaded, by
 736    requesting a line number beyond the end of the file.  */
 737
 738 bool
 739 location_missing_trailing_newline (const char *file_path)
 740 {
 741   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 742   if (c == NULL)
 743     return false;
 744
 745   return c->missing_trailing_newline;
 746 }
 747
 748 /* Test if the location originates from the spelling location of a
 749    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 750    virtual) location of a built-in token that appears in the expansion
 751    list of a macro.  Please note that this function also works on
 752    tokens that result from built-in tokens.  For instance, the
 753    function would return true if passed a token "4" that is the result
 754    of the expansion of the built-in __LINE__ macro.  */
 755 bool
 756 is_location_from_builtin_token (source_location loc)
 757 {
 758   const line_map_ordinary *map = NULL;
 759   loc = linemap_resolve_location (line_table, loc,
 760                                   LRK_SPELLING_LOCATION, &map);
 761   return loc == BUILTINS_LOCATION;
 762 }
 763
 764 /* Expand the source location LOC into a human readable location.  If
 765    LOC is virtual, it resolves to the expansion point of the involved
 766    macro.  If LOC resolves to a builtin location, the file name of the
 767    readable location is set to the string "<built-in>".  */
 768
 769 expanded_location
 770 expand_location (source_location loc)
 771 {
 772   return expand_location_1 (loc, /*expansion_point_p=*/true);
 773 }
 774
 775 /* Expand the source location LOC into a human readable location.  If
 776    LOC is virtual, it resolves to the expansion location of the
 777    relevant macro.  If LOC resolves to a builtin location, the file
 778    name of the readable location is set to the string
 779    "<built-in>".  */
 780
 781 expanded_location
 782 expand_location_to_spelling_point (source_location loc)
 783 {
 784   return expand_location_1 (loc, /*expansion_point_p=*/false);
 785 }
 786
 787 /* The rich_location class within libcpp requires a way to expand
 788    source_location instances, and relies on the client code
 789    providing a symbol named
 790      linemap_client_expand_location_to_spelling_point
 791    to do this.
 792
 793    This is the implementation for libcommon.a (all host binaries),
 794    which simply calls into expand_location_to_spelling_point.  */
 795
 796 expanded_location
 797 linemap_client_expand_location_to_spelling_point (source_location loc)
 798 {
 799   return expand_location_to_spelling_point (loc);
 800 }
 801
 802
 803 /* If LOCATION is in a system header and if it is a virtual location for
 804    a token coming from the expansion of a macro, unwind it to the
 805    location of the expansion point of the macro.  Otherwise, just return
 806    LOCATION.
 807
 808    This is used for instance when we want to emit diagnostics about a
 809    token that may be located in a macro that is itself defined in a
 810    system header, for example, for the NULL macro.  In such a case, if
 811    LOCATION were passed directly to diagnostic functions such as
 812    warning_at, the diagnostic would be suppressed (unless
 813    -Wsystem-headers).  */
 814
 815 source_location
 816 expansion_point_location_if_in_system_header (source_location location)
 817 {
 818   if (in_system_header_at (location))
 819     location = linemap_resolve_location (line_table, location,
 820                                          LRK_MACRO_EXPANSION_POINT,
 821                                          NULL);
 822   return location;
 823 }
 824
 825 /* If LOCATION is a virtual location for a token coming from the expansion
 826    of a macro, unwind to the location of the expansion point of the macro.  */
 827
 828 source_location
 829 expansion_point_location (source_location location)
 830 {
 831   return linemap_resolve_location (line_table, location,
 832                                    LRK_MACRO_EXPANSION_POINT, NULL);
 833 }
 834
 835 /* Construct a location with caret at CARET, ranging from START to
 836    finish e.g.
 837
 838                  11111111112
 839         12345678901234567890
 840      522
 841      523   return foo + bar;
 842                   ~~~~^~~~~
 843      524
 844
 845    The location's caret is at the "+", line 523 column 15, but starts
 846    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 847    of "bar" at column 19.  */
 848
 849 location_t
 850 make_location (location_t caret, location_t start, location_t finish)
 851 {
 852   location_t pure_loc = get_pure_location (caret);
 853   source_range src_range;
 854   src_range.m_start = get_start (start);
 855   src_range.m_finish = get_finish (finish);
 856   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 857                                                    pure_loc,
 858                                                    src_range,
 859                                                    NULL);
 860   return combined_loc;
 861 }
 862
 863 #define ONE_K 1024
 864 #define ONE_M (ONE_K * ONE_K)
 865
 866 /* Display a number as an integer multiple of either:
 867    - 1024, if said integer is >= to 10 K (in base 2)
 868    - 1024 * 1024, if said integer is >= 10 M in (base 2)
 869  */
 870 #define SCALE(x) ((unsigned long) ((x) < 10 * ONE_K \
 871                   ? (x) \
 872                   : ((x) < 10 * ONE_M \
 873                      ? (x) / ONE_K \
 874                      : (x) / ONE_M)))
 875
 876 /* For a given integer, display either:
 877    - the character 'k', if the number is higher than 10 K (in base 2)
 878      but strictly lower than 10 M (in base 2)
 879    - the character 'M' if the number is higher than 10 M (in base2)
 880    - the charcter ' ' if the number is strictly lower  than 10 K  */
 881 #define STAT_LABEL(x) ((x) < 10 * ONE_K ? ' ' : ((x) < 10 * ONE_M ? 'k' : 'M'))
 882
 883 /* Display an integer amount as multiple of 1K or 1M (in base 2).
 884    Display the correct unit (either k, M, or ' ') after the amout, as
 885    well.  */
 886 #define FORMAT_AMOUNT(size) SCALE (size), STAT_LABEL (size)
 887
 888 /* Dump statistics to stderr about the memory usage of the line_table
 889    set of line maps.  This also displays some statistics about macro
 890    expansion.  */
 891
 892 void
 893 dump_line_table_statistics (void)
 894 {
 895   struct linemap_stats s;
 896   long total_used_map_size,
 897     macro_maps_size,
 898     total_allocated_map_size;
 899
 900   memset (&s, 0, sizeof (s));
 901
 902   linemap_get_statistics (line_table, &s);
 903
 904   macro_maps_size = s.macro_maps_used_size
 905     + s.macro_maps_locations_size;
 906
 907   total_allocated_map_size = s.ordinary_maps_allocated_size
 908     + s.macro_maps_allocated_size
 909     + s.macro_maps_locations_size;
 910
 911   total_used_map_size = s.ordinary_maps_used_size
 912     + s.macro_maps_used_size
 913     + s.macro_maps_locations_size;
 914
 915   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 916            s.num_expanded_macros);
 917   if (s.num_expanded_macros != 0)
 918     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 919              s.num_macro_tokens / s.num_expanded_macros);
 920   fprintf (stderr,
 921            "\nLine Table allocations during the "
 922            "compilation process\n");
 923   fprintf (stderr, "Number of ordinary maps used:        %5ld%c\n",
 924            SCALE (s.num_ordinary_maps_used),
 925            STAT_LABEL (s.num_ordinary_maps_used));
 926   fprintf (stderr, "Ordinary map used size:              %5ld%c\n",
 927            SCALE (s.ordinary_maps_used_size),
 928            STAT_LABEL (s.ordinary_maps_used_size));
 929   fprintf (stderr, "Number of ordinary maps allocated:   %5ld%c\n",
 930            SCALE (s.num_ordinary_maps_allocated),
 931            STAT_LABEL (s.num_ordinary_maps_allocated));
 932   fprintf (stderr, "Ordinary maps allocated size:        %5ld%c\n",
 933            SCALE (s.ordinary_maps_allocated_size),
 934            STAT_LABEL (s.ordinary_maps_allocated_size));
 935   fprintf (stderr, "Number of macro maps used:           %5ld%c\n",
 936            SCALE (s.num_macro_maps_used),
 937            STAT_LABEL (s.num_macro_maps_used));
 938   fprintf (stderr, "Macro maps used size:                %5ld%c\n",
 939            SCALE (s.macro_maps_used_size),
 940            STAT_LABEL (s.macro_maps_used_size));
 941   fprintf (stderr, "Macro maps locations size:           %5ld%c\n",
 942            SCALE (s.macro_maps_locations_size),
 943            STAT_LABEL (s.macro_maps_locations_size));
 944   fprintf (stderr, "Macro maps size:                     %5ld%c\n",
 945            SCALE (macro_maps_size),
 946            STAT_LABEL (macro_maps_size));
 947   fprintf (stderr, "Duplicated maps locations size:      %5ld%c\n",
 948            SCALE (s.duplicated_macro_maps_locations_size),
 949            STAT_LABEL (s.duplicated_macro_maps_locations_size));
 950   fprintf (stderr, "Total allocated maps size:           %5ld%c\n",
 951            SCALE (total_allocated_map_size),
 952            STAT_LABEL (total_allocated_map_size));
 953   fprintf (stderr, "Total used maps size:                %5ld%c\n",
 954            SCALE (total_used_map_size),
 955            STAT_LABEL (total_used_map_size));
 956   fprintf (stderr, "Ad-hoc table size:                   %5ld%c\n",
 957            SCALE (s.adhoc_table_size),
 958            STAT_LABEL (s.adhoc_table_size));
 959   fprintf (stderr, "Ad-hoc table entries used:           %5ld\n",
 960            s.adhoc_table_entries_used);
 961   fprintf (stderr, "optimized_ranges: %i\n",
 962            line_table->num_optimized_ranges);
 963   fprintf (stderr, "unoptimized_ranges: %i\n",
 964            line_table->num_unoptimized_ranges);
 965
 966   fprintf (stderr, "\n");
 967 }
 968
 969 /* Get location one beyond the final location in ordinary map IDX.  */
 970
 971 static source_location
 972 get_end_location (struct line_maps *set, unsigned int idx)
 973 {
 974   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
 975     return set->highest_location;
 976
 977   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
 978   return MAP_START_LOCATION (next_map);
 979 }
 980
 981 /* Helper function for write_digit_row.  */
 982
 983 static void
 984 write_digit (FILE *stream, int digit)
 985 {
 986   fputc ('0' + (digit % 10), stream);
 987 }
 988
 989 /* Helper function for dump_location_info.
 990    Write a row of numbers to STREAM, numbering a source line,
 991    giving the units, tens, hundreds etc of the column number.  */
 992
 993 static void
 994 write_digit_row (FILE *stream, int indent,
 995                  const line_map_ordinary *map,
 996                  source_location loc, int max_col, int divisor)
 997 {
 998   fprintf (stream, "%*c", indent, ' ');
 999   fprintf (stream, "|");
1000   for (int column = 1; column < max_col; column++)
1001     {
1002       source_location column_loc = loc + (column << map->m_range_bits);
1003       write_digit (stream, column_loc / divisor);
1004     }
1005   fprintf (stream, "\n");
1006 }
1007
1008 /* Write a half-closed (START) / half-open (END) interval of
1009    source_location to STREAM.  */
1010
1011 static void
1012 dump_location_range (FILE *stream,
1013                      source_location start, source_location end)
1014 {
1015   fprintf (stream,
1016            "  source_location interval: %u <= loc < %u\n",
1017            start, end);
1018 }
1019
1020 /* Write a labelled description of a half-closed (START) / half-open (END)
1021    interval of source_location to STREAM.  */
1022
1023 static void
1024 dump_labelled_location_range (FILE *stream,
1025                               const char *name,
1026                               source_location start, source_location end)
1027 {
1028   fprintf (stream, "%s\n", name);
1029   dump_location_range (stream, start, end);
1030   fprintf (stream, "\n");
1031 }
1032
1033 /* Write a visualization of the locations in the line_table to STREAM.  */
1034
1035 void
1036 dump_location_info (FILE *stream)
1037 {
1038   /* Visualize the reserved locations.  */
1039   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1040                                 0, RESERVED_LOCATION_COUNT);
1041
1042   /* Visualize the ordinary line_map instances, rendering the sources. */
1043   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1044     {
1045       source_location end_location = get_end_location (line_table, idx);
1046       /* half-closed: doesn't include this one. */
1047
1048       const line_map_ordinary *map
1049         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1050       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1051       dump_location_range (stream,
1052                            MAP_START_LOCATION (map), end_location);
1053       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1054       fprintf (stream, "  starting at line: %i\n",
1055                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1056       fprintf (stream, "  column and range bits: %i\n",
1057                map->m_column_and_range_bits);
1058       fprintf (stream, "  column bits: %i\n",
1059                map->m_column_and_range_bits - map->m_range_bits);
1060       fprintf (stream, "  range bits: %i\n",
1061                map->m_range_bits);
1062
1063       /* Render the span of source lines that this "map" covers.  */
1064       for (source_location loc = MAP_START_LOCATION (map);
1065            loc < end_location;
1066            loc += (1 << map->m_range_bits) )
1067         {
1068           gcc_assert (pure_location_p (line_table, loc) );
1069
1070           expanded_location exploc
1071             = linemap_expand_location (line_table, map, loc);
1072
1073           if (0 == exploc.column)
1074             {
1075               /* Beginning of a new source line: draw the line.  */
1076
1077               int line_size;
1078               const char *line_text = location_get_source_line (exploc.file,
1079                                                                 exploc.line,
1080                                                                 &line_size);
1081               if (!line_text)
1082                 break;
1083               fprintf (stream,
1084                        "%s:%3i|loc:%5i|%.*s\n",
1085                        exploc.file, exploc.line,
1086                        loc,
1087                        line_size, line_text);
1088
1089               /* "loc" is at column 0, which means "the whole line".
1090                  Render the locations *within* the line, by underlining
1091                  it, showing the source_location numeric values
1092                  at each column.  */
1093               int max_col = (1 << map->m_column_and_range_bits) - 1;
1094               if (max_col > line_size)
1095                 max_col = line_size + 1;
1096
1097               int indent = 14 + strlen (exploc.file);
1098
1099               /* Thousands.  */
1100               if (end_location > 999)
1101                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1102
1103               /* Hundreds.  */
1104               if (end_location > 99)
1105                 write_digit_row (stream, indent, map, loc, max_col, 100);
1106
1107               /* Tens.  */
1108               write_digit_row (stream, indent, map, loc, max_col, 10);
1109
1110               /* Units.  */
1111               write_digit_row (stream, indent, map, loc, max_col, 1);
1112             }
1113         }
1114       fprintf (stream, "\n");
1115     }
1116
1117   /* Visualize unallocated values.  */
1118   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1119                                 line_table->highest_location,
1120                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1121
1122   /* Visualize the macro line_map instances, rendering the sources. */
1123   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1124     {
1125       /* Each macro map that is allocated owns source_location values
1126          that are *lower* that the one before them.
1127          Hence it's meaningful to view them either in order of ascending
1128          source locations, or in order of ascending macro map index.  */
1129       const bool ascending_source_locations = true;
1130       unsigned int idx = (ascending_source_locations
1131                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1132                           : i);
1133       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1134       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1135                idx,
1136                linemap_map_get_macro_name (map),
1137                MACRO_MAP_NUM_MACRO_TOKENS (map));
1138       dump_location_range (stream,
1139                            map->start_location,
1140                            (map->start_location
1141                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1142       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1143               "expansion point is location %i",
1144               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1145       fprintf (stream, "  map->start_location: %u\n",
1146                map->start_location);
1147
1148       fprintf (stream, "  macro_locations:\n");
1149       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1150         {
1151           source_location x = MACRO_MAP_LOCATIONS (map)[2 * i];
1152           source_location y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1153
1154           /* linemap_add_macro_token encodes token numbers in an expansion
1155              by putting them after MAP_START_LOCATION. */
1156
1157           /* I'm typically seeing 4 uninitialized entries at the end of
1158              0xafafafaf.
1159              This appears to be due to macro.c:replace_args
1160              adding 2 extra args for padding tokens; presumably there may
1161              be a leading and/or trailing padding token injected,
1162              each for 2 more location slots.
1163              This would explain there being up to 4 source_locations slots
1164              that may be uninitialized.  */
1165
1166           fprintf (stream, "    %u: %u, %u\n",
1167                    i,
1168                    x,
1169                    y);
1170           if (x == y)
1171             {
1172               if (x < MAP_START_LOCATION (map))
1173                 inform (x, "token %u has x-location == y-location == %u", i, x);
1174               else
1175                 fprintf (stream,
1176                          "x-location == y-location == %u encodes token # %u\n",
1177                          x, x - MAP_START_LOCATION (map));
1178                 }
1179           else
1180             {
1181               inform (x, "token %u has x-location == %u", i, x);
1182               inform (x, "token %u has y-location == %u", i, y);
1183             }
1184         }
1185       fprintf (stream, "\n");
1186     }
1187
1188   /* It appears that MAX_SOURCE_LOCATION itself is never assigned to a
1189      macro map, presumably due to an off-by-one error somewhere
1190      between the logic in linemap_enter_macro and
1191      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1192   dump_labelled_location_range (stream, "MAX_SOURCE_LOCATION",
1193                                 MAX_SOURCE_LOCATION,
1194                                 MAX_SOURCE_LOCATION + 1);
1195
1196   /* Visualize ad-hoc values.  */
1197   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1198                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
1199 }
1200
1201 /* string_concat's constructor.  */
1202
1203 string_concat::string_concat (int num, location_t *locs)
1204   : m_num (num)
1205 {
1206   m_locs = ggc_vec_alloc <location_t> (num);
1207   for (int i = 0; i < num; i++)
1208     m_locs[i] = locs[i];
1209 }
1210
1211 /* string_concat_db's constructor.  */
1212
1213 string_concat_db::string_concat_db ()
1214 {
1215   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1216 }
1217
1218 /* Record that a string concatenation occurred, covering NUM
1219    string literal tokens.  LOCS is an array of size NUM, containing the
1220    locations of the tokens.  A copy of LOCS is taken.  */
1221
1222 void
1223 string_concat_db::record_string_concatenation (int num, location_t *locs)
1224 {
1225   gcc_assert (num > 1);
1226   gcc_assert (locs);
1227
1228   location_t key_loc = get_key_loc (locs[0]);
1229
1230   string_concat *concat
1231     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1232   m_table->put (key_loc, concat);
1233 }
1234
1235 /* Determine if LOC was the location of the the initial token of a
1236    concatenation of string literal tokens.
1237    If so, *OUT_NUM is written to with the number of tokens, and
1238    *OUT_LOCS with the location of an array of locations of the
1239    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1240    storage owned by the string_concat_db.
1241    Otherwise, return false.  */
1242
1243 bool
1244 string_concat_db::get_string_concatenation (location_t loc,
1245                                             int *out_num,
1246                                             location_t **out_locs)
1247 {
1248   gcc_assert (out_num);
1249   gcc_assert (out_locs);
1250
1251   location_t key_loc = get_key_loc (loc);
1252
1253   string_concat **concat = m_table->get (key_loc);
1254   if (!concat)
1255     return false;
1256
1257   *out_num = (*concat)->m_num;
1258   *out_locs =(*concat)->m_locs;
1259   return true;
1260 }
1261
1262 /* Internal function.  Canonicalize LOC into a form suitable for
1263    use as a key within the database, stripping away macro expansion,
1264    ad-hoc information, and range information, using the location of
1265    the start of LOC within an ordinary linemap.  */
1266
1267 location_t
1268 string_concat_db::get_key_loc (location_t loc)
1269 {
1270   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1271                                   NULL);
1272
1273   loc = get_range_from_loc (line_table, loc).m_start;
1274
1275   return loc;
1276 }
1277
1278 /* Helper class for use within get_substring_ranges_for_loc.
1279    An vec of cpp_string with responsibility for releasing all of the
1280    str->text for each str in the vector.  */
1281
1282 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1283 {
1284  public:
1285   auto_cpp_string_vec (int alloc)
1286     : auto_vec <cpp_string> (alloc) {}
1287
1288   ~auto_cpp_string_vec ()
1289   {
1290     /* Clean up the copies within this vec.  */
1291     int i;
1292     cpp_string *str;
1293     FOR_EACH_VEC_ELT (*this, i, str)
1294       free (const_cast <unsigned char *> (str->text));
1295   }
1296 };
1297
1298 /* Attempt to populate RANGES with source location information on the
1299    individual characters within the string literal found at STRLOC.
1300    If CONCATS is non-NULL, then any string literals that the token at
1301    STRLOC  was concatenated with are also added to RANGES.
1302
1303    Return NULL if successful, or an error message if any errors occurred (in
1304    which case RANGES may be only partially populated and should not
1305    be used).
1306
1307    This is implemented by re-parsing the relevant source line(s).  */
1308
1309 static const char *
1310 get_substring_ranges_for_loc (cpp_reader *pfile,
1311                               string_concat_db *concats,
1312                               location_t strloc,
1313                               enum cpp_ttype type,
1314                               cpp_substring_ranges &ranges)
1315 {
1316   gcc_assert (pfile);
1317
1318   if (strloc == UNKNOWN_LOCATION)
1319     return "unknown location";
1320
1321   /* If string concatenation has occurred at STRLOC, get the locations
1322      of all of the literal tokens making up the compound string.
1323      Otherwise, just use STRLOC.  */
1324   int num_locs = 1;
1325   location_t *strlocs = &strloc;
1326   if (concats)
1327     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1328
1329   auto_cpp_string_vec strs (num_locs);
1330   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1331   for (int i = 0; i < num_locs; i++)
1332     {
1333       /* Get range of strloc.  We will use it to locate the start and finish
1334          of the literal token within the line.  */
1335       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1336
1337       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1338         /* If the string is within a macro expansion, we can't get at the
1339            end location.  */
1340         return "macro expansion";
1341
1342       if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1343         /* If so, we can't reliably determine where the token started within
1344            its line.  */
1345         return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1346
1347       if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1348         /* If so, we can't reliably determine where the token finished within
1349            its line.  */
1350         return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1351
1352       expanded_location start
1353         = expand_location_to_spelling_point (src_range.m_start);
1354       expanded_location finish
1355         = expand_location_to_spelling_point (src_range.m_finish);
1356       if (start.file != finish.file)
1357         return "range endpoints are in different files";
1358       if (start.line != finish.line)
1359         return "range endpoints are on different lines";
1360       if (start.column > finish.column)
1361         return "range endpoints are reversed";
1362
1363       int line_width;
1364       const char *line = location_get_source_line (start.file, start.line,
1365                                                    &line_width);
1366       if (line == NULL)
1367         return "unable to read source line";
1368
1369       /* Determine the location of the literal (including quotes
1370          and leading prefix chars, such as the 'u' in a u""
1371          token).  */
1372       const char *literal = line + start.column - 1;
1373       int literal_length = finish.column - start.column + 1;
1374
1375       gcc_assert (line_width >= (start.column - 1 + literal_length));
1376       cpp_string from;
1377       from.len = literal_length;
1378       /* Make a copy of the literal, to avoid having to rely on
1379          the lifetime of the copy of the line within the cache.
1380          This will be released by the auto_cpp_string_vec dtor.  */
1381       from.text = XDUPVEC (unsigned char, literal, literal_length);
1382       strs.safe_push (from);
1383
1384       /* For very long lines, a new linemap could have started
1385          halfway through the token.
1386          Ensure that the loc_reader uses the linemap of the
1387          *end* of the token for its start location.  */
1388       const line_map_ordinary *final_ord_map;
1389       linemap_resolve_location (line_table, src_range.m_finish,
1390                                 LRK_MACRO_EXPANSION_POINT, &final_ord_map);
1391       location_t start_loc
1392         = linemap_position_for_line_and_column (line_table, final_ord_map,
1393                                                 start.line, start.column);
1394
1395       cpp_string_location_reader loc_reader (start_loc, line_table);
1396       loc_readers.safe_push (loc_reader);
1397     }
1398
1399   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1400   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1401                                                  loc_readers.address (),
1402                                                  num_locs, &ranges, type);
1403   if (err)
1404     return err;
1405
1406   /* Success: "ranges" should now contain information on the string.  */
1407   return NULL;
1408 }
1409
1410 /* Attempt to populate *OUT_LOC with source location information on the
1411    given characters within the string literal found at STRLOC.
1412    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1413    character set.
1414
1415    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1416    and string literal "012345\n789"
1417    *OUT_LOC is written to with:
1418      "012345\n789"
1419          ~^~~~~
1420
1421    If CONCATS is non-NULL, then any string literals that the token at
1422    STRLOC was concatenated with are also considered.
1423
1424    This is implemented by re-parsing the relevant source line(s).
1425
1426    Return NULL if successful, or an error message if any errors occurred.
1427    Error messages are intended for GCC developers (to help debugging) rather
1428    than for end-users.  */
1429
1430 const char *
1431 get_source_location_for_substring (cpp_reader *pfile,
1432                                    string_concat_db *concats,
1433                                    location_t strloc,
1434                                    enum cpp_ttype type,
1435                                    int caret_idx, int start_idx, int end_idx,
1436                                    source_location *out_loc)
1437 {
1438   gcc_checking_assert (caret_idx >= 0);
1439   gcc_checking_assert (start_idx >= 0);
1440   gcc_checking_assert (end_idx >= 0);
1441   gcc_assert (out_loc);
1442
1443   cpp_substring_ranges ranges;
1444   const char *err
1445     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1446   if (err)
1447     return err;
1448
1449   if (caret_idx >= ranges.get_num_ranges ())
1450     return "caret_idx out of range";
1451   if (start_idx >= ranges.get_num_ranges ())
1452     return "start_idx out of range";
1453   if (end_idx >= ranges.get_num_ranges ())
1454     return "end_idx out of range";
1455
1456   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1457                             ranges.get_range (start_idx).m_start,
1458                             ranges.get_range (end_idx).m_finish);
1459   return NULL;
1460 }
1461
1462 #if CHECKING_P
1463
1464 namespace selftest {
1465
1466 /* Selftests of location handling.  */
1467
1468 /* Attempt to populate *OUT_RANGE with source location information on the
1469    given character within the string literal found at STRLOC.
1470    CHAR_IDX refers to an offset within the execution character set.
1471    If CONCATS is non-NULL, then any string literals that the token at
1472    STRLOC was concatenated with are also considered.
1473
1474    This is implemented by re-parsing the relevant source line(s).
1475
1476    Return NULL if successful, or an error message if any errors occurred.
1477    Error messages are intended for GCC developers (to help debugging) rather
1478    than for end-users.  */
1479
1480 static const char *
1481 get_source_range_for_char (cpp_reader *pfile,
1482                            string_concat_db *concats,
1483                            location_t strloc,
1484                            enum cpp_ttype type,
1485                            int char_idx,
1486                            source_range *out_range)
1487 {
1488   gcc_checking_assert (char_idx >= 0);
1489   gcc_assert (out_range);
1490
1491   cpp_substring_ranges ranges;
1492   const char *err
1493     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1494   if (err)
1495     return err;
1496
1497   if (char_idx >= ranges.get_num_ranges ())
1498     return "char_idx out of range";
1499
1500   *out_range = ranges.get_range (char_idx);
1501   return NULL;
1502 }
1503
1504 /* As get_source_range_for_char, but write to *OUT the number
1505    of ranges that are available.  */
1506
1507 static const char *
1508 get_num_source_ranges_for_substring (cpp_reader *pfile,
1509                                      string_concat_db *concats,
1510                                      location_t strloc,
1511                                      enum cpp_ttype type,
1512                                      int *out)
1513 {
1514   gcc_assert (out);
1515
1516   cpp_substring_ranges ranges;
1517   const char *err
1518     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1519
1520   if (err)
1521     return err;
1522
1523   *out = ranges.get_num_ranges ();
1524   return NULL;
1525 }
1526
1527 /* Selftests of location handling.  */
1528
1529 /* Helper function for verifying location data: when location_t
1530    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1531    as having column 0.  */
1532
1533 static bool
1534 should_have_column_data_p (location_t loc)
1535 {
1536   if (IS_ADHOC_LOC (loc))
1537     loc = get_location_from_adhoc_loc (line_table, loc);
1538   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1539     return false;
1540   return true;
1541 }
1542
1543 /* Selftest for should_have_column_data_p.  */
1544
1545 static void
1546 test_should_have_column_data_p ()
1547 {
1548   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1549   ASSERT_TRUE
1550     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1551   ASSERT_FALSE
1552     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1553 }
1554
1555 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1556    on LOC.  */
1557
1558 static void
1559 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1560               location_t loc)
1561 {
1562   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1563   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1564   /* If location_t values are sufficiently high, then column numbers
1565      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1566      When close to the threshold, column numbers *may* be present: if
1567      the final linemap before the threshold contains a line that straddles
1568      the threshold, locations in that line have column information.  */
1569   if (should_have_column_data_p (loc))
1570     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1571 }
1572
1573 /* Various selftests involve constructing a line table and one or more
1574    line maps within it.
1575
1576    For maximum test coverage we want to run these tests with a variety
1577    of situations:
1578    - line_table->default_range_bits: some frontends use a non-zero value
1579    and others use zero
1580    - the fallback modes within line-map.c: there are various threshold
1581    values for source_location/location_t beyond line-map.c changes
1582    behavior (disabling of the range-packing optimization, disabling
1583    of column-tracking).  We can exercise these by starting the line_table
1584    at interesting values at or near these thresholds.
1585
1586    The following struct describes a particular case within our test
1587    matrix.  */
1588
1589 struct line_table_case
1590 {
1591   line_table_case (int default_range_bits, int base_location)
1592   : m_default_range_bits (default_range_bits),
1593     m_base_location (base_location)
1594   {}
1595
1596   int m_default_range_bits;
1597   int m_base_location;
1598 };
1599
1600 /* Constructor.  Store the old value of line_table, and create a new
1601    one, using sane defaults.  */
1602
1603 line_table_test::line_table_test ()
1604 {
1605   gcc_assert (saved_line_table == NULL);
1606   saved_line_table = line_table;
1607   line_table = ggc_alloc<line_maps> ();
1608   linemap_init (line_table, BUILTINS_LOCATION);
1609   gcc_assert (saved_line_table->reallocator);
1610   line_table->reallocator = saved_line_table->reallocator;
1611   gcc_assert (saved_line_table->round_alloc_size);
1612   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1613   line_table->default_range_bits = 0;
1614 }
1615
1616 /* Constructor.  Store the old value of line_table, and create a new
1617    one, using the sitation described in CASE_.  */
1618
1619 line_table_test::line_table_test (const line_table_case &case_)
1620 {
1621   gcc_assert (saved_line_table == NULL);
1622   saved_line_table = line_table;
1623   line_table = ggc_alloc<line_maps> ();
1624   linemap_init (line_table, BUILTINS_LOCATION);
1625   gcc_assert (saved_line_table->reallocator);
1626   line_table->reallocator = saved_line_table->reallocator;
1627   gcc_assert (saved_line_table->round_alloc_size);
1628   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1629   line_table->default_range_bits = case_.m_default_range_bits;
1630   if (case_.m_base_location)
1631     {
1632       line_table->highest_location = case_.m_base_location;
1633       line_table->highest_line = case_.m_base_location;
1634     }
1635 }
1636
1637 /* Destructor.  Restore the old value of line_table.  */
1638
1639 line_table_test::~line_table_test ()
1640 {
1641   gcc_assert (saved_line_table != NULL);
1642   line_table = saved_line_table;
1643   saved_line_table = NULL;
1644 }
1645
1646 /* Verify basic operation of ordinary linemaps.  */
1647
1648 static void
1649 test_accessing_ordinary_linemaps (const line_table_case &case_)
1650 {
1651   line_table_test ltt (case_);
1652
1653   /* Build a simple linemap describing some locations. */
1654   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1655
1656   linemap_line_start (line_table, 1, 100);
1657   location_t loc_a = linemap_position_for_column (line_table, 1);
1658   location_t loc_b = linemap_position_for_column (line_table, 23);
1659
1660   linemap_line_start (line_table, 2, 100);
1661   location_t loc_c = linemap_position_for_column (line_table, 1);
1662   location_t loc_d = linemap_position_for_column (line_table, 17);
1663
1664   /* Example of a very long line.  */
1665   linemap_line_start (line_table, 3, 2000);
1666   location_t loc_e = linemap_position_for_column (line_table, 700);
1667
1668   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1669
1670   /* Multiple files.  */
1671   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1672   linemap_line_start (line_table, 1, 200);
1673   location_t loc_f = linemap_position_for_column (line_table, 150);
1674   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1675
1676   /* Verify that we can recover the location info.  */
1677   assert_loceq ("foo.c", 1, 1, loc_a);
1678   assert_loceq ("foo.c", 1, 23, loc_b);
1679   assert_loceq ("foo.c", 2, 1, loc_c);
1680   assert_loceq ("foo.c", 2, 17, loc_d);
1681   assert_loceq ("foo.c", 3, 700, loc_e);
1682   assert_loceq ("bar.c", 1, 150, loc_f);
1683
1684   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1685   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1686
1687   /* Verify using make_location to build a range, and extracting data
1688      back from it.  */
1689   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1690   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1691   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1692   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1693   ASSERT_EQ (loc_b, src_range.m_start);
1694   ASSERT_EQ (loc_d, src_range.m_finish);
1695 }
1696
1697 /* Verify various properties of UNKNOWN_LOCATION.  */
1698
1699 static void
1700 test_unknown_location ()
1701 {
1702   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1703   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1704   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1705 }
1706
1707 /* Verify various properties of BUILTINS_LOCATION.  */
1708
1709 static void
1710 test_builtins ()
1711 {
1712   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1713   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1714 }
1715
1716 /* Regression test for make_location.
1717    Ensure that we use pure locations for the start/finish of the range,
1718    rather than storing a packed or ad-hoc range as the start/finish.  */
1719
1720 static void
1721 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1722 {
1723   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1724      with C++ frontend.
1725      ....................0000000001111111111222.
1726      ....................1234567890123456789012.  */
1727   const char *content = "     r += !aaa == bbb;\n";
1728   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1729   line_table_test ltt (case_);
1730   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1731
1732   const location_t c11 = linemap_position_for_column (line_table, 11);
1733   const location_t c12 = linemap_position_for_column (line_table, 12);
1734   const location_t c13 = linemap_position_for_column (line_table, 13);
1735   const location_t c14 = linemap_position_for_column (line_table, 14);
1736   const location_t c21 = linemap_position_for_column (line_table, 21);
1737
1738   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1739     return;
1740
1741   /* Use column 13 for the caret location, arbitrarily, to verify that we
1742      handle start != caret.  */
1743   const location_t aaa = make_location (c13, c12, c14);
1744   ASSERT_EQ (c13, get_pure_location (aaa));
1745   ASSERT_EQ (c12, get_start (aaa));
1746   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1747   ASSERT_EQ (c14, get_finish (aaa));
1748   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1749
1750   /* Make a location using a location with a range as the start-point.  */
1751   const location_t not_aaa = make_location (c11, aaa, c14);
1752   ASSERT_EQ (c11, get_pure_location (not_aaa));
1753   /* It should use the start location of the range, not store the range
1754      itself.  */
1755   ASSERT_EQ (c12, get_start (not_aaa));
1756   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1757   ASSERT_EQ (c14, get_finish (not_aaa));
1758   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1759
1760   /* Similarly, make a location with a range as the end-point.  */
1761   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1762   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1763   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1764   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1765   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1766   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1767   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1768   /* It should use the finish location of the range, not store the range
1769      itself.  */
1770   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1771   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1772   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1773   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1774   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1775 }
1776
1777 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1778
1779 static void
1780 test_reading_source_line ()
1781 {
1782   /* Create a tempfile and write some text to it.  */
1783   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1784                         "01234567890123456789\n"
1785                         "This is the test text\n"
1786                         "This is the 3rd line");
1787
1788   /* Read back a specific line from the tempfile.  */
1789   int line_size;
1790   const char *source_line = location_get_source_line (tmp.get_filename (),
1791                                                       3, &line_size);
1792   ASSERT_TRUE (source_line != NULL);
1793   ASSERT_EQ (20, line_size);
1794   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1795                          source_line, line_size));
1796
1797   source_line = location_get_source_line (tmp.get_filename (),
1798                                           2, &line_size);
1799   ASSERT_TRUE (source_line != NULL);
1800   ASSERT_EQ (21, line_size);
1801   ASSERT_TRUE (!strncmp ("This is the test text",
1802                          source_line, line_size));
1803
1804   source_line = location_get_source_line (tmp.get_filename (),
1805                                           4, &line_size);
1806   ASSERT_TRUE (source_line == NULL);
1807 }
1808
1809 /* Tests of lexing.  */
1810
1811 /* Verify that token TOK from PARSER has cpp_token_as_text
1812    equal to EXPECTED_TEXT.  */
1813
1814 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1815   SELFTEST_BEGIN_STMT                                                   \
1816     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1817     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1818   SELFTEST_END_STMT
1819
1820 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1821    and ranges from EXP_START_COL to EXP_FINISH_COL.
1822    Use LOC as the effective location of the selftest.  */
1823
1824 static void
1825 assert_token_loc_eq (const location &loc,
1826                      const cpp_token *tok,
1827                      const char *exp_filename, int exp_linenum,
1828                      int exp_start_col, int exp_finish_col)
1829 {
1830   location_t tok_loc = tok->src_loc;
1831   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1832   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1833
1834   /* If location_t values are sufficiently high, then column numbers
1835      will be unavailable.  */
1836   if (!should_have_column_data_p (tok_loc))
1837     return;
1838
1839   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
1840   source_range tok_range = get_range_from_loc (line_table, tok_loc);
1841   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
1842   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
1843 }
1844
1845 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
1846    SELFTEST_LOCATION as the effective location of the selftest.  */
1847
1848 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
1849                             EXP_START_COL, EXP_FINISH_COL) \
1850   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
1851                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
1852
1853 /* Test of lexing a file using libcpp, verifying tokens and their
1854    location information.  */
1855
1856 static void
1857 test_lexer (const line_table_case &case_)
1858 {
1859   /* Create a tempfile and write some text to it.  */
1860   const char *content =
1861     /*00000000011111111112222222222333333.3333444444444.455555555556
1862       12345678901234567890123456789012345.6789012345678.901234567890.  */
1863     ("test_name /* c-style comment */\n"
1864      "                                  \"test literal\"\n"
1865      " // test c++-style comment\n"
1866      "   42\n");
1867   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
1868
1869   line_table_test ltt (case_);
1870
1871   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
1872
1873   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
1874   ASSERT_NE (fname, NULL);
1875
1876   /* Verify that we get the expected tokens back, with the correct
1877      location information.  */
1878
1879   location_t loc;
1880   const cpp_token *tok;
1881   tok = cpp_get_token_with_location (parser, &loc);
1882   ASSERT_NE (tok, NULL);
1883   ASSERT_EQ (tok->type, CPP_NAME);
1884   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
1885   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
1886
1887   tok = cpp_get_token_with_location (parser, &loc);
1888   ASSERT_NE (tok, NULL);
1889   ASSERT_EQ (tok->type, CPP_STRING);
1890   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
1891   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
1892
1893   tok = cpp_get_token_with_location (parser, &loc);
1894   ASSERT_NE (tok, NULL);
1895   ASSERT_EQ (tok->type, CPP_NUMBER);
1896   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
1897   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
1898
1899   tok = cpp_get_token_with_location (parser, &loc);
1900   ASSERT_NE (tok, NULL);
1901   ASSERT_EQ (tok->type, CPP_EOF);
1902
1903   cpp_finish (parser, NULL);
1904   cpp_destroy (parser);
1905 }
1906
1907 /* Forward decls.  */
1908
1909 struct lexer_test;
1910 class lexer_test_options;
1911
1912 /* A class for specifying options of a lexer_test.
1913    The "apply" vfunc is called during the lexer_test constructor.  */
1914
1915 class lexer_test_options
1916 {
1917  public:
1918   virtual void apply (lexer_test &) = 0;
1919 };
1920
1921 /* A struct for writing lexer tests.  */
1922
1923 struct lexer_test
1924 {
1925   lexer_test (const line_table_case &case_, const char *content,
1926               lexer_test_options *options);
1927   ~lexer_test ();
1928
1929   const cpp_token *get_token ();
1930
1931   temp_source_file m_tempfile;
1932   line_table_test m_ltt;
1933   cpp_reader *m_parser;
1934   string_concat_db m_concats;
1935 };
1936
1937 /* Use an EBCDIC encoding for the execution charset, specifically
1938    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
1939
1940    This exercises iconv integration within libcpp.
1941    Not every build of iconv supports the given charset,
1942    so we need to flag this error and handle it gracefully.  */
1943
1944 class ebcdic_execution_charset : public lexer_test_options
1945 {
1946  public:
1947   ebcdic_execution_charset () : m_num_iconv_errors (0)
1948     {
1949       gcc_assert (s_singleton == NULL);
1950       s_singleton = this;
1951     }
1952   ~ebcdic_execution_charset ()
1953     {
1954       gcc_assert (s_singleton == this);
1955       s_singleton = NULL;
1956     }
1957
1958   void apply (lexer_test &test) FINAL OVERRIDE
1959   {
1960     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
1961     cpp_opts->narrow_charset = "IBM1047";
1962
1963     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
1964     callbacks->error = on_error;
1965   }
1966
1967   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
1968                         int level ATTRIBUTE_UNUSED,
1969                         int reason ATTRIBUTE_UNUSED,
1970                         rich_location *richloc ATTRIBUTE_UNUSED,
1971                         const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
1972     ATTRIBUTE_FPTR_PRINTF(5,0)
1973   {
1974     gcc_assert (s_singleton);
1975     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
1976        when the local iconv build doesn't support the conversion.  */
1977     if (strstr (msgid, "not supported by iconv"))
1978       {
1979         s_singleton->m_num_iconv_errors++;
1980         return true;
1981       }
1982
1983     /* Otherwise, we have an unexpected error.  */
1984     abort ();
1985   }
1986
1987   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
1988
1989  private:
1990   static ebcdic_execution_charset *s_singleton;
1991   int m_num_iconv_errors;
1992 };
1993
1994 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
1995
1996 /* Constructor.  Override line_table with a new instance based on CASE_,
1997    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
1998    start parsing the tempfile.  */
1999
2000 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2001                         lexer_test_options *options) :
2002   /* Create a tempfile and write the text to it.  */
2003   m_tempfile (SELFTEST_LOCATION, ".c", content),
2004   m_ltt (case_),
2005   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2006   m_concats ()
2007 {
2008   if (options)
2009     options->apply (*this);
2010
2011   cpp_init_iconv (m_parser);
2012
2013   /* Parse the file.  */
2014   const char *fname = cpp_read_main_file (m_parser,
2015                                           m_tempfile.get_filename ());
2016   ASSERT_NE (fname, NULL);
2017 }
2018
2019 /* Destructor.  Verify that the next token in m_parser is EOF.  */
2020
2021 lexer_test::~lexer_test ()
2022 {
2023   location_t loc;
2024   const cpp_token *tok;
2025
2026   tok = cpp_get_token_with_location (m_parser, &loc);
2027   ASSERT_NE (tok, NULL);
2028   ASSERT_EQ (tok->type, CPP_EOF);
2029
2030   cpp_finish (m_parser, NULL);
2031   cpp_destroy (m_parser);
2032 }
2033
2034 /* Get the next token from m_parser.  */
2035
2036 const cpp_token *
2037 lexer_test::get_token ()
2038 {
2039   location_t loc;
2040   const cpp_token *tok;
2041
2042   tok = cpp_get_token_with_location (m_parser, &loc);
2043   ASSERT_NE (tok, NULL);
2044   return tok;
2045 }
2046
2047 /* Verify that locations within string literals are correctly handled.  */
2048
2049 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2050    using the string concatenation database for TEST.
2051
2052    Assert that the character at index IDX is on EXPECTED_LINE,
2053    and that it begins at column EXPECTED_START_COL and ends at
2054    EXPECTED_FINISH_COL (unless the locations are beyond
2055    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2056    columns).  */
2057
2058 static void
2059 assert_char_at_range (const location &loc,
2060                       lexer_test& test,
2061                       location_t strloc, enum cpp_ttype type, int idx,
2062                       int expected_line, int expected_start_col,
2063                       int expected_finish_col)
2064 {
2065   cpp_reader *pfile = test.m_parser;
2066   string_concat_db *concats = &test.m_concats;
2067
2068   source_range actual_range;
2069   const char *err
2070     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2071                                  &actual_range);
2072   if (should_have_column_data_p (strloc))
2073     ASSERT_EQ_AT (loc, NULL, err);
2074   else
2075     {
2076       ASSERT_STREQ_AT (loc,
2077                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2078                        err);
2079       return;
2080     }
2081
2082   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2083   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2084   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2085   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2086
2087   if (should_have_column_data_p (actual_range.m_start))
2088     {
2089       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2090       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2091     }
2092   if (should_have_column_data_p (actual_range.m_finish))
2093     {
2094       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2095       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2096     }
2097 }
2098
2099 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2100    the effective location of any errors.  */
2101
2102 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2103                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2104   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2105                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2106                         (EXPECTED_FINISH_COL))
2107
2108 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2109    using the string concatenation database for TEST.
2110
2111    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2112
2113 static void
2114 assert_num_substring_ranges (const location &loc,
2115                              lexer_test& test,
2116                              location_t strloc,
2117                              enum cpp_ttype type,
2118                              int expected_num_ranges)
2119 {
2120   cpp_reader *pfile = test.m_parser;
2121   string_concat_db *concats = &test.m_concats;
2122
2123   int actual_num_ranges = -1;
2124   const char *err
2125     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2126                                            &actual_num_ranges);
2127   if (should_have_column_data_p (strloc))
2128     ASSERT_EQ_AT (loc, NULL, err);
2129   else
2130     {
2131       ASSERT_STREQ_AT (loc,
2132                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2133                        err);
2134       return;
2135     }
2136   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2137 }
2138
2139 /* Macro for calling assert_num_substring_ranges, supplying
2140    SELFTEST_LOCATION for the effective location of any errors.  */
2141
2142 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2143                                     EXPECTED_NUM_RANGES)                \
2144   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2145                                (TYPE), (EXPECTED_NUM_RANGES))
2146
2147
2148 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2149    returns an error (using the string concatenation database for TEST).  */
2150
2151 static void
2152 assert_has_no_substring_ranges (const location &loc,
2153                                 lexer_test& test,
2154                                 location_t strloc,
2155                                 enum cpp_ttype type,
2156                                 const char *expected_err)
2157 {
2158   cpp_reader *pfile = test.m_parser;
2159   string_concat_db *concats = &test.m_concats;
2160   cpp_substring_ranges ranges;
2161   const char *actual_err
2162     = get_substring_ranges_for_loc (pfile, concats, strloc,
2163                                     type, ranges);
2164   if (should_have_column_data_p (strloc))
2165     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2166   else
2167     ASSERT_STREQ_AT (loc,
2168                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2169                      actual_err);
2170 }
2171
2172 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2173     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2174                                     (STRLOC), (TYPE), (ERR))
2175
2176 /* Lex a simple string literal.  Verify the substring location data, before
2177    and after running cpp_interpret_string on it.  */
2178
2179 static void
2180 test_lexer_string_locations_simple (const line_table_case &case_)
2181 {
2182   /* Digits 0-9 (with 0 at column 10), the simple way.
2183      ....................000000000.11111111112.2222222223333333333
2184      ....................123456789.01234567890.1234567890123456789
2185      We add a trailing comment to ensure that we correctly locate
2186      the end of the string literal token.  */
2187   const char *content = "        \"0123456789\" /* not a string */\n";
2188   lexer_test test (case_, content, NULL);
2189
2190   /* Verify that we get the expected token back, with the correct
2191      location information.  */
2192   const cpp_token *tok = test.get_token ();
2193   ASSERT_EQ (tok->type, CPP_STRING);
2194   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2195   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2196
2197   /* At this point in lexing, the quote characters are treated as part of
2198      the string (they are stripped off by cpp_interpret_string).  */
2199
2200   ASSERT_EQ (tok->val.str.len, 12);
2201
2202   /* Verify that cpp_interpret_string works.  */
2203   cpp_string dst_string;
2204   const enum cpp_ttype type = CPP_STRING;
2205   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2206                                       &dst_string, type);
2207   ASSERT_TRUE (result);
2208   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2209   free (const_cast <unsigned char *> (dst_string.text));
2210
2211   /* Verify ranges of individual characters.  This no longer includes the
2212      opening quote, but does include the closing quote.  */
2213   for (int i = 0; i <= 10; i++)
2214     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2215                           10 + i, 10 + i);
2216
2217   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2218 }
2219
2220 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2221    encoding.  */
2222
2223 static void
2224 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2225 {
2226   /* EBCDIC support requires iconv.  */
2227   if (!HAVE_ICONV)
2228     return;
2229
2230   /* Digits 0-9 (with 0 at column 10), the simple way.
2231      ....................000000000.11111111112.2222222223333333333
2232      ....................123456789.01234567890.1234567890123456789
2233      We add a trailing comment to ensure that we correctly locate
2234      the end of the string literal token.  */
2235   const char *content = "        \"0123456789\" /* not a string */\n";
2236   ebcdic_execution_charset use_ebcdic;
2237   lexer_test test (case_, content, &use_ebcdic);
2238
2239   /* Verify that we get the expected token back, with the correct
2240      location information.  */
2241   const cpp_token *tok = test.get_token ();
2242   ASSERT_EQ (tok->type, CPP_STRING);
2243   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2244   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2245
2246   /* At this point in lexing, the quote characters are treated as part of
2247      the string (they are stripped off by cpp_interpret_string).  */
2248
2249   ASSERT_EQ (tok->val.str.len, 12);
2250
2251   /* The remainder of the test requires an iconv implementation that
2252      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2253   if (use_ebcdic.iconv_errors_occurred_p ())
2254     return;
2255
2256   /* Verify that cpp_interpret_string works.  */
2257   cpp_string dst_string;
2258   const enum cpp_ttype type = CPP_STRING;
2259   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2260                                       &dst_string, type);
2261   ASSERT_TRUE (result);
2262   /* We should now have EBCDIC-encoded text, specifically
2263      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2264      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2265   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2266                 (const char *)dst_string.text);
2267   free (const_cast <unsigned char *> (dst_string.text));
2268
2269   /* Verify that we don't attempt to record substring location information
2270      for such cases.  */
2271   ASSERT_HAS_NO_SUBSTRING_RANGES
2272     (test, tok->src_loc, type,
2273      "execution character set != source character set");
2274 }
2275
2276 /* Lex a string literal containing a hex-escaped character.
2277    Verify the substring location data, before and after running
2278    cpp_interpret_string on it.  */
2279
2280 static void
2281 test_lexer_string_locations_hex (const line_table_case &case_)
2282 {
2283   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2284      and with a space in place of digit 6, to terminate the escaped
2285      hex code.
2286      ....................000000000.111111.11112222.
2287      ....................123456789.012345.67890123.  */
2288   const char *content = "        \"01234\\x35 789\"\n";
2289   lexer_test test (case_, content, NULL);
2290
2291   /* Verify that we get the expected token back, with the correct
2292      location information.  */
2293   const cpp_token *tok = test.get_token ();
2294   ASSERT_EQ (tok->type, CPP_STRING);
2295   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2296   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2297
2298   /* At this point in lexing, the quote characters are treated as part of
2299      the string (they are stripped off by cpp_interpret_string).  */
2300   ASSERT_EQ (tok->val.str.len, 15);
2301
2302   /* Verify that cpp_interpret_string works.  */
2303   cpp_string dst_string;
2304   const enum cpp_ttype type = CPP_STRING;
2305   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2306                                       &dst_string, type);
2307   ASSERT_TRUE (result);
2308   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2309   free (const_cast <unsigned char *> (dst_string.text));
2310
2311   /* Verify ranges of individual characters.  This no longer includes the
2312      opening quote, but does include the closing quote.  */
2313   for (int i = 0; i <= 4; i++)
2314     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2315   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2316   for (int i = 6; i <= 10; i++)
2317     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2318
2319   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2320 }
2321
2322 /* Lex a string literal containing an octal-escaped character.
2323    Verify the substring location data after running cpp_interpret_string
2324    on it.  */
2325
2326 static void
2327 test_lexer_string_locations_oct (const line_table_case &case_)
2328 {
2329   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2330      and with a space in place of digit 6, to terminate the escaped
2331      octal code.
2332      ....................000000000.111111.11112222.2222223333333333444
2333      ....................123456789.012345.67890123.4567890123456789012  */
2334   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2335   lexer_test test (case_, content, NULL);
2336
2337   /* Verify that we get the expected token back, with the correct
2338      location information.  */
2339   const cpp_token *tok = test.get_token ();
2340   ASSERT_EQ (tok->type, CPP_STRING);
2341   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2342
2343   /* Verify that cpp_interpret_string works.  */
2344   cpp_string dst_string;
2345   const enum cpp_ttype type = CPP_STRING;
2346   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2347                                       &dst_string, type);
2348   ASSERT_TRUE (result);
2349   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2350   free (const_cast <unsigned char *> (dst_string.text));
2351
2352   /* Verify ranges of individual characters.  This no longer includes the
2353      opening quote, but does include the closing quote.  */
2354   for (int i = 0; i < 5; i++)
2355     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2356   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2357   for (int i = 6; i <= 10; i++)
2358     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2359
2360   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2361 }
2362
2363 /* Test of string literal containing letter escapes.  */
2364
2365 static void
2366 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2367 {
2368   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2369      .....................000000000.1.11111.1.1.11222.22222223333333
2370      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2371   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2372   lexer_test test (case_, content, NULL);
2373
2374   /* Verify that we get the expected tokens back.  */
2375   const cpp_token *tok = test.get_token ();
2376   ASSERT_EQ (tok->type, CPP_STRING);
2377   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2378
2379   /* Verify ranges of individual characters. */
2380   /* "\t".  */
2381   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2382                         0, 1, 10, 11);
2383   /* "foo". */
2384   for (int i = 1; i <= 3; i++)
2385     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2386                           i, 1, 11 + i, 11 + i);
2387   /* "\\" and "\n".  */
2388   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2389                         4, 1, 15, 16);
2390   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2391                         5, 1, 17, 18);
2392
2393   /* "bar" and closing quote for nul-terminator.  */
2394   for (int i = 6; i <= 9; i++)
2395     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2396                           i, 1, 13 + i, 13 + i);
2397
2398   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2399 }
2400
2401 /* Another test of a string literal containing a letter escape.
2402    Based on string seen in
2403      printf ("%-%\n");
2404    in gcc.dg/format/c90-printf-1.c.  */
2405
2406 static void
2407 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2408 {
2409   /* .....................000000000.1111.11.1111.22222222223.
2410      .....................123456789.0123.45.6789.01234567890.  */
2411   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2412   lexer_test test (case_, content, NULL);
2413
2414   /* Verify that we get the expected tokens back.  */
2415   const cpp_token *tok = test.get_token ();
2416   ASSERT_EQ (tok->type, CPP_STRING);
2417   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2418
2419   /* Verify ranges of individual characters. */
2420   /* "%-%".  */
2421   for (int i = 0; i < 3; i++)
2422     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2423                           i, 1, 10 + i, 10 + i);
2424   /* "\n".  */
2425   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2426                         3, 1, 13, 14);
2427
2428   /* Closing quote for nul-terminator.  */
2429   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2430                         4, 1, 15, 15);
2431
2432   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2433 }
2434
2435 /* Lex a string literal containing UCN 4 characters.
2436    Verify the substring location data after running cpp_interpret_string
2437    on it.  */
2438
2439 static void
2440 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2441 {
2442   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2443      as UCN 4.
2444      ....................000000000.111111.111122.222222223.33333333344444
2445      ....................123456789.012345.678901.234567890.12345678901234  */
2446   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2447   lexer_test test (case_, content, NULL);
2448
2449   /* Verify that we get the expected token back, with the correct
2450      location information.  */
2451   const cpp_token *tok = test.get_token ();
2452   ASSERT_EQ (tok->type, CPP_STRING);
2453   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2454
2455   /* Verify that cpp_interpret_string works.
2456      The string should be encoded in the execution character
2457      set.  Assuming that that is UTF-8, we should have the following:
2458      -----------  ----  -----  -------  ----------------
2459      Byte offset  Byte  Octal  Unicode  Source Column(s)
2460      -----------  ----  -----  -------  ----------------
2461      0            0x30         '0'      10
2462      1            0x31         '1'      11
2463      2            0x32         '2'      12
2464      3            0x33         '3'      13
2465      4            0x34         '4'      14
2466      5            0xE2  \342   U+2174   15-20
2467      6            0x85  \205    (cont)  15-20
2468      7            0xB4  \264    (cont)  15-20
2469      8            0xE2  \342   U+2175   21-26
2470      9            0x85  \205    (cont)  21-26
2471      10           0xB5  \265    (cont)  21-26
2472      11           0x37         '7'      27
2473      12           0x38         '8'      28
2474      13           0x39         '9'      29
2475      14           0x00                  30 (closing quote)
2476      -----------  ----  -----  -------  ---------------.  */
2477
2478   cpp_string dst_string;
2479   const enum cpp_ttype type = CPP_STRING;
2480   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2481                                       &dst_string, type);
2482   ASSERT_TRUE (result);
2483   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2484                 (const char *)dst_string.text);
2485   free (const_cast <unsigned char *> (dst_string.text));
2486
2487   /* Verify ranges of individual characters.  This no longer includes the
2488      opening quote, but does include the closing quote.
2489      '01234'.  */
2490   for (int i = 0; i <= 4; i++)
2491     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2492   /* U+2174.  */
2493   for (int i = 5; i <= 7; i++)
2494     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2495   /* U+2175.  */
2496   for (int i = 8; i <= 10; i++)
2497     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2498   /* '789' and nul terminator  */
2499   for (int i = 11; i <= 14; i++)
2500     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2501
2502   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2503 }
2504
2505 /* Lex a string literal containing UCN 8 characters.
2506    Verify the substring location data after running cpp_interpret_string
2507    on it.  */
2508
2509 static void
2510 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2511 {
2512   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2513      ....................000000000.111111.1111222222.2222333333333.344444
2514      ....................123456789.012345.6789012345.6789012345678.901234  */
2515   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2516   lexer_test test (case_, content, NULL);
2517
2518   /* Verify that we get the expected token back, with the correct
2519      location information.  */
2520   const cpp_token *tok = test.get_token ();
2521   ASSERT_EQ (tok->type, CPP_STRING);
2522   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2523                            "\"01234\\U00002174\\U00002175789\"");
2524
2525   /* Verify that cpp_interpret_string works.
2526      The UTF-8 encoding of the string is identical to that from
2527      the ucn4 testcase above; the only difference is the column
2528      locations.  */
2529   cpp_string dst_string;
2530   const enum cpp_ttype type = CPP_STRING;
2531   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2532                                       &dst_string, type);
2533   ASSERT_TRUE (result);
2534   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2535                 (const char *)dst_string.text);
2536   free (const_cast <unsigned char *> (dst_string.text));
2537
2538   /* Verify ranges of individual characters.  This no longer includes the
2539      opening quote, but does include the closing quote.
2540      '01234'.  */
2541   for (int i = 0; i <= 4; i++)
2542     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2543   /* U+2174.  */
2544   for (int i = 5; i <= 7; i++)
2545     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2546   /* U+2175.  */
2547   for (int i = 8; i <= 10; i++)
2548     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2549   /* '789' at columns 35-37  */
2550   for (int i = 11; i <= 13; i++)
2551     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2552   /* Closing quote/nul-terminator at column 38.  */
2553   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2554
2555   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2556 }
2557
2558 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2559
2560 static uint32_t
2561 uint32_from_big_endian (const uint32_t *ptr_be_value)
2562 {
2563   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2564   return (((uint32_t) buf[0] << 24)
2565           | ((uint32_t) buf[1] << 16)
2566           | ((uint32_t) buf[2] << 8)
2567           | (uint32_t) buf[3]);
2568 }
2569
2570 /* Lex a wide string literal and verify that attempts to read substring
2571    location data from it fail gracefully.  */
2572
2573 static void
2574 test_lexer_string_locations_wide_string (const line_table_case &case_)
2575 {
2576   /* Digits 0-9.
2577      ....................000000000.11111111112.22222222233333
2578      ....................123456789.01234567890.12345678901234  */
2579   const char *content = "       L\"0123456789\" /* non-str */\n";
2580   lexer_test test (case_, content, NULL);
2581
2582   /* Verify that we get the expected token back, with the correct
2583      location information.  */
2584   const cpp_token *tok = test.get_token ();
2585   ASSERT_EQ (tok->type, CPP_WSTRING);
2586   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2587
2588   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2589   cpp_string dst_string;
2590   const enum cpp_ttype type = CPP_WSTRING;
2591   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2592                                       &dst_string, type);
2593   ASSERT_TRUE (result);
2594   /* The cpp_reader defaults to big-endian with
2595      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2596      now be encoded as UTF-32BE.  */
2597   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2598   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2599   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2600   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2601   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2602   free (const_cast <unsigned char *> (dst_string.text));
2603
2604   /* We don't yet support generating substring location information
2605      for L"" strings.  */
2606   ASSERT_HAS_NO_SUBSTRING_RANGES
2607     (test, tok->src_loc, type,
2608      "execution character set != source character set");
2609 }
2610
2611 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2612
2613 static uint16_t
2614 uint16_from_big_endian (const uint16_t *ptr_be_value)
2615 {
2616   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2617   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2618 }
2619
2620 /* Lex a u"" string literal and verify that attempts to read substring
2621    location data from it fail gracefully.  */
2622
2623 static void
2624 test_lexer_string_locations_string16 (const line_table_case &case_)
2625 {
2626   /* Digits 0-9.
2627      ....................000000000.11111111112.22222222233333
2628      ....................123456789.01234567890.12345678901234  */
2629   const char *content = "       u\"0123456789\" /* non-str */\n";
2630   lexer_test test (case_, content, NULL);
2631
2632   /* Verify that we get the expected token back, with the correct
2633      location information.  */
2634   const cpp_token *tok = test.get_token ();
2635   ASSERT_EQ (tok->type, CPP_STRING16);
2636   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2637
2638   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2639   cpp_string dst_string;
2640   const enum cpp_ttype type = CPP_STRING16;
2641   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2642                                       &dst_string, type);
2643   ASSERT_TRUE (result);
2644
2645   /* The cpp_reader defaults to big-endian, so dst_string should
2646      now be encoded as UTF-16BE.  */
2647   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2648   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2649   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2650   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2651   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2652   free (const_cast <unsigned char *> (dst_string.text));
2653
2654   /* We don't yet support generating substring location information
2655      for L"" strings.  */
2656   ASSERT_HAS_NO_SUBSTRING_RANGES
2657     (test, tok->src_loc, type,
2658      "execution character set != source character set");
2659 }
2660
2661 /* Lex a U"" string literal and verify that attempts to read substring
2662    location data from it fail gracefully.  */
2663
2664 static void
2665 test_lexer_string_locations_string32 (const line_table_case &case_)
2666 {
2667   /* Digits 0-9.
2668      ....................000000000.11111111112.22222222233333
2669      ....................123456789.01234567890.12345678901234  */
2670   const char *content = "       U\"0123456789\" /* non-str */\n";
2671   lexer_test test (case_, content, NULL);
2672
2673   /* Verify that we get the expected token back, with the correct
2674      location information.  */
2675   const cpp_token *tok = test.get_token ();
2676   ASSERT_EQ (tok->type, CPP_STRING32);
2677   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2678
2679   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2680   cpp_string dst_string;
2681   const enum cpp_ttype type = CPP_STRING32;
2682   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2683                                       &dst_string, type);
2684   ASSERT_TRUE (result);
2685
2686   /* The cpp_reader defaults to big-endian, so dst_string should
2687      now be encoded as UTF-32BE.  */
2688   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2689   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2690   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2691   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2692   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2693   free (const_cast <unsigned char *> (dst_string.text));
2694
2695   /* We don't yet support generating substring location information
2696      for L"" strings.  */
2697   ASSERT_HAS_NO_SUBSTRING_RANGES
2698     (test, tok->src_loc, type,
2699      "execution character set != source character set");
2700 }
2701
2702 /* Lex a u8-string literal.
2703    Verify the substring location data after running cpp_interpret_string
2704    on it.  */
2705
2706 static void
2707 test_lexer_string_locations_u8 (const line_table_case &case_)
2708 {
2709   /* Digits 0-9.
2710      ....................000000000.11111111112.22222222233333
2711      ....................123456789.01234567890.12345678901234  */
2712   const char *content = "      u8\"0123456789\" /* non-str */\n";
2713   lexer_test test (case_, content, NULL);
2714
2715   /* Verify that we get the expected token back, with the correct
2716      location information.  */
2717   const cpp_token *tok = test.get_token ();
2718   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2719   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2720
2721   /* Verify that cpp_interpret_string works.  */
2722   cpp_string dst_string;
2723   const enum cpp_ttype type = CPP_STRING;
2724   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2725                                       &dst_string, type);
2726   ASSERT_TRUE (result);
2727   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2728   free (const_cast <unsigned char *> (dst_string.text));
2729
2730   /* Verify ranges of individual characters.  This no longer includes the
2731      opening quote, but does include the closing quote.  */
2732   for (int i = 0; i <= 10; i++)
2733     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2734 }
2735
2736 /* Lex a string literal containing UTF-8 source characters.
2737    Verify the substring location data after running cpp_interpret_string
2738    on it.  */
2739
2740 static void
2741 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2742 {
2743  /* This string literal is written out to the source file as UTF-8,
2744     and is of the form "before mojibake after", where "mojibake"
2745     is written as the following four unicode code points:
2746        U+6587 CJK UNIFIED IDEOGRAPH-6587
2747        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2748        U+5316 CJK UNIFIED IDEOGRAPH-5316
2749        U+3051 HIRAGANA LETTER KE.
2750      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2751      "before" and "after" are 1 byte per unicode character.
2752
2753      The numbering shown are "columns", which are *byte* numbers within
2754      the line, rather than unicode character numbers.
2755
2756      .................... 000000000.1111111.
2757      .................... 123456789.0123456.  */
2758   const char *content = ("        \"before "
2759                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
2760                               UTF-8: 0xE6 0x96 0x87
2761                               C octal escaped UTF-8: \346\226\207
2762                             "column" numbers: 17-19.  */
2763                          "\346\226\207"
2764
2765                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2766                               UTF-8: 0xE5 0xAD 0x97
2767                               C octal escaped UTF-8: \345\255\227
2768                             "column" numbers: 20-22.  */
2769                          "\345\255\227"
2770
2771                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
2772                               UTF-8: 0xE5 0x8C 0x96
2773                               C octal escaped UTF-8: \345\214\226
2774                             "column" numbers: 23-25.  */
2775                          "\345\214\226"
2776
2777                          /* U+3051 HIRAGANA LETTER KE
2778                               UTF-8: 0xE3 0x81 0x91
2779                               C octal escaped UTF-8: \343\201\221
2780                             "column" numbers: 26-28.  */
2781                          "\343\201\221"
2782
2783                          /* column numbers 29 onwards
2784                           2333333.33334444444444
2785                           9012345.67890123456789. */
2786                          " after\" /* non-str */\n");
2787   lexer_test test (case_, content, NULL);
2788
2789   /* Verify that we get the expected token back, with the correct
2790      location information.  */
2791   const cpp_token *tok = test.get_token ();
2792   ASSERT_EQ (tok->type, CPP_STRING);
2793   ASSERT_TOKEN_AS_TEXT_EQ
2794     (test.m_parser, tok,
2795      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
2796
2797   /* Verify that cpp_interpret_string works.  */
2798   cpp_string dst_string;
2799   const enum cpp_ttype type = CPP_STRING;
2800   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2801                                       &dst_string, type);
2802   ASSERT_TRUE (result);
2803   ASSERT_STREQ
2804     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
2805      (const char *)dst_string.text);
2806   free (const_cast <unsigned char *> (dst_string.text));
2807
2808   /* Verify ranges of individual characters.  This no longer includes the
2809      opening quote, but does include the closing quote.
2810      Assuming that both source and execution encodings are UTF-8, we have
2811      a run of 25 octets in each, plus the NUL terminator.  */
2812   for (int i = 0; i < 25; i++)
2813     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2814   /* NUL-terminator should use the closing quote at column 35.  */
2815   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
2816
2817   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
2818 }
2819
2820 /* Test of string literal concatenation.  */
2821
2822 static void
2823 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
2824 {
2825   /* Digits 0-9.
2826      .....................000000000.111111.11112222222222
2827      .....................123456789.012345.67890123456789.  */
2828   const char *content = ("        \"01234\" /* non-str */\n"
2829                          "        \"56789\" /* non-str */\n");
2830   lexer_test test (case_, content, NULL);
2831
2832   location_t input_locs[2];
2833
2834   /* Verify that we get the expected tokens back.  */
2835   auto_vec <cpp_string> input_strings;
2836   const cpp_token *tok_a = test.get_token ();
2837   ASSERT_EQ (tok_a->type, CPP_STRING);
2838   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
2839   input_strings.safe_push (tok_a->val.str);
2840   input_locs[0] = tok_a->src_loc;
2841
2842   const cpp_token *tok_b = test.get_token ();
2843   ASSERT_EQ (tok_b->type, CPP_STRING);
2844   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
2845   input_strings.safe_push (tok_b->val.str);
2846   input_locs[1] = tok_b->src_loc;
2847
2848   /* Verify that cpp_interpret_string works.  */
2849   cpp_string dst_string;
2850   const enum cpp_ttype type = CPP_STRING;
2851   bool result = cpp_interpret_string (test.m_parser,
2852                                       input_strings.address (), 2,
2853                                       &dst_string, type);
2854   ASSERT_TRUE (result);
2855   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2856   free (const_cast <unsigned char *> (dst_string.text));
2857
2858   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2859   test.m_concats.record_string_concatenation (2, input_locs);
2860
2861   location_t initial_loc = input_locs[0];
2862
2863   /* "01234" on line 1.  */
2864   for (int i = 0; i <= 4; i++)
2865     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
2866   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
2867   for (int i = 5; i <= 10; i++)
2868     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
2869
2870   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2871 }
2872
2873 /* Another test of string literal concatenation.  */
2874
2875 static void
2876 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
2877 {
2878   /* Digits 0-9.
2879      .....................000000000.111.11111112222222
2880      .....................123456789.012.34567890123456.  */
2881   const char *content = ("        \"01\" /* non-str */\n"
2882                          "        \"23\" /* non-str */\n"
2883                          "        \"45\" /* non-str */\n"
2884                          "        \"67\" /* non-str */\n"
2885                          "        \"89\" /* non-str */\n");
2886   lexer_test test (case_, content, NULL);
2887
2888   auto_vec <cpp_string> input_strings;
2889   location_t input_locs[5];
2890
2891   /* Verify that we get the expected tokens back.  */
2892   for (int i = 0; i < 5; i++)
2893     {
2894       const cpp_token *tok = test.get_token ();
2895       ASSERT_EQ (tok->type, CPP_STRING);
2896       input_strings.safe_push (tok->val.str);
2897       input_locs[i] = tok->src_loc;
2898     }
2899
2900   /* Verify that cpp_interpret_string works.  */
2901   cpp_string dst_string;
2902   const enum cpp_ttype type = CPP_STRING;
2903   bool result = cpp_interpret_string (test.m_parser,
2904                                       input_strings.address (), 5,
2905                                       &dst_string, type);
2906   ASSERT_TRUE (result);
2907   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2908   free (const_cast <unsigned char *> (dst_string.text));
2909
2910   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2911   test.m_concats.record_string_concatenation (5, input_locs);
2912
2913   location_t initial_loc = input_locs[0];
2914
2915   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
2916      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
2917      and expect get_source_range_for_substring to fail.
2918      However, for a string concatenation test, we can have a case
2919      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
2920      but subsequent strings can be after it.
2921      Attempting to detect this within assert_char_at_range
2922      would overcomplicate the logic for the common test cases, so
2923      we detect it here.  */
2924   if (should_have_column_data_p (input_locs[0])
2925       && !should_have_column_data_p (input_locs[4]))
2926     {
2927       /* Verify that get_source_range_for_substring gracefully rejects
2928          this case.  */
2929       source_range actual_range;
2930       const char *err
2931         = get_source_range_for_char (test.m_parser, &test.m_concats,
2932                                      initial_loc, type, 0, &actual_range);
2933       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
2934       return;
2935     }
2936
2937   for (int i = 0; i < 5; i++)
2938     for (int j = 0; j < 2; j++)
2939       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
2940                             i + 1, 10 + j, 10 + j);
2941
2942   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
2943   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
2944
2945   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2946 }
2947
2948 /* Another test of string literal concatenation, this time combined with
2949    various kinds of escaped characters.  */
2950
2951 static void
2952 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
2953 {
2954   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
2955      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
2956   const char *content
2957     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
2958        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
2959     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
2960   lexer_test test (case_, content, NULL);
2961
2962   auto_vec <cpp_string> input_strings;
2963   location_t input_locs[4];
2964
2965   /* Verify that we get the expected tokens back.  */
2966   for (int i = 0; i < 4; i++)
2967     {
2968       const cpp_token *tok = test.get_token ();
2969       ASSERT_EQ (tok->type, CPP_STRING);
2970       input_strings.safe_push (tok->val.str);
2971       input_locs[i] = tok->src_loc;
2972     }
2973
2974   /* Verify that cpp_interpret_string works.  */
2975   cpp_string dst_string;
2976   const enum cpp_ttype type = CPP_STRING;
2977   bool result = cpp_interpret_string (test.m_parser,
2978                                       input_strings.address (), 4,
2979                                       &dst_string, type);
2980   ASSERT_TRUE (result);
2981   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2982   free (const_cast <unsigned char *> (dst_string.text));
2983
2984   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2985   test.m_concats.record_string_concatenation (4, input_locs);
2986
2987   location_t initial_loc = input_locs[0];
2988
2989   for (int i = 0; i <= 4; i++)
2990     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
2991   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
2992   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
2993   for (int i = 7; i <= 9; i++)
2994     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
2995
2996   /* NUL-terminator should use the location of the final closing quote.  */
2997   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
2998
2999   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3000 }
3001
3002 /* Test of string literal in a macro.  */
3003
3004 static void
3005 test_lexer_string_locations_macro (const line_table_case &case_)
3006 {
3007   /* Digits 0-9.
3008      .....................0000000001111111111.22222222223.
3009      .....................1234567890123456789.01234567890.  */
3010   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3011                          "  MACRO");
3012   lexer_test test (case_, content, NULL);
3013
3014   /* Verify that we get the expected tokens back.  */
3015   const cpp_token *tok = test.get_token ();
3016   ASSERT_EQ (tok->type, CPP_PADDING);
3017
3018   tok = test.get_token ();
3019   ASSERT_EQ (tok->type, CPP_STRING);
3020   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3021
3022   /* Verify ranges of individual characters.  We ought to
3023      see columns within the macro definition.  */
3024   for (int i = 0; i <= 10; i++)
3025     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3026                           i, 1, 20 + i, 20 + i);
3027
3028   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3029
3030   tok = test.get_token ();
3031   ASSERT_EQ (tok->type, CPP_PADDING);
3032 }
3033
3034 /* Test of stringification of a macro argument.  */
3035
3036 static void
3037 test_lexer_string_locations_stringified_macro_argument
3038   (const line_table_case &case_)
3039 {
3040   /* .....................000000000111111111122222222223.
3041      .....................123456789012345678901234567890.  */
3042   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3043                          "MACRO(foo)\n");
3044   lexer_test test (case_, content, NULL);
3045
3046   /* Verify that we get the expected token back.  */
3047   const cpp_token *tok = test.get_token ();
3048   ASSERT_EQ (tok->type, CPP_PADDING);
3049
3050   tok = test.get_token ();
3051   ASSERT_EQ (tok->type, CPP_STRING);
3052   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3053
3054   /* We don't support getting the location of a stringified macro
3055      argument.  Verify that it fails gracefully.  */
3056   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3057                                   "cpp_interpret_string_1 failed");
3058
3059   tok = test.get_token ();
3060   ASSERT_EQ (tok->type, CPP_PADDING);
3061
3062   tok = test.get_token ();
3063   ASSERT_EQ (tok->type, CPP_PADDING);
3064 }
3065
3066 /* Ensure that we are fail gracefully if something attempts to pass
3067    in a location that isn't a string literal token.  Seen on this code:
3068
3069      const char a[] = " %d ";
3070      __builtin_printf (a, 0.5);
3071                        ^
3072
3073    when c-format.c erroneously used the indicated one-character
3074    location as the format string location, leading to a read past the
3075    end of a string buffer in cpp_interpret_string_1.  */
3076
3077 static void
3078 test_lexer_string_locations_non_string (const line_table_case &case_)
3079 {
3080   /* .....................000000000111111111122222222223.
3081      .....................123456789012345678901234567890.  */
3082   const char *content = ("         a\n");
3083   lexer_test test (case_, content, NULL);
3084
3085   /* Verify that we get the expected token back.  */
3086   const cpp_token *tok = test.get_token ();
3087   ASSERT_EQ (tok->type, CPP_NAME);
3088   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3089
3090   /* At this point, libcpp is attempting to interpret the name as a
3091      string literal, despite it not starting with a quote.  We don't detect
3092      that, but we should at least fail gracefully.  */
3093   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3094                                   "cpp_interpret_string_1 failed");
3095 }
3096
3097 /* Ensure that we can read substring information for a token which
3098    starts in one linemap and ends in another .  Adapted from
3099    gcc.dg/cpp/pr69985.c.  */
3100
3101 static void
3102 test_lexer_string_locations_long_line (const line_table_case &case_)
3103 {
3104   /* .....................000000.000111111111
3105      .....................123456.789012346789.  */
3106   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3107                          "     \"0123456789012345678901234567890123456789"
3108                          "0123456789012345678901234567890123456789"
3109                          "0123456789012345678901234567890123456789"
3110                          "0123456789\"\n");
3111
3112   lexer_test test (case_, content, NULL);
3113
3114   /* Verify that we get the expected token back.  */
3115   const cpp_token *tok = test.get_token ();
3116   ASSERT_EQ (tok->type, CPP_STRING);
3117
3118   if (!should_have_column_data_p (line_table->highest_location))
3119     return;
3120
3121   /* Verify ranges of individual characters.  */
3122   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3123   for (int i = 0; i < 131; i++)
3124     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3125                           i, 2, 7 + i, 7 + i);
3126 }
3127
3128 /* Test of lexing char constants.  */
3129
3130 static void
3131 test_lexer_char_constants (const line_table_case &case_)
3132 {
3133   /* Various char constants.
3134      .....................0000000001111111111.22222222223.
3135      .....................1234567890123456789.01234567890.  */
3136   const char *content = ("         'a'\n"
3137                          "        u'a'\n"
3138                          "        U'a'\n"
3139                          "        L'a'\n"
3140                          "         'abc'\n");
3141   lexer_test test (case_, content, NULL);
3142
3143   /* Verify that we get the expected tokens back.  */
3144   /* 'a'.  */
3145   const cpp_token *tok = test.get_token ();
3146   ASSERT_EQ (tok->type, CPP_CHAR);
3147   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3148
3149   unsigned int chars_seen;
3150   int unsignedp;
3151   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3152                                           &chars_seen, &unsignedp);
3153   ASSERT_EQ (cc, 'a');
3154   ASSERT_EQ (chars_seen, 1);
3155
3156   /* u'a'.  */
3157   tok = test.get_token ();
3158   ASSERT_EQ (tok->type, CPP_CHAR16);
3159   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3160
3161   /* U'a'.  */
3162   tok = test.get_token ();
3163   ASSERT_EQ (tok->type, CPP_CHAR32);
3164   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3165
3166   /* L'a'.  */
3167   tok = test.get_token ();
3168   ASSERT_EQ (tok->type, CPP_WCHAR);
3169   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3170
3171   /* 'abc' (c-char-sequence).  */
3172   tok = test.get_token ();
3173   ASSERT_EQ (tok->type, CPP_CHAR);
3174   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3175 }
3176 /* A table of interesting location_t values, giving one axis of our test
3177    matrix.  */
3178
3179 static const location_t boundary_locations[] = {
3180   /* Zero means "don't override the default values for a new line_table".  */
3181   0,
3182
3183   /* An arbitrary non-zero value that isn't close to one of
3184      the boundary values below.  */
3185   0x10000,
3186
3187   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3188   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3189   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3190   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3191   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3192   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3193
3194   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3195   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3196   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3197   LINE_MAP_MAX_LOCATION_WITH_COLS,
3198   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3199   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3200 };
3201
3202 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3203
3204 void
3205 for_each_line_table_case (void (*testcase) (const line_table_case &))
3206 {
3207   /* As noted above in the description of struct line_table_case,
3208      we want to explore a test matrix of interesting line_table
3209      situations, running various selftests for each case within the
3210      matrix.  */
3211
3212   /* Run all tests with:
3213      (a) line_table->default_range_bits == 0, and
3214      (b) line_table->default_range_bits == 5.  */
3215   int num_cases_tested = 0;
3216   for (int default_range_bits = 0; default_range_bits <= 5;
3217        default_range_bits += 5)
3218     {
3219       /* ...and use each of the "interesting" location values as
3220          the starting location within line_table.  */
3221       const int num_boundary_locations
3222         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3223       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3224         {
3225           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3226
3227           testcase (c);
3228
3229           num_cases_tested++;
3230         }
3231     }
3232
3233   /* Verify that we fully covered the test matrix.  */
3234   ASSERT_EQ (num_cases_tested, 2 * 12);
3235 }
3236
3237 /* Run all of the selftests within this file.  */
3238
3239 void
3240 input_c_tests ()
3241 {
3242   test_should_have_column_data_p ();
3243   test_unknown_location ();
3244   test_builtins ();
3245   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3246
3247   for_each_line_table_case (test_accessing_ordinary_linemaps);
3248   for_each_line_table_case (test_lexer);
3249   for_each_line_table_case (test_lexer_string_locations_simple);
3250   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3251   for_each_line_table_case (test_lexer_string_locations_hex);
3252   for_each_line_table_case (test_lexer_string_locations_oct);
3253   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3254   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3255   for_each_line_table_case (test_lexer_string_locations_ucn4);
3256   for_each_line_table_case (test_lexer_string_locations_ucn8);
3257   for_each_line_table_case (test_lexer_string_locations_wide_string);
3258   for_each_line_table_case (test_lexer_string_locations_string16);
3259   for_each_line_table_case (test_lexer_string_locations_string32);
3260   for_each_line_table_case (test_lexer_string_locations_u8);
3261   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3262   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3263   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3264   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3265   for_each_line_table_case (test_lexer_string_locations_macro);
3266   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3267   for_each_line_table_case (test_lexer_string_locations_non_string);
3268   for_each_line_table_case (test_lexer_string_locations_long_line);
3269   for_each_line_table_case (test_lexer_char_constants);
3270
3271   test_reading_source_line ();
3272 }
3273
3274 } // namespace selftest
3275
3276 #endif /* CHECKING_P */