gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2021 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* Input charset configuration.  */
  33 static const char *default_charset_callback (const char *)
  34 {
  35   return nullptr;
  36 }
  37
  38 void
  39 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
  40                                       bool should_skip_bom)
  41 {
  42   in_context.ccb = (ccb ? ccb : default_charset_callback);
  43   in_context.should_skip_bom = should_skip_bom;
  44 }
  45
  46 /* This is a cache used by get_next_line to store the content of a
  47    file to be searched for file lines.  */
  48 class file_cache_slot
  49 {
  50 public:
  51   file_cache_slot ();
  52   ~file_cache_slot ();
  53
  54   bool read_line_num (size_t line_num,
  55                       char ** line, ssize_t *line_len);
  56
  57   /* Accessors.  */
  58   const char *get_file_path () const { return m_file_path; }
  59   unsigned get_use_count () const { return m_use_count; }
  60   bool missing_trailing_newline_p () const
  61   {
  62     return m_missing_trailing_newline;
  63   }
  64
  65   void inc_use_count () { m_use_count++; }
  66
  67   bool create (const file_cache::input_context &in_context,
  68                const char *file_path, FILE *fp, unsigned highest_use_count);
  69   void evict ();
  70
  71  private:
  72   /* These are information used to store a line boundary.  */
  73   class line_info
  74   {
  75   public:
  76     /* The line number.  It starts from 1.  */
  77     size_t line_num;
  78
  79     /* The position (byte count) of the beginning of the line,
  80        relative to the file data pointer.  This starts at zero.  */
  81     size_t start_pos;
  82
  83     /* The position (byte count) of the last byte of the line.  This
  84        normally points to the '\n' character, or to one byte after the
  85        last byte of the file, if the file doesn't contain a '\n'
  86        character.  */
  87     size_t end_pos;
  88
  89     line_info (size_t l, size_t s, size_t e)
  90       : line_num (l), start_pos (s), end_pos (e)
  91     {}
  92
  93     line_info ()
  94       :line_num (0), start_pos (0), end_pos (0)
  95     {}
  96   };
  97
  98   bool needs_read_p () const;
  99   bool needs_grow_p () const;
 100   void maybe_grow ();
 101   bool read_data ();
 102   bool maybe_read_data ();
 103   bool get_next_line (char **line, ssize_t *line_len);
 104   bool read_next_line (char ** line, ssize_t *line_len);
 105   bool goto_next_line ();
 106
 107   static const size_t buffer_size = 4 * 1024;
 108   static const size_t line_record_size = 100;
 109
 110   /* The number of time this file has been accessed.  This is used
 111      to designate which file cache to evict from the cache
 112      array.  */
 113   unsigned m_use_count;
 114
 115   /* The file_path is the key for identifying a particular file in
 116      the cache.
 117      For libcpp-using code, the underlying buffer for this field is
 118      owned by the corresponding _cpp_file within the cpp_reader.  */
 119   const char *m_file_path;
 120
 121   FILE *m_fp;
 122
 123   /* This points to the content of the file that we've read so
 124      far.  */
 125   char *m_data;
 126
 127   /* The allocated buffer to be freed may start a little earlier than DATA,
 128      e.g. if a UTF8 BOM was skipped at the beginning.  */
 129   int m_alloc_offset;
 130
 131   /*  The size of the DATA array above.*/
 132   size_t m_size;
 133
 134   /* The number of bytes read from the underlying file so far.  This
 135      must be less (or equal) than SIZE above.  */
 136   size_t m_nb_read;
 137
 138   /* The index of the beginning of the current line.  */
 139   size_t m_line_start_idx;
 140
 141   /* The number of the previous line read.  This starts at 1.  Zero
 142      means we've read no line so far.  */
 143   size_t m_line_num;
 144
 145   /* This is the total number of lines of the current file.  At the
 146      moment, we try to get this information from the line map
 147      subsystem.  Note that this is just a hint.  When using the C++
 148      front-end, this hint is correct because the input file is then
 149      completely tokenized before parsing starts; so the line map knows
 150      the number of lines before compilation really starts.  For e.g,
 151      the C front-end, it can happen that we start emitting diagnostics
 152      before the line map has seen the end of the file.  */
 153   size_t m_total_lines;
 154
 155   /* Could this file be missing a trailing newline on its final line?
 156      Initially true (to cope with empty files), set to true/false
 157      as each line is read.  */
 158   bool m_missing_trailing_newline;
 159
 160   /* This is a record of the beginning and end of the lines we've seen
 161      while reading the file.  This is useful to avoid walking the data
 162      from the beginning when we are asked to read a line that is
 163      before LINE_START_IDX above.  Note that the maximum size of this
 164      record is line_record_size, so that the memory consumption
 165      doesn't explode.  We thus scale total_lines down to
 166      line_record_size.  */
 167   vec<line_info, va_heap> m_line_record;
 168
 169   void offset_buffer (int offset)
 170   {
 171     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
 172                 : (size_t) offset <= m_size);
 173     gcc_assert (m_data);
 174     m_alloc_offset += offset;
 175     m_data += offset;
 176     m_size -= offset;
 177   }
 178
 179 };
 180
 181 /* Current position in real source file.  */
 182
 183 location_t input_location = UNKNOWN_LOCATION;
 184
 185 class line_maps *line_table;
 186
 187 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 188    This needs to be a global so that it can be a GC root, and thus
 189    prevent the stashed copy from being garbage-collected if the GC runs
 190    during a line_table_test.  */
 191
 192 class line_maps *saved_line_table;
 193
 194 /* Expand the source location LOC into a human readable location.  If
 195    LOC resolves to a builtin location, the file name of the readable
 196    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 197    TRUE and LOC is virtual, then it is resolved to the expansion
 198    point of the involved macro.  Otherwise, it is resolved to the
 199    spelling location of the token.
 200
 201    When resolving to the spelling location of the token, if the
 202    resulting location is for a built-in location (that is, it has no
 203    associated line/column) in the context of a macro expansion, the
 204    returned location is the first one (while unwinding the macro
 205    location towards its expansion point) that is in real source
 206    code.
 207
 208    ASPECT controls which part of the location to use.  */
 209
 210 static expanded_location
 211 expand_location_1 (location_t loc,
 212                    bool expansion_point_p,
 213                    enum location_aspect aspect)
 214 {
 215   expanded_location xloc;
 216   const line_map_ordinary *map;
 217   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 218   tree block = NULL;
 219
 220   if (IS_ADHOC_LOC (loc))
 221     {
 222       block = LOCATION_BLOCK (loc);
 223       loc = LOCATION_LOCUS (loc);
 224     }
 225
 226   memset (&xloc, 0, sizeof (xloc));
 227
 228   if (loc >= RESERVED_LOCATION_COUNT)
 229     {
 230       if (!expansion_point_p)
 231         {
 232           /* We want to resolve LOC to its spelling location.
 233
 234              But if that spelling location is a reserved location that
 235              appears in the context of a macro expansion (like for a
 236              location for a built-in token), let's consider the first
 237              location (toward the expansion point) that is not reserved;
 238              that is, the first location that is in real source code.  */
 239           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 240                                                           loc, NULL);
 241           lrk = LRK_SPELLING_LOCATION;
 242         }
 243       loc = linemap_resolve_location (line_table, loc, lrk, &map);
 244
 245       /* loc is now either in an ordinary map, or is a reserved location.
 246          If it is a compound location, the caret is in a spelling location,
 247          but the start/finish might still be a virtual location.
 248          Depending of what the caller asked for, we may need to recurse
 249          one level in order to resolve any virtual locations in the
 250          end-points.  */
 251       switch (aspect)
 252         {
 253         default:
 254           gcc_unreachable ();
 255           /* Fall through.  */
 256         case LOCATION_ASPECT_CARET:
 257           break;
 258         case LOCATION_ASPECT_START:
 259           {
 260             location_t start = get_start (loc);
 261             if (start != loc)
 262               return expand_location_1 (start, expansion_point_p, aspect);
 263           }
 264           break;
 265         case LOCATION_ASPECT_FINISH:
 266           {
 267             location_t finish = get_finish (loc);
 268             if (finish != loc)
 269               return expand_location_1 (finish, expansion_point_p, aspect);
 270           }
 271           break;
 272         }
 273       xloc = linemap_expand_location (line_table, map, loc);
 274     }
 275
 276   xloc.data = block;
 277   if (loc <= BUILTINS_LOCATION)
 278     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 279
 280   return xloc;
 281 }
 282
 283 /* Initialize the set of cache used for files accessed by caret
 284    diagnostic.  */
 285
 286 static void
 287 diagnostic_file_cache_init (void)
 288 {
 289   gcc_assert (global_dc);
 290   if (global_dc->m_file_cache == NULL)
 291     global_dc->m_file_cache = new file_cache ();
 292 }
 293
 294 /* Free the resources used by the set of cache used for files accessed
 295    by caret diagnostic.  */
 296
 297 void
 298 diagnostic_file_cache_fini (void)
 299 {
 300   if (global_dc->m_file_cache)
 301     {
 302       delete global_dc->m_file_cache;
 303       global_dc->m_file_cache = NULL;
 304     }
 305 }
 306
 307 /* Return the total lines number that have been read so far by the
 308    line map (in the preprocessor) so far.  For languages like C++ that
 309    entirely preprocess the input file before starting to parse, this
 310    equals the actual number of lines of the file.  */
 311
 312 static size_t
 313 total_lines_num (const char *file_path)
 314 {
 315   size_t r = 0;
 316   location_t l = 0;
 317   if (linemap_get_file_highest_location (line_table, file_path, &l))
 318     {
 319       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 320       expanded_location xloc = expand_location (l);
 321       r = xloc.line;
 322     }
 323   return r;
 324 }
 325
 326 /* Lookup the cache used for the content of a given file accessed by
 327    caret diagnostic.  Return the found cached file, or NULL if no
 328    cached file was found.  */
 329
 330 file_cache_slot *
 331 file_cache::lookup_file (const char *file_path)
 332 {
 333   gcc_assert (file_path);
 334
 335   /* This will contain the found cached file.  */
 336   file_cache_slot *r = NULL;
 337   for (unsigned i = 0; i < num_file_slots; ++i)
 338     {
 339       file_cache_slot *c = &m_file_slots[i];
 340       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
 341         {
 342           c->inc_use_count ();
 343           r = c;
 344         }
 345     }
 346
 347   if (r)
 348     r->inc_use_count ();
 349
 350   return r;
 351 }
 352
 353 /* Purge any mention of FILENAME from the cache of files used for
 354    printing source code.  For use in selftests when working
 355    with tempfiles.  */
 356
 357 void
 358 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 359 {
 360   gcc_assert (file_path);
 361
 362   if (!global_dc->m_file_cache)
 363     return;
 364
 365   global_dc->m_file_cache->forcibly_evict_file (file_path);
 366 }
 367
 368 void
 369 file_cache::forcibly_evict_file (const char *file_path)
 370 {
 371   gcc_assert (file_path);
 372
 373   file_cache_slot *r = lookup_file (file_path);
 374   if (!r)
 375     /* Not found.  */
 376     return;
 377
 378   r->evict ();
 379 }
 380
 381 void
 382 file_cache_slot::evict ()
 383 {
 384   m_file_path = NULL;
 385   if (m_fp)
 386     fclose (m_fp);
 387   m_fp = NULL;
 388   m_nb_read = 0;
 389   m_line_start_idx = 0;
 390   m_line_num = 0;
 391   m_line_record.truncate (0);
 392   m_use_count = 0;
 393   m_total_lines = 0;
 394   m_missing_trailing_newline = true;
 395 }
 396
 397 /* Return the file cache that has been less used, recently, or the
 398    first empty one.  If HIGHEST_USE_COUNT is non-null,
 399    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 400    in the cache table.  */
 401
 402 file_cache_slot*
 403 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
 404 {
 405   diagnostic_file_cache_init ();
 406
 407   file_cache_slot *to_evict = &m_file_slots[0];
 408   unsigned huc = to_evict->get_use_count ();
 409   for (unsigned i = 1; i < num_file_slots; ++i)
 410     {
 411       file_cache_slot *c = &m_file_slots[i];
 412       bool c_is_empty = (c->get_file_path () == NULL);
 413
 414       if (c->get_use_count () < to_evict->get_use_count ()
 415           || (to_evict->get_file_path () && c_is_empty))
 416         /* We evict C because it's either an entry with a lower use
 417            count or one that is empty.  */
 418         to_evict = c;
 419
 420       if (huc < c->get_use_count ())
 421         huc = c->get_use_count ();
 422
 423       if (c_is_empty)
 424         /* We've reached the end of the cache; subsequent elements are
 425            all empty.  */
 426         break;
 427     }
 428
 429   if (highest_use_count)
 430     *highest_use_count = huc;
 431
 432   return to_evict;
 433 }
 434
 435 /* Create the cache used for the content of a given file to be
 436    accessed by caret diagnostic.  This cache is added to an array of
 437    cache and can be retrieved by lookup_file_in_cache_tab.  This
 438    function returns the created cache.  Note that only the last
 439    num_file_slots files are cached.  */
 440
 441 file_cache_slot*
 442 file_cache::add_file (const char *file_path)
 443 {
 444
 445   FILE *fp = fopen (file_path, "r");
 446   if (fp == NULL)
 447     return NULL;
 448
 449   unsigned highest_use_count = 0;
 450   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
 451   if (!r->create (in_context, file_path, fp, highest_use_count))
 452     return NULL;
 453   return r;
 454 }
 455
 456 /* Populate this slot for use on FILE_PATH and FP, dropping any
 457    existing cached content within it.  */
 458
 459 bool
 460 file_cache_slot::create (const file_cache::input_context &in_context,
 461                          const char *file_path, FILE *fp,
 462                          unsigned highest_use_count)
 463 {
 464   m_file_path = file_path;
 465   if (m_fp)
 466     fclose (m_fp);
 467   m_fp = fp;
 468   if (m_alloc_offset)
 469     offset_buffer (-m_alloc_offset);
 470   m_nb_read = 0;
 471   m_line_start_idx = 0;
 472   m_line_num = 0;
 473   m_line_record.truncate (0);
 474   /* Ensure that this cache entry doesn't get evicted next time
 475      add_file_to_cache_tab is called.  */
 476   m_use_count = ++highest_use_count;
 477   m_total_lines = total_lines_num (file_path);
 478   m_missing_trailing_newline = true;
 479
 480
 481   /* Check the input configuration to determine if we need to do any
 482      transformations, such as charset conversion or BOM skipping.  */
 483   if (const char *input_charset = in_context.ccb (file_path))
 484     {
 485       /* Need a full-blown conversion of the input charset.  */
 486       fclose (m_fp);
 487       m_fp = NULL;
 488       const cpp_converted_source cs
 489         = cpp_get_converted_source (file_path, input_charset);
 490       if (!cs.data)
 491         return false;
 492       if (m_data)
 493         XDELETEVEC (m_data);
 494       m_data = cs.data;
 495       m_nb_read = m_size = cs.len;
 496       m_alloc_offset = cs.data - cs.to_free;
 497     }
 498   else if (in_context.should_skip_bom)
 499     {
 500       if (read_data ())
 501         {
 502           const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
 503           offset_buffer (offset);
 504           m_nb_read -= offset;
 505         }
 506     }
 507
 508   return true;
 509 }
 510
 511 /* file_cache's ctor.  */
 512
 513 file_cache::file_cache ()
 514 : m_file_slots (new file_cache_slot[num_file_slots])
 515 {
 516   initialize_input_context (nullptr, false);
 517 }
 518
 519 /* file_cache's dtor.  */
 520
 521 file_cache::~file_cache ()
 522 {
 523   delete[] m_file_slots;
 524 }
 525
 526 /* Lookup the cache used for the content of a given file accessed by
 527    caret diagnostic.  If no cached file was found, create a new cache
 528    for this file, add it to the array of cached file and return
 529    it.  */
 530
 531 file_cache_slot*
 532 file_cache::lookup_or_add_file (const char *file_path)
 533 {
 534   file_cache_slot *r = lookup_file (file_path);
 535   if (r == NULL)
 536     r = add_file (file_path);
 537   return r;
 538 }
 539
 540 /* Default constructor for a cache of file used by caret
 541    diagnostic.  */
 542
 543 file_cache_slot::file_cache_slot ()
 544 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
 545   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
 546   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 547 {
 548   m_line_record.create (0);
 549 }
 550
 551 /* Destructor for a cache of file used by caret diagnostic.  */
 552
 553 file_cache_slot::~file_cache_slot ()
 554 {
 555   if (m_fp)
 556     {
 557       fclose (m_fp);
 558       m_fp = NULL;
 559     }
 560   if (m_data)
 561     {
 562       offset_buffer (-m_alloc_offset);
 563       XDELETEVEC (m_data);
 564       m_data = 0;
 565     }
 566   m_line_record.release ();
 567 }
 568
 569 /* Returns TRUE iff the cache would need to be filled with data coming
 570    from the file.  That is, either the cache is empty or full or the
 571    current line is empty.  Note that if the cache is full, it would
 572    need to be extended and filled again.  */
 573
 574 bool
 575 file_cache_slot::needs_read_p () const
 576 {
 577   return m_fp && (m_nb_read == 0
 578           || m_nb_read == m_size
 579           || (m_line_start_idx >= m_nb_read - 1));
 580 }
 581
 582 /*  Return TRUE iff the cache is full and thus needs to be
 583     extended.  */
 584
 585 bool
 586 file_cache_slot::needs_grow_p () const
 587 {
 588   return m_nb_read == m_size;
 589 }
 590
 591 /* Grow the cache if it needs to be extended.  */
 592
 593 void
 594 file_cache_slot::maybe_grow ()
 595 {
 596   if (!needs_grow_p ())
 597     return;
 598
 599   if (!m_data)
 600     {
 601       gcc_assert (m_size == 0 && m_alloc_offset == 0);
 602       m_size = buffer_size;
 603       m_data = XNEWVEC (char, m_size);
 604     }
 605   else
 606     {
 607       const int offset = m_alloc_offset;
 608       offset_buffer (-offset);
 609       m_size *= 2;
 610       m_data = XRESIZEVEC (char, m_data, m_size);
 611       offset_buffer (offset);
 612     }
 613 }
 614
 615 /*  Read more data into the cache.  Extends the cache if need be.
 616     Returns TRUE iff new data could be read.  */
 617
 618 bool
 619 file_cache_slot::read_data ()
 620 {
 621   if (feof (m_fp) || ferror (m_fp))
 622     return false;
 623
 624   maybe_grow ();
 625
 626   char * from = m_data + m_nb_read;
 627   size_t to_read = m_size - m_nb_read;
 628   size_t nb_read = fread (from, 1, to_read, m_fp);
 629
 630   if (ferror (m_fp))
 631     return false;
 632
 633   m_nb_read += nb_read;
 634   return !!nb_read;
 635 }
 636
 637 /* Read new data iff the cache needs to be filled with more data
 638    coming from the file FP.  Return TRUE iff the cache was filled with
 639    mode data.  */
 640
 641 bool
 642 file_cache_slot::maybe_read_data ()
 643 {
 644   if (!needs_read_p ())
 645     return false;
 646   return read_data ();
 647 }
 648
 649 /* Read a new line from file FP, using C as a cache for the data
 650    coming from the file.  Upon successful completion, *LINE is set to
 651    the beginning of the line found.  *LINE points directly in the
 652    line cache and is only valid until the next call of get_next_line.
 653    *LINE_LEN is set to the length of the line.  Note that the line
 654    does not contain any terminal delimiter.  This function returns
 655    true if some data was read or process from the cache, false
 656    otherwise.  Note that subsequent calls to get_next_line might
 657    make the content of *LINE invalid.  */
 658
 659 bool
 660 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
 661 {
 662   /* Fill the cache with data to process.  */
 663   maybe_read_data ();
 664
 665   size_t remaining_size = m_nb_read - m_line_start_idx;
 666   if (remaining_size == 0)
 667     /* There is no more data to process.  */
 668     return false;
 669
 670   char *line_start = m_data + m_line_start_idx;
 671
 672   char *next_line_start = NULL;
 673   size_t len = 0;
 674   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 675   if (line_end == NULL)
 676     {
 677       /* We haven't found the end-of-line delimiter in the cache.
 678          Fill the cache with more data from the file and look for the
 679          '\n'.  */
 680       while (maybe_read_data ())
 681         {
 682           line_start = m_data + m_line_start_idx;
 683           remaining_size = m_nb_read - m_line_start_idx;
 684           line_end = (char *) memchr (line_start, '\n', remaining_size);
 685           if (line_end != NULL)
 686             {
 687               next_line_start = line_end + 1;
 688               break;
 689             }
 690         }
 691       if (line_end == NULL)
 692         {
 693           /* We've loadded all the file into the cache and still no
 694              '\n'.  Let's say the line ends up at one byte passed the
 695              end of the file.  This is to stay consistent with the case
 696              of when the line ends up with a '\n' and line_end points to
 697              that terminal '\n'.  That consistency is useful below in
 698              the len calculation.  */
 699           line_end = m_data + m_nb_read ;
 700           m_missing_trailing_newline = true;
 701         }
 702       else
 703         m_missing_trailing_newline = false;
 704     }
 705   else
 706     {
 707       next_line_start = line_end + 1;
 708       m_missing_trailing_newline = false;
 709     }
 710
 711   if (m_fp && ferror (m_fp))
 712     return false;
 713
 714   /* At this point, we've found the end of the of line.  It either
 715      points to the '\n' or to one byte after the last byte of the
 716      file.  */
 717   gcc_assert (line_end != NULL);
 718
 719   len = line_end - line_start;
 720
 721   if (m_line_start_idx < m_nb_read)
 722     *line = line_start;
 723
 724   ++m_line_num;
 725
 726   /* Before we update our line record, make sure the hint about the
 727      total number of lines of the file is correct.  If it's not, then
 728      we give up recording line boundaries from now on.  */
 729   bool update_line_record = true;
 730   if (m_line_num > m_total_lines)
 731     update_line_record = false;
 732
 733     /* Now update our line record so that re-reading lines from the
 734      before m_line_start_idx is faster.  */
 735   if (update_line_record
 736       && m_line_record.length () < line_record_size)
 737     {
 738       /* If the file lines fits in the line record, we just record all
 739          its lines ...*/
 740       if (m_total_lines <= line_record_size
 741           && m_line_num > m_line_record.length ())
 742         m_line_record.safe_push
 743           (file_cache_slot::line_info (m_line_num,
 744                                        m_line_start_idx,
 745                                        line_end - m_data));
 746       else if (m_total_lines > line_record_size)
 747         {
 748           /* ... otherwise, we just scale total_lines down to
 749              (line_record_size lines.  */
 750           size_t n = (m_line_num * line_record_size) / m_total_lines;
 751           if (m_line_record.length () == 0
 752               || n >= m_line_record.length ())
 753             m_line_record.safe_push
 754               (file_cache_slot::line_info (m_line_num,
 755                                            m_line_start_idx,
 756                                            line_end - m_data));
 757         }
 758     }
 759
 760   /* Update m_line_start_idx so that it points to the next line to be
 761      read.  */
 762   if (next_line_start)
 763     m_line_start_idx = next_line_start - m_data;
 764   else
 765     /* We didn't find any terminal '\n'.  Let's consider that the end
 766        of line is the end of the data in the cache.  The next
 767        invocation of get_next_line will either read more data from the
 768        underlying file or return false early because we've reached the
 769        end of the file.  */
 770     m_line_start_idx = m_nb_read;
 771
 772   *line_len = len;
 773
 774   return true;
 775 }
 776
 777 /* Consume the next bytes coming from the cache (or from its
 778    underlying file if there are remaining unread bytes in the file)
 779    until we reach the next end-of-line (or end-of-file).  There is no
 780    copying from the cache involved.  Return TRUE upon successful
 781    completion.  */
 782
 783 bool
 784 file_cache_slot::goto_next_line ()
 785 {
 786   char *l;
 787   ssize_t len;
 788
 789   return get_next_line (&l, &len);
 790 }
 791
 792 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 793    If the line was read successfully, *LINE points to the beginning
 794    of the line in the file cache and *LINE_LEN is the length of the
 795    line.  *LINE is not nul-terminated, but may contain zero bytes.
 796    *LINE is only valid until the next call of read_line_num.
 797    This function returns bool if a line was read.  */
 798
 799 bool
 800 file_cache_slot::read_line_num (size_t line_num,
 801                        char ** line, ssize_t *line_len)
 802 {
 803   gcc_assert (line_num > 0);
 804
 805   if (line_num <= m_line_num)
 806     {
 807       /* We've been asked to read lines that are before m_line_num.
 808          So lets use our line record (if it's not empty) to try to
 809          avoid re-reading the file from the beginning again.  */
 810
 811       if (m_line_record.is_empty ())
 812         {
 813           m_line_start_idx = 0;
 814           m_line_num = 0;
 815         }
 816       else
 817         {
 818           file_cache_slot::line_info *i = NULL;
 819           if (m_total_lines <= line_record_size)
 820             {
 821               /* In languages where the input file is not totally
 822                  preprocessed up front, the m_total_lines hint
 823                  can be smaller than the number of lines of the
 824                  file.  In that case, only the first
 825                  m_total_lines have been recorded.
 826
 827                  Otherwise, the first m_total_lines we've read have
 828                  their start/end recorded here.  */
 829               i = (line_num <= m_total_lines)
 830                 ? &m_line_record[line_num - 1]
 831                 : &m_line_record[m_total_lines - 1];
 832               gcc_assert (i->line_num <= line_num);
 833             }
 834           else
 835             {
 836               /*  So the file had more lines than our line record
 837                   size.  Thus the number of lines we've recorded has
 838                   been scaled down to line_record_size.  Let's
 839                   pick the start/end of the recorded line that is
 840                   closest to line_num.  */
 841               size_t n = (line_num <= m_total_lines)
 842                 ? line_num * line_record_size / m_total_lines
 843                 : m_line_record.length () - 1;
 844               if (n < m_line_record.length ())
 845                 {
 846                   i = &m_line_record[n];
 847                   gcc_assert (i->line_num <= line_num);
 848                 }
 849             }
 850
 851           if (i && i->line_num == line_num)
 852             {
 853               /* We have the start/end of the line.  */
 854               *line = m_data + i->start_pos;
 855               *line_len = i->end_pos - i->start_pos;
 856               return true;
 857             }
 858
 859           if (i)
 860             {
 861               m_line_start_idx = i->start_pos;
 862               m_line_num = i->line_num - 1;
 863             }
 864           else
 865             {
 866               m_line_start_idx = 0;
 867               m_line_num = 0;
 868             }
 869         }
 870     }
 871
 872   /*  Let's walk from line m_line_num up to line_num - 1, without
 873       copying any line.  */
 874   while (m_line_num < line_num - 1)
 875     if (!goto_next_line ())
 876       return false;
 877
 878   /* The line we want is the next one.  Let's read and copy it back to
 879      the caller.  */
 880   return get_next_line (line, line_len);
 881 }
 882
 883 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 884    The line is not nul-terminated.  The returned pointer is only
 885    valid until the next call of location_get_source_line.
 886    Note that the line can contain several null characters,
 887    so the returned value's length has the actual length of the line.
 888    If the function fails, a NULL char_span is returned.  */
 889
 890 char_span
 891 location_get_source_line (const char *file_path, int line)
 892 {
 893   char *buffer = NULL;
 894   ssize_t len;
 895
 896   if (line == 0)
 897     return char_span (NULL, 0);
 898
 899   if (file_path == NULL)
 900     return char_span (NULL, 0);
 901
 902   diagnostic_file_cache_init ();
 903
 904   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 905   if (c == NULL)
 906     return char_span (NULL, 0);
 907
 908   bool read = c->read_line_num (line, &buffer, &len);
 909   if (!read)
 910     return char_span (NULL, 0);
 911
 912   return char_span (buffer, len);
 913 }
 914
 915 /* Determine if FILE_PATH missing a trailing newline on its final line.
 916    Only valid to call once all of the file has been loaded, by
 917    requesting a line number beyond the end of the file.  */
 918
 919 bool
 920 location_missing_trailing_newline (const char *file_path)
 921 {
 922   diagnostic_file_cache_init ();
 923
 924   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 925   if (c == NULL)
 926     return false;
 927
 928   return c->missing_trailing_newline_p ();
 929 }
 930
 931 /* Test if the location originates from the spelling location of a
 932    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 933    virtual) location of a built-in token that appears in the expansion
 934    list of a macro.  Please note that this function also works on
 935    tokens that result from built-in tokens.  For instance, the
 936    function would return true if passed a token "4" that is the result
 937    of the expansion of the built-in __LINE__ macro.  */
 938 bool
 939 is_location_from_builtin_token (location_t loc)
 940 {
 941   const line_map_ordinary *map = NULL;
 942   loc = linemap_resolve_location (line_table, loc,
 943                                   LRK_SPELLING_LOCATION, &map);
 944   return loc == BUILTINS_LOCATION;
 945 }
 946
 947 /* Expand the source location LOC into a human readable location.  If
 948    LOC is virtual, it resolves to the expansion point of the involved
 949    macro.  If LOC resolves to a builtin location, the file name of the
 950    readable location is set to the string "<built-in>".  */
 951
 952 expanded_location
 953 expand_location (location_t loc)
 954 {
 955   return expand_location_1 (loc, /*expansion_point_p=*/true,
 956                             LOCATION_ASPECT_CARET);
 957 }
 958
 959 /* Expand the source location LOC into a human readable location.  If
 960    LOC is virtual, it resolves to the expansion location of the
 961    relevant macro.  If LOC resolves to a builtin location, the file
 962    name of the readable location is set to the string
 963    "<built-in>".  */
 964
 965 expanded_location
 966 expand_location_to_spelling_point (location_t loc,
 967                                    enum location_aspect aspect)
 968 {
 969   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 970 }
 971
 972 /* The rich_location class within libcpp requires a way to expand
 973    location_t instances, and relies on the client code
 974    providing a symbol named
 975      linemap_client_expand_location_to_spelling_point
 976    to do this.
 977
 978    This is the implementation for libcommon.a (all host binaries),
 979    which simply calls into expand_location_1.  */
 980
 981 expanded_location
 982 linemap_client_expand_location_to_spelling_point (location_t loc,
 983                                                   enum location_aspect aspect)
 984 {
 985   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 986 }
 987
 988
 989 /* If LOCATION is in a system header and if it is a virtual location for
 990    a token coming from the expansion of a macro, unwind it to the
 991    location of the expansion point of the macro.  Otherwise, just return
 992    LOCATION.
 993
 994    This is used for instance when we want to emit diagnostics about a
 995    token that may be located in a macro that is itself defined in a
 996    system header, for example, for the NULL macro.  In such a case, if
 997    LOCATION were passed directly to diagnostic functions such as
 998    warning_at, the diagnostic would be suppressed (unless
 999    -Wsystem-headers).  */
1000
1001 location_t
1002 expansion_point_location_if_in_system_header (location_t location)
1003 {
1004   if (in_system_header_at (location))
1005     location = linemap_resolve_location (line_table, location,
1006                                          LRK_MACRO_EXPANSION_POINT,
1007                                          NULL);
1008   return location;
1009 }
1010
1011 /* If LOCATION is a virtual location for a token coming from the expansion
1012    of a macro, unwind to the location of the expansion point of the macro.  */
1013
1014 location_t
1015 expansion_point_location (location_t location)
1016 {
1017   return linemap_resolve_location (line_table, location,
1018                                    LRK_MACRO_EXPANSION_POINT, NULL);
1019 }
1020
1021 /* Construct a location with caret at CARET, ranging from START to
1022    finish e.g.
1023
1024                  11111111112
1025         12345678901234567890
1026      522
1027      523   return foo + bar;
1028                   ~~~~^~~~~
1029      524
1030
1031    The location's caret is at the "+", line 523 column 15, but starts
1032    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1033    of "bar" at column 19.  */
1034
1035 location_t
1036 make_location (location_t caret, location_t start, location_t finish)
1037 {
1038   location_t pure_loc = get_pure_location (caret);
1039   source_range src_range;
1040   src_range.m_start = get_start (start);
1041   src_range.m_finish = get_finish (finish);
1042   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1043                                                    pure_loc,
1044                                                    src_range,
1045                                                    NULL);
1046   return combined_loc;
1047 }
1048
1049 /* Same as above, but taking a source range rather than two locations.  */
1050
1051 location_t
1052 make_location (location_t caret, source_range src_range)
1053 {
1054   location_t pure_loc = get_pure_location (caret);
1055   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
1056 }
1057
1058 /* An expanded_location stores the column in byte units.  This function
1059    converts that column to display units.  That requires reading the associated
1060    source line in order to calculate the display width.  If that cannot be done
1061    for any reason, then returns the byte column as a fallback.  */
1062 int
1063 location_compute_display_column (expanded_location exploc,
1064                                  const cpp_char_column_policy &policy)
1065 {
1066   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1067     return exploc.column;
1068   char_span line = location_get_source_line (exploc.file, exploc.line);
1069   /* If line is NULL, this function returns exploc.column which is the
1070      desired fallback.  */
1071   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1072                                             exploc.column, policy);
1073 }
1074
1075 /* Dump statistics to stderr about the memory usage of the line_table
1076    set of line maps.  This also displays some statistics about macro
1077    expansion.  */
1078
1079 void
1080 dump_line_table_statistics (void)
1081 {
1082   struct linemap_stats s;
1083   long total_used_map_size,
1084     macro_maps_size,
1085     total_allocated_map_size;
1086
1087   memset (&s, 0, sizeof (s));
1088
1089   linemap_get_statistics (line_table, &s);
1090
1091   macro_maps_size = s.macro_maps_used_size
1092     + s.macro_maps_locations_size;
1093
1094   total_allocated_map_size = s.ordinary_maps_allocated_size
1095     + s.macro_maps_allocated_size
1096     + s.macro_maps_locations_size;
1097
1098   total_used_map_size = s.ordinary_maps_used_size
1099     + s.macro_maps_used_size
1100     + s.macro_maps_locations_size;
1101
1102   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1103            s.num_expanded_macros);
1104   if (s.num_expanded_macros != 0)
1105     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1106              s.num_macro_tokens / s.num_expanded_macros);
1107   fprintf (stderr,
1108            "\nLine Table allocations during the "
1109            "compilation process\n");
1110   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1111            SIZE_AMOUNT (s.num_ordinary_maps_used));
1112   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1113            SIZE_AMOUNT (s.ordinary_maps_used_size));
1114   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1115            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1116   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1117            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1118   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1119            SIZE_AMOUNT (s.num_macro_maps_used));
1120   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1121            SIZE_AMOUNT (s.macro_maps_used_size));
1122   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1123            SIZE_AMOUNT (s.macro_maps_locations_size));
1124   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1125            SIZE_AMOUNT (macro_maps_size));
1126   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1127            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1128   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1129            SIZE_AMOUNT (total_allocated_map_size));
1130   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1131            SIZE_AMOUNT (total_used_map_size));
1132   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1133            SIZE_AMOUNT (s.adhoc_table_size));
1134   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1135            SIZE_AMOUNT (s.adhoc_table_entries_used));
1136   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1137            SIZE_AMOUNT (line_table->num_optimized_ranges));
1138   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1139            SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1140
1141   fprintf (stderr, "\n");
1142 }
1143
1144 /* Get location one beyond the final location in ordinary map IDX.  */
1145
1146 static location_t
1147 get_end_location (class line_maps *set, unsigned int idx)
1148 {
1149   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1150     return set->highest_location;
1151
1152   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1153   return MAP_START_LOCATION (next_map);
1154 }
1155
1156 /* Helper function for write_digit_row.  */
1157
1158 static void
1159 write_digit (FILE *stream, int digit)
1160 {
1161   fputc ('0' + (digit % 10), stream);
1162 }
1163
1164 /* Helper function for dump_location_info.
1165    Write a row of numbers to STREAM, numbering a source line,
1166    giving the units, tens, hundreds etc of the column number.  */
1167
1168 static void
1169 write_digit_row (FILE *stream, int indent,
1170                  const line_map_ordinary *map,
1171                  location_t loc, int max_col, int divisor)
1172 {
1173   fprintf (stream, "%*c", indent, ' ');
1174   fprintf (stream, "|");
1175   for (int column = 1; column < max_col; column++)
1176     {
1177       location_t column_loc = loc + (column << map->m_range_bits);
1178       write_digit (stream, column_loc / divisor);
1179     }
1180   fprintf (stream, "\n");
1181 }
1182
1183 /* Write a half-closed (START) / half-open (END) interval of
1184    location_t to STREAM.  */
1185
1186 static void
1187 dump_location_range (FILE *stream,
1188                      location_t start, location_t end)
1189 {
1190   fprintf (stream,
1191            "  location_t interval: %u <= loc < %u\n",
1192            start, end);
1193 }
1194
1195 /* Write a labelled description of a half-closed (START) / half-open (END)
1196    interval of location_t to STREAM.  */
1197
1198 static void
1199 dump_labelled_location_range (FILE *stream,
1200                               const char *name,
1201                               location_t start, location_t end)
1202 {
1203   fprintf (stream, "%s\n", name);
1204   dump_location_range (stream, start, end);
1205   fprintf (stream, "\n");
1206 }
1207
1208 /* Write a visualization of the locations in the line_table to STREAM.  */
1209
1210 void
1211 dump_location_info (FILE *stream)
1212 {
1213   /* Visualize the reserved locations.  */
1214   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1215                                 0, RESERVED_LOCATION_COUNT);
1216
1217   /* Visualize the ordinary line_map instances, rendering the sources. */
1218   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1219     {
1220       location_t end_location = get_end_location (line_table, idx);
1221       /* half-closed: doesn't include this one. */
1222
1223       const line_map_ordinary *map
1224         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1225       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1226       dump_location_range (stream,
1227                            MAP_START_LOCATION (map), end_location);
1228       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1229       fprintf (stream, "  starting at line: %i\n",
1230                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1231       fprintf (stream, "  column and range bits: %i\n",
1232                map->m_column_and_range_bits);
1233       fprintf (stream, "  column bits: %i\n",
1234                map->m_column_and_range_bits - map->m_range_bits);
1235       fprintf (stream, "  range bits: %i\n",
1236                map->m_range_bits);
1237       const char * reason;
1238       switch (map->reason) {
1239       case LC_ENTER:
1240         reason = "LC_ENTER";
1241         break;
1242       case LC_LEAVE:
1243         reason = "LC_LEAVE";
1244         break;
1245       case LC_RENAME:
1246         reason = "LC_RENAME";
1247         break;
1248       case LC_RENAME_VERBATIM:
1249         reason = "LC_RENAME_VERBATIM";
1250         break;
1251       case LC_ENTER_MACRO:
1252         reason = "LC_RENAME_MACRO";
1253         break;
1254       default:
1255         reason = "Unknown";
1256       }
1257       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1258
1259       const line_map_ordinary *includer_map
1260         = linemap_included_from_linemap (line_table, map);
1261       fprintf (stream, "  included from location: %d",
1262                linemap_included_from (map));
1263       if (includer_map) {
1264         fprintf (stream, " (in ordinary map %d)",
1265                  int (includer_map - line_table->info_ordinary.maps));
1266       }
1267       fprintf (stream, "\n");
1268
1269       /* Render the span of source lines that this "map" covers.  */
1270       for (location_t loc = MAP_START_LOCATION (map);
1271            loc < end_location;
1272            loc += (1 << map->m_range_bits) )
1273         {
1274           gcc_assert (pure_location_p (line_table, loc) );
1275
1276           expanded_location exploc
1277             = linemap_expand_location (line_table, map, loc);
1278
1279           if (exploc.column == 0)
1280             {
1281               /* Beginning of a new source line: draw the line.  */
1282
1283               char_span line_text = location_get_source_line (exploc.file,
1284                                                               exploc.line);
1285               if (!line_text)
1286                 break;
1287               fprintf (stream,
1288                        "%s:%3i|loc:%5i|%.*s\n",
1289                        exploc.file, exploc.line,
1290                        loc,
1291                        (int)line_text.length (), line_text.get_buffer ());
1292
1293               /* "loc" is at column 0, which means "the whole line".
1294                  Render the locations *within* the line, by underlining
1295                  it, showing the location_t numeric values
1296                  at each column.  */
1297               size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1298               if (max_col > line_text.length ())
1299                 max_col = line_text.length () + 1;
1300
1301               int len_lnum = num_digits (exploc.line);
1302               if (len_lnum < 3)
1303                 len_lnum = 3;
1304               int len_loc = num_digits (loc);
1305               if (len_loc < 5)
1306                 len_loc = 5;
1307
1308               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1309
1310               /* Thousands.  */
1311               if (end_location > 999)
1312                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1313
1314               /* Hundreds.  */
1315               if (end_location > 99)
1316                 write_digit_row (stream, indent, map, loc, max_col, 100);
1317
1318               /* Tens.  */
1319               write_digit_row (stream, indent, map, loc, max_col, 10);
1320
1321               /* Units.  */
1322               write_digit_row (stream, indent, map, loc, max_col, 1);
1323             }
1324         }
1325       fprintf (stream, "\n");
1326     }
1327
1328   /* Visualize unallocated values.  */
1329   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1330                                 line_table->highest_location,
1331                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1332
1333   /* Visualize the macro line_map instances, rendering the sources. */
1334   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1335     {
1336       /* Each macro map that is allocated owns location_t values
1337          that are *lower* that the one before them.
1338          Hence it's meaningful to view them either in order of ascending
1339          source locations, or in order of ascending macro map index.  */
1340       const bool ascending_location_ts = true;
1341       unsigned int idx = (ascending_location_ts
1342                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1343                           : i);
1344       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1345       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1346                idx,
1347                linemap_map_get_macro_name (map),
1348                MACRO_MAP_NUM_MACRO_TOKENS (map));
1349       dump_location_range (stream,
1350                            map->start_location,
1351                            (map->start_location
1352                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1353       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1354               "expansion point is location %i",
1355               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1356       fprintf (stream, "  map->start_location: %u\n",
1357                map->start_location);
1358
1359       fprintf (stream, "  macro_locations:\n");
1360       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1361         {
1362           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1363           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1364
1365           /* linemap_add_macro_token encodes token numbers in an expansion
1366              by putting them after MAP_START_LOCATION. */
1367
1368           /* I'm typically seeing 4 uninitialized entries at the end of
1369              0xafafafaf.
1370              This appears to be due to macro.c:replace_args
1371              adding 2 extra args for padding tokens; presumably there may
1372              be a leading and/or trailing padding token injected,
1373              each for 2 more location slots.
1374              This would explain there being up to 4 location_ts slots
1375              that may be uninitialized.  */
1376
1377           fprintf (stream, "    %u: %u, %u\n",
1378                    i,
1379                    x,
1380                    y);
1381           if (x == y)
1382             {
1383               if (x < MAP_START_LOCATION (map))
1384                 inform (x, "token %u has %<x-location == y-location == %u%>",
1385                         i, x);
1386               else
1387                 fprintf (stream,
1388                          "x-location == y-location == %u encodes token # %u\n",
1389                          x, x - MAP_START_LOCATION (map));
1390                 }
1391           else
1392             {
1393               inform (x, "token %u has %<x-location == %u%>", i, x);
1394               inform (x, "token %u has %<y-location == %u%>", i, y);
1395             }
1396         }
1397       fprintf (stream, "\n");
1398     }
1399
1400   /* It appears that MAX_LOCATION_T itself is never assigned to a
1401      macro map, presumably due to an off-by-one error somewhere
1402      between the logic in linemap_enter_macro and
1403      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1404   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1405                                 MAX_LOCATION_T,
1406                                 MAX_LOCATION_T + 1);
1407
1408   /* Visualize ad-hoc values.  */
1409   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1410                                 MAX_LOCATION_T + 1, UINT_MAX);
1411 }
1412
1413 /* string_concat's constructor.  */
1414
1415 string_concat::string_concat (int num, location_t *locs)
1416   : m_num (num)
1417 {
1418   m_locs = ggc_vec_alloc <location_t> (num);
1419   for (int i = 0; i < num; i++)
1420     m_locs[i] = locs[i];
1421 }
1422
1423 /* string_concat_db's constructor.  */
1424
1425 string_concat_db::string_concat_db ()
1426 {
1427   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1428 }
1429
1430 /* Record that a string concatenation occurred, covering NUM
1431    string literal tokens.  LOCS is an array of size NUM, containing the
1432    locations of the tokens.  A copy of LOCS is taken.  */
1433
1434 void
1435 string_concat_db::record_string_concatenation (int num, location_t *locs)
1436 {
1437   gcc_assert (num > 1);
1438   gcc_assert (locs);
1439
1440   location_t key_loc = get_key_loc (locs[0]);
1441   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1442      any data now recorded under key 'key_loc' would be overwritten by a
1443      subsequent call with the same key 'key_loc'.  */
1444   if (RESERVED_LOCATION_P (key_loc))
1445     return;
1446
1447   string_concat *concat
1448     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1449   m_table->put (key_loc, concat);
1450 }
1451
1452 /* Determine if LOC was the location of the initial token of a
1453    concatenation of string literal tokens.
1454    If so, *OUT_NUM is written to with the number of tokens, and
1455    *OUT_LOCS with the location of an array of locations of the
1456    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1457    storage owned by the string_concat_db.
1458    Otherwise, return false.  */
1459
1460 bool
1461 string_concat_db::get_string_concatenation (location_t loc,
1462                                             int *out_num,
1463                                             location_t **out_locs)
1464 {
1465   gcc_assert (out_num);
1466   gcc_assert (out_locs);
1467
1468   location_t key_loc = get_key_loc (loc);
1469   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1470      discussion in 'string_concat_db::record_string_concatenation'.  */
1471   if (RESERVED_LOCATION_P (key_loc))
1472     return false;
1473
1474   string_concat **concat = m_table->get (key_loc);
1475   if (!concat)
1476     return false;
1477
1478   *out_num = (*concat)->m_num;
1479   *out_locs =(*concat)->m_locs;
1480   return true;
1481 }
1482
1483 /* Internal function.  Canonicalize LOC into a form suitable for
1484    use as a key within the database, stripping away macro expansion,
1485    ad-hoc information, and range information, using the location of
1486    the start of LOC within an ordinary linemap.  */
1487
1488 location_t
1489 string_concat_db::get_key_loc (location_t loc)
1490 {
1491   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1492                                   NULL);
1493
1494   loc = get_range_from_loc (line_table, loc).m_start;
1495
1496   return loc;
1497 }
1498
1499 /* Helper class for use within get_substring_ranges_for_loc.
1500    An vec of cpp_string with responsibility for releasing all of the
1501    str->text for each str in the vector.  */
1502
1503 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1504 {
1505  public:
1506   auto_cpp_string_vec (int alloc)
1507     : auto_vec <cpp_string> (alloc) {}
1508
1509   ~auto_cpp_string_vec ()
1510   {
1511     /* Clean up the copies within this vec.  */
1512     int i;
1513     cpp_string *str;
1514     FOR_EACH_VEC_ELT (*this, i, str)
1515       free (const_cast <unsigned char *> (str->text));
1516   }
1517 };
1518
1519 /* Attempt to populate RANGES with source location information on the
1520    individual characters within the string literal found at STRLOC.
1521    If CONCATS is non-NULL, then any string literals that the token at
1522    STRLOC  was concatenated with are also added to RANGES.
1523
1524    Return NULL if successful, or an error message if any errors occurred (in
1525    which case RANGES may be only partially populated and should not
1526    be used).
1527
1528    This is implemented by re-parsing the relevant source line(s).  */
1529
1530 static const char *
1531 get_substring_ranges_for_loc (cpp_reader *pfile,
1532                               string_concat_db *concats,
1533                               location_t strloc,
1534                               enum cpp_ttype type,
1535                               cpp_substring_ranges &ranges)
1536 {
1537   gcc_assert (pfile);
1538
1539   if (strloc == UNKNOWN_LOCATION)
1540     return "unknown location";
1541
1542   /* Reparsing the strings requires accurate location information.
1543      If -ftrack-macro-expansion has been overridden from its default
1544      of 2, then we might have a location of a macro expansion point,
1545      rather than the location of the literal itself.
1546      Avoid this by requiring that we have full macro expansion tracking
1547      for substring locations to be available.  */
1548   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1549     return "track_macro_expansion != 2";
1550
1551   /* If #line or # 44 "file"-style directives are present, then there's
1552      no guarantee that the line numbers we have can be used to locate
1553      the strings.  For example, we might have a .i file with # directives
1554      pointing back to lines within a .c file, but the .c file might
1555      have been edited since the .i file was created.
1556      In such a case, the safest course is to disable on-demand substring
1557      locations.  */
1558   if (line_table->seen_line_directive)
1559     return "seen line directive";
1560
1561   /* If string concatenation has occurred at STRLOC, get the locations
1562      of all of the literal tokens making up the compound string.
1563      Otherwise, just use STRLOC.  */
1564   int num_locs = 1;
1565   location_t *strlocs = &strloc;
1566   if (concats)
1567     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1568
1569   auto_cpp_string_vec strs (num_locs);
1570   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1571   for (int i = 0; i < num_locs; i++)
1572     {
1573       /* Get range of strloc.  We will use it to locate the start and finish
1574          of the literal token within the line.  */
1575       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1576
1577       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1578         {
1579           /* If the string token was within a macro expansion, then we can
1580              cope with it for the simple case where we have a single token.
1581              Otherwise, bail out.  */
1582           if (src_range.m_start != src_range.m_finish)
1583             return "macro expansion";
1584         }
1585       else
1586         {
1587           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1588             /* If so, we can't reliably determine where the token started within
1589                its line.  */
1590             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1591
1592           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1593             /* If so, we can't reliably determine where the token finished
1594                within its line.  */
1595             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1596         }
1597
1598       expanded_location start
1599         = expand_location_to_spelling_point (src_range.m_start,
1600                                              LOCATION_ASPECT_START);
1601       expanded_location finish
1602         = expand_location_to_spelling_point (src_range.m_finish,
1603                                              LOCATION_ASPECT_FINISH);
1604       if (start.file != finish.file)
1605         return "range endpoints are in different files";
1606       if (start.line != finish.line)
1607         return "range endpoints are on different lines";
1608       if (start.column > finish.column)
1609         return "range endpoints are reversed";
1610
1611       char_span line = location_get_source_line (start.file, start.line);
1612       if (!line)
1613         return "unable to read source line";
1614
1615       /* Determine the location of the literal (including quotes
1616          and leading prefix chars, such as the 'u' in a u""
1617          token).  */
1618       size_t literal_length = finish.column - start.column + 1;
1619
1620       /* Ensure that we don't crash if we got the wrong location.  */
1621       if (start.column < 1)
1622         return "zero start column";
1623       if (line.length () < (start.column - 1 + literal_length))
1624         return "line is not wide enough";
1625
1626       char_span literal = line.subspan (start.column - 1, literal_length);
1627
1628       cpp_string from;
1629       from.len = literal_length;
1630       /* Make a copy of the literal, to avoid having to rely on
1631          the lifetime of the copy of the line within the cache.
1632          This will be released by the auto_cpp_string_vec dtor.  */
1633       from.text = (unsigned char *)literal.xstrdup ();
1634       strs.safe_push (from);
1635
1636       /* For very long lines, a new linemap could have started
1637          halfway through the token.
1638          Ensure that the loc_reader uses the linemap of the
1639          *end* of the token for its start location.  */
1640       const line_map_ordinary *start_ord_map;
1641       linemap_resolve_location (line_table, src_range.m_start,
1642                                 LRK_SPELLING_LOCATION, &start_ord_map);
1643       const line_map_ordinary *final_ord_map;
1644       linemap_resolve_location (line_table, src_range.m_finish,
1645                                 LRK_SPELLING_LOCATION, &final_ord_map);
1646       if (start_ord_map == NULL || final_ord_map == NULL)
1647         return "failed to get ordinary maps";
1648       /* Bulletproofing.  We ought to only have different ordinary maps
1649          for start vs finish due to line-length jumps.  */
1650       if (start_ord_map != final_ord_map
1651           && start_ord_map->to_file != final_ord_map->to_file)
1652         return "start and finish are spelled in different ordinary maps";
1653       /* The file from linemap_resolve_location ought to match that from
1654          expand_location_to_spelling_point.  */
1655       if (start_ord_map->to_file != start.file)
1656         return "mismatching file after resolving linemap";
1657
1658       location_t start_loc
1659         = linemap_position_for_line_and_column (line_table, final_ord_map,
1660                                                 start.line, start.column);
1661
1662       cpp_string_location_reader loc_reader (start_loc, line_table);
1663       loc_readers.safe_push (loc_reader);
1664     }
1665
1666   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1667   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1668                                                  loc_readers.address (),
1669                                                  num_locs, &ranges, type);
1670   if (err)
1671     return err;
1672
1673   /* Success: "ranges" should now contain information on the string.  */
1674   return NULL;
1675 }
1676
1677 /* Attempt to populate *OUT_LOC with source location information on the
1678    given characters within the string literal found at STRLOC.
1679    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1680    character set.
1681
1682    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1683    and string literal "012345\n789"
1684    *OUT_LOC is written to with:
1685      "012345\n789"
1686          ~^~~~~
1687
1688    If CONCATS is non-NULL, then any string literals that the token at
1689    STRLOC was concatenated with are also considered.
1690
1691    This is implemented by re-parsing the relevant source line(s).
1692
1693    Return NULL if successful, or an error message if any errors occurred.
1694    Error messages are intended for GCC developers (to help debugging) rather
1695    than for end-users.  */
1696
1697 const char *
1698 get_location_within_string (cpp_reader *pfile,
1699                             string_concat_db *concats,
1700                             location_t strloc,
1701                             enum cpp_ttype type,
1702                             int caret_idx, int start_idx, int end_idx,
1703                             location_t *out_loc)
1704 {
1705   gcc_checking_assert (caret_idx >= 0);
1706   gcc_checking_assert (start_idx >= 0);
1707   gcc_checking_assert (end_idx >= 0);
1708   gcc_assert (out_loc);
1709
1710   cpp_substring_ranges ranges;
1711   const char *err
1712     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1713   if (err)
1714     return err;
1715
1716   if (caret_idx >= ranges.get_num_ranges ())
1717     return "caret_idx out of range";
1718   if (start_idx >= ranges.get_num_ranges ())
1719     return "start_idx out of range";
1720   if (end_idx >= ranges.get_num_ranges ())
1721     return "end_idx out of range";
1722
1723   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1724                             ranges.get_range (start_idx).m_start,
1725                             ranges.get_range (end_idx).m_finish);
1726   return NULL;
1727 }
1728
1729 #if CHECKING_P
1730
1731 namespace selftest {
1732
1733 /* Selftests of location handling.  */
1734
1735 /* Attempt to populate *OUT_RANGE with source location information on the
1736    given character within the string literal found at STRLOC.
1737    CHAR_IDX refers to an offset within the execution character set.
1738    If CONCATS is non-NULL, then any string literals that the token at
1739    STRLOC was concatenated with are also considered.
1740
1741    This is implemented by re-parsing the relevant source line(s).
1742
1743    Return NULL if successful, or an error message if any errors occurred.
1744    Error messages are intended for GCC developers (to help debugging) rather
1745    than for end-users.  */
1746
1747 static const char *
1748 get_source_range_for_char (cpp_reader *pfile,
1749                            string_concat_db *concats,
1750                            location_t strloc,
1751                            enum cpp_ttype type,
1752                            int char_idx,
1753                            source_range *out_range)
1754 {
1755   gcc_checking_assert (char_idx >= 0);
1756   gcc_assert (out_range);
1757
1758   cpp_substring_ranges ranges;
1759   const char *err
1760     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1761   if (err)
1762     return err;
1763
1764   if (char_idx >= ranges.get_num_ranges ())
1765     return "char_idx out of range";
1766
1767   *out_range = ranges.get_range (char_idx);
1768   return NULL;
1769 }
1770
1771 /* As get_source_range_for_char, but write to *OUT the number
1772    of ranges that are available.  */
1773
1774 static const char *
1775 get_num_source_ranges_for_substring (cpp_reader *pfile,
1776                                      string_concat_db *concats,
1777                                      location_t strloc,
1778                                      enum cpp_ttype type,
1779                                      int *out)
1780 {
1781   gcc_assert (out);
1782
1783   cpp_substring_ranges ranges;
1784   const char *err
1785     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1786
1787   if (err)
1788     return err;
1789
1790   *out = ranges.get_num_ranges ();
1791   return NULL;
1792 }
1793
1794 /* Selftests of location handling.  */
1795
1796 /* Verify that compare() on linenum_type handles comparisons over the full
1797    range of the type.  */
1798
1799 static void
1800 test_linenum_comparisons ()
1801 {
1802   linenum_type min_line (0);
1803   linenum_type max_line (0xffffffff);
1804   ASSERT_EQ (0, compare (min_line, min_line));
1805   ASSERT_EQ (0, compare (max_line, max_line));
1806
1807   ASSERT_GT (compare (max_line, min_line), 0);
1808   ASSERT_LT (compare (min_line, max_line), 0);
1809 }
1810
1811 /* Helper function for verifying location data: when location_t
1812    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1813    as having column 0.  */
1814
1815 static bool
1816 should_have_column_data_p (location_t loc)
1817 {
1818   if (IS_ADHOC_LOC (loc))
1819     loc = get_location_from_adhoc_loc (line_table, loc);
1820   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1821     return false;
1822   return true;
1823 }
1824
1825 /* Selftest for should_have_column_data_p.  */
1826
1827 static void
1828 test_should_have_column_data_p ()
1829 {
1830   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1831   ASSERT_TRUE
1832     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1833   ASSERT_FALSE
1834     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1835 }
1836
1837 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1838    on LOC.  */
1839
1840 static void
1841 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1842               location_t loc)
1843 {
1844   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1845   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1846   /* If location_t values are sufficiently high, then column numbers
1847      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1848      When close to the threshold, column numbers *may* be present: if
1849      the final linemap before the threshold contains a line that straddles
1850      the threshold, locations in that line have column information.  */
1851   if (should_have_column_data_p (loc))
1852     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1853 }
1854
1855 /* Various selftests involve constructing a line table and one or more
1856    line maps within it.
1857
1858    For maximum test coverage we want to run these tests with a variety
1859    of situations:
1860    - line_table->default_range_bits: some frontends use a non-zero value
1861    and others use zero
1862    - the fallback modes within line-map.c: there are various threshold
1863    values for location_t beyond line-map.c changes
1864    behavior (disabling of the range-packing optimization, disabling
1865    of column-tracking).  We can exercise these by starting the line_table
1866    at interesting values at or near these thresholds.
1867
1868    The following struct describes a particular case within our test
1869    matrix.  */
1870
1871 class line_table_case
1872 {
1873 public:
1874   line_table_case (int default_range_bits, int base_location)
1875   : m_default_range_bits (default_range_bits),
1876     m_base_location (base_location)
1877   {}
1878
1879   int m_default_range_bits;
1880   int m_base_location;
1881 };
1882
1883 /* Constructor.  Store the old value of line_table, and create a new
1884    one, using sane defaults.  */
1885
1886 line_table_test::line_table_test ()
1887 {
1888   gcc_assert (saved_line_table == NULL);
1889   saved_line_table = line_table;
1890   line_table = ggc_alloc<line_maps> ();
1891   linemap_init (line_table, BUILTINS_LOCATION);
1892   gcc_assert (saved_line_table->reallocator);
1893   line_table->reallocator = saved_line_table->reallocator;
1894   gcc_assert (saved_line_table->round_alloc_size);
1895   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1896   line_table->default_range_bits = 0;
1897 }
1898
1899 /* Constructor.  Store the old value of line_table, and create a new
1900    one, using the sitation described in CASE_.  */
1901
1902 line_table_test::line_table_test (const line_table_case &case_)
1903 {
1904   gcc_assert (saved_line_table == NULL);
1905   saved_line_table = line_table;
1906   line_table = ggc_alloc<line_maps> ();
1907   linemap_init (line_table, BUILTINS_LOCATION);
1908   gcc_assert (saved_line_table->reallocator);
1909   line_table->reallocator = saved_line_table->reallocator;
1910   gcc_assert (saved_line_table->round_alloc_size);
1911   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1912   line_table->default_range_bits = case_.m_default_range_bits;
1913   if (case_.m_base_location)
1914     {
1915       line_table->highest_location = case_.m_base_location;
1916       line_table->highest_line = case_.m_base_location;
1917     }
1918 }
1919
1920 /* Destructor.  Restore the old value of line_table.  */
1921
1922 line_table_test::~line_table_test ()
1923 {
1924   gcc_assert (saved_line_table != NULL);
1925   line_table = saved_line_table;
1926   saved_line_table = NULL;
1927 }
1928
1929 /* Verify basic operation of ordinary linemaps.  */
1930
1931 static void
1932 test_accessing_ordinary_linemaps (const line_table_case &case_)
1933 {
1934   line_table_test ltt (case_);
1935
1936   /* Build a simple linemap describing some locations. */
1937   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1938
1939   linemap_line_start (line_table, 1, 100);
1940   location_t loc_a = linemap_position_for_column (line_table, 1);
1941   location_t loc_b = linemap_position_for_column (line_table, 23);
1942
1943   linemap_line_start (line_table, 2, 100);
1944   location_t loc_c = linemap_position_for_column (line_table, 1);
1945   location_t loc_d = linemap_position_for_column (line_table, 17);
1946
1947   /* Example of a very long line.  */
1948   linemap_line_start (line_table, 3, 2000);
1949   location_t loc_e = linemap_position_for_column (line_table, 700);
1950
1951   /* Transitioning back to a short line.  */
1952   linemap_line_start (line_table, 4, 0);
1953   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1954
1955   if (should_have_column_data_p (loc_back_to_short))
1956     {
1957       /* Verify that we switched to short lines in the linemap.  */
1958       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1959       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1960     }
1961
1962   /* Example of a line that will eventually be seen to be longer
1963      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1964      below that.  */
1965   linemap_line_start (line_table, 5, 2000);
1966
1967   location_t loc_start_of_very_long_line
1968     = linemap_position_for_column (line_table, 2000);
1969   location_t loc_too_wide
1970     = linemap_position_for_column (line_table, 4097);
1971   location_t loc_too_wide_2
1972     = linemap_position_for_column (line_table, 4098);
1973
1974   /* ...and back to a sane line length.  */
1975   linemap_line_start (line_table, 6, 100);
1976   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1977
1978   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1979
1980   /* Multiple files.  */
1981   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1982   linemap_line_start (line_table, 1, 200);
1983   location_t loc_f = linemap_position_for_column (line_table, 150);
1984   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1985
1986   /* Verify that we can recover the location info.  */
1987   assert_loceq ("foo.c", 1, 1, loc_a);
1988   assert_loceq ("foo.c", 1, 23, loc_b);
1989   assert_loceq ("foo.c", 2, 1, loc_c);
1990   assert_loceq ("foo.c", 2, 17, loc_d);
1991   assert_loceq ("foo.c", 3, 700, loc_e);
1992   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1993
1994   /* In the very wide line, the initial location should be fully tracked.  */
1995   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1996   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1997      be disabled.  */
1998   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1999   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2000   /*...and column-tracking should be re-enabled for subsequent lines.  */
2001   assert_loceq ("foo.c", 6, 10, loc_sane_again);
2002
2003   assert_loceq ("bar.c", 1, 150, loc_f);
2004
2005   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2006   ASSERT_TRUE (pure_location_p (line_table, loc_a));
2007
2008   /* Verify using make_location to build a range, and extracting data
2009      back from it.  */
2010   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2011   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2012   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2013   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2014   ASSERT_EQ (loc_b, src_range.m_start);
2015   ASSERT_EQ (loc_d, src_range.m_finish);
2016 }
2017
2018 /* Verify various properties of UNKNOWN_LOCATION.  */
2019
2020 static void
2021 test_unknown_location ()
2022 {
2023   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2024   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2025   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2026 }
2027
2028 /* Verify various properties of BUILTINS_LOCATION.  */
2029
2030 static void
2031 test_builtins ()
2032 {
2033   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
2034   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2035 }
2036
2037 /* Regression test for make_location.
2038    Ensure that we use pure locations for the start/finish of the range,
2039    rather than storing a packed or ad-hoc range as the start/finish.  */
2040
2041 static void
2042 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2043 {
2044   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2045      with C++ frontend.
2046      ....................0000000001111111111222.
2047      ....................1234567890123456789012.  */
2048   const char *content = "     r += !aaa == bbb;\n";
2049   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2050   line_table_test ltt (case_);
2051   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2052
2053   const location_t c11 = linemap_position_for_column (line_table, 11);
2054   const location_t c12 = linemap_position_for_column (line_table, 12);
2055   const location_t c13 = linemap_position_for_column (line_table, 13);
2056   const location_t c14 = linemap_position_for_column (line_table, 14);
2057   const location_t c21 = linemap_position_for_column (line_table, 21);
2058
2059   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2060     return;
2061
2062   /* Use column 13 for the caret location, arbitrarily, to verify that we
2063      handle start != caret.  */
2064   const location_t aaa = make_location (c13, c12, c14);
2065   ASSERT_EQ (c13, get_pure_location (aaa));
2066   ASSERT_EQ (c12, get_start (aaa));
2067   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2068   ASSERT_EQ (c14, get_finish (aaa));
2069   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2070
2071   /* Make a location using a location with a range as the start-point.  */
2072   const location_t not_aaa = make_location (c11, aaa, c14);
2073   ASSERT_EQ (c11, get_pure_location (not_aaa));
2074   /* It should use the start location of the range, not store the range
2075      itself.  */
2076   ASSERT_EQ (c12, get_start (not_aaa));
2077   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2078   ASSERT_EQ (c14, get_finish (not_aaa));
2079   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2080
2081   /* Similarly, make a location with a range as the end-point.  */
2082   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2083   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2084   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2085   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2086   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2087   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2088   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2089   /* It should use the finish location of the range, not store the range
2090      itself.  */
2091   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2092   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2093   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2094   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2095   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2096 }
2097
2098 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2099
2100 static void
2101 test_reading_source_line ()
2102 {
2103   /* Create a tempfile and write some text to it.  */
2104   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2105                         "01234567890123456789\n"
2106                         "This is the test text\n"
2107                         "This is the 3rd line");
2108
2109   /* Read back a specific line from the tempfile.  */
2110   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2111   ASSERT_TRUE (source_line);
2112   ASSERT_TRUE (source_line.get_buffer () != NULL);
2113   ASSERT_EQ (20, source_line.length ());
2114   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2115                          source_line.get_buffer (), source_line.length ()));
2116
2117   source_line = location_get_source_line (tmp.get_filename (), 2);
2118   ASSERT_TRUE (source_line);
2119   ASSERT_TRUE (source_line.get_buffer () != NULL);
2120   ASSERT_EQ (21, source_line.length ());
2121   ASSERT_TRUE (!strncmp ("This is the test text",
2122                          source_line.get_buffer (), source_line.length ()));
2123
2124   source_line = location_get_source_line (tmp.get_filename (), 4);
2125   ASSERT_FALSE (source_line);
2126   ASSERT_TRUE (source_line.get_buffer () == NULL);
2127 }
2128
2129 /* Tests of lexing.  */
2130
2131 /* Verify that token TOK from PARSER has cpp_token_as_text
2132    equal to EXPECTED_TEXT.  */
2133
2134 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
2135   SELFTEST_BEGIN_STMT                                                   \
2136     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
2137     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
2138   SELFTEST_END_STMT
2139
2140 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2141    and ranges from EXP_START_COL to EXP_FINISH_COL.
2142    Use LOC as the effective location of the selftest.  */
2143
2144 static void
2145 assert_token_loc_eq (const location &loc,
2146                      const cpp_token *tok,
2147                      const char *exp_filename, int exp_linenum,
2148                      int exp_start_col, int exp_finish_col)
2149 {
2150   location_t tok_loc = tok->src_loc;
2151   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2152   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2153
2154   /* If location_t values are sufficiently high, then column numbers
2155      will be unavailable.  */
2156   if (!should_have_column_data_p (tok_loc))
2157     return;
2158
2159   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2160   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2161   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2162   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2163 }
2164
2165 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2166    SELFTEST_LOCATION as the effective location of the selftest.  */
2167
2168 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2169                             EXP_START_COL, EXP_FINISH_COL) \
2170   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2171                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2172
2173 /* Test of lexing a file using libcpp, verifying tokens and their
2174    location information.  */
2175
2176 static void
2177 test_lexer (const line_table_case &case_)
2178 {
2179   /* Create a tempfile and write some text to it.  */
2180   const char *content =
2181     /*00000000011111111112222222222333333.3333444444444.455555555556
2182       12345678901234567890123456789012345.6789012345678.901234567890.  */
2183     ("test_name /* c-style comment */\n"
2184      "                                  \"test literal\"\n"
2185      " // test c++-style comment\n"
2186      "   42\n");
2187   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2188
2189   line_table_test ltt (case_);
2190
2191   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2192
2193   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2194   ASSERT_NE (fname, NULL);
2195
2196   /* Verify that we get the expected tokens back, with the correct
2197      location information.  */
2198
2199   location_t loc;
2200   const cpp_token *tok;
2201   tok = cpp_get_token_with_location (parser, &loc);
2202   ASSERT_NE (tok, NULL);
2203   ASSERT_EQ (tok->type, CPP_NAME);
2204   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2205   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2206
2207   tok = cpp_get_token_with_location (parser, &loc);
2208   ASSERT_NE (tok, NULL);
2209   ASSERT_EQ (tok->type, CPP_STRING);
2210   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2211   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2212
2213   tok = cpp_get_token_with_location (parser, &loc);
2214   ASSERT_NE (tok, NULL);
2215   ASSERT_EQ (tok->type, CPP_NUMBER);
2216   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2217   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2218
2219   tok = cpp_get_token_with_location (parser, &loc);
2220   ASSERT_NE (tok, NULL);
2221   ASSERT_EQ (tok->type, CPP_EOF);
2222
2223   cpp_finish (parser, NULL);
2224   cpp_destroy (parser);
2225 }
2226
2227 /* Forward decls.  */
2228
2229 class lexer_test;
2230 class lexer_test_options;
2231
2232 /* A class for specifying options of a lexer_test.
2233    The "apply" vfunc is called during the lexer_test constructor.  */
2234
2235 class lexer_test_options
2236 {
2237  public:
2238   virtual void apply (lexer_test &) = 0;
2239 };
2240
2241 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2242    in its dtor.
2243
2244    This is needed by struct lexer_test to ensure that the cleanup of the
2245    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2246
2247 class cpp_reader_ptr
2248 {
2249  public:
2250   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2251
2252   ~cpp_reader_ptr ()
2253   {
2254     cpp_finish (m_ptr, NULL);
2255     cpp_destroy (m_ptr);
2256   }
2257
2258   operator cpp_reader * () const { return m_ptr; }
2259
2260  private:
2261   cpp_reader *m_ptr;
2262 };
2263
2264 /* A struct for writing lexer tests.  */
2265
2266 class lexer_test
2267 {
2268 public:
2269   lexer_test (const line_table_case &case_, const char *content,
2270               lexer_test_options *options);
2271   ~lexer_test ();
2272
2273   const cpp_token *get_token ();
2274
2275   /* The ordering of these fields matters.
2276      The line_table_test must be first, since the cpp_reader_ptr
2277      uses it.
2278      The cpp_reader must be cleaned up *after* the temp_source_file
2279      since the filenames in input.c's input cache are owned by the
2280      cpp_reader; in particular, when ~temp_source_file evicts the
2281      filename the filenames must still be alive.  */
2282   line_table_test m_ltt;
2283   cpp_reader_ptr m_parser;
2284   temp_source_file m_tempfile;
2285   string_concat_db m_concats;
2286   bool m_implicitly_expect_EOF;
2287 };
2288
2289 /* Use an EBCDIC encoding for the execution charset, specifically
2290    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2291
2292    This exercises iconv integration within libcpp.
2293    Not every build of iconv supports the given charset,
2294    so we need to flag this error and handle it gracefully.  */
2295
2296 class ebcdic_execution_charset : public lexer_test_options
2297 {
2298  public:
2299   ebcdic_execution_charset () : m_num_iconv_errors (0)
2300     {
2301       gcc_assert (s_singleton == NULL);
2302       s_singleton = this;
2303     }
2304   ~ebcdic_execution_charset ()
2305     {
2306       gcc_assert (s_singleton == this);
2307       s_singleton = NULL;
2308     }
2309
2310   void apply (lexer_test &test) FINAL OVERRIDE
2311   {
2312     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2313     cpp_opts->narrow_charset = "IBM1047";
2314
2315     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2316     callbacks->diagnostic = on_diagnostic;
2317   }
2318
2319   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2320                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2321                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2322                              rich_location *richloc ATTRIBUTE_UNUSED,
2323                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2324     ATTRIBUTE_FPTR_PRINTF(5,0)
2325   {
2326     gcc_assert (s_singleton);
2327     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2328     const char *msg = "conversion from %s to %s not supported by iconv";
2329 #ifdef ENABLE_NLS
2330     msg = dgettext ("cpplib", msg);
2331 #endif
2332     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2333        when the local iconv build doesn't support the conversion.  */
2334     if (strcmp (msgid, msg) == 0)
2335       {
2336         s_singleton->m_num_iconv_errors++;
2337         return true;
2338       }
2339
2340     /* Otherwise, we have an unexpected error.  */
2341     abort ();
2342   }
2343
2344   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2345
2346  private:
2347   static ebcdic_execution_charset *s_singleton;
2348   int m_num_iconv_errors;
2349 };
2350
2351 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2352
2353 /* A lexer_test_options subclass that records a list of diagnostic
2354    messages emitted by the lexer.  */
2355
2356 class lexer_diagnostic_sink : public lexer_test_options
2357 {
2358  public:
2359   lexer_diagnostic_sink ()
2360   {
2361     gcc_assert (s_singleton == NULL);
2362     s_singleton = this;
2363   }
2364   ~lexer_diagnostic_sink ()
2365   {
2366     gcc_assert (s_singleton == this);
2367     s_singleton = NULL;
2368
2369     int i;
2370     char *str;
2371     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2372       free (str);
2373   }
2374
2375   void apply (lexer_test &test) FINAL OVERRIDE
2376   {
2377     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2378     callbacks->diagnostic = on_diagnostic;
2379   }
2380
2381   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2382                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2383                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2384                              rich_location *richloc ATTRIBUTE_UNUSED,
2385                              const char *msgid, va_list *ap)
2386     ATTRIBUTE_FPTR_PRINTF(5,0)
2387   {
2388     char *msg = xvasprintf (msgid, *ap);
2389     s_singleton->m_diagnostics.safe_push (msg);
2390     return true;
2391   }
2392
2393   auto_vec<char *> m_diagnostics;
2394
2395  private:
2396   static lexer_diagnostic_sink *s_singleton;
2397 };
2398
2399 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2400
2401 /* Constructor.  Override line_table with a new instance based on CASE_,
2402    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2403    start parsing the tempfile.  */
2404
2405 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2406                         lexer_test_options *options)
2407 : m_ltt (case_),
2408   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2409   /* Create a tempfile and write the text to it.  */
2410   m_tempfile (SELFTEST_LOCATION, ".c", content),
2411   m_concats (),
2412   m_implicitly_expect_EOF (true)
2413 {
2414   if (options)
2415     options->apply (*this);
2416
2417   cpp_init_iconv (m_parser);
2418
2419   /* Parse the file.  */
2420   const char *fname = cpp_read_main_file (m_parser,
2421                                           m_tempfile.get_filename ());
2422   ASSERT_NE (fname, NULL);
2423 }
2424
2425 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2426
2427 lexer_test::~lexer_test ()
2428 {
2429   location_t loc;
2430   const cpp_token *tok;
2431
2432   if (m_implicitly_expect_EOF)
2433     {
2434       tok = cpp_get_token_with_location (m_parser, &loc);
2435       ASSERT_NE (tok, NULL);
2436       ASSERT_EQ (tok->type, CPP_EOF);
2437     }
2438 }
2439
2440 /* Get the next token from m_parser.  */
2441
2442 const cpp_token *
2443 lexer_test::get_token ()
2444 {
2445   location_t loc;
2446   const cpp_token *tok;
2447
2448   tok = cpp_get_token_with_location (m_parser, &loc);
2449   ASSERT_NE (tok, NULL);
2450   return tok;
2451 }
2452
2453 /* Verify that locations within string literals are correctly handled.  */
2454
2455 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2456    using the string concatenation database for TEST.
2457
2458    Assert that the character at index IDX is on EXPECTED_LINE,
2459    and that it begins at column EXPECTED_START_COL and ends at
2460    EXPECTED_FINISH_COL (unless the locations are beyond
2461    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2462    columns).  */
2463
2464 static void
2465 assert_char_at_range (const location &loc,
2466                       lexer_test& test,
2467                       location_t strloc, enum cpp_ttype type, int idx,
2468                       int expected_line, int expected_start_col,
2469                       int expected_finish_col)
2470 {
2471   cpp_reader *pfile = test.m_parser;
2472   string_concat_db *concats = &test.m_concats;
2473
2474   source_range actual_range = source_range();
2475   const char *err
2476     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2477                                  &actual_range);
2478   if (should_have_column_data_p (strloc))
2479     ASSERT_EQ_AT (loc, NULL, err);
2480   else
2481     {
2482       ASSERT_STREQ_AT (loc,
2483                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2484                        err);
2485       return;
2486     }
2487
2488   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2489   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2490   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2491   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2492
2493   if (should_have_column_data_p (actual_range.m_start))
2494     {
2495       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2496       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2497     }
2498   if (should_have_column_data_p (actual_range.m_finish))
2499     {
2500       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2501       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2502     }
2503 }
2504
2505 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2506    the effective location of any errors.  */
2507
2508 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2509                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2510   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2511                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2512                         (EXPECTED_FINISH_COL))
2513
2514 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2515    using the string concatenation database for TEST.
2516
2517    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2518
2519 static void
2520 assert_num_substring_ranges (const location &loc,
2521                              lexer_test& test,
2522                              location_t strloc,
2523                              enum cpp_ttype type,
2524                              int expected_num_ranges)
2525 {
2526   cpp_reader *pfile = test.m_parser;
2527   string_concat_db *concats = &test.m_concats;
2528
2529   int actual_num_ranges = -1;
2530   const char *err
2531     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2532                                            &actual_num_ranges);
2533   if (should_have_column_data_p (strloc))
2534     ASSERT_EQ_AT (loc, NULL, err);
2535   else
2536     {
2537       ASSERT_STREQ_AT (loc,
2538                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2539                        err);
2540       return;
2541     }
2542   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2543 }
2544
2545 /* Macro for calling assert_num_substring_ranges, supplying
2546    SELFTEST_LOCATION for the effective location of any errors.  */
2547
2548 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2549                                     EXPECTED_NUM_RANGES)                \
2550   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2551                                (TYPE), (EXPECTED_NUM_RANGES))
2552
2553
2554 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2555    returns an error (using the string concatenation database for TEST).  */
2556
2557 static void
2558 assert_has_no_substring_ranges (const location &loc,
2559                                 lexer_test& test,
2560                                 location_t strloc,
2561                                 enum cpp_ttype type,
2562                                 const char *expected_err)
2563 {
2564   cpp_reader *pfile = test.m_parser;
2565   string_concat_db *concats = &test.m_concats;
2566   cpp_substring_ranges ranges;
2567   const char *actual_err
2568     = get_substring_ranges_for_loc (pfile, concats, strloc,
2569                                     type, ranges);
2570   if (should_have_column_data_p (strloc))
2571     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2572   else
2573     ASSERT_STREQ_AT (loc,
2574                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2575                      actual_err);
2576 }
2577
2578 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2579     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2580                                     (STRLOC), (TYPE), (ERR))
2581
2582 /* Lex a simple string literal.  Verify the substring location data, before
2583    and after running cpp_interpret_string on it.  */
2584
2585 static void
2586 test_lexer_string_locations_simple (const line_table_case &case_)
2587 {
2588   /* Digits 0-9 (with 0 at column 10), the simple way.
2589      ....................000000000.11111111112.2222222223333333333
2590      ....................123456789.01234567890.1234567890123456789
2591      We add a trailing comment to ensure that we correctly locate
2592      the end of the string literal token.  */
2593   const char *content = "        \"0123456789\" /* not a string */\n";
2594   lexer_test test (case_, content, NULL);
2595
2596   /* Verify that we get the expected token back, with the correct
2597      location information.  */
2598   const cpp_token *tok = test.get_token ();
2599   ASSERT_EQ (tok->type, CPP_STRING);
2600   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2601   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2602
2603   /* At this point in lexing, the quote characters are treated as part of
2604      the string (they are stripped off by cpp_interpret_string).  */
2605
2606   ASSERT_EQ (tok->val.str.len, 12);
2607
2608   /* Verify that cpp_interpret_string works.  */
2609   cpp_string dst_string;
2610   const enum cpp_ttype type = CPP_STRING;
2611   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2612                                       &dst_string, type);
2613   ASSERT_TRUE (result);
2614   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2615   free (const_cast <unsigned char *> (dst_string.text));
2616
2617   /* Verify ranges of individual characters.  This no longer includes the
2618      opening quote, but does include the closing quote.  */
2619   for (int i = 0; i <= 10; i++)
2620     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2621                           10 + i, 10 + i);
2622
2623   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2624 }
2625
2626 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2627    encoding.  */
2628
2629 static void
2630 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2631 {
2632   /* EBCDIC support requires iconv.  */
2633   if (!HAVE_ICONV)
2634     return;
2635
2636   /* Digits 0-9 (with 0 at column 10), the simple way.
2637      ....................000000000.11111111112.2222222223333333333
2638      ....................123456789.01234567890.1234567890123456789
2639      We add a trailing comment to ensure that we correctly locate
2640      the end of the string literal token.  */
2641   const char *content = "        \"0123456789\" /* not a string */\n";
2642   ebcdic_execution_charset use_ebcdic;
2643   lexer_test test (case_, content, &use_ebcdic);
2644
2645   /* Verify that we get the expected token back, with the correct
2646      location information.  */
2647   const cpp_token *tok = test.get_token ();
2648   ASSERT_EQ (tok->type, CPP_STRING);
2649   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2650   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2651
2652   /* At this point in lexing, the quote characters are treated as part of
2653      the string (they are stripped off by cpp_interpret_string).  */
2654
2655   ASSERT_EQ (tok->val.str.len, 12);
2656
2657   /* The remainder of the test requires an iconv implementation that
2658      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2659   if (use_ebcdic.iconv_errors_occurred_p ())
2660     return;
2661
2662   /* Verify that cpp_interpret_string works.  */
2663   cpp_string dst_string;
2664   const enum cpp_ttype type = CPP_STRING;
2665   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2666                                       &dst_string, type);
2667   ASSERT_TRUE (result);
2668   /* We should now have EBCDIC-encoded text, specifically
2669      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2670      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2671   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2672                 (const char *)dst_string.text);
2673   free (const_cast <unsigned char *> (dst_string.text));
2674
2675   /* Verify that we don't attempt to record substring location information
2676      for such cases.  */
2677   ASSERT_HAS_NO_SUBSTRING_RANGES
2678     (test, tok->src_loc, type,
2679      "execution character set != source character set");
2680 }
2681
2682 /* Lex a string literal containing a hex-escaped character.
2683    Verify the substring location data, before and after running
2684    cpp_interpret_string on it.  */
2685
2686 static void
2687 test_lexer_string_locations_hex (const line_table_case &case_)
2688 {
2689   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2690      and with a space in place of digit 6, to terminate the escaped
2691      hex code.
2692      ....................000000000.111111.11112222.
2693      ....................123456789.012345.67890123.  */
2694   const char *content = "        \"01234\\x35 789\"\n";
2695   lexer_test test (case_, content, NULL);
2696
2697   /* Verify that we get the expected token back, with the correct
2698      location information.  */
2699   const cpp_token *tok = test.get_token ();
2700   ASSERT_EQ (tok->type, CPP_STRING);
2701   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2702   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2703
2704   /* At this point in lexing, the quote characters are treated as part of
2705      the string (they are stripped off by cpp_interpret_string).  */
2706   ASSERT_EQ (tok->val.str.len, 15);
2707
2708   /* Verify that cpp_interpret_string works.  */
2709   cpp_string dst_string;
2710   const enum cpp_ttype type = CPP_STRING;
2711   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2712                                       &dst_string, type);
2713   ASSERT_TRUE (result);
2714   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2715   free (const_cast <unsigned char *> (dst_string.text));
2716
2717   /* Verify ranges of individual characters.  This no longer includes the
2718      opening quote, but does include the closing quote.  */
2719   for (int i = 0; i <= 4; i++)
2720     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2721   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2722   for (int i = 6; i <= 10; i++)
2723     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2724
2725   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2726 }
2727
2728 /* Lex a string literal containing an octal-escaped character.
2729    Verify the substring location data after running cpp_interpret_string
2730    on it.  */
2731
2732 static void
2733 test_lexer_string_locations_oct (const line_table_case &case_)
2734 {
2735   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2736      and with a space in place of digit 6, to terminate the escaped
2737      octal code.
2738      ....................000000000.111111.11112222.2222223333333333444
2739      ....................123456789.012345.67890123.4567890123456789012  */
2740   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2741   lexer_test test (case_, content, NULL);
2742
2743   /* Verify that we get the expected token back, with the correct
2744      location information.  */
2745   const cpp_token *tok = test.get_token ();
2746   ASSERT_EQ (tok->type, CPP_STRING);
2747   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2748
2749   /* Verify that cpp_interpret_string works.  */
2750   cpp_string dst_string;
2751   const enum cpp_ttype type = CPP_STRING;
2752   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2753                                       &dst_string, type);
2754   ASSERT_TRUE (result);
2755   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2756   free (const_cast <unsigned char *> (dst_string.text));
2757
2758   /* Verify ranges of individual characters.  This no longer includes the
2759      opening quote, but does include the closing quote.  */
2760   for (int i = 0; i < 5; i++)
2761     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2762   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2763   for (int i = 6; i <= 10; i++)
2764     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2765
2766   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2767 }
2768
2769 /* Test of string literal containing letter escapes.  */
2770
2771 static void
2772 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2773 {
2774   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2775      .....................000000000.1.11111.1.1.11222.22222223333333
2776      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2777   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2778   lexer_test test (case_, content, NULL);
2779
2780   /* Verify that we get the expected tokens back.  */
2781   const cpp_token *tok = test.get_token ();
2782   ASSERT_EQ (tok->type, CPP_STRING);
2783   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2784
2785   /* Verify ranges of individual characters. */
2786   /* "\t".  */
2787   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2788                         0, 1, 10, 11);
2789   /* "foo". */
2790   for (int i = 1; i <= 3; i++)
2791     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2792                           i, 1, 11 + i, 11 + i);
2793   /* "\\" and "\n".  */
2794   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2795                         4, 1, 15, 16);
2796   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2797                         5, 1, 17, 18);
2798
2799   /* "bar" and closing quote for nul-terminator.  */
2800   for (int i = 6; i <= 9; i++)
2801     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2802                           i, 1, 13 + i, 13 + i);
2803
2804   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2805 }
2806
2807 /* Another test of a string literal containing a letter escape.
2808    Based on string seen in
2809      printf ("%-%\n");
2810    in gcc.dg/format/c90-printf-1.c.  */
2811
2812 static void
2813 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2814 {
2815   /* .....................000000000.1111.11.1111.22222222223.
2816      .....................123456789.0123.45.6789.01234567890.  */
2817   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2818   lexer_test test (case_, content, NULL);
2819
2820   /* Verify that we get the expected tokens back.  */
2821   const cpp_token *tok = test.get_token ();
2822   ASSERT_EQ (tok->type, CPP_STRING);
2823   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2824
2825   /* Verify ranges of individual characters. */
2826   /* "%-%".  */
2827   for (int i = 0; i < 3; i++)
2828     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2829                           i, 1, 10 + i, 10 + i);
2830   /* "\n".  */
2831   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2832                         3, 1, 13, 14);
2833
2834   /* Closing quote for nul-terminator.  */
2835   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2836                         4, 1, 15, 15);
2837
2838   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2839 }
2840
2841 /* Lex a string literal containing UCN 4 characters.
2842    Verify the substring location data after running cpp_interpret_string
2843    on it.  */
2844
2845 static void
2846 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2847 {
2848   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2849      as UCN 4.
2850      ....................000000000.111111.111122.222222223.33333333344444
2851      ....................123456789.012345.678901.234567890.12345678901234  */
2852   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2853   lexer_test test (case_, content, NULL);
2854
2855   /* Verify that we get the expected token back, with the correct
2856      location information.  */
2857   const cpp_token *tok = test.get_token ();
2858   ASSERT_EQ (tok->type, CPP_STRING);
2859   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2860
2861   /* Verify that cpp_interpret_string works.
2862      The string should be encoded in the execution character
2863      set.  Assuming that is UTF-8, we should have the following:
2864      -----------  ----  -----  -------  ----------------
2865      Byte offset  Byte  Octal  Unicode  Source Column(s)
2866      -----------  ----  -----  -------  ----------------
2867      0            0x30         '0'      10
2868      1            0x31         '1'      11
2869      2            0x32         '2'      12
2870      3            0x33         '3'      13
2871      4            0x34         '4'      14
2872      5            0xE2  \342   U+2174   15-20
2873      6            0x85  \205    (cont)  15-20
2874      7            0xB4  \264    (cont)  15-20
2875      8            0xE2  \342   U+2175   21-26
2876      9            0x85  \205    (cont)  21-26
2877      10           0xB5  \265    (cont)  21-26
2878      11           0x37         '7'      27
2879      12           0x38         '8'      28
2880      13           0x39         '9'      29
2881      14           0x00                  30 (closing quote)
2882      -----------  ----  -----  -------  ---------------.  */
2883
2884   cpp_string dst_string;
2885   const enum cpp_ttype type = CPP_STRING;
2886   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2887                                       &dst_string, type);
2888   ASSERT_TRUE (result);
2889   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2890                 (const char *)dst_string.text);
2891   free (const_cast <unsigned char *> (dst_string.text));
2892
2893   /* Verify ranges of individual characters.  This no longer includes the
2894      opening quote, but does include the closing quote.
2895      '01234'.  */
2896   for (int i = 0; i <= 4; i++)
2897     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2898   /* U+2174.  */
2899   for (int i = 5; i <= 7; i++)
2900     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2901   /* U+2175.  */
2902   for (int i = 8; i <= 10; i++)
2903     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2904   /* '789' and nul terminator  */
2905   for (int i = 11; i <= 14; i++)
2906     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2907
2908   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2909 }
2910
2911 /* Lex a string literal containing UCN 8 characters.
2912    Verify the substring location data after running cpp_interpret_string
2913    on it.  */
2914
2915 static void
2916 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2917 {
2918   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2919      ....................000000000.111111.1111222222.2222333333333.344444
2920      ....................123456789.012345.6789012345.6789012345678.901234  */
2921   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2922   lexer_test test (case_, content, NULL);
2923
2924   /* Verify that we get the expected token back, with the correct
2925      location information.  */
2926   const cpp_token *tok = test.get_token ();
2927   ASSERT_EQ (tok->type, CPP_STRING);
2928   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2929                            "\"01234\\U00002174\\U00002175789\"");
2930
2931   /* Verify that cpp_interpret_string works.
2932      The UTF-8 encoding of the string is identical to that from
2933      the ucn4 testcase above; the only difference is the column
2934      locations.  */
2935   cpp_string dst_string;
2936   const enum cpp_ttype type = CPP_STRING;
2937   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2938                                       &dst_string, type);
2939   ASSERT_TRUE (result);
2940   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2941                 (const char *)dst_string.text);
2942   free (const_cast <unsigned char *> (dst_string.text));
2943
2944   /* Verify ranges of individual characters.  This no longer includes the
2945      opening quote, but does include the closing quote.
2946      '01234'.  */
2947   for (int i = 0; i <= 4; i++)
2948     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2949   /* U+2174.  */
2950   for (int i = 5; i <= 7; i++)
2951     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2952   /* U+2175.  */
2953   for (int i = 8; i <= 10; i++)
2954     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2955   /* '789' at columns 35-37  */
2956   for (int i = 11; i <= 13; i++)
2957     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2958   /* Closing quote/nul-terminator at column 38.  */
2959   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2960
2961   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2962 }
2963
2964 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2965
2966 static uint32_t
2967 uint32_from_big_endian (const uint32_t *ptr_be_value)
2968 {
2969   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2970   return (((uint32_t) buf[0] << 24)
2971           | ((uint32_t) buf[1] << 16)
2972           | ((uint32_t) buf[2] << 8)
2973           | (uint32_t) buf[3]);
2974 }
2975
2976 /* Lex a wide string literal and verify that attempts to read substring
2977    location data from it fail gracefully.  */
2978
2979 static void
2980 test_lexer_string_locations_wide_string (const line_table_case &case_)
2981 {
2982   /* Digits 0-9.
2983      ....................000000000.11111111112.22222222233333
2984      ....................123456789.01234567890.12345678901234  */
2985   const char *content = "       L\"0123456789\" /* non-str */\n";
2986   lexer_test test (case_, content, NULL);
2987
2988   /* Verify that we get the expected token back, with the correct
2989      location information.  */
2990   const cpp_token *tok = test.get_token ();
2991   ASSERT_EQ (tok->type, CPP_WSTRING);
2992   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2993
2994   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2995   cpp_string dst_string;
2996   const enum cpp_ttype type = CPP_WSTRING;
2997   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2998                                       &dst_string, type);
2999   ASSERT_TRUE (result);
3000   /* The cpp_reader defaults to big-endian with
3001      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3002      now be encoded as UTF-32BE.  */
3003   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3004   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3005   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3006   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3007   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3008   free (const_cast <unsigned char *> (dst_string.text));
3009
3010   /* We don't yet support generating substring location information
3011      for L"" strings.  */
3012   ASSERT_HAS_NO_SUBSTRING_RANGES
3013     (test, tok->src_loc, type,
3014      "execution character set != source character set");
3015 }
3016
3017 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3018
3019 static uint16_t
3020 uint16_from_big_endian (const uint16_t *ptr_be_value)
3021 {
3022   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3023   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3024 }
3025
3026 /* Lex a u"" string literal and verify that attempts to read substring
3027    location data from it fail gracefully.  */
3028
3029 static void
3030 test_lexer_string_locations_string16 (const line_table_case &case_)
3031 {
3032   /* Digits 0-9.
3033      ....................000000000.11111111112.22222222233333
3034      ....................123456789.01234567890.12345678901234  */
3035   const char *content = "       u\"0123456789\" /* non-str */\n";
3036   lexer_test test (case_, content, NULL);
3037
3038   /* Verify that we get the expected token back, with the correct
3039      location information.  */
3040   const cpp_token *tok = test.get_token ();
3041   ASSERT_EQ (tok->type, CPP_STRING16);
3042   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3043
3044   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3045   cpp_string dst_string;
3046   const enum cpp_ttype type = CPP_STRING16;
3047   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3048                                       &dst_string, type);
3049   ASSERT_TRUE (result);
3050
3051   /* The cpp_reader defaults to big-endian, so dst_string should
3052      now be encoded as UTF-16BE.  */
3053   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3054   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3055   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3056   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3057   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3058   free (const_cast <unsigned char *> (dst_string.text));
3059
3060   /* We don't yet support generating substring location information
3061      for L"" strings.  */
3062   ASSERT_HAS_NO_SUBSTRING_RANGES
3063     (test, tok->src_loc, type,
3064      "execution character set != source character set");
3065 }
3066
3067 /* Lex a U"" string literal and verify that attempts to read substring
3068    location data from it fail gracefully.  */
3069
3070 static void
3071 test_lexer_string_locations_string32 (const line_table_case &case_)
3072 {
3073   /* Digits 0-9.
3074      ....................000000000.11111111112.22222222233333
3075      ....................123456789.01234567890.12345678901234  */
3076   const char *content = "       U\"0123456789\" /* non-str */\n";
3077   lexer_test test (case_, content, NULL);
3078
3079   /* Verify that we get the expected token back, with the correct
3080      location information.  */
3081   const cpp_token *tok = test.get_token ();
3082   ASSERT_EQ (tok->type, CPP_STRING32);
3083   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3084
3085   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3086   cpp_string dst_string;
3087   const enum cpp_ttype type = CPP_STRING32;
3088   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3089                                       &dst_string, type);
3090   ASSERT_TRUE (result);
3091
3092   /* The cpp_reader defaults to big-endian, so dst_string should
3093      now be encoded as UTF-32BE.  */
3094   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3095   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3096   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3097   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3098   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3099   free (const_cast <unsigned char *> (dst_string.text));
3100
3101   /* We don't yet support generating substring location information
3102      for L"" strings.  */
3103   ASSERT_HAS_NO_SUBSTRING_RANGES
3104     (test, tok->src_loc, type,
3105      "execution character set != source character set");
3106 }
3107
3108 /* Lex a u8-string literal.
3109    Verify the substring location data after running cpp_interpret_string
3110    on it.  */
3111
3112 static void
3113 test_lexer_string_locations_u8 (const line_table_case &case_)
3114 {
3115   /* Digits 0-9.
3116      ....................000000000.11111111112.22222222233333
3117      ....................123456789.01234567890.12345678901234  */
3118   const char *content = "      u8\"0123456789\" /* non-str */\n";
3119   lexer_test test (case_, content, NULL);
3120
3121   /* Verify that we get the expected token back, with the correct
3122      location information.  */
3123   const cpp_token *tok = test.get_token ();
3124   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3125   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3126
3127   /* Verify that cpp_interpret_string works.  */
3128   cpp_string dst_string;
3129   const enum cpp_ttype type = CPP_STRING;
3130   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3131                                       &dst_string, type);
3132   ASSERT_TRUE (result);
3133   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3134   free (const_cast <unsigned char *> (dst_string.text));
3135
3136   /* Verify ranges of individual characters.  This no longer includes the
3137      opening quote, but does include the closing quote.  */
3138   for (int i = 0; i <= 10; i++)
3139     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3140 }
3141
3142 /* Lex a string literal containing UTF-8 source characters.
3143    Verify the substring location data after running cpp_interpret_string
3144    on it.  */
3145
3146 static void
3147 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3148 {
3149  /* This string literal is written out to the source file as UTF-8,
3150     and is of the form "before mojibake after", where "mojibake"
3151     is written as the following four unicode code points:
3152        U+6587 CJK UNIFIED IDEOGRAPH-6587
3153        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3154        U+5316 CJK UNIFIED IDEOGRAPH-5316
3155        U+3051 HIRAGANA LETTER KE.
3156      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3157      "before" and "after" are 1 byte per unicode character.
3158
3159      The numbering shown are "columns", which are *byte* numbers within
3160      the line, rather than unicode character numbers.
3161
3162      .................... 000000000.1111111.
3163      .................... 123456789.0123456.  */
3164   const char *content = ("        \"before "
3165                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3166                               UTF-8: 0xE6 0x96 0x87
3167                               C octal escaped UTF-8: \346\226\207
3168                             "column" numbers: 17-19.  */
3169                          "\346\226\207"
3170
3171                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3172                               UTF-8: 0xE5 0xAD 0x97
3173                               C octal escaped UTF-8: \345\255\227
3174                             "column" numbers: 20-22.  */
3175                          "\345\255\227"
3176
3177                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3178                               UTF-8: 0xE5 0x8C 0x96
3179                               C octal escaped UTF-8: \345\214\226
3180                             "column" numbers: 23-25.  */
3181                          "\345\214\226"
3182
3183                          /* U+3051 HIRAGANA LETTER KE
3184                               UTF-8: 0xE3 0x81 0x91
3185                               C octal escaped UTF-8: \343\201\221
3186                             "column" numbers: 26-28.  */
3187                          "\343\201\221"
3188
3189                          /* column numbers 29 onwards
3190                           2333333.33334444444444
3191                           9012345.67890123456789. */
3192                          " after\" /* non-str */\n");
3193   lexer_test test (case_, content, NULL);
3194
3195   /* Verify that we get the expected token back, with the correct
3196      location information.  */
3197   const cpp_token *tok = test.get_token ();
3198   ASSERT_EQ (tok->type, CPP_STRING);
3199   ASSERT_TOKEN_AS_TEXT_EQ
3200     (test.m_parser, tok,
3201      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3202
3203   /* Verify that cpp_interpret_string works.  */
3204   cpp_string dst_string;
3205   const enum cpp_ttype type = CPP_STRING;
3206   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3207                                       &dst_string, type);
3208   ASSERT_TRUE (result);
3209   ASSERT_STREQ
3210     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3211      (const char *)dst_string.text);
3212   free (const_cast <unsigned char *> (dst_string.text));
3213
3214   /* Verify ranges of individual characters.  This no longer includes the
3215      opening quote, but does include the closing quote.
3216      Assuming that both source and execution encodings are UTF-8, we have
3217      a run of 25 octets in each, plus the NUL terminator.  */
3218   for (int i = 0; i < 25; i++)
3219     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3220   /* NUL-terminator should use the closing quote at column 35.  */
3221   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3222
3223   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3224 }
3225
3226 /* Test of string literal concatenation.  */
3227
3228 static void
3229 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3230 {
3231   /* Digits 0-9.
3232      .....................000000000.111111.11112222222222
3233      .....................123456789.012345.67890123456789.  */
3234   const char *content = ("        \"01234\" /* non-str */\n"
3235                          "        \"56789\" /* non-str */\n");
3236   lexer_test test (case_, content, NULL);
3237
3238   location_t input_locs[2];
3239
3240   /* Verify that we get the expected tokens back.  */
3241   auto_vec <cpp_string> input_strings;
3242   const cpp_token *tok_a = test.get_token ();
3243   ASSERT_EQ (tok_a->type, CPP_STRING);
3244   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3245   input_strings.safe_push (tok_a->val.str);
3246   input_locs[0] = tok_a->src_loc;
3247
3248   const cpp_token *tok_b = test.get_token ();
3249   ASSERT_EQ (tok_b->type, CPP_STRING);
3250   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3251   input_strings.safe_push (tok_b->val.str);
3252   input_locs[1] = tok_b->src_loc;
3253
3254   /* Verify that cpp_interpret_string works.  */
3255   cpp_string dst_string;
3256   const enum cpp_ttype type = CPP_STRING;
3257   bool result = cpp_interpret_string (test.m_parser,
3258                                       input_strings.address (), 2,
3259                                       &dst_string, type);
3260   ASSERT_TRUE (result);
3261   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3262   free (const_cast <unsigned char *> (dst_string.text));
3263
3264   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3265   test.m_concats.record_string_concatenation (2, input_locs);
3266
3267   location_t initial_loc = input_locs[0];
3268
3269   /* "01234" on line 1.  */
3270   for (int i = 0; i <= 4; i++)
3271     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3272   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3273   for (int i = 5; i <= 10; i++)
3274     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3275
3276   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3277 }
3278
3279 /* Another test of string literal concatenation.  */
3280
3281 static void
3282 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3283 {
3284   /* Digits 0-9.
3285      .....................000000000.111.11111112222222
3286      .....................123456789.012.34567890123456.  */
3287   const char *content = ("        \"01\" /* non-str */\n"
3288                          "        \"23\" /* non-str */\n"
3289                          "        \"45\" /* non-str */\n"
3290                          "        \"67\" /* non-str */\n"
3291                          "        \"89\" /* non-str */\n");
3292   lexer_test test (case_, content, NULL);
3293
3294   auto_vec <cpp_string> input_strings;
3295   location_t input_locs[5];
3296
3297   /* Verify that we get the expected tokens back.  */
3298   for (int i = 0; i < 5; i++)
3299     {
3300       const cpp_token *tok = test.get_token ();
3301       ASSERT_EQ (tok->type, CPP_STRING);
3302       input_strings.safe_push (tok->val.str);
3303       input_locs[i] = tok->src_loc;
3304     }
3305
3306   /* Verify that cpp_interpret_string works.  */
3307   cpp_string dst_string;
3308   const enum cpp_ttype type = CPP_STRING;
3309   bool result = cpp_interpret_string (test.m_parser,
3310                                       input_strings.address (), 5,
3311                                       &dst_string, type);
3312   ASSERT_TRUE (result);
3313   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3314   free (const_cast <unsigned char *> (dst_string.text));
3315
3316   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3317   test.m_concats.record_string_concatenation (5, input_locs);
3318
3319   location_t initial_loc = input_locs[0];
3320
3321   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3322      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3323      and expect get_source_range_for_substring to fail.
3324      However, for a string concatenation test, we can have a case
3325      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3326      but subsequent strings can be after it.
3327      Attempting to detect this within assert_char_at_range
3328      would overcomplicate the logic for the common test cases, so
3329      we detect it here.  */
3330   if (should_have_column_data_p (input_locs[0])
3331       && !should_have_column_data_p (input_locs[4]))
3332     {
3333       /* Verify that get_source_range_for_substring gracefully rejects
3334          this case.  */
3335       source_range actual_range;
3336       const char *err
3337         = get_source_range_for_char (test.m_parser, &test.m_concats,
3338                                      initial_loc, type, 0, &actual_range);
3339       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3340       return;
3341     }
3342
3343   for (int i = 0; i < 5; i++)
3344     for (int j = 0; j < 2; j++)
3345       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3346                             i + 1, 10 + j, 10 + j);
3347
3348   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3349   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3350
3351   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3352 }
3353
3354 /* Another test of string literal concatenation, this time combined with
3355    various kinds of escaped characters.  */
3356
3357 static void
3358 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3359 {
3360   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3361      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3362   const char *content
3363     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3364        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3365     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3366   lexer_test test (case_, content, NULL);
3367
3368   auto_vec <cpp_string> input_strings;
3369   location_t input_locs[4];
3370
3371   /* Verify that we get the expected tokens back.  */
3372   for (int i = 0; i < 4; i++)
3373     {
3374       const cpp_token *tok = test.get_token ();
3375       ASSERT_EQ (tok->type, CPP_STRING);
3376       input_strings.safe_push (tok->val.str);
3377       input_locs[i] = tok->src_loc;
3378     }
3379
3380   /* Verify that cpp_interpret_string works.  */
3381   cpp_string dst_string;
3382   const enum cpp_ttype type = CPP_STRING;
3383   bool result = cpp_interpret_string (test.m_parser,
3384                                       input_strings.address (), 4,
3385                                       &dst_string, type);
3386   ASSERT_TRUE (result);
3387   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3388   free (const_cast <unsigned char *> (dst_string.text));
3389
3390   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3391   test.m_concats.record_string_concatenation (4, input_locs);
3392
3393   location_t initial_loc = input_locs[0];
3394
3395   for (int i = 0; i <= 4; i++)
3396     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3397   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3398   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3399   for (int i = 7; i <= 9; i++)
3400     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3401
3402   /* NUL-terminator should use the location of the final closing quote.  */
3403   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3404
3405   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3406 }
3407
3408 /* Test of string literal in a macro.  */
3409
3410 static void
3411 test_lexer_string_locations_macro (const line_table_case &case_)
3412 {
3413   /* Digits 0-9.
3414      .....................0000000001111111111.22222222223.
3415      .....................1234567890123456789.01234567890.  */
3416   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3417                          "  MACRO");
3418   lexer_test test (case_, content, NULL);
3419
3420   /* Verify that we get the expected tokens back.  */
3421   const cpp_token *tok = test.get_token ();
3422   ASSERT_EQ (tok->type, CPP_PADDING);
3423
3424   tok = test.get_token ();
3425   ASSERT_EQ (tok->type, CPP_STRING);
3426   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3427
3428   /* Verify ranges of individual characters.  We ought to
3429      see columns within the macro definition.  */
3430   for (int i = 0; i <= 10; i++)
3431     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3432                           i, 1, 20 + i, 20 + i);
3433
3434   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3435
3436   tok = test.get_token ();
3437   ASSERT_EQ (tok->type, CPP_PADDING);
3438 }
3439
3440 /* Test of stringification of a macro argument.  */
3441
3442 static void
3443 test_lexer_string_locations_stringified_macro_argument
3444   (const line_table_case &case_)
3445 {
3446   /* .....................000000000111111111122222222223.
3447      .....................123456789012345678901234567890.  */
3448   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3449                          "MACRO(foo)\n");
3450   lexer_test test (case_, content, NULL);
3451
3452   /* Verify that we get the expected token back.  */
3453   const cpp_token *tok = test.get_token ();
3454   ASSERT_EQ (tok->type, CPP_PADDING);
3455
3456   tok = test.get_token ();
3457   ASSERT_EQ (tok->type, CPP_STRING);
3458   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3459
3460   /* We don't support getting the location of a stringified macro
3461      argument.  Verify that it fails gracefully.  */
3462   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3463                                   "cpp_interpret_string_1 failed");
3464
3465   tok = test.get_token ();
3466   ASSERT_EQ (tok->type, CPP_PADDING);
3467
3468   tok = test.get_token ();
3469   ASSERT_EQ (tok->type, CPP_PADDING);
3470 }
3471
3472 /* Ensure that we are fail gracefully if something attempts to pass
3473    in a location that isn't a string literal token.  Seen on this code:
3474
3475      const char a[] = " %d ";
3476      __builtin_printf (a, 0.5);
3477                        ^
3478
3479    when c-format.c erroneously used the indicated one-character
3480    location as the format string location, leading to a read past the
3481    end of a string buffer in cpp_interpret_string_1.  */
3482
3483 static void
3484 test_lexer_string_locations_non_string (const line_table_case &case_)
3485 {
3486   /* .....................000000000111111111122222222223.
3487      .....................123456789012345678901234567890.  */
3488   const char *content = ("         a\n");
3489   lexer_test test (case_, content, NULL);
3490
3491   /* Verify that we get the expected token back.  */
3492   const cpp_token *tok = test.get_token ();
3493   ASSERT_EQ (tok->type, CPP_NAME);
3494   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3495
3496   /* At this point, libcpp is attempting to interpret the name as a
3497      string literal, despite it not starting with a quote.  We don't detect
3498      that, but we should at least fail gracefully.  */
3499   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3500                                   "cpp_interpret_string_1 failed");
3501 }
3502
3503 /* Ensure that we can read substring information for a token which
3504    starts in one linemap and ends in another .  Adapted from
3505    gcc.dg/cpp/pr69985.c.  */
3506
3507 static void
3508 test_lexer_string_locations_long_line (const line_table_case &case_)
3509 {
3510   /* .....................000000.000111111111
3511      .....................123456.789012346789.  */
3512   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3513                          "     \"0123456789012345678901234567890123456789"
3514                          "0123456789012345678901234567890123456789"
3515                          "0123456789012345678901234567890123456789"
3516                          "0123456789\"\n");
3517
3518   lexer_test test (case_, content, NULL);
3519
3520   /* Verify that we get the expected token back.  */
3521   const cpp_token *tok = test.get_token ();
3522   ASSERT_EQ (tok->type, CPP_STRING);
3523
3524   if (!should_have_column_data_p (line_table->highest_location))
3525     return;
3526
3527   /* Verify ranges of individual characters.  */
3528   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3529   for (int i = 0; i < 131; i++)
3530     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3531                           i, 2, 7 + i, 7 + i);
3532 }
3533
3534 /* Test of locations within a raw string that doesn't contain a newline.  */
3535
3536 static void
3537 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3538 {
3539   /* .....................00.0000000111111111122.
3540      .....................12.3456789012345678901.  */
3541   const char *content = ("R\"foo(0123456789)foo\"\n");
3542   lexer_test test (case_, content, NULL);
3543
3544   /* Verify that we get the expected token back.  */
3545   const cpp_token *tok = test.get_token ();
3546   ASSERT_EQ (tok->type, CPP_STRING);
3547
3548   /* Verify that cpp_interpret_string works.  */
3549   cpp_string dst_string;
3550   const enum cpp_ttype type = CPP_STRING;
3551   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3552                                       &dst_string, type);
3553   ASSERT_TRUE (result);
3554   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3555   free (const_cast <unsigned char *> (dst_string.text));
3556
3557   if (!should_have_column_data_p (line_table->highest_location))
3558     return;
3559
3560   /* 0-9, plus the nil terminator.  */
3561   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3562   for (int i = 0; i < 11; i++)
3563     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3564                           i, 1, 7 + i, 7 + i);
3565 }
3566
3567 /* Test of locations within a raw string that contains a newline.  */
3568
3569 static void
3570 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3571 {
3572   /* .....................00.0000.
3573      .....................12.3456.  */
3574   const char *content = ("R\"foo(\n"
3575   /* .....................00000.
3576      .....................12345.  */
3577                          "hello\n"
3578                          "world\n"
3579   /* .....................00000.
3580      .....................12345.  */
3581                          ")foo\"\n");
3582   lexer_test test (case_, content, NULL);
3583
3584   /* Verify that we get the expected token back.  */
3585   const cpp_token *tok = test.get_token ();
3586   ASSERT_EQ (tok->type, CPP_STRING);
3587
3588   /* Verify that cpp_interpret_string works.  */
3589   cpp_string dst_string;
3590   const enum cpp_ttype type = CPP_STRING;
3591   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3592                                       &dst_string, type);
3593   ASSERT_TRUE (result);
3594   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3595   free (const_cast <unsigned char *> (dst_string.text));
3596
3597   if (!should_have_column_data_p (line_table->highest_location))
3598     return;
3599
3600   /* Currently we don't support locations within raw strings that
3601      contain newlines.  */
3602   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3603                                   "range endpoints are on different lines");
3604 }
3605
3606 /* Test of parsing an unterminated raw string.  */
3607
3608 static void
3609 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3610 {
3611   const char *content = "R\"ouch()ouCh\" /* etc */";
3612
3613   lexer_diagnostic_sink diagnostics;
3614   lexer_test test (case_, content, &diagnostics);
3615   test.m_implicitly_expect_EOF = false;
3616
3617   /* Attempt to parse the raw string.  */
3618   const cpp_token *tok = test.get_token ();
3619   ASSERT_EQ (tok->type, CPP_EOF);
3620
3621   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3622   /* We expect the message "unterminated raw string"
3623      in the "cpplib" translation domain.
3624      It's not clear that dgettext is available on all supported hosts,
3625      so this assertion is commented-out for now.
3626        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3627                      diagnostics.m_diagnostics[0]);
3628   */
3629 }
3630
3631 /* Test of lexing char constants.  */
3632
3633 static void
3634 test_lexer_char_constants (const line_table_case &case_)
3635 {
3636   /* Various char constants.
3637      .....................0000000001111111111.22222222223.
3638      .....................1234567890123456789.01234567890.  */
3639   const char *content = ("         'a'\n"
3640                          "        u'a'\n"
3641                          "        U'a'\n"
3642                          "        L'a'\n"
3643                          "         'abc'\n");
3644   lexer_test test (case_, content, NULL);
3645
3646   /* Verify that we get the expected tokens back.  */
3647   /* 'a'.  */
3648   const cpp_token *tok = test.get_token ();
3649   ASSERT_EQ (tok->type, CPP_CHAR);
3650   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3651
3652   unsigned int chars_seen;
3653   int unsignedp;
3654   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3655                                           &chars_seen, &unsignedp);
3656   ASSERT_EQ (cc, 'a');
3657   ASSERT_EQ (chars_seen, 1);
3658
3659   /* u'a'.  */
3660   tok = test.get_token ();
3661   ASSERT_EQ (tok->type, CPP_CHAR16);
3662   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3663
3664   /* U'a'.  */
3665   tok = test.get_token ();
3666   ASSERT_EQ (tok->type, CPP_CHAR32);
3667   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3668
3669   /* L'a'.  */
3670   tok = test.get_token ();
3671   ASSERT_EQ (tok->type, CPP_WCHAR);
3672   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3673
3674   /* 'abc' (c-char-sequence).  */
3675   tok = test.get_token ();
3676   ASSERT_EQ (tok->type, CPP_CHAR);
3677   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3678 }
3679 /* A table of interesting location_t values, giving one axis of our test
3680    matrix.  */
3681
3682 static const location_t boundary_locations[] = {
3683   /* Zero means "don't override the default values for a new line_table".  */
3684   0,
3685
3686   /* An arbitrary non-zero value that isn't close to one of
3687      the boundary values below.  */
3688   0x10000,
3689
3690   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3691   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3692   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3693   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3694   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3695   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3696
3697   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3698   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3699   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3700   LINE_MAP_MAX_LOCATION_WITH_COLS,
3701   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3702   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3703 };
3704
3705 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3706
3707 void
3708 for_each_line_table_case (void (*testcase) (const line_table_case &))
3709 {
3710   /* As noted above in the description of struct line_table_case,
3711      we want to explore a test matrix of interesting line_table
3712      situations, running various selftests for each case within the
3713      matrix.  */
3714
3715   /* Run all tests with:
3716      (a) line_table->default_range_bits == 0, and
3717      (b) line_table->default_range_bits == 5.  */
3718   int num_cases_tested = 0;
3719   for (int default_range_bits = 0; default_range_bits <= 5;
3720        default_range_bits += 5)
3721     {
3722       /* ...and use each of the "interesting" location values as
3723          the starting location within line_table.  */
3724       const int num_boundary_locations
3725         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3726       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3727         {
3728           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3729
3730           testcase (c);
3731
3732           num_cases_tested++;
3733         }
3734     }
3735
3736   /* Verify that we fully covered the test matrix.  */
3737   ASSERT_EQ (num_cases_tested, 2 * 12);
3738 }
3739
3740 /* Verify that when presented with a consecutive pair of locations with
3741    a very large line offset, we don't attempt to consolidate them into
3742    a single ordinary linemap where the line offsets within the line map
3743    would lead to overflow (PR lto/88147).  */
3744
3745 static void
3746 test_line_offset_overflow ()
3747 {
3748   line_table_test ltt (line_table_case (5, 0));
3749
3750   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3751   linemap_line_start (line_table, 1, 100);
3752   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3753   assert_loceq ("foo.c", 2578, 0, loc_a);
3754
3755   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3756   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3757   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3758
3759   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3760   assert_loceq ("foo.c", 404198, 0, loc_b);
3761
3762   /* We should have started a new linemap, rather than attempting to store
3763      a very large line offset.  */
3764   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3765   ASSERT_NE (ordmap_a, ordmap_b);
3766 }
3767
3768 void test_cpp_utf8 ()
3769 {
3770   const int def_tabstop = 8;
3771   cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3772
3773   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3774   {
3775     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3776     ASSERT_EQ (8, w_bad);
3777     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3778     ASSERT_EQ (5, w_ctrl);
3779   }
3780
3781   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3782   {
3783     const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3784     ASSERT_EQ (1, w_pi);
3785     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3786     ASSERT_EQ (2, w_emoji);
3787     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3788                                                         policy);
3789     ASSERT_EQ (1, w_umlaut_precomposed);
3790     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3791                                                       policy);
3792     ASSERT_EQ (1, w_umlaut_combining);
3793     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3794     ASSERT_EQ (2, w_han);
3795     const int w_ascii = cpp_display_width ("GCC", 3, policy);
3796     ASSERT_EQ (3, w_ascii);
3797     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3798                                            "\x9f! \xe4\xb8\xba y\xcc\x88",
3799                                            24, policy);
3800     ASSERT_EQ (18, w_mixed);
3801   }
3802
3803   /* Verify that display width properly expands tabs.  */
3804   {
3805     const char *tstr = "\tabc\td";
3806     ASSERT_EQ (6, cpp_display_width (tstr, 6,
3807                                      cpp_char_column_policy (1, cpp_wcwidth)));
3808     ASSERT_EQ (10, cpp_display_width (tstr, 6,
3809                                       cpp_char_column_policy (3, cpp_wcwidth)));
3810     ASSERT_EQ (17, cpp_display_width (tstr, 6,
3811                                       cpp_char_column_policy (8, cpp_wcwidth)));
3812     ASSERT_EQ (1,
3813                cpp_display_column_to_byte_column
3814                  (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
3815   }
3816
3817   /* Verify that cpp_byte_column_to_display_column can go past the end,
3818      and similar edge cases.  */
3819   {
3820     const char *str
3821       /* Display columns.
3822          111111112345  */
3823       = "\xcf\x80 abc";
3824       /* 111122223456
3825          Byte columns.  */
3826
3827     ASSERT_EQ (5, cpp_display_width (str, 6, policy));
3828     ASSERT_EQ (105,
3829                cpp_byte_column_to_display_column (str, 6, 106, policy));
3830     ASSERT_EQ (10000,
3831                cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
3832     ASSERT_EQ (0,
3833                cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
3834   }
3835
3836   /* Verify that cpp_display_column_to_byte_column can go past the end,
3837      and similar edge cases, and check invertibility.  */
3838   {
3839     const char *str
3840       /* Display columns.
3841          000000000000000000000000000000000000011
3842          111111112222222234444444455555555678901  */
3843       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3844       /* 000000000000000000000000000000000111111
3845          111122223333444456666777788889999012345
3846          Byte columns.  */
3847     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
3848     ASSERT_EQ (15,
3849                cpp_display_column_to_byte_column (str, 15, 11, policy));
3850     ASSERT_EQ (115,
3851                cpp_display_column_to_byte_column (str, 15, 111, policy));
3852     ASSERT_EQ (10000,
3853                cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
3854     ASSERT_EQ (0,
3855                cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
3856
3857     /* Verify that we do not interrupt a UTF-8 sequence.  */
3858     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
3859
3860     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3861       {
3862         const int disp_col
3863           = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
3864         const int byte_col2
3865           = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
3866
3867         /* If we ask for the display column in the middle of a UTF-8
3868            sequence, it will return the length of the partial sequence,
3869            matching the behavior of GCC before display column support.
3870            Otherwise check the round trip was successful.  */
3871         if (byte_col < 4)
3872           ASSERT_EQ (byte_col, disp_col);
3873         else if (byte_col >= 6 && byte_col < 9)
3874           ASSERT_EQ (3 + (byte_col - 5), disp_col);
3875         else
3876           ASSERT_EQ (byte_col2, byte_col);
3877       }
3878   }
3879
3880 }
3881
3882 /* Run all of the selftests within this file.  */
3883
3884 void
3885 input_c_tests ()
3886 {
3887   test_linenum_comparisons ();
3888   test_should_have_column_data_p ();
3889   test_unknown_location ();
3890   test_builtins ();
3891   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3892
3893   for_each_line_table_case (test_accessing_ordinary_linemaps);
3894   for_each_line_table_case (test_lexer);
3895   for_each_line_table_case (test_lexer_string_locations_simple);
3896   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3897   for_each_line_table_case (test_lexer_string_locations_hex);
3898   for_each_line_table_case (test_lexer_string_locations_oct);
3899   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3900   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3901   for_each_line_table_case (test_lexer_string_locations_ucn4);
3902   for_each_line_table_case (test_lexer_string_locations_ucn8);
3903   for_each_line_table_case (test_lexer_string_locations_wide_string);
3904   for_each_line_table_case (test_lexer_string_locations_string16);
3905   for_each_line_table_case (test_lexer_string_locations_string32);
3906   for_each_line_table_case (test_lexer_string_locations_u8);
3907   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3908   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3909   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3910   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3911   for_each_line_table_case (test_lexer_string_locations_macro);
3912   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3913   for_each_line_table_case (test_lexer_string_locations_non_string);
3914   for_each_line_table_case (test_lexer_string_locations_long_line);
3915   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3916   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3917   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3918   for_each_line_table_case (test_lexer_char_constants);
3919
3920   test_reading_source_line ();
3921
3922   test_line_offset_overflow ();
3923
3924   test_cpp_utf8 ();
3925 }
3926
3927 } // namespace selftest
3928
3929 #endif /* CHECKING_P */