gcc/input.cc

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2022 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* Input charset configuration.  */
  33 static const char *default_charset_callback (const char *)
  34 {
  35   return nullptr;
  36 }
  37
  38 void
  39 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
  40                                       bool should_skip_bom)
  41 {
  42   in_context.ccb = (ccb ? ccb : default_charset_callback);
  43   in_context.should_skip_bom = should_skip_bom;
  44 }
  45
  46 /* This is a cache used by get_next_line to store the content of a
  47    file to be searched for file lines.  */
  48 class file_cache_slot
  49 {
  50 public:
  51   file_cache_slot ();
  52   ~file_cache_slot ();
  53
  54   bool read_line_num (size_t line_num,
  55                       char ** line, ssize_t *line_len);
  56
  57   /* Accessors.  */
  58   const char *get_file_path () const { return m_file_path; }
  59   unsigned get_use_count () const { return m_use_count; }
  60   bool missing_trailing_newline_p () const
  61   {
  62     return m_missing_trailing_newline;
  63   }
  64
  65   void inc_use_count () { m_use_count++; }
  66
  67   bool create (const file_cache::input_context &in_context,
  68                const char *file_path, FILE *fp, unsigned highest_use_count);
  69   void evict ();
  70
  71  private:
  72   /* These are information used to store a line boundary.  */
  73   class line_info
  74   {
  75   public:
  76     /* The line number.  It starts from 1.  */
  77     size_t line_num;
  78
  79     /* The position (byte count) of the beginning of the line,
  80        relative to the file data pointer.  This starts at zero.  */
  81     size_t start_pos;
  82
  83     /* The position (byte count) of the last byte of the line.  This
  84        normally points to the '\n' character, or to one byte after the
  85        last byte of the file, if the file doesn't contain a '\n'
  86        character.  */
  87     size_t end_pos;
  88
  89     line_info (size_t l, size_t s, size_t e)
  90       : line_num (l), start_pos (s), end_pos (e)
  91     {}
  92
  93     line_info ()
  94       :line_num (0), start_pos (0), end_pos (0)
  95     {}
  96   };
  97
  98   bool needs_read_p () const;
  99   bool needs_grow_p () const;
 100   void maybe_grow ();
 101   bool read_data ();
 102   bool maybe_read_data ();
 103   bool get_next_line (char **line, ssize_t *line_len);
 104   bool read_next_line (char ** line, ssize_t *line_len);
 105   bool goto_next_line ();
 106
 107   static const size_t buffer_size = 4 * 1024;
 108   static const size_t line_record_size = 100;
 109
 110   /* The number of time this file has been accessed.  This is used
 111      to designate which file cache to evict from the cache
 112      array.  */
 113   unsigned m_use_count;
 114
 115   /* The file_path is the key for identifying a particular file in
 116      the cache.
 117      For libcpp-using code, the underlying buffer for this field is
 118      owned by the corresponding _cpp_file within the cpp_reader.  */
 119   const char *m_file_path;
 120
 121   FILE *m_fp;
 122
 123   /* This points to the content of the file that we've read so
 124      far.  */
 125   char *m_data;
 126
 127   /* The allocated buffer to be freed may start a little earlier than DATA,
 128      e.g. if a UTF8 BOM was skipped at the beginning.  */
 129   int m_alloc_offset;
 130
 131   /*  The size of the DATA array above.*/
 132   size_t m_size;
 133
 134   /* The number of bytes read from the underlying file so far.  This
 135      must be less (or equal) than SIZE above.  */
 136   size_t m_nb_read;
 137
 138   /* The index of the beginning of the current line.  */
 139   size_t m_line_start_idx;
 140
 141   /* The number of the previous line read.  This starts at 1.  Zero
 142      means we've read no line so far.  */
 143   size_t m_line_num;
 144
 145   /* This is the total number of lines of the current file.  At the
 146      moment, we try to get this information from the line map
 147      subsystem.  Note that this is just a hint.  When using the C++
 148      front-end, this hint is correct because the input file is then
 149      completely tokenized before parsing starts; so the line map knows
 150      the number of lines before compilation really starts.  For e.g,
 151      the C front-end, it can happen that we start emitting diagnostics
 152      before the line map has seen the end of the file.  */
 153   size_t m_total_lines;
 154
 155   /* Could this file be missing a trailing newline on its final line?
 156      Initially true (to cope with empty files), set to true/false
 157      as each line is read.  */
 158   bool m_missing_trailing_newline;
 159
 160   /* This is a record of the beginning and end of the lines we've seen
 161      while reading the file.  This is useful to avoid walking the data
 162      from the beginning when we are asked to read a line that is
 163      before LINE_START_IDX above.  Note that the maximum size of this
 164      record is line_record_size, so that the memory consumption
 165      doesn't explode.  We thus scale total_lines down to
 166      line_record_size.  */
 167   vec<line_info, va_heap> m_line_record;
 168
 169   void offset_buffer (int offset)
 170   {
 171     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
 172                 : (size_t) offset <= m_size);
 173     gcc_assert (m_data);
 174     m_alloc_offset += offset;
 175     m_data += offset;
 176     m_size -= offset;
 177   }
 178
 179 };
 180
 181 /* Current position in real source file.  */
 182
 183 location_t input_location = UNKNOWN_LOCATION;
 184
 185 class line_maps *line_table;
 186
 187 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 188    This needs to be a global so that it can be a GC root, and thus
 189    prevent the stashed copy from being garbage-collected if the GC runs
 190    during a line_table_test.  */
 191
 192 class line_maps *saved_line_table;
 193
 194 /* Expand the source location LOC into a human readable location.  If
 195    LOC resolves to a builtin location, the file name of the readable
 196    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 197    TRUE and LOC is virtual, then it is resolved to the expansion
 198    point of the involved macro.  Otherwise, it is resolved to the
 199    spelling location of the token.
 200
 201    When resolving to the spelling location of the token, if the
 202    resulting location is for a built-in location (that is, it has no
 203    associated line/column) in the context of a macro expansion, the
 204    returned location is the first one (while unwinding the macro
 205    location towards its expansion point) that is in real source
 206    code.
 207
 208    ASPECT controls which part of the location to use.  */
 209
 210 static expanded_location
 211 expand_location_1 (location_t loc,
 212                    bool expansion_point_p,
 213                    enum location_aspect aspect)
 214 {
 215   expanded_location xloc;
 216   const line_map_ordinary *map;
 217   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 218   tree block = NULL;
 219
 220   if (IS_ADHOC_LOC (loc))
 221     {
 222       block = LOCATION_BLOCK (loc);
 223       loc = LOCATION_LOCUS (loc);
 224     }
 225
 226   memset (&xloc, 0, sizeof (xloc));
 227
 228   if (loc >= RESERVED_LOCATION_COUNT)
 229     {
 230       if (!expansion_point_p)
 231         {
 232           /* We want to resolve LOC to its spelling location.
 233
 234              But if that spelling location is a reserved location that
 235              appears in the context of a macro expansion (like for a
 236              location for a built-in token), let's consider the first
 237              location (toward the expansion point) that is not reserved;
 238              that is, the first location that is in real source code.  */
 239           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 240                                                           loc, NULL);
 241           lrk = LRK_SPELLING_LOCATION;
 242         }
 243       loc = linemap_resolve_location (line_table, loc, lrk, &map);
 244
 245       /* loc is now either in an ordinary map, or is a reserved location.
 246          If it is a compound location, the caret is in a spelling location,
 247          but the start/finish might still be a virtual location.
 248          Depending of what the caller asked for, we may need to recurse
 249          one level in order to resolve any virtual locations in the
 250          end-points.  */
 251       switch (aspect)
 252         {
 253         default:
 254           gcc_unreachable ();
 255           /* Fall through.  */
 256         case LOCATION_ASPECT_CARET:
 257           break;
 258         case LOCATION_ASPECT_START:
 259           {
 260             location_t start = get_start (loc);
 261             if (start != loc)
 262               return expand_location_1 (start, expansion_point_p, aspect);
 263           }
 264           break;
 265         case LOCATION_ASPECT_FINISH:
 266           {
 267             location_t finish = get_finish (loc);
 268             if (finish != loc)
 269               return expand_location_1 (finish, expansion_point_p, aspect);
 270           }
 271           break;
 272         }
 273       xloc = linemap_expand_location (line_table, map, loc);
 274     }
 275
 276   xloc.data = block;
 277   if (loc <= BUILTINS_LOCATION)
 278     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 279
 280   return xloc;
 281 }
 282
 283 /* Initialize the set of cache used for files accessed by caret
 284    diagnostic.  */
 285
 286 static void
 287 diagnostic_file_cache_init (void)
 288 {
 289   gcc_assert (global_dc);
 290   if (global_dc->m_file_cache == NULL)
 291     global_dc->m_file_cache = new file_cache ();
 292 }
 293
 294 /* Free the resources used by the set of cache used for files accessed
 295    by caret diagnostic.  */
 296
 297 void
 298 diagnostic_file_cache_fini (void)
 299 {
 300   if (global_dc->m_file_cache)
 301     {
 302       delete global_dc->m_file_cache;
 303       global_dc->m_file_cache = NULL;
 304     }
 305 }
 306
 307 /* Return the total lines number that have been read so far by the
 308    line map (in the preprocessor) so far.  For languages like C++ that
 309    entirely preprocess the input file before starting to parse, this
 310    equals the actual number of lines of the file.  */
 311
 312 static size_t
 313 total_lines_num (const char *file_path)
 314 {
 315   size_t r = 0;
 316   location_t l = 0;
 317   if (linemap_get_file_highest_location (line_table, file_path, &l))
 318     {
 319       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 320       expanded_location xloc = expand_location (l);
 321       r = xloc.line;
 322     }
 323   return r;
 324 }
 325
 326 /* Lookup the cache used for the content of a given file accessed by
 327    caret diagnostic.  Return the found cached file, or NULL if no
 328    cached file was found.  */
 329
 330 file_cache_slot *
 331 file_cache::lookup_file (const char *file_path)
 332 {
 333   gcc_assert (file_path);
 334
 335   /* This will contain the found cached file.  */
 336   file_cache_slot *r = NULL;
 337   for (unsigned i = 0; i < num_file_slots; ++i)
 338     {
 339       file_cache_slot *c = &m_file_slots[i];
 340       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
 341         {
 342           c->inc_use_count ();
 343           r = c;
 344         }
 345     }
 346
 347   if (r)
 348     r->inc_use_count ();
 349
 350   return r;
 351 }
 352
 353 /* Purge any mention of FILENAME from the cache of files used for
 354    printing source code.  For use in selftests when working
 355    with tempfiles.  */
 356
 357 void
 358 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 359 {
 360   gcc_assert (file_path);
 361
 362   if (!global_dc->m_file_cache)
 363     return;
 364
 365   global_dc->m_file_cache->forcibly_evict_file (file_path);
 366 }
 367
 368 void
 369 file_cache::forcibly_evict_file (const char *file_path)
 370 {
 371   gcc_assert (file_path);
 372
 373   file_cache_slot *r = lookup_file (file_path);
 374   if (!r)
 375     /* Not found.  */
 376     return;
 377
 378   r->evict ();
 379 }
 380
 381 void
 382 file_cache_slot::evict ()
 383 {
 384   m_file_path = NULL;
 385   if (m_fp)
 386     fclose (m_fp);
 387   m_fp = NULL;
 388   m_nb_read = 0;
 389   m_line_start_idx = 0;
 390   m_line_num = 0;
 391   m_line_record.truncate (0);
 392   m_use_count = 0;
 393   m_total_lines = 0;
 394   m_missing_trailing_newline = true;
 395 }
 396
 397 /* Return the file cache that has been less used, recently, or the
 398    first empty one.  If HIGHEST_USE_COUNT is non-null,
 399    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 400    in the cache table.  */
 401
 402 file_cache_slot*
 403 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
 404 {
 405   diagnostic_file_cache_init ();
 406
 407   file_cache_slot *to_evict = &m_file_slots[0];
 408   unsigned huc = to_evict->get_use_count ();
 409   for (unsigned i = 1; i < num_file_slots; ++i)
 410     {
 411       file_cache_slot *c = &m_file_slots[i];
 412       bool c_is_empty = (c->get_file_path () == NULL);
 413
 414       if (c->get_use_count () < to_evict->get_use_count ()
 415           || (to_evict->get_file_path () && c_is_empty))
 416         /* We evict C because it's either an entry with a lower use
 417            count or one that is empty.  */
 418         to_evict = c;
 419
 420       if (huc < c->get_use_count ())
 421         huc = c->get_use_count ();
 422
 423       if (c_is_empty)
 424         /* We've reached the end of the cache; subsequent elements are
 425            all empty.  */
 426         break;
 427     }
 428
 429   if (highest_use_count)
 430     *highest_use_count = huc;
 431
 432   return to_evict;
 433 }
 434
 435 /* Create the cache used for the content of a given file to be
 436    accessed by caret diagnostic.  This cache is added to an array of
 437    cache and can be retrieved by lookup_file_in_cache_tab.  This
 438    function returns the created cache.  Note that only the last
 439    num_file_slots files are cached.  */
 440
 441 file_cache_slot*
 442 file_cache::add_file (const char *file_path)
 443 {
 444
 445   FILE *fp = fopen (file_path, "r");
 446   if (fp == NULL)
 447     return NULL;
 448
 449   unsigned highest_use_count = 0;
 450   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
 451   if (!r->create (in_context, file_path, fp, highest_use_count))
 452     return NULL;
 453   return r;
 454 }
 455
 456 /* Populate this slot for use on FILE_PATH and FP, dropping any
 457    existing cached content within it.  */
 458
 459 bool
 460 file_cache_slot::create (const file_cache::input_context &in_context,
 461                          const char *file_path, FILE *fp,
 462                          unsigned highest_use_count)
 463 {
 464   m_file_path = file_path;
 465   if (m_fp)
 466     fclose (m_fp);
 467   m_fp = fp;
 468   if (m_alloc_offset)
 469     offset_buffer (-m_alloc_offset);
 470   m_nb_read = 0;
 471   m_line_start_idx = 0;
 472   m_line_num = 0;
 473   m_line_record.truncate (0);
 474   /* Ensure that this cache entry doesn't get evicted next time
 475      add_file_to_cache_tab is called.  */
 476   m_use_count = ++highest_use_count;
 477   m_total_lines = total_lines_num (file_path);
 478   m_missing_trailing_newline = true;
 479
 480
 481   /* Check the input configuration to determine if we need to do any
 482      transformations, such as charset conversion or BOM skipping.  */
 483   if (const char *input_charset = in_context.ccb (file_path))
 484     {
 485       /* Need a full-blown conversion of the input charset.  */
 486       fclose (m_fp);
 487       m_fp = NULL;
 488       const cpp_converted_source cs
 489         = cpp_get_converted_source (file_path, input_charset);
 490       if (!cs.data)
 491         return false;
 492       if (m_data)
 493         XDELETEVEC (m_data);
 494       m_data = cs.data;
 495       m_nb_read = m_size = cs.len;
 496       m_alloc_offset = cs.data - cs.to_free;
 497     }
 498   else if (in_context.should_skip_bom)
 499     {
 500       if (read_data ())
 501         {
 502           const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
 503           offset_buffer (offset);
 504           m_nb_read -= offset;
 505         }
 506     }
 507
 508   return true;
 509 }
 510
 511 /* file_cache's ctor.  */
 512
 513 file_cache::file_cache ()
 514 : m_file_slots (new file_cache_slot[num_file_slots])
 515 {
 516   initialize_input_context (nullptr, false);
 517 }
 518
 519 /* file_cache's dtor.  */
 520
 521 file_cache::~file_cache ()
 522 {
 523   delete[] m_file_slots;
 524 }
 525
 526 /* Lookup the cache used for the content of a given file accessed by
 527    caret diagnostic.  If no cached file was found, create a new cache
 528    for this file, add it to the array of cached file and return
 529    it.  */
 530
 531 file_cache_slot*
 532 file_cache::lookup_or_add_file (const char *file_path)
 533 {
 534   file_cache_slot *r = lookup_file (file_path);
 535   if (r == NULL)
 536     r = add_file (file_path);
 537   return r;
 538 }
 539
 540 /* Default constructor for a cache of file used by caret
 541    diagnostic.  */
 542
 543 file_cache_slot::file_cache_slot ()
 544 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
 545   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
 546   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 547 {
 548   m_line_record.create (0);
 549 }
 550
 551 /* Destructor for a cache of file used by caret diagnostic.  */
 552
 553 file_cache_slot::~file_cache_slot ()
 554 {
 555   if (m_fp)
 556     {
 557       fclose (m_fp);
 558       m_fp = NULL;
 559     }
 560   if (m_data)
 561     {
 562       offset_buffer (-m_alloc_offset);
 563       XDELETEVEC (m_data);
 564       m_data = 0;
 565     }
 566   m_line_record.release ();
 567 }
 568
 569 /* Returns TRUE iff the cache would need to be filled with data coming
 570    from the file.  That is, either the cache is empty or full or the
 571    current line is empty.  Note that if the cache is full, it would
 572    need to be extended and filled again.  */
 573
 574 bool
 575 file_cache_slot::needs_read_p () const
 576 {
 577   return m_fp && (m_nb_read == 0
 578           || m_nb_read == m_size
 579           || (m_line_start_idx >= m_nb_read - 1));
 580 }
 581
 582 /*  Return TRUE iff the cache is full and thus needs to be
 583     extended.  */
 584
 585 bool
 586 file_cache_slot::needs_grow_p () const
 587 {
 588   return m_nb_read == m_size;
 589 }
 590
 591 /* Grow the cache if it needs to be extended.  */
 592
 593 void
 594 file_cache_slot::maybe_grow ()
 595 {
 596   if (!needs_grow_p ())
 597     return;
 598
 599   if (!m_data)
 600     {
 601       gcc_assert (m_size == 0 && m_alloc_offset == 0);
 602       m_size = buffer_size;
 603       m_data = XNEWVEC (char, m_size);
 604     }
 605   else
 606     {
 607       const int offset = m_alloc_offset;
 608       offset_buffer (-offset);
 609       m_size *= 2;
 610       m_data = XRESIZEVEC (char, m_data, m_size);
 611       offset_buffer (offset);
 612     }
 613 }
 614
 615 /*  Read more data into the cache.  Extends the cache if need be.
 616     Returns TRUE iff new data could be read.  */
 617
 618 bool
 619 file_cache_slot::read_data ()
 620 {
 621   if (feof (m_fp) || ferror (m_fp))
 622     return false;
 623
 624   maybe_grow ();
 625
 626   char * from = m_data + m_nb_read;
 627   size_t to_read = m_size - m_nb_read;
 628   size_t nb_read = fread (from, 1, to_read, m_fp);
 629
 630   if (ferror (m_fp))
 631     return false;
 632
 633   m_nb_read += nb_read;
 634   return !!nb_read;
 635 }
 636
 637 /* Read new data iff the cache needs to be filled with more data
 638    coming from the file FP.  Return TRUE iff the cache was filled with
 639    mode data.  */
 640
 641 bool
 642 file_cache_slot::maybe_read_data ()
 643 {
 644   if (!needs_read_p ())
 645     return false;
 646   return read_data ();
 647 }
 648
 649 /* Helper function for file_cache_slot::get_next_line (), to find the end of
 650    the next line.  Returns with the memchr convention, i.e. nullptr if a line
 651    terminator was not found.  We need to determine line endings in the same
 652    manner that libcpp does: any of \n, \r\n, or \r is a line ending.  */
 653
 654 static char *
 655 find_end_of_line (char *s, size_t len)
 656 {
 657   for (const auto end = s + len; s != end; ++s)
 658     {
 659       if (*s == '\n')
 660         return s;
 661       if (*s == '\r')
 662         {
 663           const auto next = s + 1;
 664           if (next == end)
 665             {
 666               /* Don't find the line ending if \r is the very last character
 667                  in the buffer; we do not know if it's the end of the file or
 668                  just the end of what has been read so far, and we wouldn't
 669                  want to break in the middle of what's actually a \r\n
 670                  sequence.  Instead, we will handle the case of a file ending
 671                  in a \r later.  */
 672               break;
 673             }
 674           return (*next == '\n' ? next : s);
 675         }
 676     }
 677   return nullptr;
 678 }
 679
 680 /* Read a new line from file FP, using C as a cache for the data
 681    coming from the file.  Upon successful completion, *LINE is set to
 682    the beginning of the line found.  *LINE points directly in the
 683    line cache and is only valid until the next call of get_next_line.
 684    *LINE_LEN is set to the length of the line.  Note that the line
 685    does not contain any terminal delimiter.  This function returns
 686    true if some data was read or process from the cache, false
 687    otherwise.  Note that subsequent calls to get_next_line might
 688    make the content of *LINE invalid.  */
 689
 690 bool
 691 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
 692 {
 693   /* Fill the cache with data to process.  */
 694   maybe_read_data ();
 695
 696   size_t remaining_size = m_nb_read - m_line_start_idx;
 697   if (remaining_size == 0)
 698     /* There is no more data to process.  */
 699     return false;
 700
 701   char *line_start = m_data + m_line_start_idx;
 702
 703   char *next_line_start = NULL;
 704   size_t len = 0;
 705   char *line_end = find_end_of_line (line_start, remaining_size);
 706   if (line_end == NULL)
 707     {
 708       /* We haven't found an end-of-line delimiter in the cache.
 709          Fill the cache with more data from the file and look again.  */
 710       while (maybe_read_data ())
 711         {
 712           line_start = m_data + m_line_start_idx;
 713           remaining_size = m_nb_read - m_line_start_idx;
 714           line_end = find_end_of_line (line_start, remaining_size);
 715           if (line_end != NULL)
 716             {
 717               next_line_start = line_end + 1;
 718               break;
 719             }
 720         }
 721       if (line_end == NULL)
 722         {
 723           /* We've loaded all the file into the cache and still no
 724              terminator.  Let's say the line ends up at one byte past the
 725              end of the file.  This is to stay consistent with the case
 726              of when the line ends up with a terminator and line_end points to
 727              that.  That consistency is useful below in the len calculation.
 728
 729              If the file ends in a \r, we didn't identify it as a line
 730              terminator above, so do that now instead.  */
 731           line_end = m_data + m_nb_read;
 732           if (m_nb_read && line_end[-1] == '\r')
 733             {
 734               --line_end;
 735               m_missing_trailing_newline = false;
 736             }
 737           else
 738             m_missing_trailing_newline = true;
 739         }
 740       else
 741         m_missing_trailing_newline = false;
 742     }
 743   else
 744     {
 745       next_line_start = line_end + 1;
 746       m_missing_trailing_newline = false;
 747     }
 748
 749   if (m_fp && ferror (m_fp))
 750     return false;
 751
 752   /* At this point, we've found the end of the of line.  It either points to
 753      the line terminator or to one byte after the last byte of the file.  */
 754   gcc_assert (line_end != NULL);
 755
 756   len = line_end - line_start;
 757
 758   if (m_line_start_idx < m_nb_read)
 759     *line = line_start;
 760
 761   ++m_line_num;
 762
 763   /* Before we update our line record, make sure the hint about the
 764      total number of lines of the file is correct.  If it's not, then
 765      we give up recording line boundaries from now on.  */
 766   bool update_line_record = true;
 767   if (m_line_num > m_total_lines)
 768     update_line_record = false;
 769
 770     /* Now update our line record so that re-reading lines from the
 771      before m_line_start_idx is faster.  */
 772   if (update_line_record
 773       && m_line_record.length () < line_record_size)
 774     {
 775       /* If the file lines fits in the line record, we just record all
 776          its lines ...*/
 777       if (m_total_lines <= line_record_size
 778           && m_line_num > m_line_record.length ())
 779         m_line_record.safe_push
 780           (file_cache_slot::line_info (m_line_num,
 781                                        m_line_start_idx,
 782                                        line_end - m_data));
 783       else if (m_total_lines > line_record_size)
 784         {
 785           /* ... otherwise, we just scale total_lines down to
 786              (line_record_size lines.  */
 787           size_t n = (m_line_num * line_record_size) / m_total_lines;
 788           if (m_line_record.length () == 0
 789               || n >= m_line_record.length ())
 790             m_line_record.safe_push
 791               (file_cache_slot::line_info (m_line_num,
 792                                            m_line_start_idx,
 793                                            line_end - m_data));
 794         }
 795     }
 796
 797   /* Update m_line_start_idx so that it points to the next line to be
 798      read.  */
 799   if (next_line_start)
 800     m_line_start_idx = next_line_start - m_data;
 801   else
 802     /* We didn't find any terminal '\n'.  Let's consider that the end
 803        of line is the end of the data in the cache.  The next
 804        invocation of get_next_line will either read more data from the
 805        underlying file or return false early because we've reached the
 806        end of the file.  */
 807     m_line_start_idx = m_nb_read;
 808
 809   *line_len = len;
 810
 811   return true;
 812 }
 813
 814 /* Consume the next bytes coming from the cache (or from its
 815    underlying file if there are remaining unread bytes in the file)
 816    until we reach the next end-of-line (or end-of-file).  There is no
 817    copying from the cache involved.  Return TRUE upon successful
 818    completion.  */
 819
 820 bool
 821 file_cache_slot::goto_next_line ()
 822 {
 823   char *l;
 824   ssize_t len;
 825
 826   return get_next_line (&l, &len);
 827 }
 828
 829 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 830    If the line was read successfully, *LINE points to the beginning
 831    of the line in the file cache and *LINE_LEN is the length of the
 832    line.  *LINE is not nul-terminated, but may contain zero bytes.
 833    *LINE is only valid until the next call of read_line_num.
 834    This function returns bool if a line was read.  */
 835
 836 bool
 837 file_cache_slot::read_line_num (size_t line_num,
 838                        char ** line, ssize_t *line_len)
 839 {
 840   gcc_assert (line_num > 0);
 841
 842   if (line_num <= m_line_num)
 843     {
 844       /* We've been asked to read lines that are before m_line_num.
 845          So lets use our line record (if it's not empty) to try to
 846          avoid re-reading the file from the beginning again.  */
 847
 848       if (m_line_record.is_empty ())
 849         {
 850           m_line_start_idx = 0;
 851           m_line_num = 0;
 852         }
 853       else
 854         {
 855           file_cache_slot::line_info *i = NULL;
 856           if (m_total_lines <= line_record_size)
 857             {
 858               /* In languages where the input file is not totally
 859                  preprocessed up front, the m_total_lines hint
 860                  can be smaller than the number of lines of the
 861                  file.  In that case, only the first
 862                  m_total_lines have been recorded.
 863
 864                  Otherwise, the first m_total_lines we've read have
 865                  their start/end recorded here.  */
 866               i = (line_num <= m_total_lines)
 867                 ? &m_line_record[line_num - 1]
 868                 : &m_line_record[m_total_lines - 1];
 869               gcc_assert (i->line_num <= line_num);
 870             }
 871           else
 872             {
 873               /*  So the file had more lines than our line record
 874                   size.  Thus the number of lines we've recorded has
 875                   been scaled down to line_record_size.  Let's
 876                   pick the start/end of the recorded line that is
 877                   closest to line_num.  */
 878               size_t n = (line_num <= m_total_lines)
 879                 ? line_num * line_record_size / m_total_lines
 880                 : m_line_record.length () - 1;
 881               if (n < m_line_record.length ())
 882                 {
 883                   i = &m_line_record[n];
 884                   gcc_assert (i->line_num <= line_num);
 885                 }
 886             }
 887
 888           if (i && i->line_num == line_num)
 889             {
 890               /* We have the start/end of the line.  */
 891               *line = m_data + i->start_pos;
 892               *line_len = i->end_pos - i->start_pos;
 893               return true;
 894             }
 895
 896           if (i)
 897             {
 898               m_line_start_idx = i->start_pos;
 899               m_line_num = i->line_num - 1;
 900             }
 901           else
 902             {
 903               m_line_start_idx = 0;
 904               m_line_num = 0;
 905             }
 906         }
 907     }
 908
 909   /*  Let's walk from line m_line_num up to line_num - 1, without
 910       copying any line.  */
 911   while (m_line_num < line_num - 1)
 912     if (!goto_next_line ())
 913       return false;
 914
 915   /* The line we want is the next one.  Let's read and copy it back to
 916      the caller.  */
 917   return get_next_line (line, line_len);
 918 }
 919
 920 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 921    The line is not nul-terminated.  The returned pointer is only
 922    valid until the next call of location_get_source_line.
 923    Note that the line can contain several null characters,
 924    so the returned value's length has the actual length of the line.
 925    If the function fails, a NULL char_span is returned.  */
 926
 927 char_span
 928 location_get_source_line (const char *file_path, int line)
 929 {
 930   char *buffer = NULL;
 931   ssize_t len;
 932
 933   if (line == 0)
 934     return char_span (NULL, 0);
 935
 936   if (file_path == NULL)
 937     return char_span (NULL, 0);
 938
 939   diagnostic_file_cache_init ();
 940
 941   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 942   if (c == NULL)
 943     return char_span (NULL, 0);
 944
 945   bool read = c->read_line_num (line, &buffer, &len);
 946   if (!read)
 947     return char_span (NULL, 0);
 948
 949   return char_span (buffer, len);
 950 }
 951
 952 /* Determine if FILE_PATH missing a trailing newline on its final line.
 953    Only valid to call once all of the file has been loaded, by
 954    requesting a line number beyond the end of the file.  */
 955
 956 bool
 957 location_missing_trailing_newline (const char *file_path)
 958 {
 959   diagnostic_file_cache_init ();
 960
 961   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 962   if (c == NULL)
 963     return false;
 964
 965   return c->missing_trailing_newline_p ();
 966 }
 967
 968 /* Test if the location originates from the spelling location of a
 969    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 970    virtual) location of a built-in token that appears in the expansion
 971    list of a macro.  Please note that this function also works on
 972    tokens that result from built-in tokens.  For instance, the
 973    function would return true if passed a token "4" that is the result
 974    of the expansion of the built-in __LINE__ macro.  */
 975 bool
 976 is_location_from_builtin_token (location_t loc)
 977 {
 978   const line_map_ordinary *map = NULL;
 979   loc = linemap_resolve_location (line_table, loc,
 980                                   LRK_SPELLING_LOCATION, &map);
 981   return loc == BUILTINS_LOCATION;
 982 }
 983
 984 /* Expand the source location LOC into a human readable location.  If
 985    LOC is virtual, it resolves to the expansion point of the involved
 986    macro.  If LOC resolves to a builtin location, the file name of the
 987    readable location is set to the string "<built-in>".  */
 988
 989 expanded_location
 990 expand_location (location_t loc)
 991 {
 992   return expand_location_1 (loc, /*expansion_point_p=*/true,
 993                             LOCATION_ASPECT_CARET);
 994 }
 995
 996 /* Expand the source location LOC into a human readable location.  If
 997    LOC is virtual, it resolves to the expansion location of the
 998    relevant macro.  If LOC resolves to a builtin location, the file
 999    name of the readable location is set to the string
1000    "<built-in>".  */
1001
1002 expanded_location
1003 expand_location_to_spelling_point (location_t loc,
1004                                    enum location_aspect aspect)
1005 {
1006   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1007 }
1008
1009 /* The rich_location class within libcpp requires a way to expand
1010    location_t instances, and relies on the client code
1011    providing a symbol named
1012      linemap_client_expand_location_to_spelling_point
1013    to do this.
1014
1015    This is the implementation for libcommon.a (all host binaries),
1016    which simply calls into expand_location_1.  */
1017
1018 expanded_location
1019 linemap_client_expand_location_to_spelling_point (location_t loc,
1020                                                   enum location_aspect aspect)
1021 {
1022   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1023 }
1024
1025
1026 /* If LOCATION is in a system header and if it is a virtual location
1027    for a token coming from the expansion of a macro, unwind it to
1028    the location of the expansion point of the macro.  If the expansion
1029    point is also in a system header return the original LOCATION.
1030    Otherwise, return the location of the expansion point.
1031
1032    This is used for instance when we want to emit diagnostics about a
1033    token that may be located in a macro that is itself defined in a
1034    system header, for example, for the NULL macro.  In such a case, if
1035    LOCATION were passed directly to diagnostic functions such as
1036    warning_at, the diagnostic would be suppressed (unless
1037    -Wsystem-headers).  */
1038
1039 location_t
1040 expansion_point_location_if_in_system_header (location_t location)
1041 {
1042   if (!in_system_header_at (location))
1043     return location;
1044
1045   location_t xloc = linemap_resolve_location (line_table, location,
1046                                               LRK_MACRO_EXPANSION_POINT,
1047                                               NULL);
1048   return in_system_header_at (xloc) ? location : xloc;
1049 }
1050
1051 /* If LOCATION is a virtual location for a token coming from the expansion
1052    of a macro, unwind to the location of the expansion point of the macro.  */
1053
1054 location_t
1055 expansion_point_location (location_t location)
1056 {
1057   return linemap_resolve_location (line_table, location,
1058                                    LRK_MACRO_EXPANSION_POINT, NULL);
1059 }
1060
1061 /* Construct a location with caret at CARET, ranging from START to
1062    finish e.g.
1063
1064                  11111111112
1065         12345678901234567890
1066      522
1067      523   return foo + bar;
1068                   ~~~~^~~~~
1069      524
1070
1071    The location's caret is at the "+", line 523 column 15, but starts
1072    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1073    of "bar" at column 19.  */
1074
1075 location_t
1076 make_location (location_t caret, location_t start, location_t finish)
1077 {
1078   location_t pure_loc = get_pure_location (caret);
1079   source_range src_range;
1080   src_range.m_start = get_start (start);
1081   src_range.m_finish = get_finish (finish);
1082   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1083                                                    pure_loc,
1084                                                    src_range,
1085                                                    NULL);
1086   return combined_loc;
1087 }
1088
1089 /* Same as above, but taking a source range rather than two locations.  */
1090
1091 location_t
1092 make_location (location_t caret, source_range src_range)
1093 {
1094   location_t pure_loc = get_pure_location (caret);
1095   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
1096 }
1097
1098 /* An expanded_location stores the column in byte units.  This function
1099    converts that column to display units.  That requires reading the associated
1100    source line in order to calculate the display width.  If that cannot be done
1101    for any reason, then returns the byte column as a fallback.  */
1102 int
1103 location_compute_display_column (expanded_location exploc,
1104                                  const cpp_char_column_policy &policy)
1105 {
1106   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1107     return exploc.column;
1108   char_span line = location_get_source_line (exploc.file, exploc.line);
1109   /* If line is NULL, this function returns exploc.column which is the
1110      desired fallback.  */
1111   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1112                                             exploc.column, policy);
1113 }
1114
1115 /* Dump statistics to stderr about the memory usage of the line_table
1116    set of line maps.  This also displays some statistics about macro
1117    expansion.  */
1118
1119 void
1120 dump_line_table_statistics (void)
1121 {
1122   struct linemap_stats s;
1123   long total_used_map_size,
1124     macro_maps_size,
1125     total_allocated_map_size;
1126
1127   memset (&s, 0, sizeof (s));
1128
1129   linemap_get_statistics (line_table, &s);
1130
1131   macro_maps_size = s.macro_maps_used_size
1132     + s.macro_maps_locations_size;
1133
1134   total_allocated_map_size = s.ordinary_maps_allocated_size
1135     + s.macro_maps_allocated_size
1136     + s.macro_maps_locations_size;
1137
1138   total_used_map_size = s.ordinary_maps_used_size
1139     + s.macro_maps_used_size
1140     + s.macro_maps_locations_size;
1141
1142   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1143            s.num_expanded_macros);
1144   if (s.num_expanded_macros != 0)
1145     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1146              s.num_macro_tokens / s.num_expanded_macros);
1147   fprintf (stderr,
1148            "\nLine Table allocations during the "
1149            "compilation process\n");
1150   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1151            SIZE_AMOUNT (s.num_ordinary_maps_used));
1152   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1153            SIZE_AMOUNT (s.ordinary_maps_used_size));
1154   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1155            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1156   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1157            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1158   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1159            SIZE_AMOUNT (s.num_macro_maps_used));
1160   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1161            SIZE_AMOUNT (s.macro_maps_used_size));
1162   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1163            SIZE_AMOUNT (s.macro_maps_locations_size));
1164   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1165            SIZE_AMOUNT (macro_maps_size));
1166   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1167            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1168   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1169            SIZE_AMOUNT (total_allocated_map_size));
1170   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1171            SIZE_AMOUNT (total_used_map_size));
1172   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1173            SIZE_AMOUNT (s.adhoc_table_size));
1174   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1175            SIZE_AMOUNT (s.adhoc_table_entries_used));
1176   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1177            SIZE_AMOUNT (line_table->num_optimized_ranges));
1178   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1179            SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1180
1181   fprintf (stderr, "\n");
1182 }
1183
1184 /* Get location one beyond the final location in ordinary map IDX.  */
1185
1186 static location_t
1187 get_end_location (class line_maps *set, unsigned int idx)
1188 {
1189   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1190     return set->highest_location;
1191
1192   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1193   return MAP_START_LOCATION (next_map);
1194 }
1195
1196 /* Helper function for write_digit_row.  */
1197
1198 static void
1199 write_digit (FILE *stream, int digit)
1200 {
1201   fputc ('0' + (digit % 10), stream);
1202 }
1203
1204 /* Helper function for dump_location_info.
1205    Write a row of numbers to STREAM, numbering a source line,
1206    giving the units, tens, hundreds etc of the column number.  */
1207
1208 static void
1209 write_digit_row (FILE *stream, int indent,
1210                  const line_map_ordinary *map,
1211                  location_t loc, int max_col, int divisor)
1212 {
1213   fprintf (stream, "%*c", indent, ' ');
1214   fprintf (stream, "|");
1215   for (int column = 1; column < max_col; column++)
1216     {
1217       location_t column_loc = loc + (column << map->m_range_bits);
1218       write_digit (stream, column_loc / divisor);
1219     }
1220   fprintf (stream, "\n");
1221 }
1222
1223 /* Write a half-closed (START) / half-open (END) interval of
1224    location_t to STREAM.  */
1225
1226 static void
1227 dump_location_range (FILE *stream,
1228                      location_t start, location_t end)
1229 {
1230   fprintf (stream,
1231            "  location_t interval: %u <= loc < %u\n",
1232            start, end);
1233 }
1234
1235 /* Write a labelled description of a half-closed (START) / half-open (END)
1236    interval of location_t to STREAM.  */
1237
1238 static void
1239 dump_labelled_location_range (FILE *stream,
1240                               const char *name,
1241                               location_t start, location_t end)
1242 {
1243   fprintf (stream, "%s\n", name);
1244   dump_location_range (stream, start, end);
1245   fprintf (stream, "\n");
1246 }
1247
1248 /* Write a visualization of the locations in the line_table to STREAM.  */
1249
1250 void
1251 dump_location_info (FILE *stream)
1252 {
1253   /* Visualize the reserved locations.  */
1254   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1255                                 0, RESERVED_LOCATION_COUNT);
1256
1257   /* Visualize the ordinary line_map instances, rendering the sources. */
1258   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1259     {
1260       location_t end_location = get_end_location (line_table, idx);
1261       /* half-closed: doesn't include this one. */
1262
1263       const line_map_ordinary *map
1264         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1265       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1266       dump_location_range (stream,
1267                            MAP_START_LOCATION (map), end_location);
1268       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1269       fprintf (stream, "  starting at line: %i\n",
1270                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1271       fprintf (stream, "  column and range bits: %i\n",
1272                map->m_column_and_range_bits);
1273       fprintf (stream, "  column bits: %i\n",
1274                map->m_column_and_range_bits - map->m_range_bits);
1275       fprintf (stream, "  range bits: %i\n",
1276                map->m_range_bits);
1277       const char * reason;
1278       switch (map->reason) {
1279       case LC_ENTER:
1280         reason = "LC_ENTER";
1281         break;
1282       case LC_LEAVE:
1283         reason = "LC_LEAVE";
1284         break;
1285       case LC_RENAME:
1286         reason = "LC_RENAME";
1287         break;
1288       case LC_RENAME_VERBATIM:
1289         reason = "LC_RENAME_VERBATIM";
1290         break;
1291       case LC_ENTER_MACRO:
1292         reason = "LC_RENAME_MACRO";
1293         break;
1294       default:
1295         reason = "Unknown";
1296       }
1297       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1298
1299       const line_map_ordinary *includer_map
1300         = linemap_included_from_linemap (line_table, map);
1301       fprintf (stream, "  included from location: %d",
1302                linemap_included_from (map));
1303       if (includer_map) {
1304         fprintf (stream, " (in ordinary map %d)",
1305                  int (includer_map - line_table->info_ordinary.maps));
1306       }
1307       fprintf (stream, "\n");
1308
1309       /* Render the span of source lines that this "map" covers.  */
1310       for (location_t loc = MAP_START_LOCATION (map);
1311            loc < end_location;
1312            loc += (1 << map->m_range_bits) )
1313         {
1314           gcc_assert (pure_location_p (line_table, loc) );
1315
1316           expanded_location exploc
1317             = linemap_expand_location (line_table, map, loc);
1318
1319           if (exploc.column == 0)
1320             {
1321               /* Beginning of a new source line: draw the line.  */
1322
1323               char_span line_text = location_get_source_line (exploc.file,
1324                                                               exploc.line);
1325               if (!line_text)
1326                 break;
1327               fprintf (stream,
1328                        "%s:%3i|loc:%5i|%.*s\n",
1329                        exploc.file, exploc.line,
1330                        loc,
1331                        (int)line_text.length (), line_text.get_buffer ());
1332
1333               /* "loc" is at column 0, which means "the whole line".
1334                  Render the locations *within* the line, by underlining
1335                  it, showing the location_t numeric values
1336                  at each column.  */
1337               size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1338               if (max_col > line_text.length ())
1339                 max_col = line_text.length () + 1;
1340
1341               int len_lnum = num_digits (exploc.line);
1342               if (len_lnum < 3)
1343                 len_lnum = 3;
1344               int len_loc = num_digits (loc);
1345               if (len_loc < 5)
1346                 len_loc = 5;
1347
1348               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1349
1350               /* Thousands.  */
1351               if (end_location > 999)
1352                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1353
1354               /* Hundreds.  */
1355               if (end_location > 99)
1356                 write_digit_row (stream, indent, map, loc, max_col, 100);
1357
1358               /* Tens.  */
1359               write_digit_row (stream, indent, map, loc, max_col, 10);
1360
1361               /* Units.  */
1362               write_digit_row (stream, indent, map, loc, max_col, 1);
1363             }
1364         }
1365       fprintf (stream, "\n");
1366     }
1367
1368   /* Visualize unallocated values.  */
1369   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1370                                 line_table->highest_location,
1371                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1372
1373   /* Visualize the macro line_map instances, rendering the sources. */
1374   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1375     {
1376       /* Each macro map that is allocated owns location_t values
1377          that are *lower* that the one before them.
1378          Hence it's meaningful to view them either in order of ascending
1379          source locations, or in order of ascending macro map index.  */
1380       const bool ascending_location_ts = true;
1381       unsigned int idx = (ascending_location_ts
1382                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1383                           : i);
1384       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1385       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1386                idx,
1387                linemap_map_get_macro_name (map),
1388                MACRO_MAP_NUM_MACRO_TOKENS (map));
1389       dump_location_range (stream,
1390                            map->start_location,
1391                            (map->start_location
1392                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1393       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1394               "expansion point is location %i",
1395               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1396       fprintf (stream, "  map->start_location: %u\n",
1397                map->start_location);
1398
1399       fprintf (stream, "  macro_locations:\n");
1400       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1401         {
1402           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1403           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1404
1405           /* linemap_add_macro_token encodes token numbers in an expansion
1406              by putting them after MAP_START_LOCATION. */
1407
1408           /* I'm typically seeing 4 uninitialized entries at the end of
1409              0xafafafaf.
1410              This appears to be due to macro.cc:replace_args
1411              adding 2 extra args for padding tokens; presumably there may
1412              be a leading and/or trailing padding token injected,
1413              each for 2 more location slots.
1414              This would explain there being up to 4 location_ts slots
1415              that may be uninitialized.  */
1416
1417           fprintf (stream, "    %u: %u, %u\n",
1418                    i,
1419                    x,
1420                    y);
1421           if (x == y)
1422             {
1423               if (x < MAP_START_LOCATION (map))
1424                 inform (x, "token %u has %<x-location == y-location == %u%>",
1425                         i, x);
1426               else
1427                 fprintf (stream,
1428                          "x-location == y-location == %u encodes token # %u\n",
1429                          x, x - MAP_START_LOCATION (map));
1430                 }
1431           else
1432             {
1433               inform (x, "token %u has %<x-location == %u%>", i, x);
1434               inform (x, "token %u has %<y-location == %u%>", i, y);
1435             }
1436         }
1437       fprintf (stream, "\n");
1438     }
1439
1440   /* It appears that MAX_LOCATION_T itself is never assigned to a
1441      macro map, presumably due to an off-by-one error somewhere
1442      between the logic in linemap_enter_macro and
1443      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1444   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1445                                 MAX_LOCATION_T,
1446                                 MAX_LOCATION_T + 1);
1447
1448   /* Visualize ad-hoc values.  */
1449   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1450                                 MAX_LOCATION_T + 1, UINT_MAX);
1451 }
1452
1453 /* string_concat's constructor.  */
1454
1455 string_concat::string_concat (int num, location_t *locs)
1456   : m_num (num)
1457 {
1458   m_locs = ggc_vec_alloc <location_t> (num);
1459   for (int i = 0; i < num; i++)
1460     m_locs[i] = locs[i];
1461 }
1462
1463 /* string_concat_db's constructor.  */
1464
1465 string_concat_db::string_concat_db ()
1466 {
1467   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1468 }
1469
1470 /* Record that a string concatenation occurred, covering NUM
1471    string literal tokens.  LOCS is an array of size NUM, containing the
1472    locations of the tokens.  A copy of LOCS is taken.  */
1473
1474 void
1475 string_concat_db::record_string_concatenation (int num, location_t *locs)
1476 {
1477   gcc_assert (num > 1);
1478   gcc_assert (locs);
1479
1480   location_t key_loc = get_key_loc (locs[0]);
1481   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1482      any data now recorded under key 'key_loc' would be overwritten by a
1483      subsequent call with the same key 'key_loc'.  */
1484   if (RESERVED_LOCATION_P (key_loc))
1485     return;
1486
1487   string_concat *concat
1488     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1489   m_table->put (key_loc, concat);
1490 }
1491
1492 /* Determine if LOC was the location of the initial token of a
1493    concatenation of string literal tokens.
1494    If so, *OUT_NUM is written to with the number of tokens, and
1495    *OUT_LOCS with the location of an array of locations of the
1496    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1497    storage owned by the string_concat_db.
1498    Otherwise, return false.  */
1499
1500 bool
1501 string_concat_db::get_string_concatenation (location_t loc,
1502                                             int *out_num,
1503                                             location_t **out_locs)
1504 {
1505   gcc_assert (out_num);
1506   gcc_assert (out_locs);
1507
1508   location_t key_loc = get_key_loc (loc);
1509   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1510      discussion in 'string_concat_db::record_string_concatenation'.  */
1511   if (RESERVED_LOCATION_P (key_loc))
1512     return false;
1513
1514   string_concat **concat = m_table->get (key_loc);
1515   if (!concat)
1516     return false;
1517
1518   *out_num = (*concat)->m_num;
1519   *out_locs =(*concat)->m_locs;
1520   return true;
1521 }
1522
1523 /* Internal function.  Canonicalize LOC into a form suitable for
1524    use as a key within the database, stripping away macro expansion,
1525    ad-hoc information, and range information, using the location of
1526    the start of LOC within an ordinary linemap.  */
1527
1528 location_t
1529 string_concat_db::get_key_loc (location_t loc)
1530 {
1531   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1532                                   NULL);
1533
1534   loc = get_range_from_loc (line_table, loc).m_start;
1535
1536   return loc;
1537 }
1538
1539 /* Helper class for use within get_substring_ranges_for_loc.
1540    An vec of cpp_string with responsibility for releasing all of the
1541    str->text for each str in the vector.  */
1542
1543 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1544 {
1545  public:
1546   auto_cpp_string_vec (int alloc)
1547     : auto_vec <cpp_string> (alloc) {}
1548
1549   ~auto_cpp_string_vec ()
1550   {
1551     /* Clean up the copies within this vec.  */
1552     int i;
1553     cpp_string *str;
1554     FOR_EACH_VEC_ELT (*this, i, str)
1555       free (const_cast <unsigned char *> (str->text));
1556   }
1557 };
1558
1559 /* Attempt to populate RANGES with source location information on the
1560    individual characters within the string literal found at STRLOC.
1561    If CONCATS is non-NULL, then any string literals that the token at
1562    STRLOC  was concatenated with are also added to RANGES.
1563
1564    Return NULL if successful, or an error message if any errors occurred (in
1565    which case RANGES may be only partially populated and should not
1566    be used).
1567
1568    This is implemented by re-parsing the relevant source line(s).  */
1569
1570 static const char *
1571 get_substring_ranges_for_loc (cpp_reader *pfile,
1572                               string_concat_db *concats,
1573                               location_t strloc,
1574                               enum cpp_ttype type,
1575                               cpp_substring_ranges &ranges)
1576 {
1577   gcc_assert (pfile);
1578
1579   if (strloc == UNKNOWN_LOCATION)
1580     return "unknown location";
1581
1582   /* Reparsing the strings requires accurate location information.
1583      If -ftrack-macro-expansion has been overridden from its default
1584      of 2, then we might have a location of a macro expansion point,
1585      rather than the location of the literal itself.
1586      Avoid this by requiring that we have full macro expansion tracking
1587      for substring locations to be available.  */
1588   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1589     return "track_macro_expansion != 2";
1590
1591   /* If #line or # 44 "file"-style directives are present, then there's
1592      no guarantee that the line numbers we have can be used to locate
1593      the strings.  For example, we might have a .i file with # directives
1594      pointing back to lines within a .c file, but the .c file might
1595      have been edited since the .i file was created.
1596      In such a case, the safest course is to disable on-demand substring
1597      locations.  */
1598   if (line_table->seen_line_directive)
1599     return "seen line directive";
1600
1601   /* If string concatenation has occurred at STRLOC, get the locations
1602      of all of the literal tokens making up the compound string.
1603      Otherwise, just use STRLOC.  */
1604   int num_locs = 1;
1605   location_t *strlocs = &strloc;
1606   if (concats)
1607     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1608
1609   auto_cpp_string_vec strs (num_locs);
1610   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1611   for (int i = 0; i < num_locs; i++)
1612     {
1613       /* Get range of strloc.  We will use it to locate the start and finish
1614          of the literal token within the line.  */
1615       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1616
1617       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1618         {
1619           /* If the string token was within a macro expansion, then we can
1620              cope with it for the simple case where we have a single token.
1621              Otherwise, bail out.  */
1622           if (src_range.m_start != src_range.m_finish)
1623             return "macro expansion";
1624         }
1625       else
1626         {
1627           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1628             /* If so, we can't reliably determine where the token started within
1629                its line.  */
1630             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1631
1632           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1633             /* If so, we can't reliably determine where the token finished
1634                within its line.  */
1635             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1636         }
1637
1638       expanded_location start
1639         = expand_location_to_spelling_point (src_range.m_start,
1640                                              LOCATION_ASPECT_START);
1641       expanded_location finish
1642         = expand_location_to_spelling_point (src_range.m_finish,
1643                                              LOCATION_ASPECT_FINISH);
1644       if (start.file != finish.file)
1645         return "range endpoints are in different files";
1646       if (start.line != finish.line)
1647         return "range endpoints are on different lines";
1648       if (start.column > finish.column)
1649         return "range endpoints are reversed";
1650
1651       char_span line = location_get_source_line (start.file, start.line);
1652       if (!line)
1653         return "unable to read source line";
1654
1655       /* Determine the location of the literal (including quotes
1656          and leading prefix chars, such as the 'u' in a u""
1657          token).  */
1658       size_t literal_length = finish.column - start.column + 1;
1659
1660       /* Ensure that we don't crash if we got the wrong location.  */
1661       if (start.column < 1)
1662         return "zero start column";
1663       if (line.length () < (start.column - 1 + literal_length))
1664         return "line is not wide enough";
1665
1666       char_span literal = line.subspan (start.column - 1, literal_length);
1667
1668       cpp_string from;
1669       from.len = literal_length;
1670       /* Make a copy of the literal, to avoid having to rely on
1671          the lifetime of the copy of the line within the cache.
1672          This will be released by the auto_cpp_string_vec dtor.  */
1673       from.text = (unsigned char *)literal.xstrdup ();
1674       strs.safe_push (from);
1675
1676       /* For very long lines, a new linemap could have started
1677          halfway through the token.
1678          Ensure that the loc_reader uses the linemap of the
1679          *end* of the token for its start location.  */
1680       const line_map_ordinary *start_ord_map;
1681       linemap_resolve_location (line_table, src_range.m_start,
1682                                 LRK_SPELLING_LOCATION, &start_ord_map);
1683       const line_map_ordinary *final_ord_map;
1684       linemap_resolve_location (line_table, src_range.m_finish,
1685                                 LRK_SPELLING_LOCATION, &final_ord_map);
1686       if (start_ord_map == NULL || final_ord_map == NULL)
1687         return "failed to get ordinary maps";
1688       /* Bulletproofing.  We ought to only have different ordinary maps
1689          for start vs finish due to line-length jumps.  */
1690       if (start_ord_map != final_ord_map
1691           && start_ord_map->to_file != final_ord_map->to_file)
1692         return "start and finish are spelled in different ordinary maps";
1693       /* The file from linemap_resolve_location ought to match that from
1694          expand_location_to_spelling_point.  */
1695       if (start_ord_map->to_file != start.file)
1696         return "mismatching file after resolving linemap";
1697
1698       location_t start_loc
1699         = linemap_position_for_line_and_column (line_table, final_ord_map,
1700                                                 start.line, start.column);
1701
1702       cpp_string_location_reader loc_reader (start_loc, line_table);
1703       loc_readers.safe_push (loc_reader);
1704     }
1705
1706   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1707   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1708                                                  loc_readers.address (),
1709                                                  num_locs, &ranges, type);
1710   if (err)
1711     return err;
1712
1713   /* Success: "ranges" should now contain information on the string.  */
1714   return NULL;
1715 }
1716
1717 /* Attempt to populate *OUT_LOC with source location information on the
1718    given characters within the string literal found at STRLOC.
1719    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1720    character set.
1721
1722    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1723    and string literal "012345\n789"
1724    *OUT_LOC is written to with:
1725      "012345\n789"
1726          ~^~~~~
1727
1728    If CONCATS is non-NULL, then any string literals that the token at
1729    STRLOC was concatenated with are also considered.
1730
1731    This is implemented by re-parsing the relevant source line(s).
1732
1733    Return NULL if successful, or an error message if any errors occurred.
1734    Error messages are intended for GCC developers (to help debugging) rather
1735    than for end-users.  */
1736
1737 const char *
1738 get_location_within_string (cpp_reader *pfile,
1739                             string_concat_db *concats,
1740                             location_t strloc,
1741                             enum cpp_ttype type,
1742                             int caret_idx, int start_idx, int end_idx,
1743                             location_t *out_loc)
1744 {
1745   gcc_checking_assert (caret_idx >= 0);
1746   gcc_checking_assert (start_idx >= 0);
1747   gcc_checking_assert (end_idx >= 0);
1748   gcc_assert (out_loc);
1749
1750   cpp_substring_ranges ranges;
1751   const char *err
1752     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1753   if (err)
1754     return err;
1755
1756   if (caret_idx >= ranges.get_num_ranges ())
1757     return "caret_idx out of range";
1758   if (start_idx >= ranges.get_num_ranges ())
1759     return "start_idx out of range";
1760   if (end_idx >= ranges.get_num_ranges ())
1761     return "end_idx out of range";
1762
1763   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1764                             ranges.get_range (start_idx).m_start,
1765                             ranges.get_range (end_idx).m_finish);
1766   return NULL;
1767 }
1768
1769 #if CHECKING_P
1770
1771 namespace selftest {
1772
1773 /* Selftests of location handling.  */
1774
1775 /* Attempt to populate *OUT_RANGE with source location information on the
1776    given character within the string literal found at STRLOC.
1777    CHAR_IDX refers to an offset within the execution character set.
1778    If CONCATS is non-NULL, then any string literals that the token at
1779    STRLOC was concatenated with are also considered.
1780
1781    This is implemented by re-parsing the relevant source line(s).
1782
1783    Return NULL if successful, or an error message if any errors occurred.
1784    Error messages are intended for GCC developers (to help debugging) rather
1785    than for end-users.  */
1786
1787 static const char *
1788 get_source_range_for_char (cpp_reader *pfile,
1789                            string_concat_db *concats,
1790                            location_t strloc,
1791                            enum cpp_ttype type,
1792                            int char_idx,
1793                            source_range *out_range)
1794 {
1795   gcc_checking_assert (char_idx >= 0);
1796   gcc_assert (out_range);
1797
1798   cpp_substring_ranges ranges;
1799   const char *err
1800     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1801   if (err)
1802     return err;
1803
1804   if (char_idx >= ranges.get_num_ranges ())
1805     return "char_idx out of range";
1806
1807   *out_range = ranges.get_range (char_idx);
1808   return NULL;
1809 }
1810
1811 /* As get_source_range_for_char, but write to *OUT the number
1812    of ranges that are available.  */
1813
1814 static const char *
1815 get_num_source_ranges_for_substring (cpp_reader *pfile,
1816                                      string_concat_db *concats,
1817                                      location_t strloc,
1818                                      enum cpp_ttype type,
1819                                      int *out)
1820 {
1821   gcc_assert (out);
1822
1823   cpp_substring_ranges ranges;
1824   const char *err
1825     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1826
1827   if (err)
1828     return err;
1829
1830   *out = ranges.get_num_ranges ();
1831   return NULL;
1832 }
1833
1834 /* Selftests of location handling.  */
1835
1836 /* Verify that compare() on linenum_type handles comparisons over the full
1837    range of the type.  */
1838
1839 static void
1840 test_linenum_comparisons ()
1841 {
1842   linenum_type min_line (0);
1843   linenum_type max_line (0xffffffff);
1844   ASSERT_EQ (0, compare (min_line, min_line));
1845   ASSERT_EQ (0, compare (max_line, max_line));
1846
1847   ASSERT_GT (compare (max_line, min_line), 0);
1848   ASSERT_LT (compare (min_line, max_line), 0);
1849 }
1850
1851 /* Helper function for verifying location data: when location_t
1852    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1853    as having column 0.  */
1854
1855 static bool
1856 should_have_column_data_p (location_t loc)
1857 {
1858   if (IS_ADHOC_LOC (loc))
1859     loc = get_location_from_adhoc_loc (line_table, loc);
1860   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1861     return false;
1862   return true;
1863 }
1864
1865 /* Selftest for should_have_column_data_p.  */
1866
1867 static void
1868 test_should_have_column_data_p ()
1869 {
1870   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1871   ASSERT_TRUE
1872     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1873   ASSERT_FALSE
1874     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1875 }
1876
1877 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1878    on LOC.  */
1879
1880 static void
1881 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1882               location_t loc)
1883 {
1884   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1885   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1886   /* If location_t values are sufficiently high, then column numbers
1887      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1888      When close to the threshold, column numbers *may* be present: if
1889      the final linemap before the threshold contains a line that straddles
1890      the threshold, locations in that line have column information.  */
1891   if (should_have_column_data_p (loc))
1892     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1893 }
1894
1895 /* Various selftests involve constructing a line table and one or more
1896    line maps within it.
1897
1898    For maximum test coverage we want to run these tests with a variety
1899    of situations:
1900    - line_table->default_range_bits: some frontends use a non-zero value
1901    and others use zero
1902    - the fallback modes within line-map.cc: there are various threshold
1903    values for location_t beyond line-map.cc changes
1904    behavior (disabling of the range-packing optimization, disabling
1905    of column-tracking).  We can exercise these by starting the line_table
1906    at interesting values at or near these thresholds.
1907
1908    The following struct describes a particular case within our test
1909    matrix.  */
1910
1911 class line_table_case
1912 {
1913 public:
1914   line_table_case (int default_range_bits, int base_location)
1915   : m_default_range_bits (default_range_bits),
1916     m_base_location (base_location)
1917   {}
1918
1919   int m_default_range_bits;
1920   int m_base_location;
1921 };
1922
1923 /* Constructor.  Store the old value of line_table, and create a new
1924    one, using sane defaults.  */
1925
1926 line_table_test::line_table_test ()
1927 {
1928   gcc_assert (saved_line_table == NULL);
1929   saved_line_table = line_table;
1930   line_table = ggc_alloc<line_maps> ();
1931   linemap_init (line_table, BUILTINS_LOCATION);
1932   gcc_assert (saved_line_table->reallocator);
1933   line_table->reallocator = saved_line_table->reallocator;
1934   gcc_assert (saved_line_table->round_alloc_size);
1935   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1936   line_table->default_range_bits = 0;
1937 }
1938
1939 /* Constructor.  Store the old value of line_table, and create a new
1940    one, using the sitation described in CASE_.  */
1941
1942 line_table_test::line_table_test (const line_table_case &case_)
1943 {
1944   gcc_assert (saved_line_table == NULL);
1945   saved_line_table = line_table;
1946   line_table = ggc_alloc<line_maps> ();
1947   linemap_init (line_table, BUILTINS_LOCATION);
1948   gcc_assert (saved_line_table->reallocator);
1949   line_table->reallocator = saved_line_table->reallocator;
1950   gcc_assert (saved_line_table->round_alloc_size);
1951   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1952   line_table->default_range_bits = case_.m_default_range_bits;
1953   if (case_.m_base_location)
1954     {
1955       line_table->highest_location = case_.m_base_location;
1956       line_table->highest_line = case_.m_base_location;
1957     }
1958 }
1959
1960 /* Destructor.  Restore the old value of line_table.  */
1961
1962 line_table_test::~line_table_test ()
1963 {
1964   gcc_assert (saved_line_table != NULL);
1965   line_table = saved_line_table;
1966   saved_line_table = NULL;
1967 }
1968
1969 /* Verify basic operation of ordinary linemaps.  */
1970
1971 static void
1972 test_accessing_ordinary_linemaps (const line_table_case &case_)
1973 {
1974   line_table_test ltt (case_);
1975
1976   /* Build a simple linemap describing some locations. */
1977   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1978
1979   linemap_line_start (line_table, 1, 100);
1980   location_t loc_a = linemap_position_for_column (line_table, 1);
1981   location_t loc_b = linemap_position_for_column (line_table, 23);
1982
1983   linemap_line_start (line_table, 2, 100);
1984   location_t loc_c = linemap_position_for_column (line_table, 1);
1985   location_t loc_d = linemap_position_for_column (line_table, 17);
1986
1987   /* Example of a very long line.  */
1988   linemap_line_start (line_table, 3, 2000);
1989   location_t loc_e = linemap_position_for_column (line_table, 700);
1990
1991   /* Transitioning back to a short line.  */
1992   linemap_line_start (line_table, 4, 0);
1993   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1994
1995   if (should_have_column_data_p (loc_back_to_short))
1996     {
1997       /* Verify that we switched to short lines in the linemap.  */
1998       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1999       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2000     }
2001
2002   /* Example of a line that will eventually be seen to be longer
2003      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2004      below that.  */
2005   linemap_line_start (line_table, 5, 2000);
2006
2007   location_t loc_start_of_very_long_line
2008     = linemap_position_for_column (line_table, 2000);
2009   location_t loc_too_wide
2010     = linemap_position_for_column (line_table, 4097);
2011   location_t loc_too_wide_2
2012     = linemap_position_for_column (line_table, 4098);
2013
2014   /* ...and back to a sane line length.  */
2015   linemap_line_start (line_table, 6, 100);
2016   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2017
2018   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2019
2020   /* Multiple files.  */
2021   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2022   linemap_line_start (line_table, 1, 200);
2023   location_t loc_f = linemap_position_for_column (line_table, 150);
2024   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2025
2026   /* Verify that we can recover the location info.  */
2027   assert_loceq ("foo.c", 1, 1, loc_a);
2028   assert_loceq ("foo.c", 1, 23, loc_b);
2029   assert_loceq ("foo.c", 2, 1, loc_c);
2030   assert_loceq ("foo.c", 2, 17, loc_d);
2031   assert_loceq ("foo.c", 3, 700, loc_e);
2032   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2033
2034   /* In the very wide line, the initial location should be fully tracked.  */
2035   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2036   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2037      be disabled.  */
2038   assert_loceq ("foo.c", 5, 0, loc_too_wide);
2039   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2040   /*...and column-tracking should be re-enabled for subsequent lines.  */
2041   assert_loceq ("foo.c", 6, 10, loc_sane_again);
2042
2043   assert_loceq ("bar.c", 1, 150, loc_f);
2044
2045   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2046   ASSERT_TRUE (pure_location_p (line_table, loc_a));
2047
2048   /* Verify using make_location to build a range, and extracting data
2049      back from it.  */
2050   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2051   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2052   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2053   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2054   ASSERT_EQ (loc_b, src_range.m_start);
2055   ASSERT_EQ (loc_d, src_range.m_finish);
2056 }
2057
2058 /* Verify various properties of UNKNOWN_LOCATION.  */
2059
2060 static void
2061 test_unknown_location ()
2062 {
2063   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2064   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2065   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2066 }
2067
2068 /* Verify various properties of BUILTINS_LOCATION.  */
2069
2070 static void
2071 test_builtins ()
2072 {
2073   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
2074   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2075 }
2076
2077 /* Regression test for make_location.
2078    Ensure that we use pure locations for the start/finish of the range,
2079    rather than storing a packed or ad-hoc range as the start/finish.  */
2080
2081 static void
2082 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2083 {
2084   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2085      with C++ frontend.
2086      ....................0000000001111111111222.
2087      ....................1234567890123456789012.  */
2088   const char *content = "     r += !aaa == bbb;\n";
2089   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2090   line_table_test ltt (case_);
2091   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2092
2093   const location_t c11 = linemap_position_for_column (line_table, 11);
2094   const location_t c12 = linemap_position_for_column (line_table, 12);
2095   const location_t c13 = linemap_position_for_column (line_table, 13);
2096   const location_t c14 = linemap_position_for_column (line_table, 14);
2097   const location_t c21 = linemap_position_for_column (line_table, 21);
2098
2099   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2100     return;
2101
2102   /* Use column 13 for the caret location, arbitrarily, to verify that we
2103      handle start != caret.  */
2104   const location_t aaa = make_location (c13, c12, c14);
2105   ASSERT_EQ (c13, get_pure_location (aaa));
2106   ASSERT_EQ (c12, get_start (aaa));
2107   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2108   ASSERT_EQ (c14, get_finish (aaa));
2109   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2110
2111   /* Make a location using a location with a range as the start-point.  */
2112   const location_t not_aaa = make_location (c11, aaa, c14);
2113   ASSERT_EQ (c11, get_pure_location (not_aaa));
2114   /* It should use the start location of the range, not store the range
2115      itself.  */
2116   ASSERT_EQ (c12, get_start (not_aaa));
2117   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2118   ASSERT_EQ (c14, get_finish (not_aaa));
2119   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2120
2121   /* Similarly, make a location with a range as the end-point.  */
2122   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2123   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2124   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2125   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2126   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2127   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2128   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2129   /* It should use the finish location of the range, not store the range
2130      itself.  */
2131   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2132   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2133   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2134   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2135   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2136 }
2137
2138 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2139
2140 static void
2141 test_reading_source_line ()
2142 {
2143   /* Create a tempfile and write some text to it.  */
2144   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2145                         "01234567890123456789\n"
2146                         "This is the test text\n"
2147                         "This is the 3rd line");
2148
2149   /* Read back a specific line from the tempfile.  */
2150   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2151   ASSERT_TRUE (source_line);
2152   ASSERT_TRUE (source_line.get_buffer () != NULL);
2153   ASSERT_EQ (20, source_line.length ());
2154   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2155                          source_line.get_buffer (), source_line.length ()));
2156
2157   source_line = location_get_source_line (tmp.get_filename (), 2);
2158   ASSERT_TRUE (source_line);
2159   ASSERT_TRUE (source_line.get_buffer () != NULL);
2160   ASSERT_EQ (21, source_line.length ());
2161   ASSERT_TRUE (!strncmp ("This is the test text",
2162                          source_line.get_buffer (), source_line.length ()));
2163
2164   source_line = location_get_source_line (tmp.get_filename (), 4);
2165   ASSERT_FALSE (source_line);
2166   ASSERT_TRUE (source_line.get_buffer () == NULL);
2167 }
2168
2169 /* Tests of lexing.  */
2170
2171 /* Verify that token TOK from PARSER has cpp_token_as_text
2172    equal to EXPECTED_TEXT.  */
2173
2174 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
2175   SELFTEST_BEGIN_STMT                                                   \
2176     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
2177     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
2178   SELFTEST_END_STMT
2179
2180 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2181    and ranges from EXP_START_COL to EXP_FINISH_COL.
2182    Use LOC as the effective location of the selftest.  */
2183
2184 static void
2185 assert_token_loc_eq (const location &loc,
2186                      const cpp_token *tok,
2187                      const char *exp_filename, int exp_linenum,
2188                      int exp_start_col, int exp_finish_col)
2189 {
2190   location_t tok_loc = tok->src_loc;
2191   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2192   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2193
2194   /* If location_t values are sufficiently high, then column numbers
2195      will be unavailable.  */
2196   if (!should_have_column_data_p (tok_loc))
2197     return;
2198
2199   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2200   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2201   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2202   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2203 }
2204
2205 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2206    SELFTEST_LOCATION as the effective location of the selftest.  */
2207
2208 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2209                             EXP_START_COL, EXP_FINISH_COL) \
2210   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2211                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2212
2213 /* Test of lexing a file using libcpp, verifying tokens and their
2214    location information.  */
2215
2216 static void
2217 test_lexer (const line_table_case &case_)
2218 {
2219   /* Create a tempfile and write some text to it.  */
2220   const char *content =
2221     /*00000000011111111112222222222333333.3333444444444.455555555556
2222       12345678901234567890123456789012345.6789012345678.901234567890.  */
2223     ("test_name /* c-style comment */\n"
2224      "                                  \"test literal\"\n"
2225      " // test c++-style comment\n"
2226      "   42\n");
2227   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2228
2229   line_table_test ltt (case_);
2230
2231   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2232
2233   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2234   ASSERT_NE (fname, NULL);
2235
2236   /* Verify that we get the expected tokens back, with the correct
2237      location information.  */
2238
2239   location_t loc;
2240   const cpp_token *tok;
2241   tok = cpp_get_token_with_location (parser, &loc);
2242   ASSERT_NE (tok, NULL);
2243   ASSERT_EQ (tok->type, CPP_NAME);
2244   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2245   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2246
2247   tok = cpp_get_token_with_location (parser, &loc);
2248   ASSERT_NE (tok, NULL);
2249   ASSERT_EQ (tok->type, CPP_STRING);
2250   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2251   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2252
2253   tok = cpp_get_token_with_location (parser, &loc);
2254   ASSERT_NE (tok, NULL);
2255   ASSERT_EQ (tok->type, CPP_NUMBER);
2256   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2257   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2258
2259   tok = cpp_get_token_with_location (parser, &loc);
2260   ASSERT_NE (tok, NULL);
2261   ASSERT_EQ (tok->type, CPP_EOF);
2262
2263   cpp_finish (parser, NULL);
2264   cpp_destroy (parser);
2265 }
2266
2267 /* Forward decls.  */
2268
2269 class lexer_test;
2270 class lexer_test_options;
2271
2272 /* A class for specifying options of a lexer_test.
2273    The "apply" vfunc is called during the lexer_test constructor.  */
2274
2275 class lexer_test_options
2276 {
2277  public:
2278   virtual void apply (lexer_test &) = 0;
2279 };
2280
2281 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2282    in its dtor.
2283
2284    This is needed by struct lexer_test to ensure that the cleanup of the
2285    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2286
2287 class cpp_reader_ptr
2288 {
2289  public:
2290   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2291
2292   ~cpp_reader_ptr ()
2293   {
2294     cpp_finish (m_ptr, NULL);
2295     cpp_destroy (m_ptr);
2296   }
2297
2298   operator cpp_reader * () const { return m_ptr; }
2299
2300  private:
2301   cpp_reader *m_ptr;
2302 };
2303
2304 /* A struct for writing lexer tests.  */
2305
2306 class lexer_test
2307 {
2308 public:
2309   lexer_test (const line_table_case &case_, const char *content,
2310               lexer_test_options *options);
2311   ~lexer_test ();
2312
2313   const cpp_token *get_token ();
2314
2315   /* The ordering of these fields matters.
2316      The line_table_test must be first, since the cpp_reader_ptr
2317      uses it.
2318      The cpp_reader must be cleaned up *after* the temp_source_file
2319      since the filenames in input.cc's input cache are owned by the
2320      cpp_reader; in particular, when ~temp_source_file evicts the
2321      filename the filenames must still be alive.  */
2322   line_table_test m_ltt;
2323   cpp_reader_ptr m_parser;
2324   temp_source_file m_tempfile;
2325   string_concat_db m_concats;
2326   bool m_implicitly_expect_EOF;
2327 };
2328
2329 /* Use an EBCDIC encoding for the execution charset, specifically
2330    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2331
2332    This exercises iconv integration within libcpp.
2333    Not every build of iconv supports the given charset,
2334    so we need to flag this error and handle it gracefully.  */
2335
2336 class ebcdic_execution_charset : public lexer_test_options
2337 {
2338  public:
2339   ebcdic_execution_charset () : m_num_iconv_errors (0)
2340     {
2341       gcc_assert (s_singleton == NULL);
2342       s_singleton = this;
2343     }
2344   ~ebcdic_execution_charset ()
2345     {
2346       gcc_assert (s_singleton == this);
2347       s_singleton = NULL;
2348     }
2349
2350   void apply (lexer_test &test) final override
2351   {
2352     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2353     cpp_opts->narrow_charset = "IBM1047";
2354
2355     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2356     callbacks->diagnostic = on_diagnostic;
2357   }
2358
2359   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2360                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2361                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2362                              rich_location *richloc ATTRIBUTE_UNUSED,
2363                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2364     ATTRIBUTE_FPTR_PRINTF(5,0)
2365   {
2366     gcc_assert (s_singleton);
2367     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2368     const char *msg = "conversion from %s to %s not supported by iconv";
2369 #ifdef ENABLE_NLS
2370     msg = dgettext ("cpplib", msg);
2371 #endif
2372     /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2373        when the local iconv build doesn't support the conversion.  */
2374     if (strcmp (msgid, msg) == 0)
2375       {
2376         s_singleton->m_num_iconv_errors++;
2377         return true;
2378       }
2379
2380     /* Otherwise, we have an unexpected error.  */
2381     abort ();
2382   }
2383
2384   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2385
2386  private:
2387   static ebcdic_execution_charset *s_singleton;
2388   int m_num_iconv_errors;
2389 };
2390
2391 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2392
2393 /* A lexer_test_options subclass that records a list of diagnostic
2394    messages emitted by the lexer.  */
2395
2396 class lexer_diagnostic_sink : public lexer_test_options
2397 {
2398  public:
2399   lexer_diagnostic_sink ()
2400   {
2401     gcc_assert (s_singleton == NULL);
2402     s_singleton = this;
2403   }
2404   ~lexer_diagnostic_sink ()
2405   {
2406     gcc_assert (s_singleton == this);
2407     s_singleton = NULL;
2408
2409     int i;
2410     char *str;
2411     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2412       free (str);
2413   }
2414
2415   void apply (lexer_test &test) final override
2416   {
2417     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2418     callbacks->diagnostic = on_diagnostic;
2419   }
2420
2421   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2422                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2423                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2424                              rich_location *richloc ATTRIBUTE_UNUSED,
2425                              const char *msgid, va_list *ap)
2426     ATTRIBUTE_FPTR_PRINTF(5,0)
2427   {
2428     char *msg = xvasprintf (msgid, *ap);
2429     s_singleton->m_diagnostics.safe_push (msg);
2430     return true;
2431   }
2432
2433   auto_vec<char *> m_diagnostics;
2434
2435  private:
2436   static lexer_diagnostic_sink *s_singleton;
2437 };
2438
2439 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2440
2441 /* Constructor.  Override line_table with a new instance based on CASE_,
2442    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2443    start parsing the tempfile.  */
2444
2445 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2446                         lexer_test_options *options)
2447 : m_ltt (case_),
2448   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2449   /* Create a tempfile and write the text to it.  */
2450   m_tempfile (SELFTEST_LOCATION, ".c", content),
2451   m_concats (),
2452   m_implicitly_expect_EOF (true)
2453 {
2454   if (options)
2455     options->apply (*this);
2456
2457   cpp_init_iconv (m_parser);
2458
2459   /* Parse the file.  */
2460   const char *fname = cpp_read_main_file (m_parser,
2461                                           m_tempfile.get_filename ());
2462   ASSERT_NE (fname, NULL);
2463 }
2464
2465 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2466
2467 lexer_test::~lexer_test ()
2468 {
2469   location_t loc;
2470   const cpp_token *tok;
2471
2472   if (m_implicitly_expect_EOF)
2473     {
2474       tok = cpp_get_token_with_location (m_parser, &loc);
2475       ASSERT_NE (tok, NULL);
2476       ASSERT_EQ (tok->type, CPP_EOF);
2477     }
2478 }
2479
2480 /* Get the next token from m_parser.  */
2481
2482 const cpp_token *
2483 lexer_test::get_token ()
2484 {
2485   location_t loc;
2486   const cpp_token *tok;
2487
2488   tok = cpp_get_token_with_location (m_parser, &loc);
2489   ASSERT_NE (tok, NULL);
2490   return tok;
2491 }
2492
2493 /* Verify that locations within string literals are correctly handled.  */
2494
2495 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2496    using the string concatenation database for TEST.
2497
2498    Assert that the character at index IDX is on EXPECTED_LINE,
2499    and that it begins at column EXPECTED_START_COL and ends at
2500    EXPECTED_FINISH_COL (unless the locations are beyond
2501    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2502    columns).  */
2503
2504 static void
2505 assert_char_at_range (const location &loc,
2506                       lexer_test& test,
2507                       location_t strloc, enum cpp_ttype type, int idx,
2508                       int expected_line, int expected_start_col,
2509                       int expected_finish_col)
2510 {
2511   cpp_reader *pfile = test.m_parser;
2512   string_concat_db *concats = &test.m_concats;
2513
2514   source_range actual_range = source_range();
2515   const char *err
2516     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2517                                  &actual_range);
2518   if (should_have_column_data_p (strloc))
2519     ASSERT_EQ_AT (loc, NULL, err);
2520   else
2521     {
2522       ASSERT_STREQ_AT (loc,
2523                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2524                        err);
2525       return;
2526     }
2527
2528   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2529   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2530   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2531   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2532
2533   if (should_have_column_data_p (actual_range.m_start))
2534     {
2535       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2536       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2537     }
2538   if (should_have_column_data_p (actual_range.m_finish))
2539     {
2540       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2541       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2542     }
2543 }
2544
2545 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2546    the effective location of any errors.  */
2547
2548 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2549                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2550   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2551                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2552                         (EXPECTED_FINISH_COL))
2553
2554 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2555    using the string concatenation database for TEST.
2556
2557    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2558
2559 static void
2560 assert_num_substring_ranges (const location &loc,
2561                              lexer_test& test,
2562                              location_t strloc,
2563                              enum cpp_ttype type,
2564                              int expected_num_ranges)
2565 {
2566   cpp_reader *pfile = test.m_parser;
2567   string_concat_db *concats = &test.m_concats;
2568
2569   int actual_num_ranges = -1;
2570   const char *err
2571     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2572                                            &actual_num_ranges);
2573   if (should_have_column_data_p (strloc))
2574     ASSERT_EQ_AT (loc, NULL, err);
2575   else
2576     {
2577       ASSERT_STREQ_AT (loc,
2578                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2579                        err);
2580       return;
2581     }
2582   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2583 }
2584
2585 /* Macro for calling assert_num_substring_ranges, supplying
2586    SELFTEST_LOCATION for the effective location of any errors.  */
2587
2588 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2589                                     EXPECTED_NUM_RANGES)                \
2590   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2591                                (TYPE), (EXPECTED_NUM_RANGES))
2592
2593
2594 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2595    returns an error (using the string concatenation database for TEST).  */
2596
2597 static void
2598 assert_has_no_substring_ranges (const location &loc,
2599                                 lexer_test& test,
2600                                 location_t strloc,
2601                                 enum cpp_ttype type,
2602                                 const char *expected_err)
2603 {
2604   cpp_reader *pfile = test.m_parser;
2605   string_concat_db *concats = &test.m_concats;
2606   cpp_substring_ranges ranges;
2607   const char *actual_err
2608     = get_substring_ranges_for_loc (pfile, concats, strloc,
2609                                     type, ranges);
2610   if (should_have_column_data_p (strloc))
2611     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2612   else
2613     ASSERT_STREQ_AT (loc,
2614                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2615                      actual_err);
2616 }
2617
2618 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2619     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2620                                     (STRLOC), (TYPE), (ERR))
2621
2622 /* Lex a simple string literal.  Verify the substring location data, before
2623    and after running cpp_interpret_string on it.  */
2624
2625 static void
2626 test_lexer_string_locations_simple (const line_table_case &case_)
2627 {
2628   /* Digits 0-9 (with 0 at column 10), the simple way.
2629      ....................000000000.11111111112.2222222223333333333
2630      ....................123456789.01234567890.1234567890123456789
2631      We add a trailing comment to ensure that we correctly locate
2632      the end of the string literal token.  */
2633   const char *content = "        \"0123456789\" /* not a string */\n";
2634   lexer_test test (case_, content, NULL);
2635
2636   /* Verify that we get the expected token back, with the correct
2637      location information.  */
2638   const cpp_token *tok = test.get_token ();
2639   ASSERT_EQ (tok->type, CPP_STRING);
2640   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2641   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2642
2643   /* At this point in lexing, the quote characters are treated as part of
2644      the string (they are stripped off by cpp_interpret_string).  */
2645
2646   ASSERT_EQ (tok->val.str.len, 12);
2647
2648   /* Verify that cpp_interpret_string works.  */
2649   cpp_string dst_string;
2650   const enum cpp_ttype type = CPP_STRING;
2651   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2652                                       &dst_string, type);
2653   ASSERT_TRUE (result);
2654   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2655   free (const_cast <unsigned char *> (dst_string.text));
2656
2657   /* Verify ranges of individual characters.  This no longer includes the
2658      opening quote, but does include the closing quote.  */
2659   for (int i = 0; i <= 10; i++)
2660     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2661                           10 + i, 10 + i);
2662
2663   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2664 }
2665
2666 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2667    encoding.  */
2668
2669 static void
2670 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2671 {
2672   /* EBCDIC support requires iconv.  */
2673   if (!HAVE_ICONV)
2674     return;
2675
2676   /* Digits 0-9 (with 0 at column 10), the simple way.
2677      ....................000000000.11111111112.2222222223333333333
2678      ....................123456789.01234567890.1234567890123456789
2679      We add a trailing comment to ensure that we correctly locate
2680      the end of the string literal token.  */
2681   const char *content = "        \"0123456789\" /* not a string */\n";
2682   ebcdic_execution_charset use_ebcdic;
2683   lexer_test test (case_, content, &use_ebcdic);
2684
2685   /* Verify that we get the expected token back, with the correct
2686      location information.  */
2687   const cpp_token *tok = test.get_token ();
2688   ASSERT_EQ (tok->type, CPP_STRING);
2689   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2690   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2691
2692   /* At this point in lexing, the quote characters are treated as part of
2693      the string (they are stripped off by cpp_interpret_string).  */
2694
2695   ASSERT_EQ (tok->val.str.len, 12);
2696
2697   /* The remainder of the test requires an iconv implementation that
2698      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2699   if (use_ebcdic.iconv_errors_occurred_p ())
2700     return;
2701
2702   /* Verify that cpp_interpret_string works.  */
2703   cpp_string dst_string;
2704   const enum cpp_ttype type = CPP_STRING;
2705   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2706                                       &dst_string, type);
2707   ASSERT_TRUE (result);
2708   /* We should now have EBCDIC-encoded text, specifically
2709      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2710      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2711   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2712                 (const char *)dst_string.text);
2713   free (const_cast <unsigned char *> (dst_string.text));
2714
2715   /* Verify that we don't attempt to record substring location information
2716      for such cases.  */
2717   ASSERT_HAS_NO_SUBSTRING_RANGES
2718     (test, tok->src_loc, type,
2719      "execution character set != source character set");
2720 }
2721
2722 /* Lex a string literal containing a hex-escaped character.
2723    Verify the substring location data, before and after running
2724    cpp_interpret_string on it.  */
2725
2726 static void
2727 test_lexer_string_locations_hex (const line_table_case &case_)
2728 {
2729   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2730      and with a space in place of digit 6, to terminate the escaped
2731      hex code.
2732      ....................000000000.111111.11112222.
2733      ....................123456789.012345.67890123.  */
2734   const char *content = "        \"01234\\x35 789\"\n";
2735   lexer_test test (case_, content, NULL);
2736
2737   /* Verify that we get the expected token back, with the correct
2738      location information.  */
2739   const cpp_token *tok = test.get_token ();
2740   ASSERT_EQ (tok->type, CPP_STRING);
2741   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2742   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2743
2744   /* At this point in lexing, the quote characters are treated as part of
2745      the string (they are stripped off by cpp_interpret_string).  */
2746   ASSERT_EQ (tok->val.str.len, 15);
2747
2748   /* Verify that cpp_interpret_string works.  */
2749   cpp_string dst_string;
2750   const enum cpp_ttype type = CPP_STRING;
2751   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2752                                       &dst_string, type);
2753   ASSERT_TRUE (result);
2754   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2755   free (const_cast <unsigned char *> (dst_string.text));
2756
2757   /* Verify ranges of individual characters.  This no longer includes the
2758      opening quote, but does include the closing quote.  */
2759   for (int i = 0; i <= 4; i++)
2760     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2761   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2762   for (int i = 6; i <= 10; i++)
2763     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2764
2765   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2766 }
2767
2768 /* Lex a string literal containing an octal-escaped character.
2769    Verify the substring location data after running cpp_interpret_string
2770    on it.  */
2771
2772 static void
2773 test_lexer_string_locations_oct (const line_table_case &case_)
2774 {
2775   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2776      and with a space in place of digit 6, to terminate the escaped
2777      octal code.
2778      ....................000000000.111111.11112222.2222223333333333444
2779      ....................123456789.012345.67890123.4567890123456789012  */
2780   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2781   lexer_test test (case_, content, NULL);
2782
2783   /* Verify that we get the expected token back, with the correct
2784      location information.  */
2785   const cpp_token *tok = test.get_token ();
2786   ASSERT_EQ (tok->type, CPP_STRING);
2787   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2788
2789   /* Verify that cpp_interpret_string works.  */
2790   cpp_string dst_string;
2791   const enum cpp_ttype type = CPP_STRING;
2792   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2793                                       &dst_string, type);
2794   ASSERT_TRUE (result);
2795   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2796   free (const_cast <unsigned char *> (dst_string.text));
2797
2798   /* Verify ranges of individual characters.  This no longer includes the
2799      opening quote, but does include the closing quote.  */
2800   for (int i = 0; i < 5; i++)
2801     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2802   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2803   for (int i = 6; i <= 10; i++)
2804     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2805
2806   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2807 }
2808
2809 /* Test of string literal containing letter escapes.  */
2810
2811 static void
2812 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2813 {
2814   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2815      .....................000000000.1.11111.1.1.11222.22222223333333
2816      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2817   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2818   lexer_test test (case_, content, NULL);
2819
2820   /* Verify that we get the expected tokens back.  */
2821   const cpp_token *tok = test.get_token ();
2822   ASSERT_EQ (tok->type, CPP_STRING);
2823   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2824
2825   /* Verify ranges of individual characters. */
2826   /* "\t".  */
2827   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2828                         0, 1, 10, 11);
2829   /* "foo". */
2830   for (int i = 1; i <= 3; i++)
2831     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2832                           i, 1, 11 + i, 11 + i);
2833   /* "\\" and "\n".  */
2834   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2835                         4, 1, 15, 16);
2836   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2837                         5, 1, 17, 18);
2838
2839   /* "bar" and closing quote for nul-terminator.  */
2840   for (int i = 6; i <= 9; i++)
2841     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2842                           i, 1, 13 + i, 13 + i);
2843
2844   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2845 }
2846
2847 /* Another test of a string literal containing a letter escape.
2848    Based on string seen in
2849      printf ("%-%\n");
2850    in gcc.dg/format/c90-printf-1.c.  */
2851
2852 static void
2853 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2854 {
2855   /* .....................000000000.1111.11.1111.22222222223.
2856      .....................123456789.0123.45.6789.01234567890.  */
2857   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2858   lexer_test test (case_, content, NULL);
2859
2860   /* Verify that we get the expected tokens back.  */
2861   const cpp_token *tok = test.get_token ();
2862   ASSERT_EQ (tok->type, CPP_STRING);
2863   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2864
2865   /* Verify ranges of individual characters. */
2866   /* "%-%".  */
2867   for (int i = 0; i < 3; i++)
2868     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2869                           i, 1, 10 + i, 10 + i);
2870   /* "\n".  */
2871   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2872                         3, 1, 13, 14);
2873
2874   /* Closing quote for nul-terminator.  */
2875   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2876                         4, 1, 15, 15);
2877
2878   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2879 }
2880
2881 /* Lex a string literal containing UCN 4 characters.
2882    Verify the substring location data after running cpp_interpret_string
2883    on it.  */
2884
2885 static void
2886 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2887 {
2888   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2889      as UCN 4.
2890      ....................000000000.111111.111122.222222223.33333333344444
2891      ....................123456789.012345.678901.234567890.12345678901234  */
2892   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2893   lexer_test test (case_, content, NULL);
2894
2895   /* Verify that we get the expected token back, with the correct
2896      location information.  */
2897   const cpp_token *tok = test.get_token ();
2898   ASSERT_EQ (tok->type, CPP_STRING);
2899   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2900
2901   /* Verify that cpp_interpret_string works.
2902      The string should be encoded in the execution character
2903      set.  Assuming that is UTF-8, we should have the following:
2904      -----------  ----  -----  -------  ----------------
2905      Byte offset  Byte  Octal  Unicode  Source Column(s)
2906      -----------  ----  -----  -------  ----------------
2907      0            0x30         '0'      10
2908      1            0x31         '1'      11
2909      2            0x32         '2'      12
2910      3            0x33         '3'      13
2911      4            0x34         '4'      14
2912      5            0xE2  \342   U+2174   15-20
2913      6            0x85  \205    (cont)  15-20
2914      7            0xB4  \264    (cont)  15-20
2915      8            0xE2  \342   U+2175   21-26
2916      9            0x85  \205    (cont)  21-26
2917      10           0xB5  \265    (cont)  21-26
2918      11           0x37         '7'      27
2919      12           0x38         '8'      28
2920      13           0x39         '9'      29
2921      14           0x00                  30 (closing quote)
2922      -----------  ----  -----  -------  ---------------.  */
2923
2924   cpp_string dst_string;
2925   const enum cpp_ttype type = CPP_STRING;
2926   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2927                                       &dst_string, type);
2928   ASSERT_TRUE (result);
2929   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2930                 (const char *)dst_string.text);
2931   free (const_cast <unsigned char *> (dst_string.text));
2932
2933   /* Verify ranges of individual characters.  This no longer includes the
2934      opening quote, but does include the closing quote.
2935      '01234'.  */
2936   for (int i = 0; i <= 4; i++)
2937     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2938   /* U+2174.  */
2939   for (int i = 5; i <= 7; i++)
2940     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2941   /* U+2175.  */
2942   for (int i = 8; i <= 10; i++)
2943     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2944   /* '789' and nul terminator  */
2945   for (int i = 11; i <= 14; i++)
2946     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2947
2948   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2949 }
2950
2951 /* Lex a string literal containing UCN 8 characters.
2952    Verify the substring location data after running cpp_interpret_string
2953    on it.  */
2954
2955 static void
2956 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2957 {
2958   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2959      ....................000000000.111111.1111222222.2222333333333.344444
2960      ....................123456789.012345.6789012345.6789012345678.901234  */
2961   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2962   lexer_test test (case_, content, NULL);
2963
2964   /* Verify that we get the expected token back, with the correct
2965      location information.  */
2966   const cpp_token *tok = test.get_token ();
2967   ASSERT_EQ (tok->type, CPP_STRING);
2968   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2969                            "\"01234\\U00002174\\U00002175789\"");
2970
2971   /* Verify that cpp_interpret_string works.
2972      The UTF-8 encoding of the string is identical to that from
2973      the ucn4 testcase above; the only difference is the column
2974      locations.  */
2975   cpp_string dst_string;
2976   const enum cpp_ttype type = CPP_STRING;
2977   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2978                                       &dst_string, type);
2979   ASSERT_TRUE (result);
2980   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2981                 (const char *)dst_string.text);
2982   free (const_cast <unsigned char *> (dst_string.text));
2983
2984   /* Verify ranges of individual characters.  This no longer includes the
2985      opening quote, but does include the closing quote.
2986      '01234'.  */
2987   for (int i = 0; i <= 4; i++)
2988     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2989   /* U+2174.  */
2990   for (int i = 5; i <= 7; i++)
2991     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2992   /* U+2175.  */
2993   for (int i = 8; i <= 10; i++)
2994     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2995   /* '789' at columns 35-37  */
2996   for (int i = 11; i <= 13; i++)
2997     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2998   /* Closing quote/nul-terminator at column 38.  */
2999   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3000
3001   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3002 }
3003
3004 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
3005
3006 static uint32_t
3007 uint32_from_big_endian (const uint32_t *ptr_be_value)
3008 {
3009   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3010   return (((uint32_t) buf[0] << 24)
3011           | ((uint32_t) buf[1] << 16)
3012           | ((uint32_t) buf[2] << 8)
3013           | (uint32_t) buf[3]);
3014 }
3015
3016 /* Lex a wide string literal and verify that attempts to read substring
3017    location data from it fail gracefully.  */
3018
3019 static void
3020 test_lexer_string_locations_wide_string (const line_table_case &case_)
3021 {
3022   /* Digits 0-9.
3023      ....................000000000.11111111112.22222222233333
3024      ....................123456789.01234567890.12345678901234  */
3025   const char *content = "       L\"0123456789\" /* non-str */\n";
3026   lexer_test test (case_, content, NULL);
3027
3028   /* Verify that we get the expected token back, with the correct
3029      location information.  */
3030   const cpp_token *tok = test.get_token ();
3031   ASSERT_EQ (tok->type, CPP_WSTRING);
3032   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3033
3034   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
3035   cpp_string dst_string;
3036   const enum cpp_ttype type = CPP_WSTRING;
3037   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3038                                       &dst_string, type);
3039   ASSERT_TRUE (result);
3040   /* The cpp_reader defaults to big-endian with
3041      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3042      now be encoded as UTF-32BE.  */
3043   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3044   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3045   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3046   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3047   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3048   free (const_cast <unsigned char *> (dst_string.text));
3049
3050   /* We don't yet support generating substring location information
3051      for L"" strings.  */
3052   ASSERT_HAS_NO_SUBSTRING_RANGES
3053     (test, tok->src_loc, type,
3054      "execution character set != source character set");
3055 }
3056
3057 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3058
3059 static uint16_t
3060 uint16_from_big_endian (const uint16_t *ptr_be_value)
3061 {
3062   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3063   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3064 }
3065
3066 /* Lex a u"" string literal and verify that attempts to read substring
3067    location data from it fail gracefully.  */
3068
3069 static void
3070 test_lexer_string_locations_string16 (const line_table_case &case_)
3071 {
3072   /* Digits 0-9.
3073      ....................000000000.11111111112.22222222233333
3074      ....................123456789.01234567890.12345678901234  */
3075   const char *content = "       u\"0123456789\" /* non-str */\n";
3076   lexer_test test (case_, content, NULL);
3077
3078   /* Verify that we get the expected token back, with the correct
3079      location information.  */
3080   const cpp_token *tok = test.get_token ();
3081   ASSERT_EQ (tok->type, CPP_STRING16);
3082   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3083
3084   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3085   cpp_string dst_string;
3086   const enum cpp_ttype type = CPP_STRING16;
3087   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3088                                       &dst_string, type);
3089   ASSERT_TRUE (result);
3090
3091   /* The cpp_reader defaults to big-endian, so dst_string should
3092      now be encoded as UTF-16BE.  */
3093   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3094   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3095   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3096   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3097   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3098   free (const_cast <unsigned char *> (dst_string.text));
3099
3100   /* We don't yet support generating substring location information
3101      for L"" strings.  */
3102   ASSERT_HAS_NO_SUBSTRING_RANGES
3103     (test, tok->src_loc, type,
3104      "execution character set != source character set");
3105 }
3106
3107 /* Lex a U"" string literal and verify that attempts to read substring
3108    location data from it fail gracefully.  */
3109
3110 static void
3111 test_lexer_string_locations_string32 (const line_table_case &case_)
3112 {
3113   /* Digits 0-9.
3114      ....................000000000.11111111112.22222222233333
3115      ....................123456789.01234567890.12345678901234  */
3116   const char *content = "       U\"0123456789\" /* non-str */\n";
3117   lexer_test test (case_, content, NULL);
3118
3119   /* Verify that we get the expected token back, with the correct
3120      location information.  */
3121   const cpp_token *tok = test.get_token ();
3122   ASSERT_EQ (tok->type, CPP_STRING32);
3123   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3124
3125   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3126   cpp_string dst_string;
3127   const enum cpp_ttype type = CPP_STRING32;
3128   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3129                                       &dst_string, type);
3130   ASSERT_TRUE (result);
3131
3132   /* The cpp_reader defaults to big-endian, so dst_string should
3133      now be encoded as UTF-32BE.  */
3134   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3135   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3136   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3137   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3138   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3139   free (const_cast <unsigned char *> (dst_string.text));
3140
3141   /* We don't yet support generating substring location information
3142      for L"" strings.  */
3143   ASSERT_HAS_NO_SUBSTRING_RANGES
3144     (test, tok->src_loc, type,
3145      "execution character set != source character set");
3146 }
3147
3148 /* Lex a u8-string literal.
3149    Verify the substring location data after running cpp_interpret_string
3150    on it.  */
3151
3152 static void
3153 test_lexer_string_locations_u8 (const line_table_case &case_)
3154 {
3155   /* Digits 0-9.
3156      ....................000000000.11111111112.22222222233333
3157      ....................123456789.01234567890.12345678901234  */
3158   const char *content = "      u8\"0123456789\" /* non-str */\n";
3159   lexer_test test (case_, content, NULL);
3160
3161   /* Verify that we get the expected token back, with the correct
3162      location information.  */
3163   const cpp_token *tok = test.get_token ();
3164   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3165   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3166
3167   /* Verify that cpp_interpret_string works.  */
3168   cpp_string dst_string;
3169   const enum cpp_ttype type = CPP_STRING;
3170   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3171                                       &dst_string, type);
3172   ASSERT_TRUE (result);
3173   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3174   free (const_cast <unsigned char *> (dst_string.text));
3175
3176   /* Verify ranges of individual characters.  This no longer includes the
3177      opening quote, but does include the closing quote.  */
3178   for (int i = 0; i <= 10; i++)
3179     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3180 }
3181
3182 /* Lex a string literal containing UTF-8 source characters.
3183    Verify the substring location data after running cpp_interpret_string
3184    on it.  */
3185
3186 static void
3187 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3188 {
3189  /* This string literal is written out to the source file as UTF-8,
3190     and is of the form "before mojibake after", where "mojibake"
3191     is written as the following four unicode code points:
3192        U+6587 CJK UNIFIED IDEOGRAPH-6587
3193        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3194        U+5316 CJK UNIFIED IDEOGRAPH-5316
3195        U+3051 HIRAGANA LETTER KE.
3196      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3197      "before" and "after" are 1 byte per unicode character.
3198
3199      The numbering shown are "columns", which are *byte* numbers within
3200      the line, rather than unicode character numbers.
3201
3202      .................... 000000000.1111111.
3203      .................... 123456789.0123456.  */
3204   const char *content = ("        \"before "
3205                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3206                               UTF-8: 0xE6 0x96 0x87
3207                               C octal escaped UTF-8: \346\226\207
3208                             "column" numbers: 17-19.  */
3209                          "\346\226\207"
3210
3211                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3212                               UTF-8: 0xE5 0xAD 0x97
3213                               C octal escaped UTF-8: \345\255\227
3214                             "column" numbers: 20-22.  */
3215                          "\345\255\227"
3216
3217                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3218                               UTF-8: 0xE5 0x8C 0x96
3219                               C octal escaped UTF-8: \345\214\226
3220                             "column" numbers: 23-25.  */
3221                          "\345\214\226"
3222
3223                          /* U+3051 HIRAGANA LETTER KE
3224                               UTF-8: 0xE3 0x81 0x91
3225                               C octal escaped UTF-8: \343\201\221
3226                             "column" numbers: 26-28.  */
3227                          "\343\201\221"
3228
3229                          /* column numbers 29 onwards
3230                           2333333.33334444444444
3231                           9012345.67890123456789. */
3232                          " after\" /* non-str */\n");
3233   lexer_test test (case_, content, NULL);
3234
3235   /* Verify that we get the expected token back, with the correct
3236      location information.  */
3237   const cpp_token *tok = test.get_token ();
3238   ASSERT_EQ (tok->type, CPP_STRING);
3239   ASSERT_TOKEN_AS_TEXT_EQ
3240     (test.m_parser, tok,
3241      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3242
3243   /* Verify that cpp_interpret_string works.  */
3244   cpp_string dst_string;
3245   const enum cpp_ttype type = CPP_STRING;
3246   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3247                                       &dst_string, type);
3248   ASSERT_TRUE (result);
3249   ASSERT_STREQ
3250     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3251      (const char *)dst_string.text);
3252   free (const_cast <unsigned char *> (dst_string.text));
3253
3254   /* Verify ranges of individual characters.  This no longer includes the
3255      opening quote, but does include the closing quote.
3256      Assuming that both source and execution encodings are UTF-8, we have
3257      a run of 25 octets in each, plus the NUL terminator.  */
3258   for (int i = 0; i < 25; i++)
3259     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3260   /* NUL-terminator should use the closing quote at column 35.  */
3261   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3262
3263   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3264 }
3265
3266 /* Test of string literal concatenation.  */
3267
3268 static void
3269 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3270 {
3271   /* Digits 0-9.
3272      .....................000000000.111111.11112222222222
3273      .....................123456789.012345.67890123456789.  */
3274   const char *content = ("        \"01234\" /* non-str */\n"
3275                          "        \"56789\" /* non-str */\n");
3276   lexer_test test (case_, content, NULL);
3277
3278   location_t input_locs[2];
3279
3280   /* Verify that we get the expected tokens back.  */
3281   auto_vec <cpp_string> input_strings;
3282   const cpp_token *tok_a = test.get_token ();
3283   ASSERT_EQ (tok_a->type, CPP_STRING);
3284   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3285   input_strings.safe_push (tok_a->val.str);
3286   input_locs[0] = tok_a->src_loc;
3287
3288   const cpp_token *tok_b = test.get_token ();
3289   ASSERT_EQ (tok_b->type, CPP_STRING);
3290   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3291   input_strings.safe_push (tok_b->val.str);
3292   input_locs[1] = tok_b->src_loc;
3293
3294   /* Verify that cpp_interpret_string works.  */
3295   cpp_string dst_string;
3296   const enum cpp_ttype type = CPP_STRING;
3297   bool result = cpp_interpret_string (test.m_parser,
3298                                       input_strings.address (), 2,
3299                                       &dst_string, type);
3300   ASSERT_TRUE (result);
3301   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3302   free (const_cast <unsigned char *> (dst_string.text));
3303
3304   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3305   test.m_concats.record_string_concatenation (2, input_locs);
3306
3307   location_t initial_loc = input_locs[0];
3308
3309   /* "01234" on line 1.  */
3310   for (int i = 0; i <= 4; i++)
3311     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3312   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3313   for (int i = 5; i <= 10; i++)
3314     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3315
3316   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3317 }
3318
3319 /* Another test of string literal concatenation.  */
3320
3321 static void
3322 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3323 {
3324   /* Digits 0-9.
3325      .....................000000000.111.11111112222222
3326      .....................123456789.012.34567890123456.  */
3327   const char *content = ("        \"01\" /* non-str */\n"
3328                          "        \"23\" /* non-str */\n"
3329                          "        \"45\" /* non-str */\n"
3330                          "        \"67\" /* non-str */\n"
3331                          "        \"89\" /* non-str */\n");
3332   lexer_test test (case_, content, NULL);
3333
3334   auto_vec <cpp_string> input_strings;
3335   location_t input_locs[5];
3336
3337   /* Verify that we get the expected tokens back.  */
3338   for (int i = 0; i < 5; i++)
3339     {
3340       const cpp_token *tok = test.get_token ();
3341       ASSERT_EQ (tok->type, CPP_STRING);
3342       input_strings.safe_push (tok->val.str);
3343       input_locs[i] = tok->src_loc;
3344     }
3345
3346   /* Verify that cpp_interpret_string works.  */
3347   cpp_string dst_string;
3348   const enum cpp_ttype type = CPP_STRING;
3349   bool result = cpp_interpret_string (test.m_parser,
3350                                       input_strings.address (), 5,
3351                                       &dst_string, type);
3352   ASSERT_TRUE (result);
3353   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3354   free (const_cast <unsigned char *> (dst_string.text));
3355
3356   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3357   test.m_concats.record_string_concatenation (5, input_locs);
3358
3359   location_t initial_loc = input_locs[0];
3360
3361   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3362      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3363      and expect get_source_range_for_substring to fail.
3364      However, for a string concatenation test, we can have a case
3365      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3366      but subsequent strings can be after it.
3367      Attempting to detect this within assert_char_at_range
3368      would overcomplicate the logic for the common test cases, so
3369      we detect it here.  */
3370   if (should_have_column_data_p (input_locs[0])
3371       && !should_have_column_data_p (input_locs[4]))
3372     {
3373       /* Verify that get_source_range_for_substring gracefully rejects
3374          this case.  */
3375       source_range actual_range;
3376       const char *err
3377         = get_source_range_for_char (test.m_parser, &test.m_concats,
3378                                      initial_loc, type, 0, &actual_range);
3379       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3380       return;
3381     }
3382
3383   for (int i = 0; i < 5; i++)
3384     for (int j = 0; j < 2; j++)
3385       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3386                             i + 1, 10 + j, 10 + j);
3387
3388   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3389   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3390
3391   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3392 }
3393
3394 /* Another test of string literal concatenation, this time combined with
3395    various kinds of escaped characters.  */
3396
3397 static void
3398 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3399 {
3400   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3401      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3402   const char *content
3403     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3404        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3405     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3406   lexer_test test (case_, content, NULL);
3407
3408   auto_vec <cpp_string> input_strings;
3409   location_t input_locs[4];
3410
3411   /* Verify that we get the expected tokens back.  */
3412   for (int i = 0; i < 4; i++)
3413     {
3414       const cpp_token *tok = test.get_token ();
3415       ASSERT_EQ (tok->type, CPP_STRING);
3416       input_strings.safe_push (tok->val.str);
3417       input_locs[i] = tok->src_loc;
3418     }
3419
3420   /* Verify that cpp_interpret_string works.  */
3421   cpp_string dst_string;
3422   const enum cpp_ttype type = CPP_STRING;
3423   bool result = cpp_interpret_string (test.m_parser,
3424                                       input_strings.address (), 4,
3425                                       &dst_string, type);
3426   ASSERT_TRUE (result);
3427   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3428   free (const_cast <unsigned char *> (dst_string.text));
3429
3430   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3431   test.m_concats.record_string_concatenation (4, input_locs);
3432
3433   location_t initial_loc = input_locs[0];
3434
3435   for (int i = 0; i <= 4; i++)
3436     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3437   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3438   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3439   for (int i = 7; i <= 9; i++)
3440     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3441
3442   /* NUL-terminator should use the location of the final closing quote.  */
3443   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3444
3445   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3446 }
3447
3448 /* Test of string literal in a macro.  */
3449
3450 static void
3451 test_lexer_string_locations_macro (const line_table_case &case_)
3452 {
3453   /* Digits 0-9.
3454      .....................0000000001111111111.22222222223.
3455      .....................1234567890123456789.01234567890.  */
3456   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3457                          "  MACRO");
3458   lexer_test test (case_, content, NULL);
3459
3460   /* Verify that we get the expected tokens back.  */
3461   const cpp_token *tok = test.get_token ();
3462   ASSERT_EQ (tok->type, CPP_PADDING);
3463
3464   tok = test.get_token ();
3465   ASSERT_EQ (tok->type, CPP_STRING);
3466   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3467
3468   /* Verify ranges of individual characters.  We ought to
3469      see columns within the macro definition.  */
3470   for (int i = 0; i <= 10; i++)
3471     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3472                           i, 1, 20 + i, 20 + i);
3473
3474   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3475
3476   tok = test.get_token ();
3477   ASSERT_EQ (tok->type, CPP_PADDING);
3478 }
3479
3480 /* Test of stringification of a macro argument.  */
3481
3482 static void
3483 test_lexer_string_locations_stringified_macro_argument
3484   (const line_table_case &case_)
3485 {
3486   /* .....................000000000111111111122222222223.
3487      .....................123456789012345678901234567890.  */
3488   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3489                          "MACRO(foo)\n");
3490   lexer_test test (case_, content, NULL);
3491
3492   /* Verify that we get the expected token back.  */
3493   const cpp_token *tok = test.get_token ();
3494   ASSERT_EQ (tok->type, CPP_PADDING);
3495
3496   tok = test.get_token ();
3497   ASSERT_EQ (tok->type, CPP_STRING);
3498   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3499
3500   /* We don't support getting the location of a stringified macro
3501      argument.  Verify that it fails gracefully.  */
3502   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3503                                   "cpp_interpret_string_1 failed");
3504
3505   tok = test.get_token ();
3506   ASSERT_EQ (tok->type, CPP_PADDING);
3507
3508   tok = test.get_token ();
3509   ASSERT_EQ (tok->type, CPP_PADDING);
3510 }
3511
3512 /* Ensure that we are fail gracefully if something attempts to pass
3513    in a location that isn't a string literal token.  Seen on this code:
3514
3515      const char a[] = " %d ";
3516      __builtin_printf (a, 0.5);
3517                        ^
3518
3519    when c-format.cc erroneously used the indicated one-character
3520    location as the format string location, leading to a read past the
3521    end of a string buffer in cpp_interpret_string_1.  */
3522
3523 static void
3524 test_lexer_string_locations_non_string (const line_table_case &case_)
3525 {
3526   /* .....................000000000111111111122222222223.
3527      .....................123456789012345678901234567890.  */
3528   const char *content = ("         a\n");
3529   lexer_test test (case_, content, NULL);
3530
3531   /* Verify that we get the expected token back.  */
3532   const cpp_token *tok = test.get_token ();
3533   ASSERT_EQ (tok->type, CPP_NAME);
3534   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3535
3536   /* At this point, libcpp is attempting to interpret the name as a
3537      string literal, despite it not starting with a quote.  We don't detect
3538      that, but we should at least fail gracefully.  */
3539   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3540                                   "cpp_interpret_string_1 failed");
3541 }
3542
3543 /* Ensure that we can read substring information for a token which
3544    starts in one linemap and ends in another .  Adapted from
3545    gcc.dg/cpp/pr69985.c.  */
3546
3547 static void
3548 test_lexer_string_locations_long_line (const line_table_case &case_)
3549 {
3550   /* .....................000000.000111111111
3551      .....................123456.789012346789.  */
3552   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3553                          "     \"0123456789012345678901234567890123456789"
3554                          "0123456789012345678901234567890123456789"
3555                          "0123456789012345678901234567890123456789"
3556                          "0123456789\"\n");
3557
3558   lexer_test test (case_, content, NULL);
3559
3560   /* Verify that we get the expected token back.  */
3561   const cpp_token *tok = test.get_token ();
3562   ASSERT_EQ (tok->type, CPP_STRING);
3563
3564   if (!should_have_column_data_p (line_table->highest_location))
3565     return;
3566
3567   /* Verify ranges of individual characters.  */
3568   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3569   for (int i = 0; i < 131; i++)
3570     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3571                           i, 2, 7 + i, 7 + i);
3572 }
3573
3574 /* Test of locations within a raw string that doesn't contain a newline.  */
3575
3576 static void
3577 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3578 {
3579   /* .....................00.0000000111111111122.
3580      .....................12.3456789012345678901.  */
3581   const char *content = ("R\"foo(0123456789)foo\"\n");
3582   lexer_test test (case_, content, NULL);
3583
3584   /* Verify that we get the expected token back.  */
3585   const cpp_token *tok = test.get_token ();
3586   ASSERT_EQ (tok->type, CPP_STRING);
3587
3588   /* Verify that cpp_interpret_string works.  */
3589   cpp_string dst_string;
3590   const enum cpp_ttype type = CPP_STRING;
3591   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3592                                       &dst_string, type);
3593   ASSERT_TRUE (result);
3594   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3595   free (const_cast <unsigned char *> (dst_string.text));
3596
3597   if (!should_have_column_data_p (line_table->highest_location))
3598     return;
3599
3600   /* 0-9, plus the nil terminator.  */
3601   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3602   for (int i = 0; i < 11; i++)
3603     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3604                           i, 1, 7 + i, 7 + i);
3605 }
3606
3607 /* Test of locations within a raw string that contains a newline.  */
3608
3609 static void
3610 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3611 {
3612   /* .....................00.0000.
3613      .....................12.3456.  */
3614   const char *content = ("R\"foo(\n"
3615   /* .....................00000.
3616      .....................12345.  */
3617                          "hello\n"
3618                          "world\n"
3619   /* .....................00000.
3620      .....................12345.  */
3621                          ")foo\"\n");
3622   lexer_test test (case_, content, NULL);
3623
3624   /* Verify that we get the expected token back.  */
3625   const cpp_token *tok = test.get_token ();
3626   ASSERT_EQ (tok->type, CPP_STRING);
3627
3628   /* Verify that cpp_interpret_string works.  */
3629   cpp_string dst_string;
3630   const enum cpp_ttype type = CPP_STRING;
3631   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3632                                       &dst_string, type);
3633   ASSERT_TRUE (result);
3634   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3635   free (const_cast <unsigned char *> (dst_string.text));
3636
3637   if (!should_have_column_data_p (line_table->highest_location))
3638     return;
3639
3640   /* Currently we don't support locations within raw strings that
3641      contain newlines.  */
3642   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3643                                   "range endpoints are on different lines");
3644 }
3645
3646 /* Test of parsing an unterminated raw string.  */
3647
3648 static void
3649 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3650 {
3651   const char *content = "R\"ouch()ouCh\" /* etc */";
3652
3653   lexer_diagnostic_sink diagnostics;
3654   lexer_test test (case_, content, &diagnostics);
3655   test.m_implicitly_expect_EOF = false;
3656
3657   /* Attempt to parse the raw string.  */
3658   const cpp_token *tok = test.get_token ();
3659   ASSERT_EQ (tok->type, CPP_EOF);
3660
3661   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3662   /* We expect the message "unterminated raw string"
3663      in the "cpplib" translation domain.
3664      It's not clear that dgettext is available on all supported hosts,
3665      so this assertion is commented-out for now.
3666        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3667                      diagnostics.m_diagnostics[0]);
3668   */
3669 }
3670
3671 /* Test of lexing char constants.  */
3672
3673 static void
3674 test_lexer_char_constants (const line_table_case &case_)
3675 {
3676   /* Various char constants.
3677      .....................0000000001111111111.22222222223.
3678      .....................1234567890123456789.01234567890.  */
3679   const char *content = ("         'a'\n"
3680                          "        u'a'\n"
3681                          "        U'a'\n"
3682                          "        L'a'\n"
3683                          "         'abc'\n");
3684   lexer_test test (case_, content, NULL);
3685
3686   /* Verify that we get the expected tokens back.  */
3687   /* 'a'.  */
3688   const cpp_token *tok = test.get_token ();
3689   ASSERT_EQ (tok->type, CPP_CHAR);
3690   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3691
3692   unsigned int chars_seen;
3693   int unsignedp;
3694   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3695                                           &chars_seen, &unsignedp);
3696   ASSERT_EQ (cc, 'a');
3697   ASSERT_EQ (chars_seen, 1);
3698
3699   /* u'a'.  */
3700   tok = test.get_token ();
3701   ASSERT_EQ (tok->type, CPP_CHAR16);
3702   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3703
3704   /* U'a'.  */
3705   tok = test.get_token ();
3706   ASSERT_EQ (tok->type, CPP_CHAR32);
3707   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3708
3709   /* L'a'.  */
3710   tok = test.get_token ();
3711   ASSERT_EQ (tok->type, CPP_WCHAR);
3712   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3713
3714   /* 'abc' (c-char-sequence).  */
3715   tok = test.get_token ();
3716   ASSERT_EQ (tok->type, CPP_CHAR);
3717   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3718 }
3719 /* A table of interesting location_t values, giving one axis of our test
3720    matrix.  */
3721
3722 static const location_t boundary_locations[] = {
3723   /* Zero means "don't override the default values for a new line_table".  */
3724   0,
3725
3726   /* An arbitrary non-zero value that isn't close to one of
3727      the boundary values below.  */
3728   0x10000,
3729
3730   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3731   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3732   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3733   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3734   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3735   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3736
3737   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3738   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3739   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3740   LINE_MAP_MAX_LOCATION_WITH_COLS,
3741   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3742   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3743 };
3744
3745 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3746
3747 void
3748 for_each_line_table_case (void (*testcase) (const line_table_case &))
3749 {
3750   /* As noted above in the description of struct line_table_case,
3751      we want to explore a test matrix of interesting line_table
3752      situations, running various selftests for each case within the
3753      matrix.  */
3754
3755   /* Run all tests with:
3756      (a) line_table->default_range_bits == 0, and
3757      (b) line_table->default_range_bits == 5.  */
3758   int num_cases_tested = 0;
3759   for (int default_range_bits = 0; default_range_bits <= 5;
3760        default_range_bits += 5)
3761     {
3762       /* ...and use each of the "interesting" location values as
3763          the starting location within line_table.  */
3764       const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3765       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3766         {
3767           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3768
3769           testcase (c);
3770
3771           num_cases_tested++;
3772         }
3773     }
3774
3775   /* Verify that we fully covered the test matrix.  */
3776   ASSERT_EQ (num_cases_tested, 2 * 12);
3777 }
3778
3779 /* Verify that when presented with a consecutive pair of locations with
3780    a very large line offset, we don't attempt to consolidate them into
3781    a single ordinary linemap where the line offsets within the line map
3782    would lead to overflow (PR lto/88147).  */
3783
3784 static void
3785 test_line_offset_overflow ()
3786 {
3787   line_table_test ltt (line_table_case (5, 0));
3788
3789   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3790   linemap_line_start (line_table, 1, 100);
3791   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3792   assert_loceq ("foo.c", 2578, 0, loc_a);
3793
3794   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3795   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3796   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3797
3798   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3799   assert_loceq ("foo.c", 404198, 0, loc_b);
3800
3801   /* We should have started a new linemap, rather than attempting to store
3802      a very large line offset.  */
3803   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3804   ASSERT_NE (ordmap_a, ordmap_b);
3805 }
3806
3807 void test_cpp_utf8 ()
3808 {
3809   const int def_tabstop = 8;
3810   cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3811
3812   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3813   {
3814     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3815     ASSERT_EQ (8, w_bad);
3816     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3817     ASSERT_EQ (5, w_ctrl);
3818   }
3819
3820   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3821   {
3822     const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3823     ASSERT_EQ (1, w_pi);
3824     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3825     ASSERT_EQ (2, w_emoji);
3826     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3827                                                         policy);
3828     ASSERT_EQ (1, w_umlaut_precomposed);
3829     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3830                                                       policy);
3831     ASSERT_EQ (1, w_umlaut_combining);
3832     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3833     ASSERT_EQ (2, w_han);
3834     const int w_ascii = cpp_display_width ("GCC", 3, policy);
3835     ASSERT_EQ (3, w_ascii);
3836     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3837                                            "\x9f! \xe4\xb8\xba y\xcc\x88",
3838                                            24, policy);
3839     ASSERT_EQ (18, w_mixed);
3840   }
3841
3842   /* Verify that display width properly expands tabs.  */
3843   {
3844     const char *tstr = "\tabc\td";
3845     ASSERT_EQ (6, cpp_display_width (tstr, 6,
3846                                      cpp_char_column_policy (1, cpp_wcwidth)));
3847     ASSERT_EQ (10, cpp_display_width (tstr, 6,
3848                                       cpp_char_column_policy (3, cpp_wcwidth)));
3849     ASSERT_EQ (17, cpp_display_width (tstr, 6,
3850                                       cpp_char_column_policy (8, cpp_wcwidth)));
3851     ASSERT_EQ (1,
3852                cpp_display_column_to_byte_column
3853                  (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
3854   }
3855
3856   /* Verify that cpp_byte_column_to_display_column can go past the end,
3857      and similar edge cases.  */
3858   {
3859     const char *str
3860       /* Display columns.
3861          111111112345  */
3862       = "\xcf\x80 abc";
3863       /* 111122223456
3864          Byte columns.  */
3865
3866     ASSERT_EQ (5, cpp_display_width (str, 6, policy));
3867     ASSERT_EQ (105,
3868                cpp_byte_column_to_display_column (str, 6, 106, policy));
3869     ASSERT_EQ (10000,
3870                cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
3871     ASSERT_EQ (0,
3872                cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
3873   }
3874
3875   /* Verify that cpp_display_column_to_byte_column can go past the end,
3876      and similar edge cases, and check invertibility.  */
3877   {
3878     const char *str
3879       /* Display columns.
3880          000000000000000000000000000000000000011
3881          111111112222222234444444455555555678901  */
3882       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3883       /* 000000000000000000000000000000000111111
3884          111122223333444456666777788889999012345
3885          Byte columns.  */
3886     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
3887     ASSERT_EQ (15,
3888                cpp_display_column_to_byte_column (str, 15, 11, policy));
3889     ASSERT_EQ (115,
3890                cpp_display_column_to_byte_column (str, 15, 111, policy));
3891     ASSERT_EQ (10000,
3892                cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
3893     ASSERT_EQ (0,
3894                cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
3895
3896     /* Verify that we do not interrupt a UTF-8 sequence.  */
3897     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
3898
3899     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3900       {
3901         const int disp_col
3902           = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
3903         const int byte_col2
3904           = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
3905
3906         /* If we ask for the display column in the middle of a UTF-8
3907            sequence, it will return the length of the partial sequence,
3908            matching the behavior of GCC before display column support.
3909            Otherwise check the round trip was successful.  */
3910         if (byte_col < 4)
3911           ASSERT_EQ (byte_col, disp_col);
3912         else if (byte_col >= 6 && byte_col < 9)
3913           ASSERT_EQ (3 + (byte_col - 5), disp_col);
3914         else
3915           ASSERT_EQ (byte_col2, byte_col);
3916       }
3917   }
3918
3919 }
3920
3921 /* Run all of the selftests within this file.  */
3922
3923 void
3924 input_cc_tests ()
3925 {
3926   test_linenum_comparisons ();
3927   test_should_have_column_data_p ();
3928   test_unknown_location ();
3929   test_builtins ();
3930   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3931
3932   for_each_line_table_case (test_accessing_ordinary_linemaps);
3933   for_each_line_table_case (test_lexer);
3934   for_each_line_table_case (test_lexer_string_locations_simple);
3935   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3936   for_each_line_table_case (test_lexer_string_locations_hex);
3937   for_each_line_table_case (test_lexer_string_locations_oct);
3938   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3939   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3940   for_each_line_table_case (test_lexer_string_locations_ucn4);
3941   for_each_line_table_case (test_lexer_string_locations_ucn8);
3942   for_each_line_table_case (test_lexer_string_locations_wide_string);
3943   for_each_line_table_case (test_lexer_string_locations_string16);
3944   for_each_line_table_case (test_lexer_string_locations_string32);
3945   for_each_line_table_case (test_lexer_string_locations_u8);
3946   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3947   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3948   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3949   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3950   for_each_line_table_case (test_lexer_string_locations_macro);
3951   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3952   for_each_line_table_case (test_lexer_string_locations_non_string);
3953   for_each_line_table_case (test_lexer_string_locations_long_line);
3954   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3955   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3956   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3957   for_each_line_table_case (test_lexer_char_constants);
3958
3959   test_reading_source_line ();
3960
3961   test_line_offset_overflow ();
3962
3963   test_cpp_utf8 ();
3964 }
3965
3966 } // namespace selftest
3967
3968 #endif /* CHECKING_P */