gcc/input.cc

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2022 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* Input charset configuration.  */
  33 static const char *default_charset_callback (const char *)
  34 {
  35   return nullptr;
  36 }
  37
  38 void
  39 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
  40                                       bool should_skip_bom)
  41 {
  42   in_context.ccb = (ccb ? ccb : default_charset_callback);
  43   in_context.should_skip_bom = should_skip_bom;
  44 }
  45
  46 /* This is a cache used by get_next_line to store the content of a
  47    file to be searched for file lines.  */
  48 class file_cache_slot
  49 {
  50 public:
  51   file_cache_slot ();
  52   ~file_cache_slot ();
  53
  54   bool read_line_num (size_t line_num,
  55                       char ** line, ssize_t *line_len);
  56
  57   /* Accessors.  */
  58   const char *get_file_path () const { return m_file_path; }
  59   unsigned get_use_count () const { return m_use_count; }
  60   bool missing_trailing_newline_p () const
  61   {
  62     return m_missing_trailing_newline;
  63   }
  64
  65   void inc_use_count () { m_use_count++; }
  66
  67   bool create (const file_cache::input_context &in_context,
  68                const char *file_path, FILE *fp, unsigned highest_use_count);
  69   void evict ();
  70
  71  private:
  72   /* These are information used to store a line boundary.  */
  73   class line_info
  74   {
  75   public:
  76     /* The line number.  It starts from 1.  */
  77     size_t line_num;
  78
  79     /* The position (byte count) of the beginning of the line,
  80        relative to the file data pointer.  This starts at zero.  */
  81     size_t start_pos;
  82
  83     /* The position (byte count) of the last byte of the line.  This
  84        normally points to the '\n' character, or to one byte after the
  85        last byte of the file, if the file doesn't contain a '\n'
  86        character.  */
  87     size_t end_pos;
  88
  89     line_info (size_t l, size_t s, size_t e)
  90       : line_num (l), start_pos (s), end_pos (e)
  91     {}
  92
  93     line_info ()
  94       :line_num (0), start_pos (0), end_pos (0)
  95     {}
  96   };
  97
  98   bool needs_read_p () const;
  99   bool needs_grow_p () const;
 100   void maybe_grow ();
 101   bool read_data ();
 102   bool maybe_read_data ();
 103   bool get_next_line (char **line, ssize_t *line_len);
 104   bool read_next_line (char ** line, ssize_t *line_len);
 105   bool goto_next_line ();
 106
 107   static const size_t buffer_size = 4 * 1024;
 108   static const size_t line_record_size = 100;
 109
 110   /* The number of time this file has been accessed.  This is used
 111      to designate which file cache to evict from the cache
 112      array.  */
 113   unsigned m_use_count;
 114
 115   /* The file_path is the key for identifying a particular file in
 116      the cache.
 117      For libcpp-using code, the underlying buffer for this field is
 118      owned by the corresponding _cpp_file within the cpp_reader.  */
 119   const char *m_file_path;
 120
 121   FILE *m_fp;
 122
 123   /* This points to the content of the file that we've read so
 124      far.  */
 125   char *m_data;
 126
 127   /* The allocated buffer to be freed may start a little earlier than DATA,
 128      e.g. if a UTF8 BOM was skipped at the beginning.  */
 129   int m_alloc_offset;
 130
 131   /*  The size of the DATA array above.*/
 132   size_t m_size;
 133
 134   /* The number of bytes read from the underlying file so far.  This
 135      must be less (or equal) than SIZE above.  */
 136   size_t m_nb_read;
 137
 138   /* The index of the beginning of the current line.  */
 139   size_t m_line_start_idx;
 140
 141   /* The number of the previous line read.  This starts at 1.  Zero
 142      means we've read no line so far.  */
 143   size_t m_line_num;
 144
 145   /* This is the total number of lines of the current file.  At the
 146      moment, we try to get this information from the line map
 147      subsystem.  Note that this is just a hint.  When using the C++
 148      front-end, this hint is correct because the input file is then
 149      completely tokenized before parsing starts; so the line map knows
 150      the number of lines before compilation really starts.  For e.g,
 151      the C front-end, it can happen that we start emitting diagnostics
 152      before the line map has seen the end of the file.  */
 153   size_t m_total_lines;
 154
 155   /* Could this file be missing a trailing newline on its final line?
 156      Initially true (to cope with empty files), set to true/false
 157      as each line is read.  */
 158   bool m_missing_trailing_newline;
 159
 160   /* This is a record of the beginning and end of the lines we've seen
 161      while reading the file.  This is useful to avoid walking the data
 162      from the beginning when we are asked to read a line that is
 163      before LINE_START_IDX above.  Note that the maximum size of this
 164      record is line_record_size, so that the memory consumption
 165      doesn't explode.  We thus scale total_lines down to
 166      line_record_size.  */
 167   vec<line_info, va_heap> m_line_record;
 168
 169   void offset_buffer (int offset)
 170   {
 171     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
 172                 : (size_t) offset <= m_size);
 173     gcc_assert (m_data);
 174     m_alloc_offset += offset;
 175     m_data += offset;
 176     m_size -= offset;
 177   }
 178
 179 };
 180
 181 /* Current position in real source file.  */
 182
 183 location_t input_location = UNKNOWN_LOCATION;
 184
 185 class line_maps *line_table;
 186
 187 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 188    This needs to be a global so that it can be a GC root, and thus
 189    prevent the stashed copy from being garbage-collected if the GC runs
 190    during a line_table_test.  */
 191
 192 class line_maps *saved_line_table;
 193
 194 /* Expand the source location LOC into a human readable location.  If
 195    LOC resolves to a builtin location, the file name of the readable
 196    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 197    TRUE and LOC is virtual, then it is resolved to the expansion
 198    point of the involved macro.  Otherwise, it is resolved to the
 199    spelling location of the token.
 200
 201    When resolving to the spelling location of the token, if the
 202    resulting location is for a built-in location (that is, it has no
 203    associated line/column) in the context of a macro expansion, the
 204    returned location is the first one (while unwinding the macro
 205    location towards its expansion point) that is in real source
 206    code.
 207
 208    ASPECT controls which part of the location to use.  */
 209
 210 static expanded_location
 211 expand_location_1 (location_t loc,
 212                    bool expansion_point_p,
 213                    enum location_aspect aspect)
 214 {
 215   expanded_location xloc;
 216   const line_map_ordinary *map;
 217   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 218   tree block = NULL;
 219
 220   if (IS_ADHOC_LOC (loc))
 221     {
 222       block = LOCATION_BLOCK (loc);
 223       loc = LOCATION_LOCUS (loc);
 224     }
 225
 226   memset (&xloc, 0, sizeof (xloc));
 227
 228   if (loc >= RESERVED_LOCATION_COUNT)
 229     {
 230       if (!expansion_point_p)
 231         {
 232           /* We want to resolve LOC to its spelling location.
 233
 234              But if that spelling location is a reserved location that
 235              appears in the context of a macro expansion (like for a
 236              location for a built-in token), let's consider the first
 237              location (toward the expansion point) that is not reserved;
 238              that is, the first location that is in real source code.  */
 239           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 240                                                           loc, NULL);
 241           lrk = LRK_SPELLING_LOCATION;
 242         }
 243       loc = linemap_resolve_location (line_table, loc, lrk, &map);
 244
 245       /* loc is now either in an ordinary map, or is a reserved location.
 246          If it is a compound location, the caret is in a spelling location,
 247          but the start/finish might still be a virtual location.
 248          Depending of what the caller asked for, we may need to recurse
 249          one level in order to resolve any virtual locations in the
 250          end-points.  */
 251       switch (aspect)
 252         {
 253         default:
 254           gcc_unreachable ();
 255           /* Fall through.  */
 256         case LOCATION_ASPECT_CARET:
 257           break;
 258         case LOCATION_ASPECT_START:
 259           {
 260             location_t start = get_start (loc);
 261             if (start != loc)
 262               return expand_location_1 (start, expansion_point_p, aspect);
 263           }
 264           break;
 265         case LOCATION_ASPECT_FINISH:
 266           {
 267             location_t finish = get_finish (loc);
 268             if (finish != loc)
 269               return expand_location_1 (finish, expansion_point_p, aspect);
 270           }
 271           break;
 272         }
 273       xloc = linemap_expand_location (line_table, map, loc);
 274     }
 275
 276   xloc.data = block;
 277   if (loc <= BUILTINS_LOCATION)
 278     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 279
 280   return xloc;
 281 }
 282
 283 /* Initialize the set of cache used for files accessed by caret
 284    diagnostic.  */
 285
 286 static void
 287 diagnostic_file_cache_init (void)
 288 {
 289   gcc_assert (global_dc);
 290   if (global_dc->m_file_cache == NULL)
 291     global_dc->m_file_cache = new file_cache ();
 292 }
 293
 294 /* Free the resources used by the set of cache used for files accessed
 295    by caret diagnostic.  */
 296
 297 void
 298 diagnostic_file_cache_fini (void)
 299 {
 300   if (global_dc->m_file_cache)
 301     {
 302       delete global_dc->m_file_cache;
 303       global_dc->m_file_cache = NULL;
 304     }
 305 }
 306
 307 /* Return the total lines number that have been read so far by the
 308    line map (in the preprocessor) so far.  For languages like C++ that
 309    entirely preprocess the input file before starting to parse, this
 310    equals the actual number of lines of the file.  */
 311
 312 static size_t
 313 total_lines_num (const char *file_path)
 314 {
 315   size_t r = 0;
 316   location_t l = 0;
 317   if (linemap_get_file_highest_location (line_table, file_path, &l))
 318     {
 319       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 320       expanded_location xloc = expand_location (l);
 321       r = xloc.line;
 322     }
 323   return r;
 324 }
 325
 326 /* Lookup the cache used for the content of a given file accessed by
 327    caret diagnostic.  Return the found cached file, or NULL if no
 328    cached file was found.  */
 329
 330 file_cache_slot *
 331 file_cache::lookup_file (const char *file_path)
 332 {
 333   gcc_assert (file_path);
 334
 335   /* This will contain the found cached file.  */
 336   file_cache_slot *r = NULL;
 337   for (unsigned i = 0; i < num_file_slots; ++i)
 338     {
 339       file_cache_slot *c = &m_file_slots[i];
 340       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
 341         {
 342           c->inc_use_count ();
 343           r = c;
 344         }
 345     }
 346
 347   if (r)
 348     r->inc_use_count ();
 349
 350   return r;
 351 }
 352
 353 /* Purge any mention of FILENAME from the cache of files used for
 354    printing source code.  For use in selftests when working
 355    with tempfiles.  */
 356
 357 void
 358 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 359 {
 360   gcc_assert (file_path);
 361
 362   if (!global_dc->m_file_cache)
 363     return;
 364
 365   global_dc->m_file_cache->forcibly_evict_file (file_path);
 366 }
 367
 368 void
 369 file_cache::forcibly_evict_file (const char *file_path)
 370 {
 371   gcc_assert (file_path);
 372
 373   file_cache_slot *r = lookup_file (file_path);
 374   if (!r)
 375     /* Not found.  */
 376     return;
 377
 378   r->evict ();
 379 }
 380
 381 void
 382 file_cache_slot::evict ()
 383 {
 384   m_file_path = NULL;
 385   if (m_fp)
 386     fclose (m_fp);
 387   m_fp = NULL;
 388   m_nb_read = 0;
 389   m_line_start_idx = 0;
 390   m_line_num = 0;
 391   m_line_record.truncate (0);
 392   m_use_count = 0;
 393   m_total_lines = 0;
 394   m_missing_trailing_newline = true;
 395 }
 396
 397 /* Return the file cache that has been less used, recently, or the
 398    first empty one.  If HIGHEST_USE_COUNT is non-null,
 399    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 400    in the cache table.  */
 401
 402 file_cache_slot*
 403 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
 404 {
 405   diagnostic_file_cache_init ();
 406
 407   file_cache_slot *to_evict = &m_file_slots[0];
 408   unsigned huc = to_evict->get_use_count ();
 409   for (unsigned i = 1; i < num_file_slots; ++i)
 410     {
 411       file_cache_slot *c = &m_file_slots[i];
 412       bool c_is_empty = (c->get_file_path () == NULL);
 413
 414       if (c->get_use_count () < to_evict->get_use_count ()
 415           || (to_evict->get_file_path () && c_is_empty))
 416         /* We evict C because it's either an entry with a lower use
 417            count or one that is empty.  */
 418         to_evict = c;
 419
 420       if (huc < c->get_use_count ())
 421         huc = c->get_use_count ();
 422
 423       if (c_is_empty)
 424         /* We've reached the end of the cache; subsequent elements are
 425            all empty.  */
 426         break;
 427     }
 428
 429   if (highest_use_count)
 430     *highest_use_count = huc;
 431
 432   return to_evict;
 433 }
 434
 435 /* Create the cache used for the content of a given file to be
 436    accessed by caret diagnostic.  This cache is added to an array of
 437    cache and can be retrieved by lookup_file_in_cache_tab.  This
 438    function returns the created cache.  Note that only the last
 439    num_file_slots files are cached.  */
 440
 441 file_cache_slot*
 442 file_cache::add_file (const char *file_path)
 443 {
 444
 445   FILE *fp = fopen (file_path, "r");
 446   if (fp == NULL)
 447     return NULL;
 448
 449   unsigned highest_use_count = 0;
 450   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
 451   if (!r->create (in_context, file_path, fp, highest_use_count))
 452     return NULL;
 453   return r;
 454 }
 455
 456 /* Populate this slot for use on FILE_PATH and FP, dropping any
 457    existing cached content within it.  */
 458
 459 bool
 460 file_cache_slot::create (const file_cache::input_context &in_context,
 461                          const char *file_path, FILE *fp,
 462                          unsigned highest_use_count)
 463 {
 464   m_file_path = file_path;
 465   if (m_fp)
 466     fclose (m_fp);
 467   m_fp = fp;
 468   if (m_alloc_offset)
 469     offset_buffer (-m_alloc_offset);
 470   m_nb_read = 0;
 471   m_line_start_idx = 0;
 472   m_line_num = 0;
 473   m_line_record.truncate (0);
 474   /* Ensure that this cache entry doesn't get evicted next time
 475      add_file_to_cache_tab is called.  */
 476   m_use_count = ++highest_use_count;
 477   m_total_lines = total_lines_num (file_path);
 478   m_missing_trailing_newline = true;
 479
 480
 481   /* Check the input configuration to determine if we need to do any
 482      transformations, such as charset conversion or BOM skipping.  */
 483   if (const char *input_charset = in_context.ccb (file_path))
 484     {
 485       /* Need a full-blown conversion of the input charset.  */
 486       fclose (m_fp);
 487       m_fp = NULL;
 488       const cpp_converted_source cs
 489         = cpp_get_converted_source (file_path, input_charset);
 490       if (!cs.data)
 491         return false;
 492       if (m_data)
 493         XDELETEVEC (m_data);
 494       m_data = cs.data;
 495       m_nb_read = m_size = cs.len;
 496       m_alloc_offset = cs.data - cs.to_free;
 497     }
 498   else if (in_context.should_skip_bom)
 499     {
 500       if (read_data ())
 501         {
 502           const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
 503           offset_buffer (offset);
 504           m_nb_read -= offset;
 505         }
 506     }
 507
 508   return true;
 509 }
 510
 511 /* file_cache's ctor.  */
 512
 513 file_cache::file_cache ()
 514 : m_file_slots (new file_cache_slot[num_file_slots])
 515 {
 516   initialize_input_context (nullptr, false);
 517 }
 518
 519 /* file_cache's dtor.  */
 520
 521 file_cache::~file_cache ()
 522 {
 523   delete[] m_file_slots;
 524 }
 525
 526 /* Lookup the cache used for the content of a given file accessed by
 527    caret diagnostic.  If no cached file was found, create a new cache
 528    for this file, add it to the array of cached file and return
 529    it.  */
 530
 531 file_cache_slot*
 532 file_cache::lookup_or_add_file (const char *file_path)
 533 {
 534   file_cache_slot *r = lookup_file (file_path);
 535   if (r == NULL)
 536     r = add_file (file_path);
 537   return r;
 538 }
 539
 540 /* Default constructor for a cache of file used by caret
 541    diagnostic.  */
 542
 543 file_cache_slot::file_cache_slot ()
 544 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
 545   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
 546   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 547 {
 548   m_line_record.create (0);
 549 }
 550
 551 /* Destructor for a cache of file used by caret diagnostic.  */
 552
 553 file_cache_slot::~file_cache_slot ()
 554 {
 555   if (m_fp)
 556     {
 557       fclose (m_fp);
 558       m_fp = NULL;
 559     }
 560   if (m_data)
 561     {
 562       offset_buffer (-m_alloc_offset);
 563       XDELETEVEC (m_data);
 564       m_data = 0;
 565     }
 566   m_line_record.release ();
 567 }
 568
 569 /* Returns TRUE iff the cache would need to be filled with data coming
 570    from the file.  That is, either the cache is empty or full or the
 571    current line is empty.  Note that if the cache is full, it would
 572    need to be extended and filled again.  */
 573
 574 bool
 575 file_cache_slot::needs_read_p () const
 576 {
 577   return m_fp && (m_nb_read == 0
 578           || m_nb_read == m_size
 579           || (m_line_start_idx >= m_nb_read - 1));
 580 }
 581
 582 /*  Return TRUE iff the cache is full and thus needs to be
 583     extended.  */
 584
 585 bool
 586 file_cache_slot::needs_grow_p () const
 587 {
 588   return m_nb_read == m_size;
 589 }
 590
 591 /* Grow the cache if it needs to be extended.  */
 592
 593 void
 594 file_cache_slot::maybe_grow ()
 595 {
 596   if (!needs_grow_p ())
 597     return;
 598
 599   if (!m_data)
 600     {
 601       gcc_assert (m_size == 0 && m_alloc_offset == 0);
 602       m_size = buffer_size;
 603       m_data = XNEWVEC (char, m_size);
 604     }
 605   else
 606     {
 607       const int offset = m_alloc_offset;
 608       offset_buffer (-offset);
 609       m_size *= 2;
 610       m_data = XRESIZEVEC (char, m_data, m_size);
 611       offset_buffer (offset);
 612     }
 613 }
 614
 615 /*  Read more data into the cache.  Extends the cache if need be.
 616     Returns TRUE iff new data could be read.  */
 617
 618 bool
 619 file_cache_slot::read_data ()
 620 {
 621   if (feof (m_fp) || ferror (m_fp))
 622     return false;
 623
 624   maybe_grow ();
 625
 626   char * from = m_data + m_nb_read;
 627   size_t to_read = m_size - m_nb_read;
 628   size_t nb_read = fread (from, 1, to_read, m_fp);
 629
 630   if (ferror (m_fp))
 631     return false;
 632
 633   m_nb_read += nb_read;
 634   return !!nb_read;
 635 }
 636
 637 /* Read new data iff the cache needs to be filled with more data
 638    coming from the file FP.  Return TRUE iff the cache was filled with
 639    mode data.  */
 640
 641 bool
 642 file_cache_slot::maybe_read_data ()
 643 {
 644   if (!needs_read_p ())
 645     return false;
 646   return read_data ();
 647 }
 648
 649 /* Helper function for file_cache_slot::get_next_line (), to find the end of
 650    the next line.  Returns with the memchr convention, i.e. nullptr if a line
 651    terminator was not found.  We need to determine line endings in the same
 652    manner that libcpp does: any of \n, \r\n, or \r is a line ending.  */
 653
 654 static char *
 655 find_end_of_line (char *s, size_t len)
 656 {
 657   for (const auto end = s + len; s != end; ++s)
 658     {
 659       if (*s == '\n')
 660         return s;
 661       if (*s == '\r')
 662         {
 663           const auto next = s + 1;
 664           if (next == end)
 665             {
 666               /* Don't find the line ending if \r is the very last character
 667                  in the buffer; we do not know if it's the end of the file or
 668                  just the end of what has been read so far, and we wouldn't
 669                  want to break in the middle of what's actually a \r\n
 670                  sequence.  Instead, we will handle the case of a file ending
 671                  in a \r later.  */
 672               break;
 673             }
 674           return (*next == '\n' ? next : s);
 675         }
 676     }
 677   return nullptr;
 678 }
 679
 680 /* Read a new line from file FP, using C as a cache for the data
 681    coming from the file.  Upon successful completion, *LINE is set to
 682    the beginning of the line found.  *LINE points directly in the
 683    line cache and is only valid until the next call of get_next_line.
 684    *LINE_LEN is set to the length of the line.  Note that the line
 685    does not contain any terminal delimiter.  This function returns
 686    true if some data was read or process from the cache, false
 687    otherwise.  Note that subsequent calls to get_next_line might
 688    make the content of *LINE invalid.  */
 689
 690 bool
 691 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
 692 {
 693   /* Fill the cache with data to process.  */
 694   maybe_read_data ();
 695
 696   size_t remaining_size = m_nb_read - m_line_start_idx;
 697   if (remaining_size == 0)
 698     /* There is no more data to process.  */
 699     return false;
 700
 701   char *line_start = m_data + m_line_start_idx;
 702
 703   char *next_line_start = NULL;
 704   size_t len = 0;
 705   char *line_end = find_end_of_line (line_start, remaining_size);
 706   if (line_end == NULL)
 707     {
 708       /* We haven't found an end-of-line delimiter in the cache.
 709          Fill the cache with more data from the file and look again.  */
 710       while (maybe_read_data ())
 711         {
 712           line_start = m_data + m_line_start_idx;
 713           remaining_size = m_nb_read - m_line_start_idx;
 714           line_end = find_end_of_line (line_start, remaining_size);
 715           if (line_end != NULL)
 716             {
 717               next_line_start = line_end + 1;
 718               break;
 719             }
 720         }
 721       if (line_end == NULL)
 722         {
 723           /* We've loaded all the file into the cache and still no
 724              terminator.  Let's say the line ends up at one byte past the
 725              end of the file.  This is to stay consistent with the case
 726              of when the line ends up with a terminator and line_end points to
 727              that.  That consistency is useful below in the len calculation.
 728
 729              If the file ends in a \r, we didn't identify it as a line
 730              terminator above, so do that now instead.  */
 731           line_end = m_data + m_nb_read;
 732           if (m_nb_read && line_end[-1] == '\r')
 733             {
 734               --line_end;
 735               m_missing_trailing_newline = false;
 736             }
 737           else
 738             m_missing_trailing_newline = true;
 739         }
 740       else
 741         m_missing_trailing_newline = false;
 742     }
 743   else
 744     {
 745       next_line_start = line_end + 1;
 746       m_missing_trailing_newline = false;
 747     }
 748
 749   if (m_fp && ferror (m_fp))
 750     return false;
 751
 752   /* At this point, we've found the end of the of line.  It either points to
 753      the line terminator or to one byte after the last byte of the file.  */
 754   gcc_assert (line_end != NULL);
 755
 756   len = line_end - line_start;
 757
 758   if (m_line_start_idx < m_nb_read)
 759     *line = line_start;
 760
 761   ++m_line_num;
 762
 763   /* Before we update our line record, make sure the hint about the
 764      total number of lines of the file is correct.  If it's not, then
 765      we give up recording line boundaries from now on.  */
 766   bool update_line_record = true;
 767   if (m_line_num > m_total_lines)
 768     update_line_record = false;
 769
 770     /* Now update our line record so that re-reading lines from the
 771      before m_line_start_idx is faster.  */
 772   if (update_line_record
 773       && m_line_record.length () < line_record_size)
 774     {
 775       /* If the file lines fits in the line record, we just record all
 776          its lines ...*/
 777       if (m_total_lines <= line_record_size
 778           && m_line_num > m_line_record.length ())
 779         m_line_record.safe_push
 780           (file_cache_slot::line_info (m_line_num,
 781                                        m_line_start_idx,
 782                                        line_end - m_data));
 783       else if (m_total_lines > line_record_size)
 784         {
 785           /* ... otherwise, we just scale total_lines down to
 786              (line_record_size lines.  */
 787           size_t n = (m_line_num * line_record_size) / m_total_lines;
 788           if (m_line_record.length () == 0
 789               || n >= m_line_record.length ())
 790             m_line_record.safe_push
 791               (file_cache_slot::line_info (m_line_num,
 792                                            m_line_start_idx,
 793                                            line_end - m_data));
 794         }
 795     }
 796
 797   /* Update m_line_start_idx so that it points to the next line to be
 798      read.  */
 799   if (next_line_start)
 800     m_line_start_idx = next_line_start - m_data;
 801   else
 802     /* We didn't find any terminal '\n'.  Let's consider that the end
 803        of line is the end of the data in the cache.  The next
 804        invocation of get_next_line will either read more data from the
 805        underlying file or return false early because we've reached the
 806        end of the file.  */
 807     m_line_start_idx = m_nb_read;
 808
 809   *line_len = len;
 810
 811   return true;
 812 }
 813
 814 /* Consume the next bytes coming from the cache (or from its
 815    underlying file if there are remaining unread bytes in the file)
 816    until we reach the next end-of-line (or end-of-file).  There is no
 817    copying from the cache involved.  Return TRUE upon successful
 818    completion.  */
 819
 820 bool
 821 file_cache_slot::goto_next_line ()
 822 {
 823   char *l;
 824   ssize_t len;
 825
 826   return get_next_line (&l, &len);
 827 }
 828
 829 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 830    If the line was read successfully, *LINE points to the beginning
 831    of the line in the file cache and *LINE_LEN is the length of the
 832    line.  *LINE is not nul-terminated, but may contain zero bytes.
 833    *LINE is only valid until the next call of read_line_num.
 834    This function returns bool if a line was read.  */
 835
 836 bool
 837 file_cache_slot::read_line_num (size_t line_num,
 838                        char ** line, ssize_t *line_len)
 839 {
 840   gcc_assert (line_num > 0);
 841
 842   if (line_num <= m_line_num)
 843     {
 844       /* We've been asked to read lines that are before m_line_num.
 845          So lets use our line record (if it's not empty) to try to
 846          avoid re-reading the file from the beginning again.  */
 847
 848       if (m_line_record.is_empty ())
 849         {
 850           m_line_start_idx = 0;
 851           m_line_num = 0;
 852         }
 853       else
 854         {
 855           file_cache_slot::line_info *i = NULL;
 856           if (m_total_lines <= line_record_size)
 857             {
 858               /* In languages where the input file is not totally
 859                  preprocessed up front, the m_total_lines hint
 860                  can be smaller than the number of lines of the
 861                  file.  In that case, only the first
 862                  m_total_lines have been recorded.
 863
 864                  Otherwise, the first m_total_lines we've read have
 865                  their start/end recorded here.  */
 866               i = (line_num <= m_total_lines)
 867                 ? &m_line_record[line_num - 1]
 868                 : &m_line_record[m_total_lines - 1];
 869               gcc_assert (i->line_num <= line_num);
 870             }
 871           else
 872             {
 873               /*  So the file had more lines than our line record
 874                   size.  Thus the number of lines we've recorded has
 875                   been scaled down to line_record_size.  Let's
 876                   pick the start/end of the recorded line that is
 877                   closest to line_num.  */
 878               size_t n = (line_num <= m_total_lines)
 879                 ? line_num * line_record_size / m_total_lines
 880                 : m_line_record.length () - 1;
 881               if (n < m_line_record.length ())
 882                 {
 883                   i = &m_line_record[n];
 884                   gcc_assert (i->line_num <= line_num);
 885                 }
 886             }
 887
 888           if (i && i->line_num == line_num)
 889             {
 890               /* We have the start/end of the line.  */
 891               *line = m_data + i->start_pos;
 892               *line_len = i->end_pos - i->start_pos;
 893               return true;
 894             }
 895
 896           if (i)
 897             {
 898               m_line_start_idx = i->start_pos;
 899               m_line_num = i->line_num - 1;
 900             }
 901           else
 902             {
 903               m_line_start_idx = 0;
 904               m_line_num = 0;
 905             }
 906         }
 907     }
 908
 909   /*  Let's walk from line m_line_num up to line_num - 1, without
 910       copying any line.  */
 911   while (m_line_num < line_num - 1)
 912     if (!goto_next_line ())
 913       return false;
 914
 915   /* The line we want is the next one.  Let's read and copy it back to
 916      the caller.  */
 917   return get_next_line (line, line_len);
 918 }
 919
 920 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 921    The line is not nul-terminated.  The returned pointer is only
 922    valid until the next call of location_get_source_line.
 923    Note that the line can contain several null characters,
 924    so the returned value's length has the actual length of the line.
 925    If the function fails, a NULL char_span is returned.  */
 926
 927 char_span
 928 location_get_source_line (const char *file_path, int line)
 929 {
 930   char *buffer = NULL;
 931   ssize_t len;
 932
 933   if (line == 0)
 934     return char_span (NULL, 0);
 935
 936   if (file_path == NULL)
 937     return char_span (NULL, 0);
 938
 939   diagnostic_file_cache_init ();
 940
 941   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 942   if (c == NULL)
 943     return char_span (NULL, 0);
 944
 945   bool read = c->read_line_num (line, &buffer, &len);
 946   if (!read)
 947     return char_span (NULL, 0);
 948
 949   return char_span (buffer, len);
 950 }
 951
 952 /* Determine if FILE_PATH missing a trailing newline on its final line.
 953    Only valid to call once all of the file has been loaded, by
 954    requesting a line number beyond the end of the file.  */
 955
 956 bool
 957 location_missing_trailing_newline (const char *file_path)
 958 {
 959   diagnostic_file_cache_init ();
 960
 961   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 962   if (c == NULL)
 963     return false;
 964
 965   return c->missing_trailing_newline_p ();
 966 }
 967
 968 /* Test if the location originates from the spelling location of a
 969    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 970    virtual) location of a built-in token that appears in the expansion
 971    list of a macro.  Please note that this function also works on
 972    tokens that result from built-in tokens.  For instance, the
 973    function would return true if passed a token "4" that is the result
 974    of the expansion of the built-in __LINE__ macro.  */
 975 bool
 976 is_location_from_builtin_token (location_t loc)
 977 {
 978   const line_map_ordinary *map = NULL;
 979   loc = linemap_resolve_location (line_table, loc,
 980                                   LRK_SPELLING_LOCATION, &map);
 981   return loc == BUILTINS_LOCATION;
 982 }
 983
 984 /* Expand the source location LOC into a human readable location.  If
 985    LOC is virtual, it resolves to the expansion point of the involved
 986    macro.  If LOC resolves to a builtin location, the file name of the
 987    readable location is set to the string "<built-in>".  */
 988
 989 expanded_location
 990 expand_location (location_t loc)
 991 {
 992   return expand_location_1 (loc, /*expansion_point_p=*/true,
 993                             LOCATION_ASPECT_CARET);
 994 }
 995
 996 /* Expand the source location LOC into a human readable location.  If
 997    LOC is virtual, it resolves to the expansion location of the
 998    relevant macro.  If LOC resolves to a builtin location, the file
 999    name of the readable location is set to the string
1000    "<built-in>".  */
1001
1002 expanded_location
1003 expand_location_to_spelling_point (location_t loc,
1004                                    enum location_aspect aspect)
1005 {
1006   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1007 }
1008
1009 /* The rich_location class within libcpp requires a way to expand
1010    location_t instances, and relies on the client code
1011    providing a symbol named
1012      linemap_client_expand_location_to_spelling_point
1013    to do this.
1014
1015    This is the implementation for libcommon.a (all host binaries),
1016    which simply calls into expand_location_1.  */
1017
1018 expanded_location
1019 linemap_client_expand_location_to_spelling_point (location_t loc,
1020                                                   enum location_aspect aspect)
1021 {
1022   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1023 }
1024
1025
1026 /* If LOCATION is in a system header and if it is a virtual location
1027    for a token coming from the expansion of a macro, unwind it to
1028    the location of the expansion point of the macro.  If the expansion
1029    point is also in a system header return the original LOCATION.
1030    Otherwise, return the location of the expansion point.
1031
1032    This is used for instance when we want to emit diagnostics about a
1033    token that may be located in a macro that is itself defined in a
1034    system header, for example, for the NULL macro.  In such a case, if
1035    LOCATION were passed directly to diagnostic functions such as
1036    warning_at, the diagnostic would be suppressed (unless
1037    -Wsystem-headers).  */
1038
1039 location_t
1040 expansion_point_location_if_in_system_header (location_t location)
1041 {
1042   if (!in_system_header_at (location))
1043     return location;
1044
1045   location_t xloc = linemap_resolve_location (line_table, location,
1046                                               LRK_MACRO_EXPANSION_POINT,
1047                                               NULL);
1048   return in_system_header_at (xloc) ? location : xloc;
1049 }
1050
1051 /* If LOCATION is a virtual location for a token coming from the expansion
1052    of a macro, unwind to the location of the expansion point of the macro.  */
1053
1054 location_t
1055 expansion_point_location (location_t location)
1056 {
1057   return linemap_resolve_location (line_table, location,
1058                                    LRK_MACRO_EXPANSION_POINT, NULL);
1059 }
1060
1061 /* Construct a location with caret at CARET, ranging from START to
1062    finish e.g.
1063
1064                  11111111112
1065         12345678901234567890
1066      522
1067      523   return foo + bar;
1068                   ~~~~^~~~~
1069      524
1070
1071    The location's caret is at the "+", line 523 column 15, but starts
1072    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1073    of "bar" at column 19.  */
1074
1075 location_t
1076 make_location (location_t caret, location_t start, location_t finish)
1077 {
1078   location_t pure_loc = get_pure_location (caret);
1079   source_range src_range;
1080   src_range.m_start = get_start (start);
1081   src_range.m_finish = get_finish (finish);
1082   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1083                                                    pure_loc,
1084                                                    src_range,
1085                                                    NULL,
1086                                                    0);
1087   return combined_loc;
1088 }
1089
1090 /* Same as above, but taking a source range rather than two locations.  */
1091
1092 location_t
1093 make_location (location_t caret, source_range src_range)
1094 {
1095   location_t pure_loc = get_pure_location (caret);
1096   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL, 0);
1097 }
1098
1099 /* An expanded_location stores the column in byte units.  This function
1100    converts that column to display units.  That requires reading the associated
1101    source line in order to calculate the display width.  If that cannot be done
1102    for any reason, then returns the byte column as a fallback.  */
1103 int
1104 location_compute_display_column (expanded_location exploc,
1105                                  const cpp_char_column_policy &policy)
1106 {
1107   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1108     return exploc.column;
1109   char_span line = location_get_source_line (exploc.file, exploc.line);
1110   /* If line is NULL, this function returns exploc.column which is the
1111      desired fallback.  */
1112   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1113                                             exploc.column, policy);
1114 }
1115
1116 /* Dump statistics to stderr about the memory usage of the line_table
1117    set of line maps.  This also displays some statistics about macro
1118    expansion.  */
1119
1120 void
1121 dump_line_table_statistics (void)
1122 {
1123   struct linemap_stats s;
1124   long total_used_map_size,
1125     macro_maps_size,
1126     total_allocated_map_size;
1127
1128   memset (&s, 0, sizeof (s));
1129
1130   linemap_get_statistics (line_table, &s);
1131
1132   macro_maps_size = s.macro_maps_used_size
1133     + s.macro_maps_locations_size;
1134
1135   total_allocated_map_size = s.ordinary_maps_allocated_size
1136     + s.macro_maps_allocated_size
1137     + s.macro_maps_locations_size;
1138
1139   total_used_map_size = s.ordinary_maps_used_size
1140     + s.macro_maps_used_size
1141     + s.macro_maps_locations_size;
1142
1143   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1144            s.num_expanded_macros);
1145   if (s.num_expanded_macros != 0)
1146     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1147              s.num_macro_tokens / s.num_expanded_macros);
1148   fprintf (stderr,
1149            "\nLine Table allocations during the "
1150            "compilation process\n");
1151   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1152            SIZE_AMOUNT (s.num_ordinary_maps_used));
1153   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1154            SIZE_AMOUNT (s.ordinary_maps_used_size));
1155   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1156            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1157   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1158            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1159   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1160            SIZE_AMOUNT (s.num_macro_maps_used));
1161   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1162            SIZE_AMOUNT (s.macro_maps_used_size));
1163   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1164            SIZE_AMOUNT (s.macro_maps_locations_size));
1165   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1166            SIZE_AMOUNT (macro_maps_size));
1167   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1168            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1169   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1170            SIZE_AMOUNT (total_allocated_map_size));
1171   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1172            SIZE_AMOUNT (total_used_map_size));
1173   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1174            SIZE_AMOUNT (s.adhoc_table_size));
1175   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1176            SIZE_AMOUNT (s.adhoc_table_entries_used));
1177   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1178            SIZE_AMOUNT (line_table->num_optimized_ranges));
1179   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1180            SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1181
1182   fprintf (stderr, "\n");
1183 }
1184
1185 /* Get location one beyond the final location in ordinary map IDX.  */
1186
1187 static location_t
1188 get_end_location (class line_maps *set, unsigned int idx)
1189 {
1190   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1191     return set->highest_location;
1192
1193   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1194   return MAP_START_LOCATION (next_map);
1195 }
1196
1197 /* Helper function for write_digit_row.  */
1198
1199 static void
1200 write_digit (FILE *stream, int digit)
1201 {
1202   fputc ('0' + (digit % 10), stream);
1203 }
1204
1205 /* Helper function for dump_location_info.
1206    Write a row of numbers to STREAM, numbering a source line,
1207    giving the units, tens, hundreds etc of the column number.  */
1208
1209 static void
1210 write_digit_row (FILE *stream, int indent,
1211                  const line_map_ordinary *map,
1212                  location_t loc, int max_col, int divisor)
1213 {
1214   fprintf (stream, "%*c", indent, ' ');
1215   fprintf (stream, "|");
1216   for (int column = 1; column < max_col; column++)
1217     {
1218       location_t column_loc = loc + (column << map->m_range_bits);
1219       write_digit (stream, column_loc / divisor);
1220     }
1221   fprintf (stream, "\n");
1222 }
1223
1224 /* Write a half-closed (START) / half-open (END) interval of
1225    location_t to STREAM.  */
1226
1227 static void
1228 dump_location_range (FILE *stream,
1229                      location_t start, location_t end)
1230 {
1231   fprintf (stream,
1232            "  location_t interval: %u <= loc < %u\n",
1233            start, end);
1234 }
1235
1236 /* Write a labelled description of a half-closed (START) / half-open (END)
1237    interval of location_t to STREAM.  */
1238
1239 static void
1240 dump_labelled_location_range (FILE *stream,
1241                               const char *name,
1242                               location_t start, location_t end)
1243 {
1244   fprintf (stream, "%s\n", name);
1245   dump_location_range (stream, start, end);
1246   fprintf (stream, "\n");
1247 }
1248
1249 /* Write a visualization of the locations in the line_table to STREAM.  */
1250
1251 void
1252 dump_location_info (FILE *stream)
1253 {
1254   /* Visualize the reserved locations.  */
1255   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1256                                 0, RESERVED_LOCATION_COUNT);
1257
1258   /* Visualize the ordinary line_map instances, rendering the sources. */
1259   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1260     {
1261       location_t end_location = get_end_location (line_table, idx);
1262       /* half-closed: doesn't include this one. */
1263
1264       const line_map_ordinary *map
1265         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1266       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1267       dump_location_range (stream,
1268                            MAP_START_LOCATION (map), end_location);
1269       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1270       fprintf (stream, "  starting at line: %i\n",
1271                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1272       fprintf (stream, "  column and range bits: %i\n",
1273                map->m_column_and_range_bits);
1274       fprintf (stream, "  column bits: %i\n",
1275                map->m_column_and_range_bits - map->m_range_bits);
1276       fprintf (stream, "  range bits: %i\n",
1277                map->m_range_bits);
1278       const char * reason;
1279       switch (map->reason) {
1280       case LC_ENTER:
1281         reason = "LC_ENTER";
1282         break;
1283       case LC_LEAVE:
1284         reason = "LC_LEAVE";
1285         break;
1286       case LC_RENAME:
1287         reason = "LC_RENAME";
1288         break;
1289       case LC_RENAME_VERBATIM:
1290         reason = "LC_RENAME_VERBATIM";
1291         break;
1292       case LC_ENTER_MACRO:
1293         reason = "LC_RENAME_MACRO";
1294         break;
1295       default:
1296         reason = "Unknown";
1297       }
1298       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1299
1300       const line_map_ordinary *includer_map
1301         = linemap_included_from_linemap (line_table, map);
1302       fprintf (stream, "  included from location: %d",
1303                linemap_included_from (map));
1304       if (includer_map) {
1305         fprintf (stream, " (in ordinary map %d)",
1306                  int (includer_map - line_table->info_ordinary.maps));
1307       }
1308       fprintf (stream, "\n");
1309
1310       /* Render the span of source lines that this "map" covers.  */
1311       for (location_t loc = MAP_START_LOCATION (map);
1312            loc < end_location;
1313            loc += (1 << map->m_range_bits) )
1314         {
1315           gcc_assert (pure_location_p (line_table, loc) );
1316
1317           expanded_location exploc
1318             = linemap_expand_location (line_table, map, loc);
1319
1320           if (exploc.column == 0)
1321             {
1322               /* Beginning of a new source line: draw the line.  */
1323
1324               char_span line_text = location_get_source_line (exploc.file,
1325                                                               exploc.line);
1326               if (!line_text)
1327                 break;
1328               fprintf (stream,
1329                        "%s:%3i|loc:%5i|%.*s\n",
1330                        exploc.file, exploc.line,
1331                        loc,
1332                        (int)line_text.length (), line_text.get_buffer ());
1333
1334               /* "loc" is at column 0, which means "the whole line".
1335                  Render the locations *within* the line, by underlining
1336                  it, showing the location_t numeric values
1337                  at each column.  */
1338               size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1339               if (max_col > line_text.length ())
1340                 max_col = line_text.length () + 1;
1341
1342               int len_lnum = num_digits (exploc.line);
1343               if (len_lnum < 3)
1344                 len_lnum = 3;
1345               int len_loc = num_digits (loc);
1346               if (len_loc < 5)
1347                 len_loc = 5;
1348
1349               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1350
1351               /* Thousands.  */
1352               if (end_location > 999)
1353                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1354
1355               /* Hundreds.  */
1356               if (end_location > 99)
1357                 write_digit_row (stream, indent, map, loc, max_col, 100);
1358
1359               /* Tens.  */
1360               write_digit_row (stream, indent, map, loc, max_col, 10);
1361
1362               /* Units.  */
1363               write_digit_row (stream, indent, map, loc, max_col, 1);
1364             }
1365         }
1366       fprintf (stream, "\n");
1367     }
1368
1369   /* Visualize unallocated values.  */
1370   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1371                                 line_table->highest_location,
1372                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1373
1374   /* Visualize the macro line_map instances, rendering the sources. */
1375   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1376     {
1377       /* Each macro map that is allocated owns location_t values
1378          that are *lower* that the one before them.
1379          Hence it's meaningful to view them either in order of ascending
1380          source locations, or in order of ascending macro map index.  */
1381       const bool ascending_location_ts = true;
1382       unsigned int idx = (ascending_location_ts
1383                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1384                           : i);
1385       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1386       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1387                idx,
1388                linemap_map_get_macro_name (map),
1389                MACRO_MAP_NUM_MACRO_TOKENS (map));
1390       dump_location_range (stream,
1391                            map->start_location,
1392                            (map->start_location
1393                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1394       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1395               "expansion point is location %i",
1396               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1397       fprintf (stream, "  map->start_location: %u\n",
1398                map->start_location);
1399
1400       fprintf (stream, "  macro_locations:\n");
1401       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1402         {
1403           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1404           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1405
1406           /* linemap_add_macro_token encodes token numbers in an expansion
1407              by putting them after MAP_START_LOCATION. */
1408
1409           /* I'm typically seeing 4 uninitialized entries at the end of
1410              0xafafafaf.
1411              This appears to be due to macro.cc:replace_args
1412              adding 2 extra args for padding tokens; presumably there may
1413              be a leading and/or trailing padding token injected,
1414              each for 2 more location slots.
1415              This would explain there being up to 4 location_ts slots
1416              that may be uninitialized.  */
1417
1418           fprintf (stream, "    %u: %u, %u\n",
1419                    i,
1420                    x,
1421                    y);
1422           if (x == y)
1423             {
1424               if (x < MAP_START_LOCATION (map))
1425                 inform (x, "token %u has %<x-location == y-location == %u%>",
1426                         i, x);
1427               else
1428                 fprintf (stream,
1429                          "x-location == y-location == %u encodes token # %u\n",
1430                          x, x - MAP_START_LOCATION (map));
1431                 }
1432           else
1433             {
1434               inform (x, "token %u has %<x-location == %u%>", i, x);
1435               inform (x, "token %u has %<y-location == %u%>", i, y);
1436             }
1437         }
1438       fprintf (stream, "\n");
1439     }
1440
1441   /* It appears that MAX_LOCATION_T itself is never assigned to a
1442      macro map, presumably due to an off-by-one error somewhere
1443      between the logic in linemap_enter_macro and
1444      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1445   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1446                                 MAX_LOCATION_T,
1447                                 MAX_LOCATION_T + 1);
1448
1449   /* Visualize ad-hoc values.  */
1450   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1451                                 MAX_LOCATION_T + 1, UINT_MAX);
1452 }
1453
1454 /* string_concat's constructor.  */
1455
1456 string_concat::string_concat (int num, location_t *locs)
1457   : m_num (num)
1458 {
1459   m_locs = ggc_vec_alloc <location_t> (num);
1460   for (int i = 0; i < num; i++)
1461     m_locs[i] = locs[i];
1462 }
1463
1464 /* string_concat_db's constructor.  */
1465
1466 string_concat_db::string_concat_db ()
1467 {
1468   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1469 }
1470
1471 /* Record that a string concatenation occurred, covering NUM
1472    string literal tokens.  LOCS is an array of size NUM, containing the
1473    locations of the tokens.  A copy of LOCS is taken.  */
1474
1475 void
1476 string_concat_db::record_string_concatenation (int num, location_t *locs)
1477 {
1478   gcc_assert (num > 1);
1479   gcc_assert (locs);
1480
1481   location_t key_loc = get_key_loc (locs[0]);
1482   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1483      any data now recorded under key 'key_loc' would be overwritten by a
1484      subsequent call with the same key 'key_loc'.  */
1485   if (RESERVED_LOCATION_P (key_loc))
1486     return;
1487
1488   string_concat *concat
1489     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1490   m_table->put (key_loc, concat);
1491 }
1492
1493 /* Determine if LOC was the location of the initial token of a
1494    concatenation of string literal tokens.
1495    If so, *OUT_NUM is written to with the number of tokens, and
1496    *OUT_LOCS with the location of an array of locations of the
1497    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1498    storage owned by the string_concat_db.
1499    Otherwise, return false.  */
1500
1501 bool
1502 string_concat_db::get_string_concatenation (location_t loc,
1503                                             int *out_num,
1504                                             location_t **out_locs)
1505 {
1506   gcc_assert (out_num);
1507   gcc_assert (out_locs);
1508
1509   location_t key_loc = get_key_loc (loc);
1510   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1511      discussion in 'string_concat_db::record_string_concatenation'.  */
1512   if (RESERVED_LOCATION_P (key_loc))
1513     return false;
1514
1515   string_concat **concat = m_table->get (key_loc);
1516   if (!concat)
1517     return false;
1518
1519   *out_num = (*concat)->m_num;
1520   *out_locs =(*concat)->m_locs;
1521   return true;
1522 }
1523
1524 /* Internal function.  Canonicalize LOC into a form suitable for
1525    use as a key within the database, stripping away macro expansion,
1526    ad-hoc information, and range information, using the location of
1527    the start of LOC within an ordinary linemap.  */
1528
1529 location_t
1530 string_concat_db::get_key_loc (location_t loc)
1531 {
1532   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1533                                   NULL);
1534
1535   loc = get_range_from_loc (line_table, loc).m_start;
1536
1537   return loc;
1538 }
1539
1540 /* Helper class for use within get_substring_ranges_for_loc.
1541    An vec of cpp_string with responsibility for releasing all of the
1542    str->text for each str in the vector.  */
1543
1544 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1545 {
1546  public:
1547   auto_cpp_string_vec (int alloc)
1548     : auto_vec <cpp_string> (alloc) {}
1549
1550   ~auto_cpp_string_vec ()
1551   {
1552     /* Clean up the copies within this vec.  */
1553     int i;
1554     cpp_string *str;
1555     FOR_EACH_VEC_ELT (*this, i, str)
1556       free (const_cast <unsigned char *> (str->text));
1557   }
1558 };
1559
1560 /* Attempt to populate RANGES with source location information on the
1561    individual characters within the string literal found at STRLOC.
1562    If CONCATS is non-NULL, then any string literals that the token at
1563    STRLOC  was concatenated with are also added to RANGES.
1564
1565    Return NULL if successful, or an error message if any errors occurred (in
1566    which case RANGES may be only partially populated and should not
1567    be used).
1568
1569    This is implemented by re-parsing the relevant source line(s).  */
1570
1571 static const char *
1572 get_substring_ranges_for_loc (cpp_reader *pfile,
1573                               string_concat_db *concats,
1574                               location_t strloc,
1575                               enum cpp_ttype type,
1576                               cpp_substring_ranges &ranges)
1577 {
1578   gcc_assert (pfile);
1579
1580   if (strloc == UNKNOWN_LOCATION)
1581     return "unknown location";
1582
1583   /* Reparsing the strings requires accurate location information.
1584      If -ftrack-macro-expansion has been overridden from its default
1585      of 2, then we might have a location of a macro expansion point,
1586      rather than the location of the literal itself.
1587      Avoid this by requiring that we have full macro expansion tracking
1588      for substring locations to be available.  */
1589   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1590     return "track_macro_expansion != 2";
1591
1592   /* If #line or # 44 "file"-style directives are present, then there's
1593      no guarantee that the line numbers we have can be used to locate
1594      the strings.  For example, we might have a .i file with # directives
1595      pointing back to lines within a .c file, but the .c file might
1596      have been edited since the .i file was created.
1597      In such a case, the safest course is to disable on-demand substring
1598      locations.  */
1599   if (line_table->seen_line_directive)
1600     return "seen line directive";
1601
1602   /* If string concatenation has occurred at STRLOC, get the locations
1603      of all of the literal tokens making up the compound string.
1604      Otherwise, just use STRLOC.  */
1605   int num_locs = 1;
1606   location_t *strlocs = &strloc;
1607   if (concats)
1608     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1609
1610   auto_cpp_string_vec strs (num_locs);
1611   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1612   for (int i = 0; i < num_locs; i++)
1613     {
1614       /* Get range of strloc.  We will use it to locate the start and finish
1615          of the literal token within the line.  */
1616       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1617
1618       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1619         {
1620           /* If the string token was within a macro expansion, then we can
1621              cope with it for the simple case where we have a single token.
1622              Otherwise, bail out.  */
1623           if (src_range.m_start != src_range.m_finish)
1624             return "macro expansion";
1625         }
1626       else
1627         {
1628           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1629             /* If so, we can't reliably determine where the token started within
1630                its line.  */
1631             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1632
1633           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1634             /* If so, we can't reliably determine where the token finished
1635                within its line.  */
1636             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1637         }
1638
1639       expanded_location start
1640         = expand_location_to_spelling_point (src_range.m_start,
1641                                              LOCATION_ASPECT_START);
1642       expanded_location finish
1643         = expand_location_to_spelling_point (src_range.m_finish,
1644                                              LOCATION_ASPECT_FINISH);
1645       if (start.file != finish.file)
1646         return "range endpoints are in different files";
1647       if (start.line != finish.line)
1648         return "range endpoints are on different lines";
1649       if (start.column > finish.column)
1650         return "range endpoints are reversed";
1651
1652       char_span line = location_get_source_line (start.file, start.line);
1653       if (!line)
1654         return "unable to read source line";
1655
1656       /* Determine the location of the literal (including quotes
1657          and leading prefix chars, such as the 'u' in a u""
1658          token).  */
1659       size_t literal_length = finish.column - start.column + 1;
1660
1661       /* Ensure that we don't crash if we got the wrong location.  */
1662       if (start.column < 1)
1663         return "zero start column";
1664       if (line.length () < (start.column - 1 + literal_length))
1665         return "line is not wide enough";
1666
1667       char_span literal = line.subspan (start.column - 1, literal_length);
1668
1669       cpp_string from;
1670       from.len = literal_length;
1671       /* Make a copy of the literal, to avoid having to rely on
1672          the lifetime of the copy of the line within the cache.
1673          This will be released by the auto_cpp_string_vec dtor.  */
1674       from.text = (unsigned char *)literal.xstrdup ();
1675       strs.safe_push (from);
1676
1677       /* For very long lines, a new linemap could have started
1678          halfway through the token.
1679          Ensure that the loc_reader uses the linemap of the
1680          *end* of the token for its start location.  */
1681       const line_map_ordinary *start_ord_map;
1682       linemap_resolve_location (line_table, src_range.m_start,
1683                                 LRK_SPELLING_LOCATION, &start_ord_map);
1684       const line_map_ordinary *final_ord_map;
1685       linemap_resolve_location (line_table, src_range.m_finish,
1686                                 LRK_SPELLING_LOCATION, &final_ord_map);
1687       if (start_ord_map == NULL || final_ord_map == NULL)
1688         return "failed to get ordinary maps";
1689       /* Bulletproofing.  We ought to only have different ordinary maps
1690          for start vs finish due to line-length jumps.  */
1691       if (start_ord_map != final_ord_map
1692           && start_ord_map->to_file != final_ord_map->to_file)
1693         return "start and finish are spelled in different ordinary maps";
1694       /* The file from linemap_resolve_location ought to match that from
1695          expand_location_to_spelling_point.  */
1696       if (start_ord_map->to_file != start.file)
1697         return "mismatching file after resolving linemap";
1698
1699       location_t start_loc
1700         = linemap_position_for_line_and_column (line_table, final_ord_map,
1701                                                 start.line, start.column);
1702
1703       cpp_string_location_reader loc_reader (start_loc, line_table);
1704       loc_readers.safe_push (loc_reader);
1705     }
1706
1707   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1708   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1709                                                  loc_readers.address (),
1710                                                  num_locs, &ranges, type);
1711   if (err)
1712     return err;
1713
1714   /* Success: "ranges" should now contain information on the string.  */
1715   return NULL;
1716 }
1717
1718 /* Attempt to populate *OUT_LOC with source location information on the
1719    given characters within the string literal found at STRLOC.
1720    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1721    character set.
1722
1723    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1724    and string literal "012345\n789"
1725    *OUT_LOC is written to with:
1726      "012345\n789"
1727          ~^~~~~
1728
1729    If CONCATS is non-NULL, then any string literals that the token at
1730    STRLOC was concatenated with are also considered.
1731
1732    This is implemented by re-parsing the relevant source line(s).
1733
1734    Return NULL if successful, or an error message if any errors occurred.
1735    Error messages are intended for GCC developers (to help debugging) rather
1736    than for end-users.  */
1737
1738 const char *
1739 get_location_within_string (cpp_reader *pfile,
1740                             string_concat_db *concats,
1741                             location_t strloc,
1742                             enum cpp_ttype type,
1743                             int caret_idx, int start_idx, int end_idx,
1744                             location_t *out_loc)
1745 {
1746   gcc_checking_assert (caret_idx >= 0);
1747   gcc_checking_assert (start_idx >= 0);
1748   gcc_checking_assert (end_idx >= 0);
1749   gcc_assert (out_loc);
1750
1751   cpp_substring_ranges ranges;
1752   const char *err
1753     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1754   if (err)
1755     return err;
1756
1757   if (caret_idx >= ranges.get_num_ranges ())
1758     return "caret_idx out of range";
1759   if (start_idx >= ranges.get_num_ranges ())
1760     return "start_idx out of range";
1761   if (end_idx >= ranges.get_num_ranges ())
1762     return "end_idx out of range";
1763
1764   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1765                             ranges.get_range (start_idx).m_start,
1766                             ranges.get_range (end_idx).m_finish);
1767   return NULL;
1768 }
1769
1770 /* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1771
1772 location_t
1773 location_with_discriminator (location_t locus, int discriminator)
1774 {
1775   tree block = LOCATION_BLOCK (locus);
1776   source_range src_range = get_range_from_loc (line_table, locus);
1777   locus = get_pure_location (locus);
1778
1779   if (locus == UNKNOWN_LOCATION)
1780     return locus;
1781
1782   return COMBINE_LOCATION_DATA (line_table, locus, src_range, block, discriminator);
1783 }
1784
1785 /* Return TRUE if LOCUS represents a location with a discriminator.  */
1786
1787 bool
1788 has_discriminator (location_t locus)
1789 {
1790   return get_discriminator_from_loc (locus) != 0;
1791 }
1792
1793 /* Return the discriminator for LOCUS.  */
1794
1795 int
1796 get_discriminator_from_loc (location_t locus)
1797 {
1798   return get_discriminator_from_loc (line_table, locus);
1799 }
1800
1801 #if CHECKING_P
1802
1803 namespace selftest {
1804
1805 /* Selftests of location handling.  */
1806
1807 /* Attempt to populate *OUT_RANGE with source location information on the
1808    given character within the string literal found at STRLOC.
1809    CHAR_IDX refers to an offset within the execution character set.
1810    If CONCATS is non-NULL, then any string literals that the token at
1811    STRLOC was concatenated with are also considered.
1812
1813    This is implemented by re-parsing the relevant source line(s).
1814
1815    Return NULL if successful, or an error message if any errors occurred.
1816    Error messages are intended for GCC developers (to help debugging) rather
1817    than for end-users.  */
1818
1819 static const char *
1820 get_source_range_for_char (cpp_reader *pfile,
1821                            string_concat_db *concats,
1822                            location_t strloc,
1823                            enum cpp_ttype type,
1824                            int char_idx,
1825                            source_range *out_range)
1826 {
1827   gcc_checking_assert (char_idx >= 0);
1828   gcc_assert (out_range);
1829
1830   cpp_substring_ranges ranges;
1831   const char *err
1832     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1833   if (err)
1834     return err;
1835
1836   if (char_idx >= ranges.get_num_ranges ())
1837     return "char_idx out of range";
1838
1839   *out_range = ranges.get_range (char_idx);
1840   return NULL;
1841 }
1842
1843 /* As get_source_range_for_char, but write to *OUT the number
1844    of ranges that are available.  */
1845
1846 static const char *
1847 get_num_source_ranges_for_substring (cpp_reader *pfile,
1848                                      string_concat_db *concats,
1849                                      location_t strloc,
1850                                      enum cpp_ttype type,
1851                                      int *out)
1852 {
1853   gcc_assert (out);
1854
1855   cpp_substring_ranges ranges;
1856   const char *err
1857     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1858
1859   if (err)
1860     return err;
1861
1862   *out = ranges.get_num_ranges ();
1863   return NULL;
1864 }
1865
1866 /* Selftests of location handling.  */
1867
1868 /* Verify that compare() on linenum_type handles comparisons over the full
1869    range of the type.  */
1870
1871 static void
1872 test_linenum_comparisons ()
1873 {
1874   linenum_type min_line (0);
1875   linenum_type max_line (0xffffffff);
1876   ASSERT_EQ (0, compare (min_line, min_line));
1877   ASSERT_EQ (0, compare (max_line, max_line));
1878
1879   ASSERT_GT (compare (max_line, min_line), 0);
1880   ASSERT_LT (compare (min_line, max_line), 0);
1881 }
1882
1883 /* Helper function for verifying location data: when location_t
1884    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1885    as having column 0.  */
1886
1887 static bool
1888 should_have_column_data_p (location_t loc)
1889 {
1890   if (IS_ADHOC_LOC (loc))
1891     loc = get_location_from_adhoc_loc (line_table, loc);
1892   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1893     return false;
1894   return true;
1895 }
1896
1897 /* Selftest for should_have_column_data_p.  */
1898
1899 static void
1900 test_should_have_column_data_p ()
1901 {
1902   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1903   ASSERT_TRUE
1904     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1905   ASSERT_FALSE
1906     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1907 }
1908
1909 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1910    on LOC.  */
1911
1912 static void
1913 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1914               location_t loc)
1915 {
1916   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1917   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1918   /* If location_t values are sufficiently high, then column numbers
1919      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1920      When close to the threshold, column numbers *may* be present: if
1921      the final linemap before the threshold contains a line that straddles
1922      the threshold, locations in that line have column information.  */
1923   if (should_have_column_data_p (loc))
1924     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1925 }
1926
1927 /* Various selftests involve constructing a line table and one or more
1928    line maps within it.
1929
1930    For maximum test coverage we want to run these tests with a variety
1931    of situations:
1932    - line_table->default_range_bits: some frontends use a non-zero value
1933    and others use zero
1934    - the fallback modes within line-map.cc: there are various threshold
1935    values for location_t beyond line-map.cc changes
1936    behavior (disabling of the range-packing optimization, disabling
1937    of column-tracking).  We can exercise these by starting the line_table
1938    at interesting values at or near these thresholds.
1939
1940    The following struct describes a particular case within our test
1941    matrix.  */
1942
1943 class line_table_case
1944 {
1945 public:
1946   line_table_case (int default_range_bits, int base_location)
1947   : m_default_range_bits (default_range_bits),
1948     m_base_location (base_location)
1949   {}
1950
1951   int m_default_range_bits;
1952   int m_base_location;
1953 };
1954
1955 /* Constructor.  Store the old value of line_table, and create a new
1956    one, using sane defaults.  */
1957
1958 line_table_test::line_table_test ()
1959 {
1960   gcc_assert (saved_line_table == NULL);
1961   saved_line_table = line_table;
1962   line_table = ggc_alloc<line_maps> ();
1963   linemap_init (line_table, BUILTINS_LOCATION);
1964   gcc_assert (saved_line_table->reallocator);
1965   line_table->reallocator = saved_line_table->reallocator;
1966   gcc_assert (saved_line_table->round_alloc_size);
1967   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1968   line_table->default_range_bits = 0;
1969 }
1970
1971 /* Constructor.  Store the old value of line_table, and create a new
1972    one, using the sitation described in CASE_.  */
1973
1974 line_table_test::line_table_test (const line_table_case &case_)
1975 {
1976   gcc_assert (saved_line_table == NULL);
1977   saved_line_table = line_table;
1978   line_table = ggc_alloc<line_maps> ();
1979   linemap_init (line_table, BUILTINS_LOCATION);
1980   gcc_assert (saved_line_table->reallocator);
1981   line_table->reallocator = saved_line_table->reallocator;
1982   gcc_assert (saved_line_table->round_alloc_size);
1983   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1984   line_table->default_range_bits = case_.m_default_range_bits;
1985   if (case_.m_base_location)
1986     {
1987       line_table->highest_location = case_.m_base_location;
1988       line_table->highest_line = case_.m_base_location;
1989     }
1990 }
1991
1992 /* Destructor.  Restore the old value of line_table.  */
1993
1994 line_table_test::~line_table_test ()
1995 {
1996   gcc_assert (saved_line_table != NULL);
1997   line_table = saved_line_table;
1998   saved_line_table = NULL;
1999 }
2000
2001 /* Verify basic operation of ordinary linemaps.  */
2002
2003 static void
2004 test_accessing_ordinary_linemaps (const line_table_case &case_)
2005 {
2006   line_table_test ltt (case_);
2007
2008   /* Build a simple linemap describing some locations. */
2009   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
2010
2011   linemap_line_start (line_table, 1, 100);
2012   location_t loc_a = linemap_position_for_column (line_table, 1);
2013   location_t loc_b = linemap_position_for_column (line_table, 23);
2014
2015   linemap_line_start (line_table, 2, 100);
2016   location_t loc_c = linemap_position_for_column (line_table, 1);
2017   location_t loc_d = linemap_position_for_column (line_table, 17);
2018
2019   /* Example of a very long line.  */
2020   linemap_line_start (line_table, 3, 2000);
2021   location_t loc_e = linemap_position_for_column (line_table, 700);
2022
2023   /* Transitioning back to a short line.  */
2024   linemap_line_start (line_table, 4, 0);
2025   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2026
2027   if (should_have_column_data_p (loc_back_to_short))
2028     {
2029       /* Verify that we switched to short lines in the linemap.  */
2030       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
2031       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2032     }
2033
2034   /* Example of a line that will eventually be seen to be longer
2035      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2036      below that.  */
2037   linemap_line_start (line_table, 5, 2000);
2038
2039   location_t loc_start_of_very_long_line
2040     = linemap_position_for_column (line_table, 2000);
2041   location_t loc_too_wide
2042     = linemap_position_for_column (line_table, 4097);
2043   location_t loc_too_wide_2
2044     = linemap_position_for_column (line_table, 4098);
2045
2046   /* ...and back to a sane line length.  */
2047   linemap_line_start (line_table, 6, 100);
2048   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2049
2050   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2051
2052   /* Multiple files.  */
2053   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2054   linemap_line_start (line_table, 1, 200);
2055   location_t loc_f = linemap_position_for_column (line_table, 150);
2056   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2057
2058   /* Verify that we can recover the location info.  */
2059   assert_loceq ("foo.c", 1, 1, loc_a);
2060   assert_loceq ("foo.c", 1, 23, loc_b);
2061   assert_loceq ("foo.c", 2, 1, loc_c);
2062   assert_loceq ("foo.c", 2, 17, loc_d);
2063   assert_loceq ("foo.c", 3, 700, loc_e);
2064   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2065
2066   /* In the very wide line, the initial location should be fully tracked.  */
2067   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2068   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2069      be disabled.  */
2070   assert_loceq ("foo.c", 5, 0, loc_too_wide);
2071   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2072   /*...and column-tracking should be re-enabled for subsequent lines.  */
2073   assert_loceq ("foo.c", 6, 10, loc_sane_again);
2074
2075   assert_loceq ("bar.c", 1, 150, loc_f);
2076
2077   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2078   ASSERT_TRUE (pure_location_p (line_table, loc_a));
2079
2080   /* Verify using make_location to build a range, and extracting data
2081      back from it.  */
2082   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2083   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2084   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2085   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2086   ASSERT_EQ (loc_b, src_range.m_start);
2087   ASSERT_EQ (loc_d, src_range.m_finish);
2088 }
2089
2090 /* Verify various properties of UNKNOWN_LOCATION.  */
2091
2092 static void
2093 test_unknown_location ()
2094 {
2095   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2096   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2097   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2098 }
2099
2100 /* Verify various properties of BUILTINS_LOCATION.  */
2101
2102 static void
2103 test_builtins ()
2104 {
2105   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
2106   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2107 }
2108
2109 /* Regression test for make_location.
2110    Ensure that we use pure locations for the start/finish of the range,
2111    rather than storing a packed or ad-hoc range as the start/finish.  */
2112
2113 static void
2114 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2115 {
2116   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2117      with C++ frontend.
2118      ....................0000000001111111111222.
2119      ....................1234567890123456789012.  */
2120   const char *content = "     r += !aaa == bbb;\n";
2121   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2122   line_table_test ltt (case_);
2123   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2124
2125   const location_t c11 = linemap_position_for_column (line_table, 11);
2126   const location_t c12 = linemap_position_for_column (line_table, 12);
2127   const location_t c13 = linemap_position_for_column (line_table, 13);
2128   const location_t c14 = linemap_position_for_column (line_table, 14);
2129   const location_t c21 = linemap_position_for_column (line_table, 21);
2130
2131   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2132     return;
2133
2134   /* Use column 13 for the caret location, arbitrarily, to verify that we
2135      handle start != caret.  */
2136   const location_t aaa = make_location (c13, c12, c14);
2137   ASSERT_EQ (c13, get_pure_location (aaa));
2138   ASSERT_EQ (c12, get_start (aaa));
2139   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2140   ASSERT_EQ (c14, get_finish (aaa));
2141   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2142
2143   /* Make a location using a location with a range as the start-point.  */
2144   const location_t not_aaa = make_location (c11, aaa, c14);
2145   ASSERT_EQ (c11, get_pure_location (not_aaa));
2146   /* It should use the start location of the range, not store the range
2147      itself.  */
2148   ASSERT_EQ (c12, get_start (not_aaa));
2149   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2150   ASSERT_EQ (c14, get_finish (not_aaa));
2151   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2152
2153   /* Similarly, make a location with a range as the end-point.  */
2154   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2155   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2156   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2157   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2158   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2159   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2160   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2161   /* It should use the finish location of the range, not store the range
2162      itself.  */
2163   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2164   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2165   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2166   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2167   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2168 }
2169
2170 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2171
2172 static void
2173 test_reading_source_line ()
2174 {
2175   /* Create a tempfile and write some text to it.  */
2176   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2177                         "01234567890123456789\n"
2178                         "This is the test text\n"
2179                         "This is the 3rd line");
2180
2181   /* Read back a specific line from the tempfile.  */
2182   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2183   ASSERT_TRUE (source_line);
2184   ASSERT_TRUE (source_line.get_buffer () != NULL);
2185   ASSERT_EQ (20, source_line.length ());
2186   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2187                          source_line.get_buffer (), source_line.length ()));
2188
2189   source_line = location_get_source_line (tmp.get_filename (), 2);
2190   ASSERT_TRUE (source_line);
2191   ASSERT_TRUE (source_line.get_buffer () != NULL);
2192   ASSERT_EQ (21, source_line.length ());
2193   ASSERT_TRUE (!strncmp ("This is the test text",
2194                          source_line.get_buffer (), source_line.length ()));
2195
2196   source_line = location_get_source_line (tmp.get_filename (), 4);
2197   ASSERT_FALSE (source_line);
2198   ASSERT_TRUE (source_line.get_buffer () == NULL);
2199 }
2200
2201 /* Tests of lexing.  */
2202
2203 /* Verify that token TOK from PARSER has cpp_token_as_text
2204    equal to EXPECTED_TEXT.  */
2205
2206 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
2207   SELFTEST_BEGIN_STMT                                                   \
2208     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
2209     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
2210   SELFTEST_END_STMT
2211
2212 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2213    and ranges from EXP_START_COL to EXP_FINISH_COL.
2214    Use LOC as the effective location of the selftest.  */
2215
2216 static void
2217 assert_token_loc_eq (const location &loc,
2218                      const cpp_token *tok,
2219                      const char *exp_filename, int exp_linenum,
2220                      int exp_start_col, int exp_finish_col)
2221 {
2222   location_t tok_loc = tok->src_loc;
2223   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2224   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2225
2226   /* If location_t values are sufficiently high, then column numbers
2227      will be unavailable.  */
2228   if (!should_have_column_data_p (tok_loc))
2229     return;
2230
2231   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2232   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2233   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2234   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2235 }
2236
2237 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2238    SELFTEST_LOCATION as the effective location of the selftest.  */
2239
2240 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2241                             EXP_START_COL, EXP_FINISH_COL) \
2242   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2243                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2244
2245 /* Test of lexing a file using libcpp, verifying tokens and their
2246    location information.  */
2247
2248 static void
2249 test_lexer (const line_table_case &case_)
2250 {
2251   /* Create a tempfile and write some text to it.  */
2252   const char *content =
2253     /*00000000011111111112222222222333333.3333444444444.455555555556
2254       12345678901234567890123456789012345.6789012345678.901234567890.  */
2255     ("test_name /* c-style comment */\n"
2256      "                                  \"test literal\"\n"
2257      " // test c++-style comment\n"
2258      "   42\n");
2259   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2260
2261   line_table_test ltt (case_);
2262
2263   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2264
2265   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2266   ASSERT_NE (fname, NULL);
2267
2268   /* Verify that we get the expected tokens back, with the correct
2269      location information.  */
2270
2271   location_t loc;
2272   const cpp_token *tok;
2273   tok = cpp_get_token_with_location (parser, &loc);
2274   ASSERT_NE (tok, NULL);
2275   ASSERT_EQ (tok->type, CPP_NAME);
2276   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2277   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2278
2279   tok = cpp_get_token_with_location (parser, &loc);
2280   ASSERT_NE (tok, NULL);
2281   ASSERT_EQ (tok->type, CPP_STRING);
2282   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2283   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2284
2285   tok = cpp_get_token_with_location (parser, &loc);
2286   ASSERT_NE (tok, NULL);
2287   ASSERT_EQ (tok->type, CPP_NUMBER);
2288   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2289   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2290
2291   tok = cpp_get_token_with_location (parser, &loc);
2292   ASSERT_NE (tok, NULL);
2293   ASSERT_EQ (tok->type, CPP_EOF);
2294
2295   cpp_finish (parser, NULL);
2296   cpp_destroy (parser);
2297 }
2298
2299 /* Forward decls.  */
2300
2301 class lexer_test;
2302 class lexer_test_options;
2303
2304 /* A class for specifying options of a lexer_test.
2305    The "apply" vfunc is called during the lexer_test constructor.  */
2306
2307 class lexer_test_options
2308 {
2309  public:
2310   virtual void apply (lexer_test &) = 0;
2311 };
2312
2313 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2314    in its dtor.
2315
2316    This is needed by struct lexer_test to ensure that the cleanup of the
2317    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2318
2319 class cpp_reader_ptr
2320 {
2321  public:
2322   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2323
2324   ~cpp_reader_ptr ()
2325   {
2326     cpp_finish (m_ptr, NULL);
2327     cpp_destroy (m_ptr);
2328   }
2329
2330   operator cpp_reader * () const { return m_ptr; }
2331
2332  private:
2333   cpp_reader *m_ptr;
2334 };
2335
2336 /* A struct for writing lexer tests.  */
2337
2338 class lexer_test
2339 {
2340 public:
2341   lexer_test (const line_table_case &case_, const char *content,
2342               lexer_test_options *options);
2343   ~lexer_test ();
2344
2345   const cpp_token *get_token ();
2346
2347   /* The ordering of these fields matters.
2348      The line_table_test must be first, since the cpp_reader_ptr
2349      uses it.
2350      The cpp_reader must be cleaned up *after* the temp_source_file
2351      since the filenames in input.cc's input cache are owned by the
2352      cpp_reader; in particular, when ~temp_source_file evicts the
2353      filename the filenames must still be alive.  */
2354   line_table_test m_ltt;
2355   cpp_reader_ptr m_parser;
2356   temp_source_file m_tempfile;
2357   string_concat_db m_concats;
2358   bool m_implicitly_expect_EOF;
2359 };
2360
2361 /* Use an EBCDIC encoding for the execution charset, specifically
2362    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2363
2364    This exercises iconv integration within libcpp.
2365    Not every build of iconv supports the given charset,
2366    so we need to flag this error and handle it gracefully.  */
2367
2368 class ebcdic_execution_charset : public lexer_test_options
2369 {
2370  public:
2371   ebcdic_execution_charset () : m_num_iconv_errors (0)
2372     {
2373       gcc_assert (s_singleton == NULL);
2374       s_singleton = this;
2375     }
2376   ~ebcdic_execution_charset ()
2377     {
2378       gcc_assert (s_singleton == this);
2379       s_singleton = NULL;
2380     }
2381
2382   void apply (lexer_test &test) final override
2383   {
2384     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2385     cpp_opts->narrow_charset = "IBM1047";
2386
2387     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2388     callbacks->diagnostic = on_diagnostic;
2389   }
2390
2391   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2392                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2393                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2394                              rich_location *richloc ATTRIBUTE_UNUSED,
2395                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2396     ATTRIBUTE_FPTR_PRINTF(5,0)
2397   {
2398     gcc_assert (s_singleton);
2399     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2400     const char *msg = "conversion from %s to %s not supported by iconv";
2401 #ifdef ENABLE_NLS
2402     msg = dgettext ("cpplib", msg);
2403 #endif
2404     /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2405        when the local iconv build doesn't support the conversion.  */
2406     if (strcmp (msgid, msg) == 0)
2407       {
2408         s_singleton->m_num_iconv_errors++;
2409         return true;
2410       }
2411
2412     /* Otherwise, we have an unexpected error.  */
2413     abort ();
2414   }
2415
2416   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2417
2418  private:
2419   static ebcdic_execution_charset *s_singleton;
2420   int m_num_iconv_errors;
2421 };
2422
2423 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2424
2425 /* A lexer_test_options subclass that records a list of diagnostic
2426    messages emitted by the lexer.  */
2427
2428 class lexer_diagnostic_sink : public lexer_test_options
2429 {
2430  public:
2431   lexer_diagnostic_sink ()
2432   {
2433     gcc_assert (s_singleton == NULL);
2434     s_singleton = this;
2435   }
2436   ~lexer_diagnostic_sink ()
2437   {
2438     gcc_assert (s_singleton == this);
2439     s_singleton = NULL;
2440
2441     int i;
2442     char *str;
2443     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2444       free (str);
2445   }
2446
2447   void apply (lexer_test &test) final override
2448   {
2449     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2450     callbacks->diagnostic = on_diagnostic;
2451   }
2452
2453   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2454                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2455                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2456                              rich_location *richloc ATTRIBUTE_UNUSED,
2457                              const char *msgid, va_list *ap)
2458     ATTRIBUTE_FPTR_PRINTF(5,0)
2459   {
2460     char *msg = xvasprintf (msgid, *ap);
2461     s_singleton->m_diagnostics.safe_push (msg);
2462     return true;
2463   }
2464
2465   auto_vec<char *> m_diagnostics;
2466
2467  private:
2468   static lexer_diagnostic_sink *s_singleton;
2469 };
2470
2471 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2472
2473 /* Constructor.  Override line_table with a new instance based on CASE_,
2474    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2475    start parsing the tempfile.  */
2476
2477 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2478                         lexer_test_options *options)
2479 : m_ltt (case_),
2480   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2481   /* Create a tempfile and write the text to it.  */
2482   m_tempfile (SELFTEST_LOCATION, ".c", content),
2483   m_concats (),
2484   m_implicitly_expect_EOF (true)
2485 {
2486   if (options)
2487     options->apply (*this);
2488
2489   cpp_init_iconv (m_parser);
2490
2491   /* Parse the file.  */
2492   const char *fname = cpp_read_main_file (m_parser,
2493                                           m_tempfile.get_filename ());
2494   ASSERT_NE (fname, NULL);
2495 }
2496
2497 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2498
2499 lexer_test::~lexer_test ()
2500 {
2501   location_t loc;
2502   const cpp_token *tok;
2503
2504   if (m_implicitly_expect_EOF)
2505     {
2506       tok = cpp_get_token_with_location (m_parser, &loc);
2507       ASSERT_NE (tok, NULL);
2508       ASSERT_EQ (tok->type, CPP_EOF);
2509     }
2510 }
2511
2512 /* Get the next token from m_parser.  */
2513
2514 const cpp_token *
2515 lexer_test::get_token ()
2516 {
2517   location_t loc;
2518   const cpp_token *tok;
2519
2520   tok = cpp_get_token_with_location (m_parser, &loc);
2521   ASSERT_NE (tok, NULL);
2522   return tok;
2523 }
2524
2525 /* Verify that locations within string literals are correctly handled.  */
2526
2527 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2528    using the string concatenation database for TEST.
2529
2530    Assert that the character at index IDX is on EXPECTED_LINE,
2531    and that it begins at column EXPECTED_START_COL and ends at
2532    EXPECTED_FINISH_COL (unless the locations are beyond
2533    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2534    columns).  */
2535
2536 static void
2537 assert_char_at_range (const location &loc,
2538                       lexer_test& test,
2539                       location_t strloc, enum cpp_ttype type, int idx,
2540                       int expected_line, int expected_start_col,
2541                       int expected_finish_col)
2542 {
2543   cpp_reader *pfile = test.m_parser;
2544   string_concat_db *concats = &test.m_concats;
2545
2546   source_range actual_range = source_range();
2547   const char *err
2548     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2549                                  &actual_range);
2550   if (should_have_column_data_p (strloc))
2551     ASSERT_EQ_AT (loc, NULL, err);
2552   else
2553     {
2554       ASSERT_STREQ_AT (loc,
2555                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2556                        err);
2557       return;
2558     }
2559
2560   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2561   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2562   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2563   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2564
2565   if (should_have_column_data_p (actual_range.m_start))
2566     {
2567       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2568       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2569     }
2570   if (should_have_column_data_p (actual_range.m_finish))
2571     {
2572       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2573       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2574     }
2575 }
2576
2577 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2578    the effective location of any errors.  */
2579
2580 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2581                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2582   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2583                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2584                         (EXPECTED_FINISH_COL))
2585
2586 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2587    using the string concatenation database for TEST.
2588
2589    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2590
2591 static void
2592 assert_num_substring_ranges (const location &loc,
2593                              lexer_test& test,
2594                              location_t strloc,
2595                              enum cpp_ttype type,
2596                              int expected_num_ranges)
2597 {
2598   cpp_reader *pfile = test.m_parser;
2599   string_concat_db *concats = &test.m_concats;
2600
2601   int actual_num_ranges = -1;
2602   const char *err
2603     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2604                                            &actual_num_ranges);
2605   if (should_have_column_data_p (strloc))
2606     ASSERT_EQ_AT (loc, NULL, err);
2607   else
2608     {
2609       ASSERT_STREQ_AT (loc,
2610                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2611                        err);
2612       return;
2613     }
2614   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2615 }
2616
2617 /* Macro for calling assert_num_substring_ranges, supplying
2618    SELFTEST_LOCATION for the effective location of any errors.  */
2619
2620 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2621                                     EXPECTED_NUM_RANGES)                \
2622   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2623                                (TYPE), (EXPECTED_NUM_RANGES))
2624
2625
2626 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2627    returns an error (using the string concatenation database for TEST).  */
2628
2629 static void
2630 assert_has_no_substring_ranges (const location &loc,
2631                                 lexer_test& test,
2632                                 location_t strloc,
2633                                 enum cpp_ttype type,
2634                                 const char *expected_err)
2635 {
2636   cpp_reader *pfile = test.m_parser;
2637   string_concat_db *concats = &test.m_concats;
2638   cpp_substring_ranges ranges;
2639   const char *actual_err
2640     = get_substring_ranges_for_loc (pfile, concats, strloc,
2641                                     type, ranges);
2642   if (should_have_column_data_p (strloc))
2643     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2644   else
2645     ASSERT_STREQ_AT (loc,
2646                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2647                      actual_err);
2648 }
2649
2650 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2651     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2652                                     (STRLOC), (TYPE), (ERR))
2653
2654 /* Lex a simple string literal.  Verify the substring location data, before
2655    and after running cpp_interpret_string on it.  */
2656
2657 static void
2658 test_lexer_string_locations_simple (const line_table_case &case_)
2659 {
2660   /* Digits 0-9 (with 0 at column 10), the simple way.
2661      ....................000000000.11111111112.2222222223333333333
2662      ....................123456789.01234567890.1234567890123456789
2663      We add a trailing comment to ensure that we correctly locate
2664      the end of the string literal token.  */
2665   const char *content = "        \"0123456789\" /* not a string */\n";
2666   lexer_test test (case_, content, NULL);
2667
2668   /* Verify that we get the expected token back, with the correct
2669      location information.  */
2670   const cpp_token *tok = test.get_token ();
2671   ASSERT_EQ (tok->type, CPP_STRING);
2672   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2673   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2674
2675   /* At this point in lexing, the quote characters are treated as part of
2676      the string (they are stripped off by cpp_interpret_string).  */
2677
2678   ASSERT_EQ (tok->val.str.len, 12);
2679
2680   /* Verify that cpp_interpret_string works.  */
2681   cpp_string dst_string;
2682   const enum cpp_ttype type = CPP_STRING;
2683   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2684                                       &dst_string, type);
2685   ASSERT_TRUE (result);
2686   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2687   free (const_cast <unsigned char *> (dst_string.text));
2688
2689   /* Verify ranges of individual characters.  This no longer includes the
2690      opening quote, but does include the closing quote.  */
2691   for (int i = 0; i <= 10; i++)
2692     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2693                           10 + i, 10 + i);
2694
2695   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2696 }
2697
2698 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2699    encoding.  */
2700
2701 static void
2702 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2703 {
2704   /* EBCDIC support requires iconv.  */
2705   if (!HAVE_ICONV)
2706     return;
2707
2708   /* Digits 0-9 (with 0 at column 10), the simple way.
2709      ....................000000000.11111111112.2222222223333333333
2710      ....................123456789.01234567890.1234567890123456789
2711      We add a trailing comment to ensure that we correctly locate
2712      the end of the string literal token.  */
2713   const char *content = "        \"0123456789\" /* not a string */\n";
2714   ebcdic_execution_charset use_ebcdic;
2715   lexer_test test (case_, content, &use_ebcdic);
2716
2717   /* Verify that we get the expected token back, with the correct
2718      location information.  */
2719   const cpp_token *tok = test.get_token ();
2720   ASSERT_EQ (tok->type, CPP_STRING);
2721   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2722   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2723
2724   /* At this point in lexing, the quote characters are treated as part of
2725      the string (they are stripped off by cpp_interpret_string).  */
2726
2727   ASSERT_EQ (tok->val.str.len, 12);
2728
2729   /* The remainder of the test requires an iconv implementation that
2730      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2731   if (use_ebcdic.iconv_errors_occurred_p ())
2732     return;
2733
2734   /* Verify that cpp_interpret_string works.  */
2735   cpp_string dst_string;
2736   const enum cpp_ttype type = CPP_STRING;
2737   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2738                                       &dst_string, type);
2739   ASSERT_TRUE (result);
2740   /* We should now have EBCDIC-encoded text, specifically
2741      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2742      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2743   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2744                 (const char *)dst_string.text);
2745   free (const_cast <unsigned char *> (dst_string.text));
2746
2747   /* Verify that we don't attempt to record substring location information
2748      for such cases.  */
2749   ASSERT_HAS_NO_SUBSTRING_RANGES
2750     (test, tok->src_loc, type,
2751      "execution character set != source character set");
2752 }
2753
2754 /* Lex a string literal containing a hex-escaped character.
2755    Verify the substring location data, before and after running
2756    cpp_interpret_string on it.  */
2757
2758 static void
2759 test_lexer_string_locations_hex (const line_table_case &case_)
2760 {
2761   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2762      and with a space in place of digit 6, to terminate the escaped
2763      hex code.
2764      ....................000000000.111111.11112222.
2765      ....................123456789.012345.67890123.  */
2766   const char *content = "        \"01234\\x35 789\"\n";
2767   lexer_test test (case_, content, NULL);
2768
2769   /* Verify that we get the expected token back, with the correct
2770      location information.  */
2771   const cpp_token *tok = test.get_token ();
2772   ASSERT_EQ (tok->type, CPP_STRING);
2773   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2774   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2775
2776   /* At this point in lexing, the quote characters are treated as part of
2777      the string (they are stripped off by cpp_interpret_string).  */
2778   ASSERT_EQ (tok->val.str.len, 15);
2779
2780   /* Verify that cpp_interpret_string works.  */
2781   cpp_string dst_string;
2782   const enum cpp_ttype type = CPP_STRING;
2783   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2784                                       &dst_string, type);
2785   ASSERT_TRUE (result);
2786   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2787   free (const_cast <unsigned char *> (dst_string.text));
2788
2789   /* Verify ranges of individual characters.  This no longer includes the
2790      opening quote, but does include the closing quote.  */
2791   for (int i = 0; i <= 4; i++)
2792     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2793   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2794   for (int i = 6; i <= 10; i++)
2795     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2796
2797   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2798 }
2799
2800 /* Lex a string literal containing an octal-escaped character.
2801    Verify the substring location data after running cpp_interpret_string
2802    on it.  */
2803
2804 static void
2805 test_lexer_string_locations_oct (const line_table_case &case_)
2806 {
2807   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2808      and with a space in place of digit 6, to terminate the escaped
2809      octal code.
2810      ....................000000000.111111.11112222.2222223333333333444
2811      ....................123456789.012345.67890123.4567890123456789012  */
2812   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2813   lexer_test test (case_, content, NULL);
2814
2815   /* Verify that we get the expected token back, with the correct
2816      location information.  */
2817   const cpp_token *tok = test.get_token ();
2818   ASSERT_EQ (tok->type, CPP_STRING);
2819   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2820
2821   /* Verify that cpp_interpret_string works.  */
2822   cpp_string dst_string;
2823   const enum cpp_ttype type = CPP_STRING;
2824   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2825                                       &dst_string, type);
2826   ASSERT_TRUE (result);
2827   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2828   free (const_cast <unsigned char *> (dst_string.text));
2829
2830   /* Verify ranges of individual characters.  This no longer includes the
2831      opening quote, but does include the closing quote.  */
2832   for (int i = 0; i < 5; i++)
2833     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2834   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2835   for (int i = 6; i <= 10; i++)
2836     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2837
2838   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2839 }
2840
2841 /* Test of string literal containing letter escapes.  */
2842
2843 static void
2844 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2845 {
2846   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2847      .....................000000000.1.11111.1.1.11222.22222223333333
2848      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2849   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2850   lexer_test test (case_, content, NULL);
2851
2852   /* Verify that we get the expected tokens back.  */
2853   const cpp_token *tok = test.get_token ();
2854   ASSERT_EQ (tok->type, CPP_STRING);
2855   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2856
2857   /* Verify ranges of individual characters. */
2858   /* "\t".  */
2859   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2860                         0, 1, 10, 11);
2861   /* "foo". */
2862   for (int i = 1; i <= 3; i++)
2863     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2864                           i, 1, 11 + i, 11 + i);
2865   /* "\\" and "\n".  */
2866   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2867                         4, 1, 15, 16);
2868   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2869                         5, 1, 17, 18);
2870
2871   /* "bar" and closing quote for nul-terminator.  */
2872   for (int i = 6; i <= 9; i++)
2873     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2874                           i, 1, 13 + i, 13 + i);
2875
2876   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2877 }
2878
2879 /* Another test of a string literal containing a letter escape.
2880    Based on string seen in
2881      printf ("%-%\n");
2882    in gcc.dg/format/c90-printf-1.c.  */
2883
2884 static void
2885 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2886 {
2887   /* .....................000000000.1111.11.1111.22222222223.
2888      .....................123456789.0123.45.6789.01234567890.  */
2889   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2890   lexer_test test (case_, content, NULL);
2891
2892   /* Verify that we get the expected tokens back.  */
2893   const cpp_token *tok = test.get_token ();
2894   ASSERT_EQ (tok->type, CPP_STRING);
2895   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2896
2897   /* Verify ranges of individual characters. */
2898   /* "%-%".  */
2899   for (int i = 0; i < 3; i++)
2900     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2901                           i, 1, 10 + i, 10 + i);
2902   /* "\n".  */
2903   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2904                         3, 1, 13, 14);
2905
2906   /* Closing quote for nul-terminator.  */
2907   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2908                         4, 1, 15, 15);
2909
2910   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2911 }
2912
2913 /* Lex a string literal containing UCN 4 characters.
2914    Verify the substring location data after running cpp_interpret_string
2915    on it.  */
2916
2917 static void
2918 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2919 {
2920   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2921      as UCN 4.
2922      ....................000000000.111111.111122.222222223.33333333344444
2923      ....................123456789.012345.678901.234567890.12345678901234  */
2924   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2925   lexer_test test (case_, content, NULL);
2926
2927   /* Verify that we get the expected token back, with the correct
2928      location information.  */
2929   const cpp_token *tok = test.get_token ();
2930   ASSERT_EQ (tok->type, CPP_STRING);
2931   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2932
2933   /* Verify that cpp_interpret_string works.
2934      The string should be encoded in the execution character
2935      set.  Assuming that is UTF-8, we should have the following:
2936      -----------  ----  -----  -------  ----------------
2937      Byte offset  Byte  Octal  Unicode  Source Column(s)
2938      -----------  ----  -----  -------  ----------------
2939      0            0x30         '0'      10
2940      1            0x31         '1'      11
2941      2            0x32         '2'      12
2942      3            0x33         '3'      13
2943      4            0x34         '4'      14
2944      5            0xE2  \342   U+2174   15-20
2945      6            0x85  \205    (cont)  15-20
2946      7            0xB4  \264    (cont)  15-20
2947      8            0xE2  \342   U+2175   21-26
2948      9            0x85  \205    (cont)  21-26
2949      10           0xB5  \265    (cont)  21-26
2950      11           0x37         '7'      27
2951      12           0x38         '8'      28
2952      13           0x39         '9'      29
2953      14           0x00                  30 (closing quote)
2954      -----------  ----  -----  -------  ---------------.  */
2955
2956   cpp_string dst_string;
2957   const enum cpp_ttype type = CPP_STRING;
2958   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2959                                       &dst_string, type);
2960   ASSERT_TRUE (result);
2961   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2962                 (const char *)dst_string.text);
2963   free (const_cast <unsigned char *> (dst_string.text));
2964
2965   /* Verify ranges of individual characters.  This no longer includes the
2966      opening quote, but does include the closing quote.
2967      '01234'.  */
2968   for (int i = 0; i <= 4; i++)
2969     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2970   /* U+2174.  */
2971   for (int i = 5; i <= 7; i++)
2972     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2973   /* U+2175.  */
2974   for (int i = 8; i <= 10; i++)
2975     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2976   /* '789' and nul terminator  */
2977   for (int i = 11; i <= 14; i++)
2978     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2979
2980   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2981 }
2982
2983 /* Lex a string literal containing UCN 8 characters.
2984    Verify the substring location data after running cpp_interpret_string
2985    on it.  */
2986
2987 static void
2988 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2989 {
2990   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2991      ....................000000000.111111.1111222222.2222333333333.344444
2992      ....................123456789.012345.6789012345.6789012345678.901234  */
2993   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2994   lexer_test test (case_, content, NULL);
2995
2996   /* Verify that we get the expected token back, with the correct
2997      location information.  */
2998   const cpp_token *tok = test.get_token ();
2999   ASSERT_EQ (tok->type, CPP_STRING);
3000   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3001                            "\"01234\\U00002174\\U00002175789\"");
3002
3003   /* Verify that cpp_interpret_string works.
3004      The UTF-8 encoding of the string is identical to that from
3005      the ucn4 testcase above; the only difference is the column
3006      locations.  */
3007   cpp_string dst_string;
3008   const enum cpp_ttype type = CPP_STRING;
3009   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3010                                       &dst_string, type);
3011   ASSERT_TRUE (result);
3012   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3013                 (const char *)dst_string.text);
3014   free (const_cast <unsigned char *> (dst_string.text));
3015
3016   /* Verify ranges of individual characters.  This no longer includes the
3017      opening quote, but does include the closing quote.
3018      '01234'.  */
3019   for (int i = 0; i <= 4; i++)
3020     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3021   /* U+2174.  */
3022   for (int i = 5; i <= 7; i++)
3023     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3024   /* U+2175.  */
3025   for (int i = 8; i <= 10; i++)
3026     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3027   /* '789' at columns 35-37  */
3028   for (int i = 11; i <= 13; i++)
3029     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3030   /* Closing quote/nul-terminator at column 38.  */
3031   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3032
3033   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3034 }
3035
3036 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
3037
3038 static uint32_t
3039 uint32_from_big_endian (const uint32_t *ptr_be_value)
3040 {
3041   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3042   return (((uint32_t) buf[0] << 24)
3043           | ((uint32_t) buf[1] << 16)
3044           | ((uint32_t) buf[2] << 8)
3045           | (uint32_t) buf[3]);
3046 }
3047
3048 /* Lex a wide string literal and verify that attempts to read substring
3049    location data from it fail gracefully.  */
3050
3051 static void
3052 test_lexer_string_locations_wide_string (const line_table_case &case_)
3053 {
3054   /* Digits 0-9.
3055      ....................000000000.11111111112.22222222233333
3056      ....................123456789.01234567890.12345678901234  */
3057   const char *content = "       L\"0123456789\" /* non-str */\n";
3058   lexer_test test (case_, content, NULL);
3059
3060   /* Verify that we get the expected token back, with the correct
3061      location information.  */
3062   const cpp_token *tok = test.get_token ();
3063   ASSERT_EQ (tok->type, CPP_WSTRING);
3064   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3065
3066   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
3067   cpp_string dst_string;
3068   const enum cpp_ttype type = CPP_WSTRING;
3069   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3070                                       &dst_string, type);
3071   ASSERT_TRUE (result);
3072   /* The cpp_reader defaults to big-endian with
3073      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3074      now be encoded as UTF-32BE.  */
3075   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3076   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3077   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3078   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3079   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3080   free (const_cast <unsigned char *> (dst_string.text));
3081
3082   /* We don't yet support generating substring location information
3083      for L"" strings.  */
3084   ASSERT_HAS_NO_SUBSTRING_RANGES
3085     (test, tok->src_loc, type,
3086      "execution character set != source character set");
3087 }
3088
3089 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3090
3091 static uint16_t
3092 uint16_from_big_endian (const uint16_t *ptr_be_value)
3093 {
3094   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3095   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3096 }
3097
3098 /* Lex a u"" string literal and verify that attempts to read substring
3099    location data from it fail gracefully.  */
3100
3101 static void
3102 test_lexer_string_locations_string16 (const line_table_case &case_)
3103 {
3104   /* Digits 0-9.
3105      ....................000000000.11111111112.22222222233333
3106      ....................123456789.01234567890.12345678901234  */
3107   const char *content = "       u\"0123456789\" /* non-str */\n";
3108   lexer_test test (case_, content, NULL);
3109
3110   /* Verify that we get the expected token back, with the correct
3111      location information.  */
3112   const cpp_token *tok = test.get_token ();
3113   ASSERT_EQ (tok->type, CPP_STRING16);
3114   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3115
3116   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3117   cpp_string dst_string;
3118   const enum cpp_ttype type = CPP_STRING16;
3119   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3120                                       &dst_string, type);
3121   ASSERT_TRUE (result);
3122
3123   /* The cpp_reader defaults to big-endian, so dst_string should
3124      now be encoded as UTF-16BE.  */
3125   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3126   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3127   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3128   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3129   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3130   free (const_cast <unsigned char *> (dst_string.text));
3131
3132   /* We don't yet support generating substring location information
3133      for L"" strings.  */
3134   ASSERT_HAS_NO_SUBSTRING_RANGES
3135     (test, tok->src_loc, type,
3136      "execution character set != source character set");
3137 }
3138
3139 /* Lex a U"" string literal and verify that attempts to read substring
3140    location data from it fail gracefully.  */
3141
3142 static void
3143 test_lexer_string_locations_string32 (const line_table_case &case_)
3144 {
3145   /* Digits 0-9.
3146      ....................000000000.11111111112.22222222233333
3147      ....................123456789.01234567890.12345678901234  */
3148   const char *content = "       U\"0123456789\" /* non-str */\n";
3149   lexer_test test (case_, content, NULL);
3150
3151   /* Verify that we get the expected token back, with the correct
3152      location information.  */
3153   const cpp_token *tok = test.get_token ();
3154   ASSERT_EQ (tok->type, CPP_STRING32);
3155   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3156
3157   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3158   cpp_string dst_string;
3159   const enum cpp_ttype type = CPP_STRING32;
3160   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3161                                       &dst_string, type);
3162   ASSERT_TRUE (result);
3163
3164   /* The cpp_reader defaults to big-endian, so dst_string should
3165      now be encoded as UTF-32BE.  */
3166   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3167   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3168   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3169   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3170   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3171   free (const_cast <unsigned char *> (dst_string.text));
3172
3173   /* We don't yet support generating substring location information
3174      for L"" strings.  */
3175   ASSERT_HAS_NO_SUBSTRING_RANGES
3176     (test, tok->src_loc, type,
3177      "execution character set != source character set");
3178 }
3179
3180 /* Lex a u8-string literal.
3181    Verify the substring location data after running cpp_interpret_string
3182    on it.  */
3183
3184 static void
3185 test_lexer_string_locations_u8 (const line_table_case &case_)
3186 {
3187   /* Digits 0-9.
3188      ....................000000000.11111111112.22222222233333
3189      ....................123456789.01234567890.12345678901234  */
3190   const char *content = "      u8\"0123456789\" /* non-str */\n";
3191   lexer_test test (case_, content, NULL);
3192
3193   /* Verify that we get the expected token back, with the correct
3194      location information.  */
3195   const cpp_token *tok = test.get_token ();
3196   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3197   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3198
3199   /* Verify that cpp_interpret_string works.  */
3200   cpp_string dst_string;
3201   const enum cpp_ttype type = CPP_STRING;
3202   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3203                                       &dst_string, type);
3204   ASSERT_TRUE (result);
3205   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3206   free (const_cast <unsigned char *> (dst_string.text));
3207
3208   /* Verify ranges of individual characters.  This no longer includes the
3209      opening quote, but does include the closing quote.  */
3210   for (int i = 0; i <= 10; i++)
3211     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3212 }
3213
3214 /* Lex a string literal containing UTF-8 source characters.
3215    Verify the substring location data after running cpp_interpret_string
3216    on it.  */
3217
3218 static void
3219 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3220 {
3221  /* This string literal is written out to the source file as UTF-8,
3222     and is of the form "before mojibake after", where "mojibake"
3223     is written as the following four unicode code points:
3224        U+6587 CJK UNIFIED IDEOGRAPH-6587
3225        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3226        U+5316 CJK UNIFIED IDEOGRAPH-5316
3227        U+3051 HIRAGANA LETTER KE.
3228      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3229      "before" and "after" are 1 byte per unicode character.
3230
3231      The numbering shown are "columns", which are *byte* numbers within
3232      the line, rather than unicode character numbers.
3233
3234      .................... 000000000.1111111.
3235      .................... 123456789.0123456.  */
3236   const char *content = ("        \"before "
3237                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3238                               UTF-8: 0xE6 0x96 0x87
3239                               C octal escaped UTF-8: \346\226\207
3240                             "column" numbers: 17-19.  */
3241                          "\346\226\207"
3242
3243                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3244                               UTF-8: 0xE5 0xAD 0x97
3245                               C octal escaped UTF-8: \345\255\227
3246                             "column" numbers: 20-22.  */
3247                          "\345\255\227"
3248
3249                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3250                               UTF-8: 0xE5 0x8C 0x96
3251                               C octal escaped UTF-8: \345\214\226
3252                             "column" numbers: 23-25.  */
3253                          "\345\214\226"
3254
3255                          /* U+3051 HIRAGANA LETTER KE
3256                               UTF-8: 0xE3 0x81 0x91
3257                               C octal escaped UTF-8: \343\201\221
3258                             "column" numbers: 26-28.  */
3259                          "\343\201\221"
3260
3261                          /* column numbers 29 onwards
3262                           2333333.33334444444444
3263                           9012345.67890123456789. */
3264                          " after\" /* non-str */\n");
3265   lexer_test test (case_, content, NULL);
3266
3267   /* Verify that we get the expected token back, with the correct
3268      location information.  */
3269   const cpp_token *tok = test.get_token ();
3270   ASSERT_EQ (tok->type, CPP_STRING);
3271   ASSERT_TOKEN_AS_TEXT_EQ
3272     (test.m_parser, tok,
3273      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3274
3275   /* Verify that cpp_interpret_string works.  */
3276   cpp_string dst_string;
3277   const enum cpp_ttype type = CPP_STRING;
3278   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3279                                       &dst_string, type);
3280   ASSERT_TRUE (result);
3281   ASSERT_STREQ
3282     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3283      (const char *)dst_string.text);
3284   free (const_cast <unsigned char *> (dst_string.text));
3285
3286   /* Verify ranges of individual characters.  This no longer includes the
3287      opening quote, but does include the closing quote.
3288      Assuming that both source and execution encodings are UTF-8, we have
3289      a run of 25 octets in each, plus the NUL terminator.  */
3290   for (int i = 0; i < 25; i++)
3291     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3292   /* NUL-terminator should use the closing quote at column 35.  */
3293   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3294
3295   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3296 }
3297
3298 /* Test of string literal concatenation.  */
3299
3300 static void
3301 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3302 {
3303   /* Digits 0-9.
3304      .....................000000000.111111.11112222222222
3305      .....................123456789.012345.67890123456789.  */
3306   const char *content = ("        \"01234\" /* non-str */\n"
3307                          "        \"56789\" /* non-str */\n");
3308   lexer_test test (case_, content, NULL);
3309
3310   location_t input_locs[2];
3311
3312   /* Verify that we get the expected tokens back.  */
3313   auto_vec <cpp_string> input_strings;
3314   const cpp_token *tok_a = test.get_token ();
3315   ASSERT_EQ (tok_a->type, CPP_STRING);
3316   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3317   input_strings.safe_push (tok_a->val.str);
3318   input_locs[0] = tok_a->src_loc;
3319
3320   const cpp_token *tok_b = test.get_token ();
3321   ASSERT_EQ (tok_b->type, CPP_STRING);
3322   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3323   input_strings.safe_push (tok_b->val.str);
3324   input_locs[1] = tok_b->src_loc;
3325
3326   /* Verify that cpp_interpret_string works.  */
3327   cpp_string dst_string;
3328   const enum cpp_ttype type = CPP_STRING;
3329   bool result = cpp_interpret_string (test.m_parser,
3330                                       input_strings.address (), 2,
3331                                       &dst_string, type);
3332   ASSERT_TRUE (result);
3333   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3334   free (const_cast <unsigned char *> (dst_string.text));
3335
3336   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3337   test.m_concats.record_string_concatenation (2, input_locs);
3338
3339   location_t initial_loc = input_locs[0];
3340
3341   /* "01234" on line 1.  */
3342   for (int i = 0; i <= 4; i++)
3343     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3344   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3345   for (int i = 5; i <= 10; i++)
3346     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3347
3348   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3349 }
3350
3351 /* Another test of string literal concatenation.  */
3352
3353 static void
3354 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3355 {
3356   /* Digits 0-9.
3357      .....................000000000.111.11111112222222
3358      .....................123456789.012.34567890123456.  */
3359   const char *content = ("        \"01\" /* non-str */\n"
3360                          "        \"23\" /* non-str */\n"
3361                          "        \"45\" /* non-str */\n"
3362                          "        \"67\" /* non-str */\n"
3363                          "        \"89\" /* non-str */\n");
3364   lexer_test test (case_, content, NULL);
3365
3366   auto_vec <cpp_string> input_strings;
3367   location_t input_locs[5];
3368
3369   /* Verify that we get the expected tokens back.  */
3370   for (int i = 0; i < 5; i++)
3371     {
3372       const cpp_token *tok = test.get_token ();
3373       ASSERT_EQ (tok->type, CPP_STRING);
3374       input_strings.safe_push (tok->val.str);
3375       input_locs[i] = tok->src_loc;
3376     }
3377
3378   /* Verify that cpp_interpret_string works.  */
3379   cpp_string dst_string;
3380   const enum cpp_ttype type = CPP_STRING;
3381   bool result = cpp_interpret_string (test.m_parser,
3382                                       input_strings.address (), 5,
3383                                       &dst_string, type);
3384   ASSERT_TRUE (result);
3385   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3386   free (const_cast <unsigned char *> (dst_string.text));
3387
3388   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3389   test.m_concats.record_string_concatenation (5, input_locs);
3390
3391   location_t initial_loc = input_locs[0];
3392
3393   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3394      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3395      and expect get_source_range_for_substring to fail.
3396      However, for a string concatenation test, we can have a case
3397      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3398      but subsequent strings can be after it.
3399      Attempting to detect this within assert_char_at_range
3400      would overcomplicate the logic for the common test cases, so
3401      we detect it here.  */
3402   if (should_have_column_data_p (input_locs[0])
3403       && !should_have_column_data_p (input_locs[4]))
3404     {
3405       /* Verify that get_source_range_for_substring gracefully rejects
3406          this case.  */
3407       source_range actual_range;
3408       const char *err
3409         = get_source_range_for_char (test.m_parser, &test.m_concats,
3410                                      initial_loc, type, 0, &actual_range);
3411       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3412       return;
3413     }
3414
3415   for (int i = 0; i < 5; i++)
3416     for (int j = 0; j < 2; j++)
3417       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3418                             i + 1, 10 + j, 10 + j);
3419
3420   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3421   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3422
3423   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3424 }
3425
3426 /* Another test of string literal concatenation, this time combined with
3427    various kinds of escaped characters.  */
3428
3429 static void
3430 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3431 {
3432   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3433      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3434   const char *content
3435     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3436        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3437     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3438   lexer_test test (case_, content, NULL);
3439
3440   auto_vec <cpp_string> input_strings;
3441   location_t input_locs[4];
3442
3443   /* Verify that we get the expected tokens back.  */
3444   for (int i = 0; i < 4; i++)
3445     {
3446       const cpp_token *tok = test.get_token ();
3447       ASSERT_EQ (tok->type, CPP_STRING);
3448       input_strings.safe_push (tok->val.str);
3449       input_locs[i] = tok->src_loc;
3450     }
3451
3452   /* Verify that cpp_interpret_string works.  */
3453   cpp_string dst_string;
3454   const enum cpp_ttype type = CPP_STRING;
3455   bool result = cpp_interpret_string (test.m_parser,
3456                                       input_strings.address (), 4,
3457                                       &dst_string, type);
3458   ASSERT_TRUE (result);
3459   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3460   free (const_cast <unsigned char *> (dst_string.text));
3461
3462   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3463   test.m_concats.record_string_concatenation (4, input_locs);
3464
3465   location_t initial_loc = input_locs[0];
3466
3467   for (int i = 0; i <= 4; i++)
3468     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3469   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3470   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3471   for (int i = 7; i <= 9; i++)
3472     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3473
3474   /* NUL-terminator should use the location of the final closing quote.  */
3475   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3476
3477   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3478 }
3479
3480 /* Test of string literal in a macro.  */
3481
3482 static void
3483 test_lexer_string_locations_macro (const line_table_case &case_)
3484 {
3485   /* Digits 0-9.
3486      .....................0000000001111111111.22222222223.
3487      .....................1234567890123456789.01234567890.  */
3488   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3489                          "  MACRO");
3490   lexer_test test (case_, content, NULL);
3491
3492   /* Verify that we get the expected tokens back.  */
3493   const cpp_token *tok = test.get_token ();
3494   ASSERT_EQ (tok->type, CPP_PADDING);
3495
3496   tok = test.get_token ();
3497   ASSERT_EQ (tok->type, CPP_STRING);
3498   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3499
3500   /* Verify ranges of individual characters.  We ought to
3501      see columns within the macro definition.  */
3502   for (int i = 0; i <= 10; i++)
3503     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3504                           i, 1, 20 + i, 20 + i);
3505
3506   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3507
3508   tok = test.get_token ();
3509   ASSERT_EQ (tok->type, CPP_PADDING);
3510 }
3511
3512 /* Test of stringification of a macro argument.  */
3513
3514 static void
3515 test_lexer_string_locations_stringified_macro_argument
3516   (const line_table_case &case_)
3517 {
3518   /* .....................000000000111111111122222222223.
3519      .....................123456789012345678901234567890.  */
3520   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3521                          "MACRO(foo)\n");
3522   lexer_test test (case_, content, NULL);
3523
3524   /* Verify that we get the expected token back.  */
3525   const cpp_token *tok = test.get_token ();
3526   ASSERT_EQ (tok->type, CPP_PADDING);
3527
3528   tok = test.get_token ();
3529   ASSERT_EQ (tok->type, CPP_STRING);
3530   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3531
3532   /* We don't support getting the location of a stringified macro
3533      argument.  Verify that it fails gracefully.  */
3534   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3535                                   "cpp_interpret_string_1 failed");
3536
3537   tok = test.get_token ();
3538   ASSERT_EQ (tok->type, CPP_PADDING);
3539
3540   tok = test.get_token ();
3541   ASSERT_EQ (tok->type, CPP_PADDING);
3542 }
3543
3544 /* Ensure that we are fail gracefully if something attempts to pass
3545    in a location that isn't a string literal token.  Seen on this code:
3546
3547      const char a[] = " %d ";
3548      __builtin_printf (a, 0.5);
3549                        ^
3550
3551    when c-format.cc erroneously used the indicated one-character
3552    location as the format string location, leading to a read past the
3553    end of a string buffer in cpp_interpret_string_1.  */
3554
3555 static void
3556 test_lexer_string_locations_non_string (const line_table_case &case_)
3557 {
3558   /* .....................000000000111111111122222222223.
3559      .....................123456789012345678901234567890.  */
3560   const char *content = ("         a\n");
3561   lexer_test test (case_, content, NULL);
3562
3563   /* Verify that we get the expected token back.  */
3564   const cpp_token *tok = test.get_token ();
3565   ASSERT_EQ (tok->type, CPP_NAME);
3566   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3567
3568   /* At this point, libcpp is attempting to interpret the name as a
3569      string literal, despite it not starting with a quote.  We don't detect
3570      that, but we should at least fail gracefully.  */
3571   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3572                                   "cpp_interpret_string_1 failed");
3573 }
3574
3575 /* Ensure that we can read substring information for a token which
3576    starts in one linemap and ends in another .  Adapted from
3577    gcc.dg/cpp/pr69985.c.  */
3578
3579 static void
3580 test_lexer_string_locations_long_line (const line_table_case &case_)
3581 {
3582   /* .....................000000.000111111111
3583      .....................123456.789012346789.  */
3584   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3585                          "     \"0123456789012345678901234567890123456789"
3586                          "0123456789012345678901234567890123456789"
3587                          "0123456789012345678901234567890123456789"
3588                          "0123456789\"\n");
3589
3590   lexer_test test (case_, content, NULL);
3591
3592   /* Verify that we get the expected token back.  */
3593   const cpp_token *tok = test.get_token ();
3594   ASSERT_EQ (tok->type, CPP_STRING);
3595
3596   if (!should_have_column_data_p (line_table->highest_location))
3597     return;
3598
3599   /* Verify ranges of individual characters.  */
3600   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3601   for (int i = 0; i < 131; i++)
3602     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3603                           i, 2, 7 + i, 7 + i);
3604 }
3605
3606 /* Test of locations within a raw string that doesn't contain a newline.  */
3607
3608 static void
3609 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3610 {
3611   /* .....................00.0000000111111111122.
3612      .....................12.3456789012345678901.  */
3613   const char *content = ("R\"foo(0123456789)foo\"\n");
3614   lexer_test test (case_, content, NULL);
3615
3616   /* Verify that we get the expected token back.  */
3617   const cpp_token *tok = test.get_token ();
3618   ASSERT_EQ (tok->type, CPP_STRING);
3619
3620   /* Verify that cpp_interpret_string works.  */
3621   cpp_string dst_string;
3622   const enum cpp_ttype type = CPP_STRING;
3623   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3624                                       &dst_string, type);
3625   ASSERT_TRUE (result);
3626   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3627   free (const_cast <unsigned char *> (dst_string.text));
3628
3629   if (!should_have_column_data_p (line_table->highest_location))
3630     return;
3631
3632   /* 0-9, plus the nil terminator.  */
3633   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3634   for (int i = 0; i < 11; i++)
3635     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3636                           i, 1, 7 + i, 7 + i);
3637 }
3638
3639 /* Test of locations within a raw string that contains a newline.  */
3640
3641 static void
3642 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3643 {
3644   /* .....................00.0000.
3645      .....................12.3456.  */
3646   const char *content = ("R\"foo(\n"
3647   /* .....................00000.
3648      .....................12345.  */
3649                          "hello\n"
3650                          "world\n"
3651   /* .....................00000.
3652      .....................12345.  */
3653                          ")foo\"\n");
3654   lexer_test test (case_, content, NULL);
3655
3656   /* Verify that we get the expected token back.  */
3657   const cpp_token *tok = test.get_token ();
3658   ASSERT_EQ (tok->type, CPP_STRING);
3659
3660   /* Verify that cpp_interpret_string works.  */
3661   cpp_string dst_string;
3662   const enum cpp_ttype type = CPP_STRING;
3663   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3664                                       &dst_string, type);
3665   ASSERT_TRUE (result);
3666   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3667   free (const_cast <unsigned char *> (dst_string.text));
3668
3669   if (!should_have_column_data_p (line_table->highest_location))
3670     return;
3671
3672   /* Currently we don't support locations within raw strings that
3673      contain newlines.  */
3674   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3675                                   "range endpoints are on different lines");
3676 }
3677
3678 /* Test of parsing an unterminated raw string.  */
3679
3680 static void
3681 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3682 {
3683   const char *content = "R\"ouch()ouCh\" /* etc */";
3684
3685   lexer_diagnostic_sink diagnostics;
3686   lexer_test test (case_, content, &diagnostics);
3687   test.m_implicitly_expect_EOF = false;
3688
3689   /* Attempt to parse the raw string.  */
3690   const cpp_token *tok = test.get_token ();
3691   ASSERT_EQ (tok->type, CPP_EOF);
3692
3693   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3694   /* We expect the message "unterminated raw string"
3695      in the "cpplib" translation domain.
3696      It's not clear that dgettext is available on all supported hosts,
3697      so this assertion is commented-out for now.
3698        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3699                      diagnostics.m_diagnostics[0]);
3700   */
3701 }
3702
3703 /* Test of lexing char constants.  */
3704
3705 static void
3706 test_lexer_char_constants (const line_table_case &case_)
3707 {
3708   /* Various char constants.
3709      .....................0000000001111111111.22222222223.
3710      .....................1234567890123456789.01234567890.  */
3711   const char *content = ("         'a'\n"
3712                          "        u'a'\n"
3713                          "        U'a'\n"
3714                          "        L'a'\n"
3715                          "         'abc'\n");
3716   lexer_test test (case_, content, NULL);
3717
3718   /* Verify that we get the expected tokens back.  */
3719   /* 'a'.  */
3720   const cpp_token *tok = test.get_token ();
3721   ASSERT_EQ (tok->type, CPP_CHAR);
3722   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3723
3724   unsigned int chars_seen;
3725   int unsignedp;
3726   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3727                                           &chars_seen, &unsignedp);
3728   ASSERT_EQ (cc, 'a');
3729   ASSERT_EQ (chars_seen, 1);
3730
3731   /* u'a'.  */
3732   tok = test.get_token ();
3733   ASSERT_EQ (tok->type, CPP_CHAR16);
3734   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3735
3736   /* U'a'.  */
3737   tok = test.get_token ();
3738   ASSERT_EQ (tok->type, CPP_CHAR32);
3739   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3740
3741   /* L'a'.  */
3742   tok = test.get_token ();
3743   ASSERT_EQ (tok->type, CPP_WCHAR);
3744   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3745
3746   /* 'abc' (c-char-sequence).  */
3747   tok = test.get_token ();
3748   ASSERT_EQ (tok->type, CPP_CHAR);
3749   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3750 }
3751 /* A table of interesting location_t values, giving one axis of our test
3752    matrix.  */
3753
3754 static const location_t boundary_locations[] = {
3755   /* Zero means "don't override the default values for a new line_table".  */
3756   0,
3757
3758   /* An arbitrary non-zero value that isn't close to one of
3759      the boundary values below.  */
3760   0x10000,
3761
3762   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3763   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3764   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3765   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3766   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3767   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3768
3769   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3770   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3771   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3772   LINE_MAP_MAX_LOCATION_WITH_COLS,
3773   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3774   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3775 };
3776
3777 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3778
3779 void
3780 for_each_line_table_case (void (*testcase) (const line_table_case &))
3781 {
3782   /* As noted above in the description of struct line_table_case,
3783      we want to explore a test matrix of interesting line_table
3784      situations, running various selftests for each case within the
3785      matrix.  */
3786
3787   /* Run all tests with:
3788      (a) line_table->default_range_bits == 0, and
3789      (b) line_table->default_range_bits == 5.  */
3790   int num_cases_tested = 0;
3791   for (int default_range_bits = 0; default_range_bits <= 5;
3792        default_range_bits += 5)
3793     {
3794       /* ...and use each of the "interesting" location values as
3795          the starting location within line_table.  */
3796       const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3797       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3798         {
3799           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3800
3801           testcase (c);
3802
3803           num_cases_tested++;
3804         }
3805     }
3806
3807   /* Verify that we fully covered the test matrix.  */
3808   ASSERT_EQ (num_cases_tested, 2 * 12);
3809 }
3810
3811 /* Verify that when presented with a consecutive pair of locations with
3812    a very large line offset, we don't attempt to consolidate them into
3813    a single ordinary linemap where the line offsets within the line map
3814    would lead to overflow (PR lto/88147).  */
3815
3816 static void
3817 test_line_offset_overflow ()
3818 {
3819   line_table_test ltt (line_table_case (5, 0));
3820
3821   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3822   linemap_line_start (line_table, 1, 100);
3823   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3824   assert_loceq ("foo.c", 2578, 0, loc_a);
3825
3826   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3827   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3828   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3829
3830   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3831   assert_loceq ("foo.c", 404198, 0, loc_b);
3832
3833   /* We should have started a new linemap, rather than attempting to store
3834      a very large line offset.  */
3835   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3836   ASSERT_NE (ordmap_a, ordmap_b);
3837 }
3838
3839 void test_cpp_utf8 ()
3840 {
3841   const int def_tabstop = 8;
3842   cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3843
3844   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3845   {
3846     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3847     ASSERT_EQ (8, w_bad);
3848     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3849     ASSERT_EQ (5, w_ctrl);
3850   }
3851
3852   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3853   {
3854     const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3855     ASSERT_EQ (1, w_pi);
3856     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3857     ASSERT_EQ (2, w_emoji);
3858     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3859                                                         policy);
3860     ASSERT_EQ (1, w_umlaut_precomposed);
3861     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3862                                                       policy);
3863     ASSERT_EQ (1, w_umlaut_combining);
3864     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3865     ASSERT_EQ (2, w_han);
3866     const int w_ascii = cpp_display_width ("GCC", 3, policy);
3867     ASSERT_EQ (3, w_ascii);
3868     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3869                                            "\x9f! \xe4\xb8\xba y\xcc\x88",
3870                                            24, policy);
3871     ASSERT_EQ (18, w_mixed);
3872   }
3873
3874   /* Verify that display width properly expands tabs.  */
3875   {
3876     const char *tstr = "\tabc\td";
3877     ASSERT_EQ (6, cpp_display_width (tstr, 6,
3878                                      cpp_char_column_policy (1, cpp_wcwidth)));
3879     ASSERT_EQ (10, cpp_display_width (tstr, 6,
3880                                       cpp_char_column_policy (3, cpp_wcwidth)));
3881     ASSERT_EQ (17, cpp_display_width (tstr, 6,
3882                                       cpp_char_column_policy (8, cpp_wcwidth)));
3883     ASSERT_EQ (1,
3884                cpp_display_column_to_byte_column
3885                  (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
3886   }
3887
3888   /* Verify that cpp_byte_column_to_display_column can go past the end,
3889      and similar edge cases.  */
3890   {
3891     const char *str
3892       /* Display columns.
3893          111111112345  */
3894       = "\xcf\x80 abc";
3895       /* 111122223456
3896          Byte columns.  */
3897
3898     ASSERT_EQ (5, cpp_display_width (str, 6, policy));
3899     ASSERT_EQ (105,
3900                cpp_byte_column_to_display_column (str, 6, 106, policy));
3901     ASSERT_EQ (10000,
3902                cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
3903     ASSERT_EQ (0,
3904                cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
3905   }
3906
3907   /* Verify that cpp_display_column_to_byte_column can go past the end,
3908      and similar edge cases, and check invertibility.  */
3909   {
3910     const char *str
3911       /* Display columns.
3912          000000000000000000000000000000000000011
3913          111111112222222234444444455555555678901  */
3914       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3915       /* 000000000000000000000000000000000111111
3916          111122223333444456666777788889999012345
3917          Byte columns.  */
3918     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
3919     ASSERT_EQ (15,
3920                cpp_display_column_to_byte_column (str, 15, 11, policy));
3921     ASSERT_EQ (115,
3922                cpp_display_column_to_byte_column (str, 15, 111, policy));
3923     ASSERT_EQ (10000,
3924                cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
3925     ASSERT_EQ (0,
3926                cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
3927
3928     /* Verify that we do not interrupt a UTF-8 sequence.  */
3929     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
3930
3931     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3932       {
3933         const int disp_col
3934           = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
3935         const int byte_col2
3936           = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
3937
3938         /* If we ask for the display column in the middle of a UTF-8
3939            sequence, it will return the length of the partial sequence,
3940            matching the behavior of GCC before display column support.
3941            Otherwise check the round trip was successful.  */
3942         if (byte_col < 4)
3943           ASSERT_EQ (byte_col, disp_col);
3944         else if (byte_col >= 6 && byte_col < 9)
3945           ASSERT_EQ (3 + (byte_col - 5), disp_col);
3946         else
3947           ASSERT_EQ (byte_col2, byte_col);
3948       }
3949   }
3950
3951 }
3952
3953 /* Run all of the selftests within this file.  */
3954
3955 void
3956 input_cc_tests ()
3957 {
3958   test_linenum_comparisons ();
3959   test_should_have_column_data_p ();
3960   test_unknown_location ();
3961   test_builtins ();
3962   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3963
3964   for_each_line_table_case (test_accessing_ordinary_linemaps);
3965   for_each_line_table_case (test_lexer);
3966   for_each_line_table_case (test_lexer_string_locations_simple);
3967   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3968   for_each_line_table_case (test_lexer_string_locations_hex);
3969   for_each_line_table_case (test_lexer_string_locations_oct);
3970   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3971   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3972   for_each_line_table_case (test_lexer_string_locations_ucn4);
3973   for_each_line_table_case (test_lexer_string_locations_ucn8);
3974   for_each_line_table_case (test_lexer_string_locations_wide_string);
3975   for_each_line_table_case (test_lexer_string_locations_string16);
3976   for_each_line_table_case (test_lexer_string_locations_string32);
3977   for_each_line_table_case (test_lexer_string_locations_u8);
3978   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3979   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3980   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3981   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3982   for_each_line_table_case (test_lexer_string_locations_macro);
3983   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3984   for_each_line_table_case (test_lexer_string_locations_non_string);
3985   for_each_line_table_case (test_lexer_string_locations_long_line);
3986   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3987   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3988   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3989   for_each_line_table_case (test_lexer_char_constants);
3990
3991   test_reading_source_line ();
3992
3993   test_line_offset_overflow ();
3994
3995   test_cpp_utf8 ();
3996 }
3997
3998 } // namespace selftest
3999
4000 #endif /* CHECKING_P */