gcc/input.cc

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2022 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 const char *
  33 special_fname_builtin ()
  34 {
  35   return _("<built-in>");
  36 }
  37
  38 /* Input charset configuration.  */
  39 static const char *default_charset_callback (const char *)
  40 {
  41   return nullptr;
  42 }
  43
  44 void
  45 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
  46                                       bool should_skip_bom)
  47 {
  48   in_context.ccb = (ccb ? ccb : default_charset_callback);
  49   in_context.should_skip_bom = should_skip_bom;
  50 }
  51
  52 /* This is a cache used by get_next_line to store the content of a
  53    file to be searched for file lines.  */
  54 class file_cache_slot
  55 {
  56 public:
  57   file_cache_slot ();
  58   ~file_cache_slot ();
  59
  60   bool read_line_num (size_t line_num,
  61                       char ** line, ssize_t *line_len);
  62
  63   /* Accessors.  */
  64   const char *get_file_path () const { return m_file_path; }
  65   unsigned get_use_count () const { return m_use_count; }
  66   bool missing_trailing_newline_p () const
  67   {
  68     return m_missing_trailing_newline;
  69   }
  70
  71   void inc_use_count () { m_use_count++; }
  72
  73   bool create (const file_cache::input_context &in_context,
  74                const char *file_path, FILE *fp, unsigned highest_use_count);
  75   void evict ();
  76
  77  private:
  78   /* These are information used to store a line boundary.  */
  79   class line_info
  80   {
  81   public:
  82     /* The line number.  It starts from 1.  */
  83     size_t line_num;
  84
  85     /* The position (byte count) of the beginning of the line,
  86        relative to the file data pointer.  This starts at zero.  */
  87     size_t start_pos;
  88
  89     /* The position (byte count) of the last byte of the line.  This
  90        normally points to the '\n' character, or to one byte after the
  91        last byte of the file, if the file doesn't contain a '\n'
  92        character.  */
  93     size_t end_pos;
  94
  95     line_info (size_t l, size_t s, size_t e)
  96       : line_num (l), start_pos (s), end_pos (e)
  97     {}
  98
  99     line_info ()
 100       :line_num (0), start_pos (0), end_pos (0)
 101     {}
 102   };
 103
 104   bool needs_read_p () const;
 105   bool needs_grow_p () const;
 106   void maybe_grow ();
 107   bool read_data ();
 108   bool maybe_read_data ();
 109   bool get_next_line (char **line, ssize_t *line_len);
 110   bool read_next_line (char ** line, ssize_t *line_len);
 111   bool goto_next_line ();
 112
 113   static const size_t buffer_size = 4 * 1024;
 114   static const size_t line_record_size = 100;
 115
 116   /* The number of time this file has been accessed.  This is used
 117      to designate which file cache to evict from the cache
 118      array.  */
 119   unsigned m_use_count;
 120
 121   /* The file_path is the key for identifying a particular file in
 122      the cache.
 123      For libcpp-using code, the underlying buffer for this field is
 124      owned by the corresponding _cpp_file within the cpp_reader.  */
 125   const char *m_file_path;
 126
 127   FILE *m_fp;
 128
 129   /* This points to the content of the file that we've read so
 130      far.  */
 131   char *m_data;
 132
 133   /* The allocated buffer to be freed may start a little earlier than DATA,
 134      e.g. if a UTF8 BOM was skipped at the beginning.  */
 135   int m_alloc_offset;
 136
 137   /*  The size of the DATA array above.*/
 138   size_t m_size;
 139
 140   /* The number of bytes read from the underlying file so far.  This
 141      must be less (or equal) than SIZE above.  */
 142   size_t m_nb_read;
 143
 144   /* The index of the beginning of the current line.  */
 145   size_t m_line_start_idx;
 146
 147   /* The number of the previous line read.  This starts at 1.  Zero
 148      means we've read no line so far.  */
 149   size_t m_line_num;
 150
 151   /* This is the total number of lines of the current file.  At the
 152      moment, we try to get this information from the line map
 153      subsystem.  Note that this is just a hint.  When using the C++
 154      front-end, this hint is correct because the input file is then
 155      completely tokenized before parsing starts; so the line map knows
 156      the number of lines before compilation really starts.  For e.g,
 157      the C front-end, it can happen that we start emitting diagnostics
 158      before the line map has seen the end of the file.  */
 159   size_t m_total_lines;
 160
 161   /* Could this file be missing a trailing newline on its final line?
 162      Initially true (to cope with empty files), set to true/false
 163      as each line is read.  */
 164   bool m_missing_trailing_newline;
 165
 166   /* This is a record of the beginning and end of the lines we've seen
 167      while reading the file.  This is useful to avoid walking the data
 168      from the beginning when we are asked to read a line that is
 169      before LINE_START_IDX above.  Note that the maximum size of this
 170      record is line_record_size, so that the memory consumption
 171      doesn't explode.  We thus scale total_lines down to
 172      line_record_size.  */
 173   vec<line_info, va_heap> m_line_record;
 174
 175   void offset_buffer (int offset)
 176   {
 177     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
 178                 : (size_t) offset <= m_size);
 179     gcc_assert (m_data);
 180     m_alloc_offset += offset;
 181     m_data += offset;
 182     m_size -= offset;
 183   }
 184
 185 };
 186
 187 /* Current position in real source file.  */
 188
 189 location_t input_location = UNKNOWN_LOCATION;
 190
 191 class line_maps *line_table;
 192
 193 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 194    This needs to be a global so that it can be a GC root, and thus
 195    prevent the stashed copy from being garbage-collected if the GC runs
 196    during a line_table_test.  */
 197
 198 class line_maps *saved_line_table;
 199
 200 /* Expand the source location LOC into a human readable location.  If
 201    LOC resolves to a builtin location, the file name of the readable
 202    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 203    TRUE and LOC is virtual, then it is resolved to the expansion
 204    point of the involved macro.  Otherwise, it is resolved to the
 205    spelling location of the token.
 206
 207    When resolving to the spelling location of the token, if the
 208    resulting location is for a built-in location (that is, it has no
 209    associated line/column) in the context of a macro expansion, the
 210    returned location is the first one (while unwinding the macro
 211    location towards its expansion point) that is in real source
 212    code.
 213
 214    ASPECT controls which part of the location to use.  */
 215
 216 static expanded_location
 217 expand_location_1 (location_t loc,
 218                    bool expansion_point_p,
 219                    enum location_aspect aspect)
 220 {
 221   expanded_location xloc;
 222   const line_map_ordinary *map;
 223   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 224   tree block = NULL;
 225
 226   if (IS_ADHOC_LOC (loc))
 227     {
 228       block = LOCATION_BLOCK (loc);
 229       loc = LOCATION_LOCUS (loc);
 230     }
 231
 232   memset (&xloc, 0, sizeof (xloc));
 233
 234   if (loc >= RESERVED_LOCATION_COUNT)
 235     {
 236       if (!expansion_point_p)
 237         {
 238           /* We want to resolve LOC to its spelling location.
 239
 240              But if that spelling location is a reserved location that
 241              appears in the context of a macro expansion (like for a
 242              location for a built-in token), let's consider the first
 243              location (toward the expansion point) that is not reserved;
 244              that is, the first location that is in real source code.  */
 245           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 246                                                           loc, NULL);
 247           lrk = LRK_SPELLING_LOCATION;
 248         }
 249       loc = linemap_resolve_location (line_table, loc, lrk, &map);
 250
 251       /* loc is now either in an ordinary map, or is a reserved location.
 252          If it is a compound location, the caret is in a spelling location,
 253          but the start/finish might still be a virtual location.
 254          Depending of what the caller asked for, we may need to recurse
 255          one level in order to resolve any virtual locations in the
 256          end-points.  */
 257       switch (aspect)
 258         {
 259         default:
 260           gcc_unreachable ();
 261           /* Fall through.  */
 262         case LOCATION_ASPECT_CARET:
 263           break;
 264         case LOCATION_ASPECT_START:
 265           {
 266             location_t start = get_start (loc);
 267             if (start != loc)
 268               return expand_location_1 (start, expansion_point_p, aspect);
 269           }
 270           break;
 271         case LOCATION_ASPECT_FINISH:
 272           {
 273             location_t finish = get_finish (loc);
 274             if (finish != loc)
 275               return expand_location_1 (finish, expansion_point_p, aspect);
 276           }
 277           break;
 278         }
 279       xloc = linemap_expand_location (line_table, map, loc);
 280     }
 281
 282   xloc.data = block;
 283   if (loc <= BUILTINS_LOCATION)
 284     xloc.file = loc == UNKNOWN_LOCATION ? NULL : special_fname_builtin ();
 285
 286   return xloc;
 287 }
 288
 289 /* Initialize the set of cache used for files accessed by caret
 290    diagnostic.  */
 291
 292 static void
 293 diagnostic_file_cache_init (void)
 294 {
 295   gcc_assert (global_dc);
 296   if (global_dc->m_file_cache == NULL)
 297     global_dc->m_file_cache = new file_cache ();
 298 }
 299
 300 /* Free the resources used by the set of cache used for files accessed
 301    by caret diagnostic.  */
 302
 303 void
 304 diagnostic_file_cache_fini (void)
 305 {
 306   if (global_dc->m_file_cache)
 307     {
 308       delete global_dc->m_file_cache;
 309       global_dc->m_file_cache = NULL;
 310     }
 311 }
 312
 313 /* Return the total lines number that have been read so far by the
 314    line map (in the preprocessor) so far.  For languages like C++ that
 315    entirely preprocess the input file before starting to parse, this
 316    equals the actual number of lines of the file.  */
 317
 318 static size_t
 319 total_lines_num (const char *file_path)
 320 {
 321   size_t r = 0;
 322   location_t l = 0;
 323   if (linemap_get_file_highest_location (line_table, file_path, &l))
 324     {
 325       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 326       expanded_location xloc = expand_location (l);
 327       r = xloc.line;
 328     }
 329   return r;
 330 }
 331
 332 /* Lookup the cache used for the content of a given file accessed by
 333    caret diagnostic.  Return the found cached file, or NULL if no
 334    cached file was found.  */
 335
 336 file_cache_slot *
 337 file_cache::lookup_file (const char *file_path)
 338 {
 339   gcc_assert (file_path);
 340
 341   /* This will contain the found cached file.  */
 342   file_cache_slot *r = NULL;
 343   for (unsigned i = 0; i < num_file_slots; ++i)
 344     {
 345       file_cache_slot *c = &m_file_slots[i];
 346       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
 347         {
 348           c->inc_use_count ();
 349           r = c;
 350         }
 351     }
 352
 353   if (r)
 354     r->inc_use_count ();
 355
 356   return r;
 357 }
 358
 359 /* Purge any mention of FILENAME from the cache of files used for
 360    printing source code.  For use in selftests when working
 361    with tempfiles.  */
 362
 363 void
 364 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 365 {
 366   gcc_assert (file_path);
 367
 368   if (!global_dc->m_file_cache)
 369     return;
 370
 371   global_dc->m_file_cache->forcibly_evict_file (file_path);
 372 }
 373
 374 void
 375 file_cache::forcibly_evict_file (const char *file_path)
 376 {
 377   gcc_assert (file_path);
 378
 379   file_cache_slot *r = lookup_file (file_path);
 380   if (!r)
 381     /* Not found.  */
 382     return;
 383
 384   r->evict ();
 385 }
 386
 387 void
 388 file_cache_slot::evict ()
 389 {
 390   m_file_path = NULL;
 391   if (m_fp)
 392     fclose (m_fp);
 393   m_fp = NULL;
 394   m_nb_read = 0;
 395   m_line_start_idx = 0;
 396   m_line_num = 0;
 397   m_line_record.truncate (0);
 398   m_use_count = 0;
 399   m_total_lines = 0;
 400   m_missing_trailing_newline = true;
 401 }
 402
 403 /* Return the file cache that has been less used, recently, or the
 404    first empty one.  If HIGHEST_USE_COUNT is non-null,
 405    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 406    in the cache table.  */
 407
 408 file_cache_slot*
 409 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
 410 {
 411   diagnostic_file_cache_init ();
 412
 413   file_cache_slot *to_evict = &m_file_slots[0];
 414   unsigned huc = to_evict->get_use_count ();
 415   for (unsigned i = 1; i < num_file_slots; ++i)
 416     {
 417       file_cache_slot *c = &m_file_slots[i];
 418       bool c_is_empty = (c->get_file_path () == NULL);
 419
 420       if (c->get_use_count () < to_evict->get_use_count ()
 421           || (to_evict->get_file_path () && c_is_empty))
 422         /* We evict C because it's either an entry with a lower use
 423            count or one that is empty.  */
 424         to_evict = c;
 425
 426       if (huc < c->get_use_count ())
 427         huc = c->get_use_count ();
 428
 429       if (c_is_empty)
 430         /* We've reached the end of the cache; subsequent elements are
 431            all empty.  */
 432         break;
 433     }
 434
 435   if (highest_use_count)
 436     *highest_use_count = huc;
 437
 438   return to_evict;
 439 }
 440
 441 /* Create the cache used for the content of a given file to be
 442    accessed by caret diagnostic.  This cache is added to an array of
 443    cache and can be retrieved by lookup_file_in_cache_tab.  This
 444    function returns the created cache.  Note that only the last
 445    num_file_slots files are cached.  */
 446
 447 file_cache_slot*
 448 file_cache::add_file (const char *file_path)
 449 {
 450
 451   FILE *fp = fopen (file_path, "r");
 452   if (fp == NULL)
 453     return NULL;
 454
 455   unsigned highest_use_count = 0;
 456   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
 457   if (!r->create (in_context, file_path, fp, highest_use_count))
 458     return NULL;
 459   return r;
 460 }
 461
 462 /* Populate this slot for use on FILE_PATH and FP, dropping any
 463    existing cached content within it.  */
 464
 465 bool
 466 file_cache_slot::create (const file_cache::input_context &in_context,
 467                          const char *file_path, FILE *fp,
 468                          unsigned highest_use_count)
 469 {
 470   m_file_path = file_path;
 471   if (m_fp)
 472     fclose (m_fp);
 473   m_fp = fp;
 474   if (m_alloc_offset)
 475     offset_buffer (-m_alloc_offset);
 476   m_nb_read = 0;
 477   m_line_start_idx = 0;
 478   m_line_num = 0;
 479   m_line_record.truncate (0);
 480   /* Ensure that this cache entry doesn't get evicted next time
 481      add_file_to_cache_tab is called.  */
 482   m_use_count = ++highest_use_count;
 483   m_total_lines = total_lines_num (file_path);
 484   m_missing_trailing_newline = true;
 485
 486
 487   /* Check the input configuration to determine if we need to do any
 488      transformations, such as charset conversion or BOM skipping.  */
 489   if (const char *input_charset = in_context.ccb (file_path))
 490     {
 491       /* Need a full-blown conversion of the input charset.  */
 492       fclose (m_fp);
 493       m_fp = NULL;
 494       const cpp_converted_source cs
 495         = cpp_get_converted_source (file_path, input_charset);
 496       if (!cs.data)
 497         return false;
 498       if (m_data)
 499         XDELETEVEC (m_data);
 500       m_data = cs.data;
 501       m_nb_read = m_size = cs.len;
 502       m_alloc_offset = cs.data - cs.to_free;
 503     }
 504   else if (in_context.should_skip_bom)
 505     {
 506       if (read_data ())
 507         {
 508           const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
 509           offset_buffer (offset);
 510           m_nb_read -= offset;
 511         }
 512     }
 513
 514   return true;
 515 }
 516
 517 /* file_cache's ctor.  */
 518
 519 file_cache::file_cache ()
 520 : m_file_slots (new file_cache_slot[num_file_slots])
 521 {
 522   initialize_input_context (nullptr, false);
 523 }
 524
 525 /* file_cache's dtor.  */
 526
 527 file_cache::~file_cache ()
 528 {
 529   delete[] m_file_slots;
 530 }
 531
 532 /* Lookup the cache used for the content of a given file accessed by
 533    caret diagnostic.  If no cached file was found, create a new cache
 534    for this file, add it to the array of cached file and return
 535    it.  */
 536
 537 file_cache_slot*
 538 file_cache::lookup_or_add_file (const char *file_path)
 539 {
 540   file_cache_slot *r = lookup_file (file_path);
 541   if (r == NULL)
 542     r = add_file (file_path);
 543   return r;
 544 }
 545
 546 /* Default constructor for a cache of file used by caret
 547    diagnostic.  */
 548
 549 file_cache_slot::file_cache_slot ()
 550 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
 551   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
 552   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 553 {
 554   m_line_record.create (0);
 555 }
 556
 557 /* Destructor for a cache of file used by caret diagnostic.  */
 558
 559 file_cache_slot::~file_cache_slot ()
 560 {
 561   if (m_fp)
 562     {
 563       fclose (m_fp);
 564       m_fp = NULL;
 565     }
 566   if (m_data)
 567     {
 568       offset_buffer (-m_alloc_offset);
 569       XDELETEVEC (m_data);
 570       m_data = 0;
 571     }
 572   m_line_record.release ();
 573 }
 574
 575 /* Returns TRUE iff the cache would need to be filled with data coming
 576    from the file.  That is, either the cache is empty or full or the
 577    current line is empty.  Note that if the cache is full, it would
 578    need to be extended and filled again.  */
 579
 580 bool
 581 file_cache_slot::needs_read_p () const
 582 {
 583   return m_fp && (m_nb_read == 0
 584           || m_nb_read == m_size
 585           || (m_line_start_idx >= m_nb_read - 1));
 586 }
 587
 588 /*  Return TRUE iff the cache is full and thus needs to be
 589     extended.  */
 590
 591 bool
 592 file_cache_slot::needs_grow_p () const
 593 {
 594   return m_nb_read == m_size;
 595 }
 596
 597 /* Grow the cache if it needs to be extended.  */
 598
 599 void
 600 file_cache_slot::maybe_grow ()
 601 {
 602   if (!needs_grow_p ())
 603     return;
 604
 605   if (!m_data)
 606     {
 607       gcc_assert (m_size == 0 && m_alloc_offset == 0);
 608       m_size = buffer_size;
 609       m_data = XNEWVEC (char, m_size);
 610     }
 611   else
 612     {
 613       const int offset = m_alloc_offset;
 614       offset_buffer (-offset);
 615       m_size *= 2;
 616       m_data = XRESIZEVEC (char, m_data, m_size);
 617       offset_buffer (offset);
 618     }
 619 }
 620
 621 /*  Read more data into the cache.  Extends the cache if need be.
 622     Returns TRUE iff new data could be read.  */
 623
 624 bool
 625 file_cache_slot::read_data ()
 626 {
 627   if (feof (m_fp) || ferror (m_fp))
 628     return false;
 629
 630   maybe_grow ();
 631
 632   char * from = m_data + m_nb_read;
 633   size_t to_read = m_size - m_nb_read;
 634   size_t nb_read = fread (from, 1, to_read, m_fp);
 635
 636   if (ferror (m_fp))
 637     return false;
 638
 639   m_nb_read += nb_read;
 640   return !!nb_read;
 641 }
 642
 643 /* Read new data iff the cache needs to be filled with more data
 644    coming from the file FP.  Return TRUE iff the cache was filled with
 645    mode data.  */
 646
 647 bool
 648 file_cache_slot::maybe_read_data ()
 649 {
 650   if (!needs_read_p ())
 651     return false;
 652   return read_data ();
 653 }
 654
 655 /* Helper function for file_cache_slot::get_next_line (), to find the end of
 656    the next line.  Returns with the memchr convention, i.e. nullptr if a line
 657    terminator was not found.  We need to determine line endings in the same
 658    manner that libcpp does: any of \n, \r\n, or \r is a line ending.  */
 659
 660 static char *
 661 find_end_of_line (char *s, size_t len)
 662 {
 663   for (const auto end = s + len; s != end; ++s)
 664     {
 665       if (*s == '\n')
 666         return s;
 667       if (*s == '\r')
 668         {
 669           const auto next = s + 1;
 670           if (next == end)
 671             {
 672               /* Don't find the line ending if \r is the very last character
 673                  in the buffer; we do not know if it's the end of the file or
 674                  just the end of what has been read so far, and we wouldn't
 675                  want to break in the middle of what's actually a \r\n
 676                  sequence.  Instead, we will handle the case of a file ending
 677                  in a \r later.  */
 678               break;
 679             }
 680           return (*next == '\n' ? next : s);
 681         }
 682     }
 683   return nullptr;
 684 }
 685
 686 /* Read a new line from file FP, using C as a cache for the data
 687    coming from the file.  Upon successful completion, *LINE is set to
 688    the beginning of the line found.  *LINE points directly in the
 689    line cache and is only valid until the next call of get_next_line.
 690    *LINE_LEN is set to the length of the line.  Note that the line
 691    does not contain any terminal delimiter.  This function returns
 692    true if some data was read or process from the cache, false
 693    otherwise.  Note that subsequent calls to get_next_line might
 694    make the content of *LINE invalid.  */
 695
 696 bool
 697 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
 698 {
 699   /* Fill the cache with data to process.  */
 700   maybe_read_data ();
 701
 702   size_t remaining_size = m_nb_read - m_line_start_idx;
 703   if (remaining_size == 0)
 704     /* There is no more data to process.  */
 705     return false;
 706
 707   char *line_start = m_data + m_line_start_idx;
 708
 709   char *next_line_start = NULL;
 710   size_t len = 0;
 711   char *line_end = find_end_of_line (line_start, remaining_size);
 712   if (line_end == NULL)
 713     {
 714       /* We haven't found an end-of-line delimiter in the cache.
 715          Fill the cache with more data from the file and look again.  */
 716       while (maybe_read_data ())
 717         {
 718           line_start = m_data + m_line_start_idx;
 719           remaining_size = m_nb_read - m_line_start_idx;
 720           line_end = find_end_of_line (line_start, remaining_size);
 721           if (line_end != NULL)
 722             {
 723               next_line_start = line_end + 1;
 724               break;
 725             }
 726         }
 727       if (line_end == NULL)
 728         {
 729           /* We've loaded all the file into the cache and still no
 730              terminator.  Let's say the line ends up at one byte past the
 731              end of the file.  This is to stay consistent with the case
 732              of when the line ends up with a terminator and line_end points to
 733              that.  That consistency is useful below in the len calculation.
 734
 735              If the file ends in a \r, we didn't identify it as a line
 736              terminator above, so do that now instead.  */
 737           line_end = m_data + m_nb_read;
 738           if (m_nb_read && line_end[-1] == '\r')
 739             {
 740               --line_end;
 741               m_missing_trailing_newline = false;
 742             }
 743           else
 744             m_missing_trailing_newline = true;
 745         }
 746       else
 747         m_missing_trailing_newline = false;
 748     }
 749   else
 750     {
 751       next_line_start = line_end + 1;
 752       m_missing_trailing_newline = false;
 753     }
 754
 755   if (m_fp && ferror (m_fp))
 756     return false;
 757
 758   /* At this point, we've found the end of the of line.  It either points to
 759      the line terminator or to one byte after the last byte of the file.  */
 760   gcc_assert (line_end != NULL);
 761
 762   len = line_end - line_start;
 763
 764   if (m_line_start_idx < m_nb_read)
 765     *line = line_start;
 766
 767   ++m_line_num;
 768
 769   /* Before we update our line record, make sure the hint about the
 770      total number of lines of the file is correct.  If it's not, then
 771      we give up recording line boundaries from now on.  */
 772   bool update_line_record = true;
 773   if (m_line_num > m_total_lines)
 774     update_line_record = false;
 775
 776     /* Now update our line record so that re-reading lines from the
 777      before m_line_start_idx is faster.  */
 778   if (update_line_record
 779       && m_line_record.length () < line_record_size)
 780     {
 781       /* If the file lines fits in the line record, we just record all
 782          its lines ...*/
 783       if (m_total_lines <= line_record_size
 784           && m_line_num > m_line_record.length ())
 785         m_line_record.safe_push
 786           (file_cache_slot::line_info (m_line_num,
 787                                        m_line_start_idx,
 788                                        line_end - m_data));
 789       else if (m_total_lines > line_record_size)
 790         {
 791           /* ... otherwise, we just scale total_lines down to
 792              (line_record_size lines.  */
 793           size_t n = (m_line_num * line_record_size) / m_total_lines;
 794           if (m_line_record.length () == 0
 795               || n >= m_line_record.length ())
 796             m_line_record.safe_push
 797               (file_cache_slot::line_info (m_line_num,
 798                                            m_line_start_idx,
 799                                            line_end - m_data));
 800         }
 801     }
 802
 803   /* Update m_line_start_idx so that it points to the next line to be
 804      read.  */
 805   if (next_line_start)
 806     m_line_start_idx = next_line_start - m_data;
 807   else
 808     /* We didn't find any terminal '\n'.  Let's consider that the end
 809        of line is the end of the data in the cache.  The next
 810        invocation of get_next_line will either read more data from the
 811        underlying file or return false early because we've reached the
 812        end of the file.  */
 813     m_line_start_idx = m_nb_read;
 814
 815   *line_len = len;
 816
 817   return true;
 818 }
 819
 820 /* Consume the next bytes coming from the cache (or from its
 821    underlying file if there are remaining unread bytes in the file)
 822    until we reach the next end-of-line (or end-of-file).  There is no
 823    copying from the cache involved.  Return TRUE upon successful
 824    completion.  */
 825
 826 bool
 827 file_cache_slot::goto_next_line ()
 828 {
 829   char *l;
 830   ssize_t len;
 831
 832   return get_next_line (&l, &len);
 833 }
 834
 835 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 836    If the line was read successfully, *LINE points to the beginning
 837    of the line in the file cache and *LINE_LEN is the length of the
 838    line.  *LINE is not nul-terminated, but may contain zero bytes.
 839    *LINE is only valid until the next call of read_line_num.
 840    This function returns bool if a line was read.  */
 841
 842 bool
 843 file_cache_slot::read_line_num (size_t line_num,
 844                        char ** line, ssize_t *line_len)
 845 {
 846   gcc_assert (line_num > 0);
 847
 848   if (line_num <= m_line_num)
 849     {
 850       /* We've been asked to read lines that are before m_line_num.
 851          So lets use our line record (if it's not empty) to try to
 852          avoid re-reading the file from the beginning again.  */
 853
 854       if (m_line_record.is_empty ())
 855         {
 856           m_line_start_idx = 0;
 857           m_line_num = 0;
 858         }
 859       else
 860         {
 861           file_cache_slot::line_info *i = NULL;
 862           if (m_total_lines <= line_record_size)
 863             {
 864               /* In languages where the input file is not totally
 865                  preprocessed up front, the m_total_lines hint
 866                  can be smaller than the number of lines of the
 867                  file.  In that case, only the first
 868                  m_total_lines have been recorded.
 869
 870                  Otherwise, the first m_total_lines we've read have
 871                  their start/end recorded here.  */
 872               i = (line_num <= m_total_lines)
 873                 ? &m_line_record[line_num - 1]
 874                 : &m_line_record[m_total_lines - 1];
 875               gcc_assert (i->line_num <= line_num);
 876             }
 877           else
 878             {
 879               /*  So the file had more lines than our line record
 880                   size.  Thus the number of lines we've recorded has
 881                   been scaled down to line_record_size.  Let's
 882                   pick the start/end of the recorded line that is
 883                   closest to line_num.  */
 884               size_t n = (line_num <= m_total_lines)
 885                 ? line_num * line_record_size / m_total_lines
 886                 : m_line_record.length () - 1;
 887               if (n < m_line_record.length ())
 888                 {
 889                   i = &m_line_record[n];
 890                   gcc_assert (i->line_num <= line_num);
 891                 }
 892             }
 893
 894           if (i && i->line_num == line_num)
 895             {
 896               /* We have the start/end of the line.  */
 897               *line = m_data + i->start_pos;
 898               *line_len = i->end_pos - i->start_pos;
 899               return true;
 900             }
 901
 902           if (i)
 903             {
 904               m_line_start_idx = i->start_pos;
 905               m_line_num = i->line_num - 1;
 906             }
 907           else
 908             {
 909               m_line_start_idx = 0;
 910               m_line_num = 0;
 911             }
 912         }
 913     }
 914
 915   /*  Let's walk from line m_line_num up to line_num - 1, without
 916       copying any line.  */
 917   while (m_line_num < line_num - 1)
 918     if (!goto_next_line ())
 919       return false;
 920
 921   /* The line we want is the next one.  Let's read and copy it back to
 922      the caller.  */
 923   return get_next_line (line, line_len);
 924 }
 925
 926 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 927    The line is not nul-terminated.  The returned pointer is only
 928    valid until the next call of location_get_source_line.
 929    Note that the line can contain several null characters,
 930    so the returned value's length has the actual length of the line.
 931    If the function fails, a NULL char_span is returned.  */
 932
 933 char_span
 934 location_get_source_line (const char *file_path, int line)
 935 {
 936   char *buffer = NULL;
 937   ssize_t len;
 938
 939   if (line == 0)
 940     return char_span (NULL, 0);
 941
 942   if (file_path == NULL)
 943     return char_span (NULL, 0);
 944
 945   diagnostic_file_cache_init ();
 946
 947   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 948   if (c == NULL)
 949     return char_span (NULL, 0);
 950
 951   bool read = c->read_line_num (line, &buffer, &len);
 952   if (!read)
 953     return char_span (NULL, 0);
 954
 955   return char_span (buffer, len);
 956 }
 957
 958 /* Return a NUL-terminated copy of the source text between two locations, or
 959    NULL if the arguments are invalid.  The caller is responsible for freeing
 960    the return value.  */
 961
 962 char *
 963 get_source_text_between (location_t start, location_t end)
 964 {
 965   expanded_location expstart =
 966     expand_location_to_spelling_point (start, LOCATION_ASPECT_START);
 967   expanded_location expend =
 968     expand_location_to_spelling_point (end, LOCATION_ASPECT_FINISH);
 969
 970   /* If the locations are in different files or the end comes before the
 971      start, give up and return nothing.  */
 972   if (!expstart.file || !expend.file)
 973     return NULL;
 974   if (strcmp (expstart.file, expend.file) != 0)
 975     return NULL;
 976   if (expstart.line > expend.line)
 977     return NULL;
 978   if (expstart.line == expend.line
 979       && expstart.column > expend.column)
 980     return NULL;
 981   /* These aren't real column numbers, give up.  */
 982   if (expstart.column == 0 || expend.column == 0)
 983     return NULL;
 984
 985   /* For a single line we need to trim both edges.  */
 986   if (expstart.line == expend.line)
 987     {
 988       char_span line = location_get_source_line (expstart.file, expstart.line);
 989       if (line.length () < 1)
 990         return NULL;
 991       int s = expstart.column - 1;
 992       int len = expend.column - s;
 993       if (line.length () < (size_t)expend.column)
 994         return NULL;
 995       return line.subspan (s, len).xstrdup ();
 996     }
 997
 998   struct obstack buf_obstack;
 999   obstack_init (&buf_obstack);
1000
1001   /* Loop through all lines in the range and append each to buf; may trim
1002      parts of the start and end lines off depending on column values.  */
1003   for (int lnum = expstart.line; lnum <= expend.line; ++lnum)
1004     {
1005       char_span line = location_get_source_line (expstart.file, lnum);
1006       if (line.length () < 1 && (lnum != expstart.line && lnum != expend.line))
1007         continue;
1008
1009       /* For the first line in the range, only start at expstart.column */
1010       if (lnum == expstart.line)
1011         {
1012           unsigned off = expstart.column - 1;
1013           if (line.length () < off)
1014             return NULL;
1015           line = line.subspan (off, line.length() - off);
1016         }
1017       /* For the last line, don't go past expend.column */
1018       else if (lnum == expend.line)
1019         {
1020           if (line.length () < (size_t)expend.column)
1021             return NULL;
1022           line = line.subspan (0, expend.column);
1023         }
1024
1025       /* Combine spaces at the beginning of later lines.  */
1026       if (lnum > expstart.line)
1027         {
1028           unsigned off;
1029           for (off = 0; off < line.length(); ++off)
1030             if (line[off] != ' ' && line[off] != '\t')
1031               break;
1032           if (off > 0)
1033             {
1034               obstack_1grow (&buf_obstack, ' ');
1035               line = line.subspan (off, line.length() - off);
1036             }
1037         }
1038
1039       /* This does not include any trailing newlines.  */
1040       obstack_grow (&buf_obstack, line.get_buffer (), line.length ());
1041     }
1042
1043   /* NUL-terminate and finish the buf obstack.  */
1044   obstack_1grow (&buf_obstack, 0);
1045   const char *buf = (const char *) obstack_finish (&buf_obstack);
1046
1047   return xstrdup (buf);
1048 }
1049
1050 /* Determine if FILE_PATH missing a trailing newline on its final line.
1051    Only valid to call once all of the file has been loaded, by
1052    requesting a line number beyond the end of the file.  */
1053
1054 bool
1055 location_missing_trailing_newline (const char *file_path)
1056 {
1057   diagnostic_file_cache_init ();
1058
1059   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
1060   if (c == NULL)
1061     return false;
1062
1063   return c->missing_trailing_newline_p ();
1064 }
1065
1066 /* Test if the location originates from the spelling location of a
1067    builtin-tokens.  That is, return TRUE if LOC is a (possibly
1068    virtual) location of a built-in token that appears in the expansion
1069    list of a macro.  Please note that this function also works on
1070    tokens that result from built-in tokens.  For instance, the
1071    function would return true if passed a token "4" that is the result
1072    of the expansion of the built-in __LINE__ macro.  */
1073 bool
1074 is_location_from_builtin_token (location_t loc)
1075 {
1076   const line_map_ordinary *map = NULL;
1077   loc = linemap_resolve_location (line_table, loc,
1078                                   LRK_SPELLING_LOCATION, &map);
1079   return loc == BUILTINS_LOCATION;
1080 }
1081
1082 /* Expand the source location LOC into a human readable location.  If
1083    LOC is virtual, it resolves to the expansion point of the involved
1084    macro.  If LOC resolves to a builtin location, the file name of the
1085    readable location is set to the string "<built-in>".  */
1086
1087 expanded_location
1088 expand_location (location_t loc)
1089 {
1090   return expand_location_1 (loc, /*expansion_point_p=*/true,
1091                             LOCATION_ASPECT_CARET);
1092 }
1093
1094 /* Expand the source location LOC into a human readable location.  If
1095    LOC is virtual, it resolves to the expansion location of the
1096    relevant macro.  If LOC resolves to a builtin location, the file
1097    name of the readable location is set to the string
1098    "<built-in>".  */
1099
1100 expanded_location
1101 expand_location_to_spelling_point (location_t loc,
1102                                    enum location_aspect aspect)
1103 {
1104   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1105 }
1106
1107 /* The rich_location class within libcpp requires a way to expand
1108    location_t instances, and relies on the client code
1109    providing a symbol named
1110      linemap_client_expand_location_to_spelling_point
1111    to do this.
1112
1113    This is the implementation for libcommon.a (all host binaries),
1114    which simply calls into expand_location_1.  */
1115
1116 expanded_location
1117 linemap_client_expand_location_to_spelling_point (location_t loc,
1118                                                   enum location_aspect aspect)
1119 {
1120   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
1121 }
1122
1123
1124 /* If LOCATION is in a system header and if it is a virtual location
1125    for a token coming from the expansion of a macro, unwind it to
1126    the location of the expansion point of the macro.  If the expansion
1127    point is also in a system header return the original LOCATION.
1128    Otherwise, return the location of the expansion point.
1129
1130    This is used for instance when we want to emit diagnostics about a
1131    token that may be located in a macro that is itself defined in a
1132    system header, for example, for the NULL macro.  In such a case, if
1133    LOCATION were passed directly to diagnostic functions such as
1134    warning_at, the diagnostic would be suppressed (unless
1135    -Wsystem-headers).  */
1136
1137 location_t
1138 expansion_point_location_if_in_system_header (location_t location)
1139 {
1140   if (!in_system_header_at (location))
1141     return location;
1142
1143   location_t xloc = linemap_resolve_location (line_table, location,
1144                                               LRK_MACRO_EXPANSION_POINT,
1145                                               NULL);
1146   return in_system_header_at (xloc) ? location : xloc;
1147 }
1148
1149 /* If LOCATION is a virtual location for a token coming from the expansion
1150    of a macro, unwind to the location of the expansion point of the macro.  */
1151
1152 location_t
1153 expansion_point_location (location_t location)
1154 {
1155   return linemap_resolve_location (line_table, location,
1156                                    LRK_MACRO_EXPANSION_POINT, NULL);
1157 }
1158
1159 /* Construct a location with caret at CARET, ranging from START to
1160    finish e.g.
1161
1162                  11111111112
1163         12345678901234567890
1164      522
1165      523   return foo + bar;
1166                   ~~~~^~~~~
1167      524
1168
1169    The location's caret is at the "+", line 523 column 15, but starts
1170    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1171    of "bar" at column 19.  */
1172
1173 location_t
1174 make_location (location_t caret, location_t start, location_t finish)
1175 {
1176   location_t pure_loc = get_pure_location (caret);
1177   source_range src_range;
1178   src_range.m_start = get_start (start);
1179   src_range.m_finish = get_finish (finish);
1180   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1181                                                    pure_loc,
1182                                                    src_range,
1183                                                    NULL,
1184                                                    0);
1185   return combined_loc;
1186 }
1187
1188 /* Same as above, but taking a source range rather than two locations.  */
1189
1190 location_t
1191 make_location (location_t caret, source_range src_range)
1192 {
1193   location_t pure_loc = get_pure_location (caret);
1194   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL, 0);
1195 }
1196
1197 /* An expanded_location stores the column in byte units.  This function
1198    converts that column to display units.  That requires reading the associated
1199    source line in order to calculate the display width.  If that cannot be done
1200    for any reason, then returns the byte column as a fallback.  */
1201 int
1202 location_compute_display_column (expanded_location exploc,
1203                                  const cpp_char_column_policy &policy)
1204 {
1205   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1206     return exploc.column;
1207   char_span line = location_get_source_line (exploc.file, exploc.line);
1208   /* If line is NULL, this function returns exploc.column which is the
1209      desired fallback.  */
1210   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1211                                             exploc.column, policy);
1212 }
1213
1214 /* Dump statistics to stderr about the memory usage of the line_table
1215    set of line maps.  This also displays some statistics about macro
1216    expansion.  */
1217
1218 void
1219 dump_line_table_statistics (void)
1220 {
1221   struct linemap_stats s;
1222   long total_used_map_size,
1223     macro_maps_size,
1224     total_allocated_map_size;
1225
1226   memset (&s, 0, sizeof (s));
1227
1228   linemap_get_statistics (line_table, &s);
1229
1230   macro_maps_size = s.macro_maps_used_size
1231     + s.macro_maps_locations_size;
1232
1233   total_allocated_map_size = s.ordinary_maps_allocated_size
1234     + s.macro_maps_allocated_size
1235     + s.macro_maps_locations_size;
1236
1237   total_used_map_size = s.ordinary_maps_used_size
1238     + s.macro_maps_used_size
1239     + s.macro_maps_locations_size;
1240
1241   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1242            s.num_expanded_macros);
1243   if (s.num_expanded_macros != 0)
1244     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1245              s.num_macro_tokens / s.num_expanded_macros);
1246   fprintf (stderr,
1247            "\nLine Table allocations during the "
1248            "compilation process\n");
1249   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1250            SIZE_AMOUNT (s.num_ordinary_maps_used));
1251   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1252            SIZE_AMOUNT (s.ordinary_maps_used_size));
1253   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1254            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1255   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1256            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1257   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1258            SIZE_AMOUNT (s.num_macro_maps_used));
1259   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1260            SIZE_AMOUNT (s.macro_maps_used_size));
1261   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1262            SIZE_AMOUNT (s.macro_maps_locations_size));
1263   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1264            SIZE_AMOUNT (macro_maps_size));
1265   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1266            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1267   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1268            SIZE_AMOUNT (total_allocated_map_size));
1269   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1270            SIZE_AMOUNT (total_used_map_size));
1271   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1272            SIZE_AMOUNT (s.adhoc_table_size));
1273   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1274            SIZE_AMOUNT (s.adhoc_table_entries_used));
1275   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1276            SIZE_AMOUNT (line_table->num_optimized_ranges));
1277   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1278            SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1279
1280   fprintf (stderr, "\n");
1281 }
1282
1283 /* Get location one beyond the final location in ordinary map IDX.  */
1284
1285 static location_t
1286 get_end_location (class line_maps *set, unsigned int idx)
1287 {
1288   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1289     return set->highest_location;
1290
1291   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1292   return MAP_START_LOCATION (next_map);
1293 }
1294
1295 /* Helper function for write_digit_row.  */
1296
1297 static void
1298 write_digit (FILE *stream, int digit)
1299 {
1300   fputc ('0' + (digit % 10), stream);
1301 }
1302
1303 /* Helper function for dump_location_info.
1304    Write a row of numbers to STREAM, numbering a source line,
1305    giving the units, tens, hundreds etc of the column number.  */
1306
1307 static void
1308 write_digit_row (FILE *stream, int indent,
1309                  const line_map_ordinary *map,
1310                  location_t loc, int max_col, int divisor)
1311 {
1312   fprintf (stream, "%*c", indent, ' ');
1313   fprintf (stream, "|");
1314   for (int column = 1; column < max_col; column++)
1315     {
1316       location_t column_loc = loc + (column << map->m_range_bits);
1317       write_digit (stream, column_loc / divisor);
1318     }
1319   fprintf (stream, "\n");
1320 }
1321
1322 /* Write a half-closed (START) / half-open (END) interval of
1323    location_t to STREAM.  */
1324
1325 static void
1326 dump_location_range (FILE *stream,
1327                      location_t start, location_t end)
1328 {
1329   fprintf (stream,
1330            "  location_t interval: %u <= loc < %u\n",
1331            start, end);
1332 }
1333
1334 /* Write a labelled description of a half-closed (START) / half-open (END)
1335    interval of location_t to STREAM.  */
1336
1337 static void
1338 dump_labelled_location_range (FILE *stream,
1339                               const char *name,
1340                               location_t start, location_t end)
1341 {
1342   fprintf (stream, "%s\n", name);
1343   dump_location_range (stream, start, end);
1344   fprintf (stream, "\n");
1345 }
1346
1347 /* Write a visualization of the locations in the line_table to STREAM.  */
1348
1349 void
1350 dump_location_info (FILE *stream)
1351 {
1352   /* Visualize the reserved locations.  */
1353   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1354                                 0, RESERVED_LOCATION_COUNT);
1355
1356   /* Visualize the ordinary line_map instances, rendering the sources. */
1357   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1358     {
1359       location_t end_location = get_end_location (line_table, idx);
1360       /* half-closed: doesn't include this one. */
1361
1362       const line_map_ordinary *map
1363         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1364       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1365       dump_location_range (stream,
1366                            MAP_START_LOCATION (map), end_location);
1367       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1368       fprintf (stream, "  starting at line: %i\n",
1369                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1370       fprintf (stream, "  column and range bits: %i\n",
1371                map->m_column_and_range_bits);
1372       fprintf (stream, "  column bits: %i\n",
1373                map->m_column_and_range_bits - map->m_range_bits);
1374       fprintf (stream, "  range bits: %i\n",
1375                map->m_range_bits);
1376       const char * reason;
1377       switch (map->reason) {
1378       case LC_ENTER:
1379         reason = "LC_ENTER";
1380         break;
1381       case LC_LEAVE:
1382         reason = "LC_LEAVE";
1383         break;
1384       case LC_RENAME:
1385         reason = "LC_RENAME";
1386         break;
1387       case LC_RENAME_VERBATIM:
1388         reason = "LC_RENAME_VERBATIM";
1389         break;
1390       case LC_ENTER_MACRO:
1391         reason = "LC_RENAME_MACRO";
1392         break;
1393       default:
1394         reason = "Unknown";
1395       }
1396       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1397
1398       const line_map_ordinary *includer_map
1399         = linemap_included_from_linemap (line_table, map);
1400       fprintf (stream, "  included from location: %d",
1401                linemap_included_from (map));
1402       if (includer_map) {
1403         fprintf (stream, " (in ordinary map %d)",
1404                  int (includer_map - line_table->info_ordinary.maps));
1405       }
1406       fprintf (stream, "\n");
1407
1408       /* Render the span of source lines that this "map" covers.  */
1409       for (location_t loc = MAP_START_LOCATION (map);
1410            loc < end_location;
1411            loc += (1 << map->m_range_bits) )
1412         {
1413           gcc_assert (pure_location_p (line_table, loc) );
1414
1415           expanded_location exploc
1416             = linemap_expand_location (line_table, map, loc);
1417
1418           if (exploc.column == 0)
1419             {
1420               /* Beginning of a new source line: draw the line.  */
1421
1422               char_span line_text = location_get_source_line (exploc.file,
1423                                                               exploc.line);
1424               if (!line_text)
1425                 break;
1426               fprintf (stream,
1427                        "%s:%3i|loc:%5i|%.*s\n",
1428                        exploc.file, exploc.line,
1429                        loc,
1430                        (int)line_text.length (), line_text.get_buffer ());
1431
1432               /* "loc" is at column 0, which means "the whole line".
1433                  Render the locations *within* the line, by underlining
1434                  it, showing the location_t numeric values
1435                  at each column.  */
1436               size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1437               if (max_col > line_text.length ())
1438                 max_col = line_text.length () + 1;
1439
1440               int len_lnum = num_digits (exploc.line);
1441               if (len_lnum < 3)
1442                 len_lnum = 3;
1443               int len_loc = num_digits (loc);
1444               if (len_loc < 5)
1445                 len_loc = 5;
1446
1447               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1448
1449               /* Thousands.  */
1450               if (end_location > 999)
1451                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1452
1453               /* Hundreds.  */
1454               if (end_location > 99)
1455                 write_digit_row (stream, indent, map, loc, max_col, 100);
1456
1457               /* Tens.  */
1458               write_digit_row (stream, indent, map, loc, max_col, 10);
1459
1460               /* Units.  */
1461               write_digit_row (stream, indent, map, loc, max_col, 1);
1462             }
1463         }
1464       fprintf (stream, "\n");
1465     }
1466
1467   /* Visualize unallocated values.  */
1468   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1469                                 line_table->highest_location,
1470                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1471
1472   /* Visualize the macro line_map instances, rendering the sources. */
1473   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1474     {
1475       /* Each macro map that is allocated owns location_t values
1476          that are *lower* that the one before them.
1477          Hence it's meaningful to view them either in order of ascending
1478          source locations, or in order of ascending macro map index.  */
1479       const bool ascending_location_ts = true;
1480       unsigned int idx = (ascending_location_ts
1481                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1482                           : i);
1483       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1484       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1485                idx,
1486                linemap_map_get_macro_name (map),
1487                MACRO_MAP_NUM_MACRO_TOKENS (map));
1488       dump_location_range (stream,
1489                            map->start_location,
1490                            (map->start_location
1491                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1492       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1493               "expansion point is location %i",
1494               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1495       fprintf (stream, "  map->start_location: %u\n",
1496                map->start_location);
1497
1498       fprintf (stream, "  macro_locations:\n");
1499       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1500         {
1501           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1502           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1503
1504           /* linemap_add_macro_token encodes token numbers in an expansion
1505              by putting them after MAP_START_LOCATION. */
1506
1507           /* I'm typically seeing 4 uninitialized entries at the end of
1508              0xafafafaf.
1509              This appears to be due to macro.cc:replace_args
1510              adding 2 extra args for padding tokens; presumably there may
1511              be a leading and/or trailing padding token injected,
1512              each for 2 more location slots.
1513              This would explain there being up to 4 location_ts slots
1514              that may be uninitialized.  */
1515
1516           fprintf (stream, "    %u: %u, %u\n",
1517                    i,
1518                    x,
1519                    y);
1520           if (x == y)
1521             {
1522               if (x < MAP_START_LOCATION (map))
1523                 inform (x, "token %u has %<x-location == y-location == %u%>",
1524                         i, x);
1525               else
1526                 fprintf (stream,
1527                          "x-location == y-location == %u encodes token # %u\n",
1528                          x, x - MAP_START_LOCATION (map));
1529                 }
1530           else
1531             {
1532               inform (x, "token %u has %<x-location == %u%>", i, x);
1533               inform (x, "token %u has %<y-location == %u%>", i, y);
1534             }
1535         }
1536       fprintf (stream, "\n");
1537     }
1538
1539   /* It appears that MAX_LOCATION_T itself is never assigned to a
1540      macro map, presumably due to an off-by-one error somewhere
1541      between the logic in linemap_enter_macro and
1542      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1543   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1544                                 MAX_LOCATION_T,
1545                                 MAX_LOCATION_T + 1);
1546
1547   /* Visualize ad-hoc values.  */
1548   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1549                                 MAX_LOCATION_T + 1, UINT_MAX);
1550 }
1551
1552 /* string_concat's constructor.  */
1553
1554 string_concat::string_concat (int num, location_t *locs)
1555   : m_num (num)
1556 {
1557   m_locs = ggc_vec_alloc <location_t> (num);
1558   for (int i = 0; i < num; i++)
1559     m_locs[i] = locs[i];
1560 }
1561
1562 /* string_concat_db's constructor.  */
1563
1564 string_concat_db::string_concat_db ()
1565 {
1566   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1567 }
1568
1569 /* Record that a string concatenation occurred, covering NUM
1570    string literal tokens.  LOCS is an array of size NUM, containing the
1571    locations of the tokens.  A copy of LOCS is taken.  */
1572
1573 void
1574 string_concat_db::record_string_concatenation (int num, location_t *locs)
1575 {
1576   gcc_assert (num > 1);
1577   gcc_assert (locs);
1578
1579   location_t key_loc = get_key_loc (locs[0]);
1580   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1581      any data now recorded under key 'key_loc' would be overwritten by a
1582      subsequent call with the same key 'key_loc'.  */
1583   if (RESERVED_LOCATION_P (key_loc))
1584     return;
1585
1586   string_concat *concat
1587     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1588   m_table->put (key_loc, concat);
1589 }
1590
1591 /* Determine if LOC was the location of the initial token of a
1592    concatenation of string literal tokens.
1593    If so, *OUT_NUM is written to with the number of tokens, and
1594    *OUT_LOCS with the location of an array of locations of the
1595    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1596    storage owned by the string_concat_db.
1597    Otherwise, return false.  */
1598
1599 bool
1600 string_concat_db::get_string_concatenation (location_t loc,
1601                                             int *out_num,
1602                                             location_t **out_locs)
1603 {
1604   gcc_assert (out_num);
1605   gcc_assert (out_locs);
1606
1607   location_t key_loc = get_key_loc (loc);
1608   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1609      discussion in 'string_concat_db::record_string_concatenation'.  */
1610   if (RESERVED_LOCATION_P (key_loc))
1611     return false;
1612
1613   string_concat **concat = m_table->get (key_loc);
1614   if (!concat)
1615     return false;
1616
1617   *out_num = (*concat)->m_num;
1618   *out_locs =(*concat)->m_locs;
1619   return true;
1620 }
1621
1622 /* Internal function.  Canonicalize LOC into a form suitable for
1623    use as a key within the database, stripping away macro expansion,
1624    ad-hoc information, and range information, using the location of
1625    the start of LOC within an ordinary linemap.  */
1626
1627 location_t
1628 string_concat_db::get_key_loc (location_t loc)
1629 {
1630   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1631                                   NULL);
1632
1633   loc = get_range_from_loc (line_table, loc).m_start;
1634
1635   return loc;
1636 }
1637
1638 /* Helper class for use within get_substring_ranges_for_loc.
1639    An vec of cpp_string with responsibility for releasing all of the
1640    str->text for each str in the vector.  */
1641
1642 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1643 {
1644  public:
1645   auto_cpp_string_vec (int alloc)
1646     : auto_vec <cpp_string> (alloc) {}
1647
1648   ~auto_cpp_string_vec ()
1649   {
1650     /* Clean up the copies within this vec.  */
1651     int i;
1652     cpp_string *str;
1653     FOR_EACH_VEC_ELT (*this, i, str)
1654       free (const_cast <unsigned char *> (str->text));
1655   }
1656 };
1657
1658 /* Attempt to populate RANGES with source location information on the
1659    individual characters within the string literal found at STRLOC.
1660    If CONCATS is non-NULL, then any string literals that the token at
1661    STRLOC  was concatenated with are also added to RANGES.
1662
1663    Return NULL if successful, or an error message if any errors occurred (in
1664    which case RANGES may be only partially populated and should not
1665    be used).
1666
1667    This is implemented by re-parsing the relevant source line(s).  */
1668
1669 static const char *
1670 get_substring_ranges_for_loc (cpp_reader *pfile,
1671                               string_concat_db *concats,
1672                               location_t strloc,
1673                               enum cpp_ttype type,
1674                               cpp_substring_ranges &ranges)
1675 {
1676   gcc_assert (pfile);
1677
1678   if (strloc == UNKNOWN_LOCATION)
1679     return "unknown location";
1680
1681   /* Reparsing the strings requires accurate location information.
1682      If -ftrack-macro-expansion has been overridden from its default
1683      of 2, then we might have a location of a macro expansion point,
1684      rather than the location of the literal itself.
1685      Avoid this by requiring that we have full macro expansion tracking
1686      for substring locations to be available.  */
1687   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1688     return "track_macro_expansion != 2";
1689
1690   /* If #line or # 44 "file"-style directives are present, then there's
1691      no guarantee that the line numbers we have can be used to locate
1692      the strings.  For example, we might have a .i file with # directives
1693      pointing back to lines within a .c file, but the .c file might
1694      have been edited since the .i file was created.
1695      In such a case, the safest course is to disable on-demand substring
1696      locations.  */
1697   if (line_table->seen_line_directive)
1698     return "seen line directive";
1699
1700   /* If string concatenation has occurred at STRLOC, get the locations
1701      of all of the literal tokens making up the compound string.
1702      Otherwise, just use STRLOC.  */
1703   int num_locs = 1;
1704   location_t *strlocs = &strloc;
1705   if (concats)
1706     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1707
1708   auto_cpp_string_vec strs (num_locs);
1709   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1710   for (int i = 0; i < num_locs; i++)
1711     {
1712       /* Get range of strloc.  We will use it to locate the start and finish
1713          of the literal token within the line.  */
1714       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1715
1716       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1717         {
1718           /* If the string token was within a macro expansion, then we can
1719              cope with it for the simple case where we have a single token.
1720              Otherwise, bail out.  */
1721           if (src_range.m_start != src_range.m_finish)
1722             return "macro expansion";
1723         }
1724       else
1725         {
1726           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1727             /* If so, we can't reliably determine where the token started within
1728                its line.  */
1729             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1730
1731           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1732             /* If so, we can't reliably determine where the token finished
1733                within its line.  */
1734             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1735         }
1736
1737       expanded_location start
1738         = expand_location_to_spelling_point (src_range.m_start,
1739                                              LOCATION_ASPECT_START);
1740       expanded_location finish
1741         = expand_location_to_spelling_point (src_range.m_finish,
1742                                              LOCATION_ASPECT_FINISH);
1743       if (start.file != finish.file)
1744         return "range endpoints are in different files";
1745       if (start.line != finish.line)
1746         return "range endpoints are on different lines";
1747       if (start.column > finish.column)
1748         return "range endpoints are reversed";
1749
1750       char_span line = location_get_source_line (start.file, start.line);
1751       if (!line)
1752         return "unable to read source line";
1753
1754       /* Determine the location of the literal (including quotes
1755          and leading prefix chars, such as the 'u' in a u""
1756          token).  */
1757       size_t literal_length = finish.column - start.column + 1;
1758
1759       /* Ensure that we don't crash if we got the wrong location.  */
1760       if (start.column < 1)
1761         return "zero start column";
1762       if (line.length () < (start.column - 1 + literal_length))
1763         return "line is not wide enough";
1764
1765       char_span literal = line.subspan (start.column - 1, literal_length);
1766
1767       cpp_string from;
1768       from.len = literal_length;
1769       /* Make a copy of the literal, to avoid having to rely on
1770          the lifetime of the copy of the line within the cache.
1771          This will be released by the auto_cpp_string_vec dtor.  */
1772       from.text = (unsigned char *)literal.xstrdup ();
1773       strs.safe_push (from);
1774
1775       /* For very long lines, a new linemap could have started
1776          halfway through the token.
1777          Ensure that the loc_reader uses the linemap of the
1778          *end* of the token for its start location.  */
1779       const line_map_ordinary *start_ord_map;
1780       linemap_resolve_location (line_table, src_range.m_start,
1781                                 LRK_SPELLING_LOCATION, &start_ord_map);
1782       const line_map_ordinary *final_ord_map;
1783       linemap_resolve_location (line_table, src_range.m_finish,
1784                                 LRK_SPELLING_LOCATION, &final_ord_map);
1785       if (start_ord_map == NULL || final_ord_map == NULL)
1786         return "failed to get ordinary maps";
1787       /* Bulletproofing.  We ought to only have different ordinary maps
1788          for start vs finish due to line-length jumps.  */
1789       if (start_ord_map != final_ord_map
1790           && start_ord_map->to_file != final_ord_map->to_file)
1791         return "start and finish are spelled in different ordinary maps";
1792       /* The file from linemap_resolve_location ought to match that from
1793          expand_location_to_spelling_point.  */
1794       if (start_ord_map->to_file != start.file)
1795         return "mismatching file after resolving linemap";
1796
1797       location_t start_loc
1798         = linemap_position_for_line_and_column (line_table, final_ord_map,
1799                                                 start.line, start.column);
1800
1801       cpp_string_location_reader loc_reader (start_loc, line_table);
1802       loc_readers.safe_push (loc_reader);
1803     }
1804
1805   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1806   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1807                                                  loc_readers.address (),
1808                                                  num_locs, &ranges, type);
1809   if (err)
1810     return err;
1811
1812   /* Success: "ranges" should now contain information on the string.  */
1813   return NULL;
1814 }
1815
1816 /* Attempt to populate *OUT_LOC with source location information on the
1817    given characters within the string literal found at STRLOC.
1818    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1819    character set.
1820
1821    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1822    and string literal "012345\n789"
1823    *OUT_LOC is written to with:
1824      "012345\n789"
1825          ~^~~~~
1826
1827    If CONCATS is non-NULL, then any string literals that the token at
1828    STRLOC was concatenated with are also considered.
1829
1830    This is implemented by re-parsing the relevant source line(s).
1831
1832    Return NULL if successful, or an error message if any errors occurred.
1833    Error messages are intended for GCC developers (to help debugging) rather
1834    than for end-users.  */
1835
1836 const char *
1837 get_location_within_string (cpp_reader *pfile,
1838                             string_concat_db *concats,
1839                             location_t strloc,
1840                             enum cpp_ttype type,
1841                             int caret_idx, int start_idx, int end_idx,
1842                             location_t *out_loc)
1843 {
1844   gcc_checking_assert (caret_idx >= 0);
1845   gcc_checking_assert (start_idx >= 0);
1846   gcc_checking_assert (end_idx >= 0);
1847   gcc_assert (out_loc);
1848
1849   cpp_substring_ranges ranges;
1850   const char *err
1851     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1852   if (err)
1853     return err;
1854
1855   if (caret_idx >= ranges.get_num_ranges ())
1856     return "caret_idx out of range";
1857   if (start_idx >= ranges.get_num_ranges ())
1858     return "start_idx out of range";
1859   if (end_idx >= ranges.get_num_ranges ())
1860     return "end_idx out of range";
1861
1862   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1863                             ranges.get_range (start_idx).m_start,
1864                             ranges.get_range (end_idx).m_finish);
1865   return NULL;
1866 }
1867
1868 /* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1869
1870 location_t
1871 location_with_discriminator (location_t locus, int discriminator)
1872 {
1873   tree block = LOCATION_BLOCK (locus);
1874   source_range src_range = get_range_from_loc (line_table, locus);
1875   locus = get_pure_location (locus);
1876
1877   if (locus == UNKNOWN_LOCATION)
1878     return locus;
1879
1880   return COMBINE_LOCATION_DATA (line_table, locus, src_range, block, discriminator);
1881 }
1882
1883 /* Return TRUE if LOCUS represents a location with a discriminator.  */
1884
1885 bool
1886 has_discriminator (location_t locus)
1887 {
1888   return get_discriminator_from_loc (locus) != 0;
1889 }
1890
1891 /* Return the discriminator for LOCUS.  */
1892
1893 int
1894 get_discriminator_from_loc (location_t locus)
1895 {
1896   return get_discriminator_from_loc (line_table, locus);
1897 }
1898
1899 #if CHECKING_P
1900
1901 namespace selftest {
1902
1903 /* Selftests of location handling.  */
1904
1905 /* Attempt to populate *OUT_RANGE with source location information on the
1906    given character within the string literal found at STRLOC.
1907    CHAR_IDX refers to an offset within the execution character set.
1908    If CONCATS is non-NULL, then any string literals that the token at
1909    STRLOC was concatenated with are also considered.
1910
1911    This is implemented by re-parsing the relevant source line(s).
1912
1913    Return NULL if successful, or an error message if any errors occurred.
1914    Error messages are intended for GCC developers (to help debugging) rather
1915    than for end-users.  */
1916
1917 static const char *
1918 get_source_range_for_char (cpp_reader *pfile,
1919                            string_concat_db *concats,
1920                            location_t strloc,
1921                            enum cpp_ttype type,
1922                            int char_idx,
1923                            source_range *out_range)
1924 {
1925   gcc_checking_assert (char_idx >= 0);
1926   gcc_assert (out_range);
1927
1928   cpp_substring_ranges ranges;
1929   const char *err
1930     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1931   if (err)
1932     return err;
1933
1934   if (char_idx >= ranges.get_num_ranges ())
1935     return "char_idx out of range";
1936
1937   *out_range = ranges.get_range (char_idx);
1938   return NULL;
1939 }
1940
1941 /* As get_source_range_for_char, but write to *OUT the number
1942    of ranges that are available.  */
1943
1944 static const char *
1945 get_num_source_ranges_for_substring (cpp_reader *pfile,
1946                                      string_concat_db *concats,
1947                                      location_t strloc,
1948                                      enum cpp_ttype type,
1949                                      int *out)
1950 {
1951   gcc_assert (out);
1952
1953   cpp_substring_ranges ranges;
1954   const char *err
1955     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1956
1957   if (err)
1958     return err;
1959
1960   *out = ranges.get_num_ranges ();
1961   return NULL;
1962 }
1963
1964 /* Selftests of location handling.  */
1965
1966 /* Verify that compare() on linenum_type handles comparisons over the full
1967    range of the type.  */
1968
1969 static void
1970 test_linenum_comparisons ()
1971 {
1972   linenum_type min_line (0);
1973   linenum_type max_line (0xffffffff);
1974   ASSERT_EQ (0, compare (min_line, min_line));
1975   ASSERT_EQ (0, compare (max_line, max_line));
1976
1977   ASSERT_GT (compare (max_line, min_line), 0);
1978   ASSERT_LT (compare (min_line, max_line), 0);
1979 }
1980
1981 /* Helper function for verifying location data: when location_t
1982    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1983    as having column 0.  */
1984
1985 static bool
1986 should_have_column_data_p (location_t loc)
1987 {
1988   if (IS_ADHOC_LOC (loc))
1989     loc = get_location_from_adhoc_loc (line_table, loc);
1990   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1991     return false;
1992   return true;
1993 }
1994
1995 /* Selftest for should_have_column_data_p.  */
1996
1997 static void
1998 test_should_have_column_data_p ()
1999 {
2000   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
2001   ASSERT_TRUE
2002     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
2003   ASSERT_FALSE
2004     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
2005 }
2006
2007 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
2008    on LOC.  */
2009
2010 static void
2011 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
2012               location_t loc)
2013 {
2014   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
2015   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
2016   /* If location_t values are sufficiently high, then column numbers
2017      will be unavailable and LOCATION_COLUMN (loc) will be 0.
2018      When close to the threshold, column numbers *may* be present: if
2019      the final linemap before the threshold contains a line that straddles
2020      the threshold, locations in that line have column information.  */
2021   if (should_have_column_data_p (loc))
2022     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
2023 }
2024
2025 /* Various selftests involve constructing a line table and one or more
2026    line maps within it.
2027
2028    For maximum test coverage we want to run these tests with a variety
2029    of situations:
2030    - line_table->default_range_bits: some frontends use a non-zero value
2031    and others use zero
2032    - the fallback modes within line-map.cc: there are various threshold
2033    values for location_t beyond line-map.cc changes
2034    behavior (disabling of the range-packing optimization, disabling
2035    of column-tracking).  We can exercise these by starting the line_table
2036    at interesting values at or near these thresholds.
2037
2038    The following struct describes a particular case within our test
2039    matrix.  */
2040
2041 class line_table_case
2042 {
2043 public:
2044   line_table_case (int default_range_bits, int base_location)
2045   : m_default_range_bits (default_range_bits),
2046     m_base_location (base_location)
2047   {}
2048
2049   int m_default_range_bits;
2050   int m_base_location;
2051 };
2052
2053 /* Constructor.  Store the old value of line_table, and create a new
2054    one, using sane defaults.  */
2055
2056 line_table_test::line_table_test ()
2057 {
2058   gcc_assert (saved_line_table == NULL);
2059   saved_line_table = line_table;
2060   line_table = ggc_alloc<line_maps> ();
2061   linemap_init (line_table, BUILTINS_LOCATION);
2062   gcc_assert (saved_line_table->reallocator);
2063   line_table->reallocator = saved_line_table->reallocator;
2064   gcc_assert (saved_line_table->round_alloc_size);
2065   line_table->round_alloc_size = saved_line_table->round_alloc_size;
2066   line_table->default_range_bits = 0;
2067 }
2068
2069 /* Constructor.  Store the old value of line_table, and create a new
2070    one, using the sitation described in CASE_.  */
2071
2072 line_table_test::line_table_test (const line_table_case &case_)
2073 {
2074   gcc_assert (saved_line_table == NULL);
2075   saved_line_table = line_table;
2076   line_table = ggc_alloc<line_maps> ();
2077   linemap_init (line_table, BUILTINS_LOCATION);
2078   gcc_assert (saved_line_table->reallocator);
2079   line_table->reallocator = saved_line_table->reallocator;
2080   gcc_assert (saved_line_table->round_alloc_size);
2081   line_table->round_alloc_size = saved_line_table->round_alloc_size;
2082   line_table->default_range_bits = case_.m_default_range_bits;
2083   if (case_.m_base_location)
2084     {
2085       line_table->highest_location = case_.m_base_location;
2086       line_table->highest_line = case_.m_base_location;
2087     }
2088 }
2089
2090 /* Destructor.  Restore the old value of line_table.  */
2091
2092 line_table_test::~line_table_test ()
2093 {
2094   gcc_assert (saved_line_table != NULL);
2095   line_table = saved_line_table;
2096   saved_line_table = NULL;
2097 }
2098
2099 /* Verify basic operation of ordinary linemaps.  */
2100
2101 static void
2102 test_accessing_ordinary_linemaps (const line_table_case &case_)
2103 {
2104   line_table_test ltt (case_);
2105
2106   /* Build a simple linemap describing some locations. */
2107   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
2108
2109   linemap_line_start (line_table, 1, 100);
2110   location_t loc_a = linemap_position_for_column (line_table, 1);
2111   location_t loc_b = linemap_position_for_column (line_table, 23);
2112
2113   linemap_line_start (line_table, 2, 100);
2114   location_t loc_c = linemap_position_for_column (line_table, 1);
2115   location_t loc_d = linemap_position_for_column (line_table, 17);
2116
2117   /* Example of a very long line.  */
2118   linemap_line_start (line_table, 3, 2000);
2119   location_t loc_e = linemap_position_for_column (line_table, 700);
2120
2121   /* Transitioning back to a short line.  */
2122   linemap_line_start (line_table, 4, 0);
2123   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2124
2125   if (should_have_column_data_p (loc_back_to_short))
2126     {
2127       /* Verify that we switched to short lines in the linemap.  */
2128       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
2129       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2130     }
2131
2132   /* Example of a line that will eventually be seen to be longer
2133      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2134      below that.  */
2135   linemap_line_start (line_table, 5, 2000);
2136
2137   location_t loc_start_of_very_long_line
2138     = linemap_position_for_column (line_table, 2000);
2139   location_t loc_too_wide
2140     = linemap_position_for_column (line_table, 4097);
2141   location_t loc_too_wide_2
2142     = linemap_position_for_column (line_table, 4098);
2143
2144   /* ...and back to a sane line length.  */
2145   linemap_line_start (line_table, 6, 100);
2146   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2147
2148   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2149
2150   /* Multiple files.  */
2151   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
2152   linemap_line_start (line_table, 1, 200);
2153   location_t loc_f = linemap_position_for_column (line_table, 150);
2154   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
2155
2156   /* Verify that we can recover the location info.  */
2157   assert_loceq ("foo.c", 1, 1, loc_a);
2158   assert_loceq ("foo.c", 1, 23, loc_b);
2159   assert_loceq ("foo.c", 2, 1, loc_c);
2160   assert_loceq ("foo.c", 2, 17, loc_d);
2161   assert_loceq ("foo.c", 3, 700, loc_e);
2162   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
2163
2164   /* In the very wide line, the initial location should be fully tracked.  */
2165   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
2166   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2167      be disabled.  */
2168   assert_loceq ("foo.c", 5, 0, loc_too_wide);
2169   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2170   /*...and column-tracking should be re-enabled for subsequent lines.  */
2171   assert_loceq ("foo.c", 6, 10, loc_sane_again);
2172
2173   assert_loceq ("bar.c", 1, 150, loc_f);
2174
2175   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2176   ASSERT_TRUE (pure_location_p (line_table, loc_a));
2177
2178   /* Verify using make_location to build a range, and extracting data
2179      back from it.  */
2180   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2181   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2182   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2183   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2184   ASSERT_EQ (loc_b, src_range.m_start);
2185   ASSERT_EQ (loc_d, src_range.m_finish);
2186 }
2187
2188 /* Verify various properties of UNKNOWN_LOCATION.  */
2189
2190 static void
2191 test_unknown_location ()
2192 {
2193   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2194   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2195   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2196 }
2197
2198 /* Verify various properties of BUILTINS_LOCATION.  */
2199
2200 static void
2201 test_builtins ()
2202 {
2203   assert_loceq (special_fname_builtin (), 0, 0, BUILTINS_LOCATION);
2204   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2205 }
2206
2207 /* Regression test for make_location.
2208    Ensure that we use pure locations for the start/finish of the range,
2209    rather than storing a packed or ad-hoc range as the start/finish.  */
2210
2211 static void
2212 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2213 {
2214   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2215      with C++ frontend.
2216      ....................0000000001111111111222.
2217      ....................1234567890123456789012.  */
2218   const char *content = "     r += !aaa == bbb;\n";
2219   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2220   line_table_test ltt (case_);
2221   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2222
2223   const location_t c11 = linemap_position_for_column (line_table, 11);
2224   const location_t c12 = linemap_position_for_column (line_table, 12);
2225   const location_t c13 = linemap_position_for_column (line_table, 13);
2226   const location_t c14 = linemap_position_for_column (line_table, 14);
2227   const location_t c21 = linemap_position_for_column (line_table, 21);
2228
2229   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2230     return;
2231
2232   /* Use column 13 for the caret location, arbitrarily, to verify that we
2233      handle start != caret.  */
2234   const location_t aaa = make_location (c13, c12, c14);
2235   ASSERT_EQ (c13, get_pure_location (aaa));
2236   ASSERT_EQ (c12, get_start (aaa));
2237   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2238   ASSERT_EQ (c14, get_finish (aaa));
2239   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2240
2241   /* Make a location using a location with a range as the start-point.  */
2242   const location_t not_aaa = make_location (c11, aaa, c14);
2243   ASSERT_EQ (c11, get_pure_location (not_aaa));
2244   /* It should use the start location of the range, not store the range
2245      itself.  */
2246   ASSERT_EQ (c12, get_start (not_aaa));
2247   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2248   ASSERT_EQ (c14, get_finish (not_aaa));
2249   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2250
2251   /* Similarly, make a location with a range as the end-point.  */
2252   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2253   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2254   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2255   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2256   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2257   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2258   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2259   /* It should use the finish location of the range, not store the range
2260      itself.  */
2261   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2262   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2263   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2264   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2265   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2266 }
2267
2268 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2269
2270 static void
2271 test_reading_source_line ()
2272 {
2273   /* Create a tempfile and write some text to it.  */
2274   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2275                         "01234567890123456789\n"
2276                         "This is the test text\n"
2277                         "This is the 3rd line");
2278
2279   /* Read back a specific line from the tempfile.  */
2280   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2281   ASSERT_TRUE (source_line);
2282   ASSERT_TRUE (source_line.get_buffer () != NULL);
2283   ASSERT_EQ (20, source_line.length ());
2284   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2285                          source_line.get_buffer (), source_line.length ()));
2286
2287   source_line = location_get_source_line (tmp.get_filename (), 2);
2288   ASSERT_TRUE (source_line);
2289   ASSERT_TRUE (source_line.get_buffer () != NULL);
2290   ASSERT_EQ (21, source_line.length ());
2291   ASSERT_TRUE (!strncmp ("This is the test text",
2292                          source_line.get_buffer (), source_line.length ()));
2293
2294   source_line = location_get_source_line (tmp.get_filename (), 4);
2295   ASSERT_FALSE (source_line);
2296   ASSERT_TRUE (source_line.get_buffer () == NULL);
2297 }
2298
2299 /* Tests of lexing.  */
2300
2301 /* Verify that token TOK from PARSER has cpp_token_as_text
2302    equal to EXPECTED_TEXT.  */
2303
2304 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
2305   SELFTEST_BEGIN_STMT                                                   \
2306     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
2307     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
2308   SELFTEST_END_STMT
2309
2310 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2311    and ranges from EXP_START_COL to EXP_FINISH_COL.
2312    Use LOC as the effective location of the selftest.  */
2313
2314 static void
2315 assert_token_loc_eq (const location &loc,
2316                      const cpp_token *tok,
2317                      const char *exp_filename, int exp_linenum,
2318                      int exp_start_col, int exp_finish_col)
2319 {
2320   location_t tok_loc = tok->src_loc;
2321   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2322   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2323
2324   /* If location_t values are sufficiently high, then column numbers
2325      will be unavailable.  */
2326   if (!should_have_column_data_p (tok_loc))
2327     return;
2328
2329   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2330   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2331   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2332   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2333 }
2334
2335 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2336    SELFTEST_LOCATION as the effective location of the selftest.  */
2337
2338 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2339                             EXP_START_COL, EXP_FINISH_COL) \
2340   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2341                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2342
2343 /* Test of lexing a file using libcpp, verifying tokens and their
2344    location information.  */
2345
2346 static void
2347 test_lexer (const line_table_case &case_)
2348 {
2349   /* Create a tempfile and write some text to it.  */
2350   const char *content =
2351     /*00000000011111111112222222222333333.3333444444444.455555555556
2352       12345678901234567890123456789012345.6789012345678.901234567890.  */
2353     ("test_name /* c-style comment */\n"
2354      "                                  \"test literal\"\n"
2355      " // test c++-style comment\n"
2356      "   42\n");
2357   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2358
2359   line_table_test ltt (case_);
2360
2361   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2362
2363   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2364   ASSERT_NE (fname, NULL);
2365
2366   /* Verify that we get the expected tokens back, with the correct
2367      location information.  */
2368
2369   location_t loc;
2370   const cpp_token *tok;
2371   tok = cpp_get_token_with_location (parser, &loc);
2372   ASSERT_NE (tok, NULL);
2373   ASSERT_EQ (tok->type, CPP_NAME);
2374   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2375   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2376
2377   tok = cpp_get_token_with_location (parser, &loc);
2378   ASSERT_NE (tok, NULL);
2379   ASSERT_EQ (tok->type, CPP_STRING);
2380   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2381   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2382
2383   tok = cpp_get_token_with_location (parser, &loc);
2384   ASSERT_NE (tok, NULL);
2385   ASSERT_EQ (tok->type, CPP_NUMBER);
2386   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2387   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2388
2389   tok = cpp_get_token_with_location (parser, &loc);
2390   ASSERT_NE (tok, NULL);
2391   ASSERT_EQ (tok->type, CPP_EOF);
2392
2393   cpp_finish (parser, NULL);
2394   cpp_destroy (parser);
2395 }
2396
2397 /* Forward decls.  */
2398
2399 class lexer_test;
2400 class lexer_test_options;
2401
2402 /* A class for specifying options of a lexer_test.
2403    The "apply" vfunc is called during the lexer_test constructor.  */
2404
2405 class lexer_test_options
2406 {
2407  public:
2408   virtual void apply (lexer_test &) = 0;
2409 };
2410
2411 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2412    in its dtor.
2413
2414    This is needed by struct lexer_test to ensure that the cleanup of the
2415    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2416
2417 class cpp_reader_ptr
2418 {
2419  public:
2420   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2421
2422   ~cpp_reader_ptr ()
2423   {
2424     cpp_finish (m_ptr, NULL);
2425     cpp_destroy (m_ptr);
2426   }
2427
2428   operator cpp_reader * () const { return m_ptr; }
2429
2430  private:
2431   cpp_reader *m_ptr;
2432 };
2433
2434 /* A struct for writing lexer tests.  */
2435
2436 class lexer_test
2437 {
2438 public:
2439   lexer_test (const line_table_case &case_, const char *content,
2440               lexer_test_options *options);
2441   ~lexer_test ();
2442
2443   const cpp_token *get_token ();
2444
2445   /* The ordering of these fields matters.
2446      The line_table_test must be first, since the cpp_reader_ptr
2447      uses it.
2448      The cpp_reader must be cleaned up *after* the temp_source_file
2449      since the filenames in input.cc's input cache are owned by the
2450      cpp_reader; in particular, when ~temp_source_file evicts the
2451      filename the filenames must still be alive.  */
2452   line_table_test m_ltt;
2453   cpp_reader_ptr m_parser;
2454   temp_source_file m_tempfile;
2455   string_concat_db m_concats;
2456   bool m_implicitly_expect_EOF;
2457 };
2458
2459 /* Use an EBCDIC encoding for the execution charset, specifically
2460    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2461
2462    This exercises iconv integration within libcpp.
2463    Not every build of iconv supports the given charset,
2464    so we need to flag this error and handle it gracefully.  */
2465
2466 class ebcdic_execution_charset : public lexer_test_options
2467 {
2468  public:
2469   ebcdic_execution_charset () : m_num_iconv_errors (0)
2470     {
2471       gcc_assert (s_singleton == NULL);
2472       s_singleton = this;
2473     }
2474   ~ebcdic_execution_charset ()
2475     {
2476       gcc_assert (s_singleton == this);
2477       s_singleton = NULL;
2478     }
2479
2480   void apply (lexer_test &test) final override
2481   {
2482     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2483     cpp_opts->narrow_charset = "IBM1047";
2484
2485     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2486     callbacks->diagnostic = on_diagnostic;
2487   }
2488
2489   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2490                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2491                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2492                              rich_location *richloc ATTRIBUTE_UNUSED,
2493                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2494     ATTRIBUTE_FPTR_PRINTF(5,0)
2495   {
2496     gcc_assert (s_singleton);
2497     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2498     const char *msg = "conversion from %s to %s not supported by iconv";
2499 #ifdef ENABLE_NLS
2500     msg = dgettext ("cpplib", msg);
2501 #endif
2502     /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2503        when the local iconv build doesn't support the conversion.  */
2504     if (strcmp (msgid, msg) == 0)
2505       {
2506         s_singleton->m_num_iconv_errors++;
2507         return true;
2508       }
2509
2510     /* Otherwise, we have an unexpected error.  */
2511     abort ();
2512   }
2513
2514   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2515
2516  private:
2517   static ebcdic_execution_charset *s_singleton;
2518   int m_num_iconv_errors;
2519 };
2520
2521 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2522
2523 /* A lexer_test_options subclass that records a list of diagnostic
2524    messages emitted by the lexer.  */
2525
2526 class lexer_diagnostic_sink : public lexer_test_options
2527 {
2528  public:
2529   lexer_diagnostic_sink ()
2530   {
2531     gcc_assert (s_singleton == NULL);
2532     s_singleton = this;
2533   }
2534   ~lexer_diagnostic_sink ()
2535   {
2536     gcc_assert (s_singleton == this);
2537     s_singleton = NULL;
2538
2539     int i;
2540     char *str;
2541     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2542       free (str);
2543   }
2544
2545   void apply (lexer_test &test) final override
2546   {
2547     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2548     callbacks->diagnostic = on_diagnostic;
2549   }
2550
2551   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2552                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2553                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2554                              rich_location *richloc ATTRIBUTE_UNUSED,
2555                              const char *msgid, va_list *ap)
2556     ATTRIBUTE_FPTR_PRINTF(5,0)
2557   {
2558     char *msg = xvasprintf (msgid, *ap);
2559     s_singleton->m_diagnostics.safe_push (msg);
2560     return true;
2561   }
2562
2563   auto_vec<char *> m_diagnostics;
2564
2565  private:
2566   static lexer_diagnostic_sink *s_singleton;
2567 };
2568
2569 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2570
2571 /* Constructor.  Override line_table with a new instance based on CASE_,
2572    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2573    start parsing the tempfile.  */
2574
2575 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2576                         lexer_test_options *options)
2577 : m_ltt (case_),
2578   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2579   /* Create a tempfile and write the text to it.  */
2580   m_tempfile (SELFTEST_LOCATION, ".c", content),
2581   m_concats (),
2582   m_implicitly_expect_EOF (true)
2583 {
2584   if (options)
2585     options->apply (*this);
2586
2587   cpp_init_iconv (m_parser);
2588
2589   /* Parse the file.  */
2590   const char *fname = cpp_read_main_file (m_parser,
2591                                           m_tempfile.get_filename ());
2592   ASSERT_NE (fname, NULL);
2593 }
2594
2595 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2596
2597 lexer_test::~lexer_test ()
2598 {
2599   location_t loc;
2600   const cpp_token *tok;
2601
2602   if (m_implicitly_expect_EOF)
2603     {
2604       tok = cpp_get_token_with_location (m_parser, &loc);
2605       ASSERT_NE (tok, NULL);
2606       ASSERT_EQ (tok->type, CPP_EOF);
2607     }
2608 }
2609
2610 /* Get the next token from m_parser.  */
2611
2612 const cpp_token *
2613 lexer_test::get_token ()
2614 {
2615   location_t loc;
2616   const cpp_token *tok;
2617
2618   tok = cpp_get_token_with_location (m_parser, &loc);
2619   ASSERT_NE (tok, NULL);
2620   return tok;
2621 }
2622
2623 /* Verify that locations within string literals are correctly handled.  */
2624
2625 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2626    using the string concatenation database for TEST.
2627
2628    Assert that the character at index IDX is on EXPECTED_LINE,
2629    and that it begins at column EXPECTED_START_COL and ends at
2630    EXPECTED_FINISH_COL (unless the locations are beyond
2631    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2632    columns).  */
2633
2634 static void
2635 assert_char_at_range (const location &loc,
2636                       lexer_test& test,
2637                       location_t strloc, enum cpp_ttype type, int idx,
2638                       int expected_line, int expected_start_col,
2639                       int expected_finish_col)
2640 {
2641   cpp_reader *pfile = test.m_parser;
2642   string_concat_db *concats = &test.m_concats;
2643
2644   source_range actual_range = source_range();
2645   const char *err
2646     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2647                                  &actual_range);
2648   if (should_have_column_data_p (strloc))
2649     ASSERT_EQ_AT (loc, NULL, err);
2650   else
2651     {
2652       ASSERT_STREQ_AT (loc,
2653                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2654                        err);
2655       return;
2656     }
2657
2658   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2659   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2660   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2661   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2662
2663   if (should_have_column_data_p (actual_range.m_start))
2664     {
2665       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2666       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2667     }
2668   if (should_have_column_data_p (actual_range.m_finish))
2669     {
2670       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2671       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2672     }
2673 }
2674
2675 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2676    the effective location of any errors.  */
2677
2678 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2679                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2680   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2681                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2682                         (EXPECTED_FINISH_COL))
2683
2684 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2685    using the string concatenation database for TEST.
2686
2687    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2688
2689 static void
2690 assert_num_substring_ranges (const location &loc,
2691                              lexer_test& test,
2692                              location_t strloc,
2693                              enum cpp_ttype type,
2694                              int expected_num_ranges)
2695 {
2696   cpp_reader *pfile = test.m_parser;
2697   string_concat_db *concats = &test.m_concats;
2698
2699   int actual_num_ranges = -1;
2700   const char *err
2701     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2702                                            &actual_num_ranges);
2703   if (should_have_column_data_p (strloc))
2704     ASSERT_EQ_AT (loc, NULL, err);
2705   else
2706     {
2707       ASSERT_STREQ_AT (loc,
2708                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2709                        err);
2710       return;
2711     }
2712   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2713 }
2714
2715 /* Macro for calling assert_num_substring_ranges, supplying
2716    SELFTEST_LOCATION for the effective location of any errors.  */
2717
2718 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2719                                     EXPECTED_NUM_RANGES)                \
2720   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2721                                (TYPE), (EXPECTED_NUM_RANGES))
2722
2723
2724 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2725    returns an error (using the string concatenation database for TEST).  */
2726
2727 static void
2728 assert_has_no_substring_ranges (const location &loc,
2729                                 lexer_test& test,
2730                                 location_t strloc,
2731                                 enum cpp_ttype type,
2732                                 const char *expected_err)
2733 {
2734   cpp_reader *pfile = test.m_parser;
2735   string_concat_db *concats = &test.m_concats;
2736   cpp_substring_ranges ranges;
2737   const char *actual_err
2738     = get_substring_ranges_for_loc (pfile, concats, strloc,
2739                                     type, ranges);
2740   if (should_have_column_data_p (strloc))
2741     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2742   else
2743     ASSERT_STREQ_AT (loc,
2744                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2745                      actual_err);
2746 }
2747
2748 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2749     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2750                                     (STRLOC), (TYPE), (ERR))
2751
2752 /* Lex a simple string literal.  Verify the substring location data, before
2753    and after running cpp_interpret_string on it.  */
2754
2755 static void
2756 test_lexer_string_locations_simple (const line_table_case &case_)
2757 {
2758   /* Digits 0-9 (with 0 at column 10), the simple way.
2759      ....................000000000.11111111112.2222222223333333333
2760      ....................123456789.01234567890.1234567890123456789
2761      We add a trailing comment to ensure that we correctly locate
2762      the end of the string literal token.  */
2763   const char *content = "        \"0123456789\" /* not a string */\n";
2764   lexer_test test (case_, content, NULL);
2765
2766   /* Verify that we get the expected token back, with the correct
2767      location information.  */
2768   const cpp_token *tok = test.get_token ();
2769   ASSERT_EQ (tok->type, CPP_STRING);
2770   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2771   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2772
2773   /* At this point in lexing, the quote characters are treated as part of
2774      the string (they are stripped off by cpp_interpret_string).  */
2775
2776   ASSERT_EQ (tok->val.str.len, 12);
2777
2778   /* Verify that cpp_interpret_string works.  */
2779   cpp_string dst_string;
2780   const enum cpp_ttype type = CPP_STRING;
2781   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2782                                       &dst_string, type);
2783   ASSERT_TRUE (result);
2784   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2785   free (const_cast <unsigned char *> (dst_string.text));
2786
2787   /* Verify ranges of individual characters.  This no longer includes the
2788      opening quote, but does include the closing quote.  */
2789   for (int i = 0; i <= 10; i++)
2790     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2791                           10 + i, 10 + i);
2792
2793   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2794 }
2795
2796 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2797    encoding.  */
2798
2799 static void
2800 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2801 {
2802   /* EBCDIC support requires iconv.  */
2803   if (!HAVE_ICONV)
2804     return;
2805
2806   /* Digits 0-9 (with 0 at column 10), the simple way.
2807      ....................000000000.11111111112.2222222223333333333
2808      ....................123456789.01234567890.1234567890123456789
2809      We add a trailing comment to ensure that we correctly locate
2810      the end of the string literal token.  */
2811   const char *content = "        \"0123456789\" /* not a string */\n";
2812   ebcdic_execution_charset use_ebcdic;
2813   lexer_test test (case_, content, &use_ebcdic);
2814
2815   /* Verify that we get the expected token back, with the correct
2816      location information.  */
2817   const cpp_token *tok = test.get_token ();
2818   ASSERT_EQ (tok->type, CPP_STRING);
2819   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2820   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2821
2822   /* At this point in lexing, the quote characters are treated as part of
2823      the string (they are stripped off by cpp_interpret_string).  */
2824
2825   ASSERT_EQ (tok->val.str.len, 12);
2826
2827   /* The remainder of the test requires an iconv implementation that
2828      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2829   if (use_ebcdic.iconv_errors_occurred_p ())
2830     return;
2831
2832   /* Verify that cpp_interpret_string works.  */
2833   cpp_string dst_string;
2834   const enum cpp_ttype type = CPP_STRING;
2835   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2836                                       &dst_string, type);
2837   ASSERT_TRUE (result);
2838   /* We should now have EBCDIC-encoded text, specifically
2839      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2840      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2841   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2842                 (const char *)dst_string.text);
2843   free (const_cast <unsigned char *> (dst_string.text));
2844
2845   /* Verify that we don't attempt to record substring location information
2846      for such cases.  */
2847   ASSERT_HAS_NO_SUBSTRING_RANGES
2848     (test, tok->src_loc, type,
2849      "execution character set != source character set");
2850 }
2851
2852 /* Lex a string literal containing a hex-escaped character.
2853    Verify the substring location data, before and after running
2854    cpp_interpret_string on it.  */
2855
2856 static void
2857 test_lexer_string_locations_hex (const line_table_case &case_)
2858 {
2859   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2860      and with a space in place of digit 6, to terminate the escaped
2861      hex code.
2862      ....................000000000.111111.11112222.
2863      ....................123456789.012345.67890123.  */
2864   const char *content = "        \"01234\\x35 789\"\n";
2865   lexer_test test (case_, content, NULL);
2866
2867   /* Verify that we get the expected token back, with the correct
2868      location information.  */
2869   const cpp_token *tok = test.get_token ();
2870   ASSERT_EQ (tok->type, CPP_STRING);
2871   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2872   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2873
2874   /* At this point in lexing, the quote characters are treated as part of
2875      the string (they are stripped off by cpp_interpret_string).  */
2876   ASSERT_EQ (tok->val.str.len, 15);
2877
2878   /* Verify that cpp_interpret_string works.  */
2879   cpp_string dst_string;
2880   const enum cpp_ttype type = CPP_STRING;
2881   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2882                                       &dst_string, type);
2883   ASSERT_TRUE (result);
2884   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2885   free (const_cast <unsigned char *> (dst_string.text));
2886
2887   /* Verify ranges of individual characters.  This no longer includes the
2888      opening quote, but does include the closing quote.  */
2889   for (int i = 0; i <= 4; i++)
2890     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2891   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2892   for (int i = 6; i <= 10; i++)
2893     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2894
2895   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2896 }
2897
2898 /* Lex a string literal containing an octal-escaped character.
2899    Verify the substring location data after running cpp_interpret_string
2900    on it.  */
2901
2902 static void
2903 test_lexer_string_locations_oct (const line_table_case &case_)
2904 {
2905   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2906      and with a space in place of digit 6, to terminate the escaped
2907      octal code.
2908      ....................000000000.111111.11112222.2222223333333333444
2909      ....................123456789.012345.67890123.4567890123456789012  */
2910   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2911   lexer_test test (case_, content, NULL);
2912
2913   /* Verify that we get the expected token back, with the correct
2914      location information.  */
2915   const cpp_token *tok = test.get_token ();
2916   ASSERT_EQ (tok->type, CPP_STRING);
2917   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2918
2919   /* Verify that cpp_interpret_string works.  */
2920   cpp_string dst_string;
2921   const enum cpp_ttype type = CPP_STRING;
2922   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2923                                       &dst_string, type);
2924   ASSERT_TRUE (result);
2925   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2926   free (const_cast <unsigned char *> (dst_string.text));
2927
2928   /* Verify ranges of individual characters.  This no longer includes the
2929      opening quote, but does include the closing quote.  */
2930   for (int i = 0; i < 5; i++)
2931     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2932   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2933   for (int i = 6; i <= 10; i++)
2934     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2935
2936   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2937 }
2938
2939 /* Test of string literal containing letter escapes.  */
2940
2941 static void
2942 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2943 {
2944   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2945      .....................000000000.1.11111.1.1.11222.22222223333333
2946      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2947   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2948   lexer_test test (case_, content, NULL);
2949
2950   /* Verify that we get the expected tokens back.  */
2951   const cpp_token *tok = test.get_token ();
2952   ASSERT_EQ (tok->type, CPP_STRING);
2953   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2954
2955   /* Verify ranges of individual characters. */
2956   /* "\t".  */
2957   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2958                         0, 1, 10, 11);
2959   /* "foo". */
2960   for (int i = 1; i <= 3; i++)
2961     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2962                           i, 1, 11 + i, 11 + i);
2963   /* "\\" and "\n".  */
2964   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2965                         4, 1, 15, 16);
2966   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2967                         5, 1, 17, 18);
2968
2969   /* "bar" and closing quote for nul-terminator.  */
2970   for (int i = 6; i <= 9; i++)
2971     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2972                           i, 1, 13 + i, 13 + i);
2973
2974   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2975 }
2976
2977 /* Another test of a string literal containing a letter escape.
2978    Based on string seen in
2979      printf ("%-%\n");
2980    in gcc.dg/format/c90-printf-1.c.  */
2981
2982 static void
2983 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2984 {
2985   /* .....................000000000.1111.11.1111.22222222223.
2986      .....................123456789.0123.45.6789.01234567890.  */
2987   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2988   lexer_test test (case_, content, NULL);
2989
2990   /* Verify that we get the expected tokens back.  */
2991   const cpp_token *tok = test.get_token ();
2992   ASSERT_EQ (tok->type, CPP_STRING);
2993   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2994
2995   /* Verify ranges of individual characters. */
2996   /* "%-%".  */
2997   for (int i = 0; i < 3; i++)
2998     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2999                           i, 1, 10 + i, 10 + i);
3000   /* "\n".  */
3001   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3002                         3, 1, 13, 14);
3003
3004   /* Closing quote for nul-terminator.  */
3005   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3006                         4, 1, 15, 15);
3007
3008   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
3009 }
3010
3011 /* Lex a string literal containing UCN 4 characters.
3012    Verify the substring location data after running cpp_interpret_string
3013    on it.  */
3014
3015 static void
3016 test_lexer_string_locations_ucn4 (const line_table_case &case_)
3017 {
3018   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
3019      as UCN 4.
3020      ....................000000000.111111.111122.222222223.33333333344444
3021      ....................123456789.012345.678901.234567890.12345678901234  */
3022   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
3023   lexer_test test (case_, content, NULL);
3024
3025   /* Verify that we get the expected token back, with the correct
3026      location information.  */
3027   const cpp_token *tok = test.get_token ();
3028   ASSERT_EQ (tok->type, CPP_STRING);
3029   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
3030
3031   /* Verify that cpp_interpret_string works.
3032      The string should be encoded in the execution character
3033      set.  Assuming that is UTF-8, we should have the following:
3034      -----------  ----  -----  -------  ----------------
3035      Byte offset  Byte  Octal  Unicode  Source Column(s)
3036      -----------  ----  -----  -------  ----------------
3037      0            0x30         '0'      10
3038      1            0x31         '1'      11
3039      2            0x32         '2'      12
3040      3            0x33         '3'      13
3041      4            0x34         '4'      14
3042      5            0xE2  \342   U+2174   15-20
3043      6            0x85  \205    (cont)  15-20
3044      7            0xB4  \264    (cont)  15-20
3045      8            0xE2  \342   U+2175   21-26
3046      9            0x85  \205    (cont)  21-26
3047      10           0xB5  \265    (cont)  21-26
3048      11           0x37         '7'      27
3049      12           0x38         '8'      28
3050      13           0x39         '9'      29
3051      14           0x00                  30 (closing quote)
3052      -----------  ----  -----  -------  ---------------.  */
3053
3054   cpp_string dst_string;
3055   const enum cpp_ttype type = CPP_STRING;
3056   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3057                                       &dst_string, type);
3058   ASSERT_TRUE (result);
3059   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3060                 (const char *)dst_string.text);
3061   free (const_cast <unsigned char *> (dst_string.text));
3062
3063   /* Verify ranges of individual characters.  This no longer includes the
3064      opening quote, but does include the closing quote.
3065      '01234'.  */
3066   for (int i = 0; i <= 4; i++)
3067     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3068   /* U+2174.  */
3069   for (int i = 5; i <= 7; i++)
3070     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
3071   /* U+2175.  */
3072   for (int i = 8; i <= 10; i++)
3073     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
3074   /* '789' and nul terminator  */
3075   for (int i = 11; i <= 14; i++)
3076     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
3077
3078   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3079 }
3080
3081 /* Lex a string literal containing UCN 8 characters.
3082    Verify the substring location data after running cpp_interpret_string
3083    on it.  */
3084
3085 static void
3086 test_lexer_string_locations_ucn8 (const line_table_case &case_)
3087 {
3088   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
3089      ....................000000000.111111.1111222222.2222333333333.344444
3090      ....................123456789.012345.6789012345.6789012345678.901234  */
3091   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
3092   lexer_test test (case_, content, NULL);
3093
3094   /* Verify that we get the expected token back, with the correct
3095      location information.  */
3096   const cpp_token *tok = test.get_token ();
3097   ASSERT_EQ (tok->type, CPP_STRING);
3098   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3099                            "\"01234\\U00002174\\U00002175789\"");
3100
3101   /* Verify that cpp_interpret_string works.
3102      The UTF-8 encoding of the string is identical to that from
3103      the ucn4 testcase above; the only difference is the column
3104      locations.  */
3105   cpp_string dst_string;
3106   const enum cpp_ttype type = CPP_STRING;
3107   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3108                                       &dst_string, type);
3109   ASSERT_TRUE (result);
3110   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3111                 (const char *)dst_string.text);
3112   free (const_cast <unsigned char *> (dst_string.text));
3113
3114   /* Verify ranges of individual characters.  This no longer includes the
3115      opening quote, but does include the closing quote.
3116      '01234'.  */
3117   for (int i = 0; i <= 4; i++)
3118     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3119   /* U+2174.  */
3120   for (int i = 5; i <= 7; i++)
3121     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3122   /* U+2175.  */
3123   for (int i = 8; i <= 10; i++)
3124     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3125   /* '789' at columns 35-37  */
3126   for (int i = 11; i <= 13; i++)
3127     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3128   /* Closing quote/nul-terminator at column 38.  */
3129   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3130
3131   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3132 }
3133
3134 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
3135
3136 static uint32_t
3137 uint32_from_big_endian (const uint32_t *ptr_be_value)
3138 {
3139   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3140   return (((uint32_t) buf[0] << 24)
3141           | ((uint32_t) buf[1] << 16)
3142           | ((uint32_t) buf[2] << 8)
3143           | (uint32_t) buf[3]);
3144 }
3145
3146 /* Lex a wide string literal and verify that attempts to read substring
3147    location data from it fail gracefully.  */
3148
3149 static void
3150 test_lexer_string_locations_wide_string (const line_table_case &case_)
3151 {
3152   /* Digits 0-9.
3153      ....................000000000.11111111112.22222222233333
3154      ....................123456789.01234567890.12345678901234  */
3155   const char *content = "       L\"0123456789\" /* non-str */\n";
3156   lexer_test test (case_, content, NULL);
3157
3158   /* Verify that we get the expected token back, with the correct
3159      location information.  */
3160   const cpp_token *tok = test.get_token ();
3161   ASSERT_EQ (tok->type, CPP_WSTRING);
3162   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3163
3164   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
3165   cpp_string dst_string;
3166   const enum cpp_ttype type = CPP_WSTRING;
3167   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3168                                       &dst_string, type);
3169   ASSERT_TRUE (result);
3170   /* The cpp_reader defaults to big-endian with
3171      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3172      now be encoded as UTF-32BE.  */
3173   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3174   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3175   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3176   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3177   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3178   free (const_cast <unsigned char *> (dst_string.text));
3179
3180   /* We don't yet support generating substring location information
3181      for L"" strings.  */
3182   ASSERT_HAS_NO_SUBSTRING_RANGES
3183     (test, tok->src_loc, type,
3184      "execution character set != source character set");
3185 }
3186
3187 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3188
3189 static uint16_t
3190 uint16_from_big_endian (const uint16_t *ptr_be_value)
3191 {
3192   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3193   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3194 }
3195
3196 /* Lex a u"" string literal and verify that attempts to read substring
3197    location data from it fail gracefully.  */
3198
3199 static void
3200 test_lexer_string_locations_string16 (const line_table_case &case_)
3201 {
3202   /* Digits 0-9.
3203      ....................000000000.11111111112.22222222233333
3204      ....................123456789.01234567890.12345678901234  */
3205   const char *content = "       u\"0123456789\" /* non-str */\n";
3206   lexer_test test (case_, content, NULL);
3207
3208   /* Verify that we get the expected token back, with the correct
3209      location information.  */
3210   const cpp_token *tok = test.get_token ();
3211   ASSERT_EQ (tok->type, CPP_STRING16);
3212   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3213
3214   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3215   cpp_string dst_string;
3216   const enum cpp_ttype type = CPP_STRING16;
3217   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3218                                       &dst_string, type);
3219   ASSERT_TRUE (result);
3220
3221   /* The cpp_reader defaults to big-endian, so dst_string should
3222      now be encoded as UTF-16BE.  */
3223   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3224   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3225   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3226   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3227   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3228   free (const_cast <unsigned char *> (dst_string.text));
3229
3230   /* We don't yet support generating substring location information
3231      for L"" strings.  */
3232   ASSERT_HAS_NO_SUBSTRING_RANGES
3233     (test, tok->src_loc, type,
3234      "execution character set != source character set");
3235 }
3236
3237 /* Lex a U"" string literal and verify that attempts to read substring
3238    location data from it fail gracefully.  */
3239
3240 static void
3241 test_lexer_string_locations_string32 (const line_table_case &case_)
3242 {
3243   /* Digits 0-9.
3244      ....................000000000.11111111112.22222222233333
3245      ....................123456789.01234567890.12345678901234  */
3246   const char *content = "       U\"0123456789\" /* non-str */\n";
3247   lexer_test test (case_, content, NULL);
3248
3249   /* Verify that we get the expected token back, with the correct
3250      location information.  */
3251   const cpp_token *tok = test.get_token ();
3252   ASSERT_EQ (tok->type, CPP_STRING32);
3253   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3254
3255   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3256   cpp_string dst_string;
3257   const enum cpp_ttype type = CPP_STRING32;
3258   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3259                                       &dst_string, type);
3260   ASSERT_TRUE (result);
3261
3262   /* The cpp_reader defaults to big-endian, so dst_string should
3263      now be encoded as UTF-32BE.  */
3264   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3265   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3266   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3267   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3268   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3269   free (const_cast <unsigned char *> (dst_string.text));
3270
3271   /* We don't yet support generating substring location information
3272      for L"" strings.  */
3273   ASSERT_HAS_NO_SUBSTRING_RANGES
3274     (test, tok->src_loc, type,
3275      "execution character set != source character set");
3276 }
3277
3278 /* Lex a u8-string literal.
3279    Verify the substring location data after running cpp_interpret_string
3280    on it.  */
3281
3282 static void
3283 test_lexer_string_locations_u8 (const line_table_case &case_)
3284 {
3285   /* Digits 0-9.
3286      ....................000000000.11111111112.22222222233333
3287      ....................123456789.01234567890.12345678901234  */
3288   const char *content = "      u8\"0123456789\" /* non-str */\n";
3289   lexer_test test (case_, content, NULL);
3290
3291   /* Verify that we get the expected token back, with the correct
3292      location information.  */
3293   const cpp_token *tok = test.get_token ();
3294   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3295   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3296
3297   /* Verify that cpp_interpret_string works.  */
3298   cpp_string dst_string;
3299   const enum cpp_ttype type = CPP_STRING;
3300   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3301                                       &dst_string, type);
3302   ASSERT_TRUE (result);
3303   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3304   free (const_cast <unsigned char *> (dst_string.text));
3305
3306   /* Verify ranges of individual characters.  This no longer includes the
3307      opening quote, but does include the closing quote.  */
3308   for (int i = 0; i <= 10; i++)
3309     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3310 }
3311
3312 /* Lex a string literal containing UTF-8 source characters.
3313    Verify the substring location data after running cpp_interpret_string
3314    on it.  */
3315
3316 static void
3317 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3318 {
3319  /* This string literal is written out to the source file as UTF-8,
3320     and is of the form "before mojibake after", where "mojibake"
3321     is written as the following four unicode code points:
3322        U+6587 CJK UNIFIED IDEOGRAPH-6587
3323        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3324        U+5316 CJK UNIFIED IDEOGRAPH-5316
3325        U+3051 HIRAGANA LETTER KE.
3326      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3327      "before" and "after" are 1 byte per unicode character.
3328
3329      The numbering shown are "columns", which are *byte* numbers within
3330      the line, rather than unicode character numbers.
3331
3332      .................... 000000000.1111111.
3333      .................... 123456789.0123456.  */
3334   const char *content = ("        \"before "
3335                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3336                               UTF-8: 0xE6 0x96 0x87
3337                               C octal escaped UTF-8: \346\226\207
3338                             "column" numbers: 17-19.  */
3339                          "\346\226\207"
3340
3341                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3342                               UTF-8: 0xE5 0xAD 0x97
3343                               C octal escaped UTF-8: \345\255\227
3344                             "column" numbers: 20-22.  */
3345                          "\345\255\227"
3346
3347                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3348                               UTF-8: 0xE5 0x8C 0x96
3349                               C octal escaped UTF-8: \345\214\226
3350                             "column" numbers: 23-25.  */
3351                          "\345\214\226"
3352
3353                          /* U+3051 HIRAGANA LETTER KE
3354                               UTF-8: 0xE3 0x81 0x91
3355                               C octal escaped UTF-8: \343\201\221
3356                             "column" numbers: 26-28.  */
3357                          "\343\201\221"
3358
3359                          /* column numbers 29 onwards
3360                           2333333.33334444444444
3361                           9012345.67890123456789. */
3362                          " after\" /* non-str */\n");
3363   lexer_test test (case_, content, NULL);
3364
3365   /* Verify that we get the expected token back, with the correct
3366      location information.  */
3367   const cpp_token *tok = test.get_token ();
3368   ASSERT_EQ (tok->type, CPP_STRING);
3369   ASSERT_TOKEN_AS_TEXT_EQ
3370     (test.m_parser, tok,
3371      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3372
3373   /* Verify that cpp_interpret_string works.  */
3374   cpp_string dst_string;
3375   const enum cpp_ttype type = CPP_STRING;
3376   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3377                                       &dst_string, type);
3378   ASSERT_TRUE (result);
3379   ASSERT_STREQ
3380     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3381      (const char *)dst_string.text);
3382   free (const_cast <unsigned char *> (dst_string.text));
3383
3384   /* Verify ranges of individual characters.  This no longer includes the
3385      opening quote, but does include the closing quote.
3386      Assuming that both source and execution encodings are UTF-8, we have
3387      a run of 25 octets in each, plus the NUL terminator.  */
3388   for (int i = 0; i < 25; i++)
3389     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3390   /* NUL-terminator should use the closing quote at column 35.  */
3391   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3392
3393   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3394 }
3395
3396 /* Test of string literal concatenation.  */
3397
3398 static void
3399 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3400 {
3401   /* Digits 0-9.
3402      .....................000000000.111111.11112222222222
3403      .....................123456789.012345.67890123456789.  */
3404   const char *content = ("        \"01234\" /* non-str */\n"
3405                          "        \"56789\" /* non-str */\n");
3406   lexer_test test (case_, content, NULL);
3407
3408   location_t input_locs[2];
3409
3410   /* Verify that we get the expected tokens back.  */
3411   auto_vec <cpp_string> input_strings;
3412   const cpp_token *tok_a = test.get_token ();
3413   ASSERT_EQ (tok_a->type, CPP_STRING);
3414   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3415   input_strings.safe_push (tok_a->val.str);
3416   input_locs[0] = tok_a->src_loc;
3417
3418   const cpp_token *tok_b = test.get_token ();
3419   ASSERT_EQ (tok_b->type, CPP_STRING);
3420   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3421   input_strings.safe_push (tok_b->val.str);
3422   input_locs[1] = tok_b->src_loc;
3423
3424   /* Verify that cpp_interpret_string works.  */
3425   cpp_string dst_string;
3426   const enum cpp_ttype type = CPP_STRING;
3427   bool result = cpp_interpret_string (test.m_parser,
3428                                       input_strings.address (), 2,
3429                                       &dst_string, type);
3430   ASSERT_TRUE (result);
3431   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3432   free (const_cast <unsigned char *> (dst_string.text));
3433
3434   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3435   test.m_concats.record_string_concatenation (2, input_locs);
3436
3437   location_t initial_loc = input_locs[0];
3438
3439   /* "01234" on line 1.  */
3440   for (int i = 0; i <= 4; i++)
3441     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3442   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3443   for (int i = 5; i <= 10; i++)
3444     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3445
3446   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3447 }
3448
3449 /* Another test of string literal concatenation.  */
3450
3451 static void
3452 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3453 {
3454   /* Digits 0-9.
3455      .....................000000000.111.11111112222222
3456      .....................123456789.012.34567890123456.  */
3457   const char *content = ("        \"01\" /* non-str */\n"
3458                          "        \"23\" /* non-str */\n"
3459                          "        \"45\" /* non-str */\n"
3460                          "        \"67\" /* non-str */\n"
3461                          "        \"89\" /* non-str */\n");
3462   lexer_test test (case_, content, NULL);
3463
3464   auto_vec <cpp_string> input_strings;
3465   location_t input_locs[5];
3466
3467   /* Verify that we get the expected tokens back.  */
3468   for (int i = 0; i < 5; i++)
3469     {
3470       const cpp_token *tok = test.get_token ();
3471       ASSERT_EQ (tok->type, CPP_STRING);
3472       input_strings.safe_push (tok->val.str);
3473       input_locs[i] = tok->src_loc;
3474     }
3475
3476   /* Verify that cpp_interpret_string works.  */
3477   cpp_string dst_string;
3478   const enum cpp_ttype type = CPP_STRING;
3479   bool result = cpp_interpret_string (test.m_parser,
3480                                       input_strings.address (), 5,
3481                                       &dst_string, type);
3482   ASSERT_TRUE (result);
3483   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3484   free (const_cast <unsigned char *> (dst_string.text));
3485
3486   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3487   test.m_concats.record_string_concatenation (5, input_locs);
3488
3489   location_t initial_loc = input_locs[0];
3490
3491   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3492      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3493      and expect get_source_range_for_substring to fail.
3494      However, for a string concatenation test, we can have a case
3495      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3496      but subsequent strings can be after it.
3497      Attempting to detect this within assert_char_at_range
3498      would overcomplicate the logic for the common test cases, so
3499      we detect it here.  */
3500   if (should_have_column_data_p (input_locs[0])
3501       && !should_have_column_data_p (input_locs[4]))
3502     {
3503       /* Verify that get_source_range_for_substring gracefully rejects
3504          this case.  */
3505       source_range actual_range;
3506       const char *err
3507         = get_source_range_for_char (test.m_parser, &test.m_concats,
3508                                      initial_loc, type, 0, &actual_range);
3509       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3510       return;
3511     }
3512
3513   for (int i = 0; i < 5; i++)
3514     for (int j = 0; j < 2; j++)
3515       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3516                             i + 1, 10 + j, 10 + j);
3517
3518   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3519   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3520
3521   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3522 }
3523
3524 /* Another test of string literal concatenation, this time combined with
3525    various kinds of escaped characters.  */
3526
3527 static void
3528 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3529 {
3530   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3531      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3532   const char *content
3533     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3534        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3535     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3536   lexer_test test (case_, content, NULL);
3537
3538   auto_vec <cpp_string> input_strings;
3539   location_t input_locs[4];
3540
3541   /* Verify that we get the expected tokens back.  */
3542   for (int i = 0; i < 4; i++)
3543     {
3544       const cpp_token *tok = test.get_token ();
3545       ASSERT_EQ (tok->type, CPP_STRING);
3546       input_strings.safe_push (tok->val.str);
3547       input_locs[i] = tok->src_loc;
3548     }
3549
3550   /* Verify that cpp_interpret_string works.  */
3551   cpp_string dst_string;
3552   const enum cpp_ttype type = CPP_STRING;
3553   bool result = cpp_interpret_string (test.m_parser,
3554                                       input_strings.address (), 4,
3555                                       &dst_string, type);
3556   ASSERT_TRUE (result);
3557   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3558   free (const_cast <unsigned char *> (dst_string.text));
3559
3560   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3561   test.m_concats.record_string_concatenation (4, input_locs);
3562
3563   location_t initial_loc = input_locs[0];
3564
3565   for (int i = 0; i <= 4; i++)
3566     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3567   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3568   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3569   for (int i = 7; i <= 9; i++)
3570     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3571
3572   /* NUL-terminator should use the location of the final closing quote.  */
3573   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3574
3575   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3576 }
3577
3578 /* Test of string literal in a macro.  */
3579
3580 static void
3581 test_lexer_string_locations_macro (const line_table_case &case_)
3582 {
3583   /* Digits 0-9.
3584      .....................0000000001111111111.22222222223.
3585      .....................1234567890123456789.01234567890.  */
3586   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3587                          "  MACRO");
3588   lexer_test test (case_, content, NULL);
3589
3590   /* Verify that we get the expected tokens back.  */
3591   const cpp_token *tok = test.get_token ();
3592   ASSERT_EQ (tok->type, CPP_PADDING);
3593
3594   tok = test.get_token ();
3595   ASSERT_EQ (tok->type, CPP_STRING);
3596   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3597
3598   /* Verify ranges of individual characters.  We ought to
3599      see columns within the macro definition.  */
3600   for (int i = 0; i <= 10; i++)
3601     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3602                           i, 1, 20 + i, 20 + i);
3603
3604   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3605
3606   tok = test.get_token ();
3607   ASSERT_EQ (tok->type, CPP_PADDING);
3608 }
3609
3610 /* Test of stringification of a macro argument.  */
3611
3612 static void
3613 test_lexer_string_locations_stringified_macro_argument
3614   (const line_table_case &case_)
3615 {
3616   /* .....................000000000111111111122222222223.
3617      .....................123456789012345678901234567890.  */
3618   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3619                          "MACRO(foo)\n");
3620   lexer_test test (case_, content, NULL);
3621
3622   /* Verify that we get the expected token back.  */
3623   const cpp_token *tok = test.get_token ();
3624   ASSERT_EQ (tok->type, CPP_PADDING);
3625
3626   tok = test.get_token ();
3627   ASSERT_EQ (tok->type, CPP_STRING);
3628   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3629
3630   /* We don't support getting the location of a stringified macro
3631      argument.  Verify that it fails gracefully.  */
3632   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3633                                   "cpp_interpret_string_1 failed");
3634
3635   tok = test.get_token ();
3636   ASSERT_EQ (tok->type, CPP_PADDING);
3637
3638   tok = test.get_token ();
3639   ASSERT_EQ (tok->type, CPP_PADDING);
3640 }
3641
3642 /* Ensure that we are fail gracefully if something attempts to pass
3643    in a location that isn't a string literal token.  Seen on this code:
3644
3645      const char a[] = " %d ";
3646      __builtin_printf (a, 0.5);
3647                        ^
3648
3649    when c-format.cc erroneously used the indicated one-character
3650    location as the format string location, leading to a read past the
3651    end of a string buffer in cpp_interpret_string_1.  */
3652
3653 static void
3654 test_lexer_string_locations_non_string (const line_table_case &case_)
3655 {
3656   /* .....................000000000111111111122222222223.
3657      .....................123456789012345678901234567890.  */
3658   const char *content = ("         a\n");
3659   lexer_test test (case_, content, NULL);
3660
3661   /* Verify that we get the expected token back.  */
3662   const cpp_token *tok = test.get_token ();
3663   ASSERT_EQ (tok->type, CPP_NAME);
3664   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3665
3666   /* At this point, libcpp is attempting to interpret the name as a
3667      string literal, despite it not starting with a quote.  We don't detect
3668      that, but we should at least fail gracefully.  */
3669   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3670                                   "cpp_interpret_string_1 failed");
3671 }
3672
3673 /* Ensure that we can read substring information for a token which
3674    starts in one linemap and ends in another .  Adapted from
3675    gcc.dg/cpp/pr69985.c.  */
3676
3677 static void
3678 test_lexer_string_locations_long_line (const line_table_case &case_)
3679 {
3680   /* .....................000000.000111111111
3681      .....................123456.789012346789.  */
3682   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3683                          "     \"0123456789012345678901234567890123456789"
3684                          "0123456789012345678901234567890123456789"
3685                          "0123456789012345678901234567890123456789"
3686                          "0123456789\"\n");
3687
3688   lexer_test test (case_, content, NULL);
3689
3690   /* Verify that we get the expected token back.  */
3691   const cpp_token *tok = test.get_token ();
3692   ASSERT_EQ (tok->type, CPP_STRING);
3693
3694   if (!should_have_column_data_p (line_table->highest_location))
3695     return;
3696
3697   /* Verify ranges of individual characters.  */
3698   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3699   for (int i = 0; i < 131; i++)
3700     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3701                           i, 2, 7 + i, 7 + i);
3702 }
3703
3704 /* Test of locations within a raw string that doesn't contain a newline.  */
3705
3706 static void
3707 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3708 {
3709   /* .....................00.0000000111111111122.
3710      .....................12.3456789012345678901.  */
3711   const char *content = ("R\"foo(0123456789)foo\"\n");
3712   lexer_test test (case_, content, NULL);
3713
3714   /* Verify that we get the expected token back.  */
3715   const cpp_token *tok = test.get_token ();
3716   ASSERT_EQ (tok->type, CPP_STRING);
3717
3718   /* Verify that cpp_interpret_string works.  */
3719   cpp_string dst_string;
3720   const enum cpp_ttype type = CPP_STRING;
3721   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3722                                       &dst_string, type);
3723   ASSERT_TRUE (result);
3724   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3725   free (const_cast <unsigned char *> (dst_string.text));
3726
3727   if (!should_have_column_data_p (line_table->highest_location))
3728     return;
3729
3730   /* 0-9, plus the nil terminator.  */
3731   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3732   for (int i = 0; i < 11; i++)
3733     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3734                           i, 1, 7 + i, 7 + i);
3735 }
3736
3737 /* Test of locations within a raw string that contains a newline.  */
3738
3739 static void
3740 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3741 {
3742   /* .....................00.0000.
3743      .....................12.3456.  */
3744   const char *content = ("R\"foo(\n"
3745   /* .....................00000.
3746      .....................12345.  */
3747                          "hello\n"
3748                          "world\n"
3749   /* .....................00000.
3750      .....................12345.  */
3751                          ")foo\"\n");
3752   lexer_test test (case_, content, NULL);
3753
3754   /* Verify that we get the expected token back.  */
3755   const cpp_token *tok = test.get_token ();
3756   ASSERT_EQ (tok->type, CPP_STRING);
3757
3758   /* Verify that cpp_interpret_string works.  */
3759   cpp_string dst_string;
3760   const enum cpp_ttype type = CPP_STRING;
3761   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3762                                       &dst_string, type);
3763   ASSERT_TRUE (result);
3764   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3765   free (const_cast <unsigned char *> (dst_string.text));
3766
3767   if (!should_have_column_data_p (line_table->highest_location))
3768     return;
3769
3770   /* Currently we don't support locations within raw strings that
3771      contain newlines.  */
3772   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3773                                   "range endpoints are on different lines");
3774 }
3775
3776 /* Test of parsing an unterminated raw string.  */
3777
3778 static void
3779 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3780 {
3781   const char *content = "R\"ouch()ouCh\" /* etc */";
3782
3783   lexer_diagnostic_sink diagnostics;
3784   lexer_test test (case_, content, &diagnostics);
3785   test.m_implicitly_expect_EOF = false;
3786
3787   /* Attempt to parse the raw string.  */
3788   const cpp_token *tok = test.get_token ();
3789   ASSERT_EQ (tok->type, CPP_EOF);
3790
3791   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3792   /* We expect the message "unterminated raw string"
3793      in the "cpplib" translation domain.
3794      It's not clear that dgettext is available on all supported hosts,
3795      so this assertion is commented-out for now.
3796        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3797                      diagnostics.m_diagnostics[0]);
3798   */
3799 }
3800
3801 /* Test of lexing char constants.  */
3802
3803 static void
3804 test_lexer_char_constants (const line_table_case &case_)
3805 {
3806   /* Various char constants.
3807      .....................0000000001111111111.22222222223.
3808      .....................1234567890123456789.01234567890.  */
3809   const char *content = ("         'a'\n"
3810                          "        u'a'\n"
3811                          "        U'a'\n"
3812                          "        L'a'\n"
3813                          "         'abc'\n");
3814   lexer_test test (case_, content, NULL);
3815
3816   /* Verify that we get the expected tokens back.  */
3817   /* 'a'.  */
3818   const cpp_token *tok = test.get_token ();
3819   ASSERT_EQ (tok->type, CPP_CHAR);
3820   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3821
3822   unsigned int chars_seen;
3823   int unsignedp;
3824   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3825                                           &chars_seen, &unsignedp);
3826   ASSERT_EQ (cc, 'a');
3827   ASSERT_EQ (chars_seen, 1);
3828
3829   /* u'a'.  */
3830   tok = test.get_token ();
3831   ASSERT_EQ (tok->type, CPP_CHAR16);
3832   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3833
3834   /* U'a'.  */
3835   tok = test.get_token ();
3836   ASSERT_EQ (tok->type, CPP_CHAR32);
3837   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3838
3839   /* L'a'.  */
3840   tok = test.get_token ();
3841   ASSERT_EQ (tok->type, CPP_WCHAR);
3842   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3843
3844   /* 'abc' (c-char-sequence).  */
3845   tok = test.get_token ();
3846   ASSERT_EQ (tok->type, CPP_CHAR);
3847   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3848 }
3849 /* A table of interesting location_t values, giving one axis of our test
3850    matrix.  */
3851
3852 static const location_t boundary_locations[] = {
3853   /* Zero means "don't override the default values for a new line_table".  */
3854   0,
3855
3856   /* An arbitrary non-zero value that isn't close to one of
3857      the boundary values below.  */
3858   0x10000,
3859
3860   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3861   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3862   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3863   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3864   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3865   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3866
3867   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3868   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3869   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3870   LINE_MAP_MAX_LOCATION_WITH_COLS,
3871   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3872   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3873 };
3874
3875 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3876
3877 void
3878 for_each_line_table_case (void (*testcase) (const line_table_case &))
3879 {
3880   /* As noted above in the description of struct line_table_case,
3881      we want to explore a test matrix of interesting line_table
3882      situations, running various selftests for each case within the
3883      matrix.  */
3884
3885   /* Run all tests with:
3886      (a) line_table->default_range_bits == 0, and
3887      (b) line_table->default_range_bits == 5.  */
3888   int num_cases_tested = 0;
3889   for (int default_range_bits = 0; default_range_bits <= 5;
3890        default_range_bits += 5)
3891     {
3892       /* ...and use each of the "interesting" location values as
3893          the starting location within line_table.  */
3894       const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3895       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3896         {
3897           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3898
3899           testcase (c);
3900
3901           num_cases_tested++;
3902         }
3903     }
3904
3905   /* Verify that we fully covered the test matrix.  */
3906   ASSERT_EQ (num_cases_tested, 2 * 12);
3907 }
3908
3909 /* Verify that when presented with a consecutive pair of locations with
3910    a very large line offset, we don't attempt to consolidate them into
3911    a single ordinary linemap where the line offsets within the line map
3912    would lead to overflow (PR lto/88147).  */
3913
3914 static void
3915 test_line_offset_overflow ()
3916 {
3917   line_table_test ltt (line_table_case (5, 0));
3918
3919   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3920   linemap_line_start (line_table, 1, 100);
3921   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3922   assert_loceq ("foo.c", 2578, 0, loc_a);
3923
3924   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3925   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3926   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3927
3928   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3929   assert_loceq ("foo.c", 404198, 0, loc_b);
3930
3931   /* We should have started a new linemap, rather than attempting to store
3932      a very large line offset.  */
3933   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3934   ASSERT_NE (ordmap_a, ordmap_b);
3935 }
3936
3937 void test_cpp_utf8 ()
3938 {
3939   const int def_tabstop = 8;
3940   cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3941
3942   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3943   {
3944     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3945     ASSERT_EQ (8, w_bad);
3946     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3947     ASSERT_EQ (5, w_ctrl);
3948   }
3949
3950   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3951   {
3952     const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3953     ASSERT_EQ (1, w_pi);
3954     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3955     ASSERT_EQ (2, w_emoji);
3956     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3957                                                         policy);
3958     ASSERT_EQ (1, w_umlaut_precomposed);
3959     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3960                                                       policy);
3961     ASSERT_EQ (1, w_umlaut_combining);
3962     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3963     ASSERT_EQ (2, w_han);
3964     const int w_ascii = cpp_display_width ("GCC", 3, policy);
3965     ASSERT_EQ (3, w_ascii);
3966     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3967                                            "\x9f! \xe4\xb8\xba y\xcc\x88",
3968                                            24, policy);
3969     ASSERT_EQ (18, w_mixed);
3970   }
3971
3972   /* Verify that display width properly expands tabs.  */
3973   {
3974     const char *tstr = "\tabc\td";
3975     ASSERT_EQ (6, cpp_display_width (tstr, 6,
3976                                      cpp_char_column_policy (1, cpp_wcwidth)));
3977     ASSERT_EQ (10, cpp_display_width (tstr, 6,
3978                                       cpp_char_column_policy (3, cpp_wcwidth)));
3979     ASSERT_EQ (17, cpp_display_width (tstr, 6,
3980                                       cpp_char_column_policy (8, cpp_wcwidth)));
3981     ASSERT_EQ (1,
3982                cpp_display_column_to_byte_column
3983                  (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
3984   }
3985
3986   /* Verify that cpp_byte_column_to_display_column can go past the end,
3987      and similar edge cases.  */
3988   {
3989     const char *str
3990       /* Display columns.
3991          111111112345  */
3992       = "\xcf\x80 abc";
3993       /* 111122223456
3994          Byte columns.  */
3995
3996     ASSERT_EQ (5, cpp_display_width (str, 6, policy));
3997     ASSERT_EQ (105,
3998                cpp_byte_column_to_display_column (str, 6, 106, policy));
3999     ASSERT_EQ (10000,
4000                cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
4001     ASSERT_EQ (0,
4002                cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
4003   }
4004
4005   /* Verify that cpp_display_column_to_byte_column can go past the end,
4006      and similar edge cases, and check invertibility.  */
4007   {
4008     const char *str
4009       /* Display columns.
4010          000000000000000000000000000000000000011
4011          111111112222222234444444455555555678901  */
4012       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
4013       /* 000000000000000000000000000000000111111
4014          111122223333444456666777788889999012345
4015          Byte columns.  */
4016     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
4017     ASSERT_EQ (15,
4018                cpp_display_column_to_byte_column (str, 15, 11, policy));
4019     ASSERT_EQ (115,
4020                cpp_display_column_to_byte_column (str, 15, 111, policy));
4021     ASSERT_EQ (10000,
4022                cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
4023     ASSERT_EQ (0,
4024                cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
4025
4026     /* Verify that we do not interrupt a UTF-8 sequence.  */
4027     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
4028
4029     for (int byte_col = 1; byte_col <= 15; ++byte_col)
4030       {
4031         const int disp_col
4032           = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
4033         const int byte_col2
4034           = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
4035
4036         /* If we ask for the display column in the middle of a UTF-8
4037            sequence, it will return the length of the partial sequence,
4038            matching the behavior of GCC before display column support.
4039            Otherwise check the round trip was successful.  */
4040         if (byte_col < 4)
4041           ASSERT_EQ (byte_col, disp_col);
4042         else if (byte_col >= 6 && byte_col < 9)
4043           ASSERT_EQ (3 + (byte_col - 5), disp_col);
4044         else
4045           ASSERT_EQ (byte_col2, byte_col);
4046       }
4047   }
4048
4049 }
4050
4051 /* Run all of the selftests within this file.  */
4052
4053 void
4054 input_cc_tests ()
4055 {
4056   test_linenum_comparisons ();
4057   test_should_have_column_data_p ();
4058   test_unknown_location ();
4059   test_builtins ();
4060   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
4061
4062   for_each_line_table_case (test_accessing_ordinary_linemaps);
4063   for_each_line_table_case (test_lexer);
4064   for_each_line_table_case (test_lexer_string_locations_simple);
4065   for_each_line_table_case (test_lexer_string_locations_ebcdic);
4066   for_each_line_table_case (test_lexer_string_locations_hex);
4067   for_each_line_table_case (test_lexer_string_locations_oct);
4068   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
4069   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
4070   for_each_line_table_case (test_lexer_string_locations_ucn4);
4071   for_each_line_table_case (test_lexer_string_locations_ucn8);
4072   for_each_line_table_case (test_lexer_string_locations_wide_string);
4073   for_each_line_table_case (test_lexer_string_locations_string16);
4074   for_each_line_table_case (test_lexer_string_locations_string32);
4075   for_each_line_table_case (test_lexer_string_locations_u8);
4076   for_each_line_table_case (test_lexer_string_locations_utf8_source);
4077   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
4078   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
4079   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
4080   for_each_line_table_case (test_lexer_string_locations_macro);
4081   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
4082   for_each_line_table_case (test_lexer_string_locations_non_string);
4083   for_each_line_table_case (test_lexer_string_locations_long_line);
4084   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
4085   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
4086   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
4087   for_each_line_table_case (test_lexer_char_constants);
4088
4089   test_reading_source_line ();
4090
4091   test_line_offset_overflow ();
4092
4093   test_cpp_utf8 ();
4094 }
4095
4096 } // namespace selftest
4097
4098 #endif /* CHECKING_P */