gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2021 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* Input charset configuration.  */
  33 static const char *default_charset_callback (const char *)
  34 {
  35   return nullptr;
  36 }
  37
  38 void
  39 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
  40                                       bool should_skip_bom)
  41 {
  42   in_context.ccb = (ccb ? ccb : default_charset_callback);
  43   in_context.should_skip_bom = should_skip_bom;
  44 }
  45
  46 /* This is a cache used by get_next_line to store the content of a
  47    file to be searched for file lines.  */
  48 class file_cache_slot
  49 {
  50 public:
  51   file_cache_slot ();
  52   ~file_cache_slot ();
  53
  54   bool read_line_num (size_t line_num,
  55                       char ** line, ssize_t *line_len);
  56
  57   /* Accessors.  */
  58   const char *get_file_path () const { return m_file_path; }
  59   unsigned get_use_count () const { return m_use_count; }
  60   bool missing_trailing_newline_p () const
  61   {
  62     return m_missing_trailing_newline;
  63   }
  64
  65   void inc_use_count () { m_use_count++; }
  66
  67   bool create (const file_cache::input_context &in_context,
  68                const char *file_path, FILE *fp, unsigned highest_use_count);
  69   void evict ();
  70
  71  private:
  72   /* These are information used to store a line boundary.  */
  73   class line_info
  74   {
  75   public:
  76     /* The line number.  It starts from 1.  */
  77     size_t line_num;
  78
  79     /* The position (byte count) of the beginning of the line,
  80        relative to the file data pointer.  This starts at zero.  */
  81     size_t start_pos;
  82
  83     /* The position (byte count) of the last byte of the line.  This
  84        normally points to the '\n' character, or to one byte after the
  85        last byte of the file, if the file doesn't contain a '\n'
  86        character.  */
  87     size_t end_pos;
  88
  89     line_info (size_t l, size_t s, size_t e)
  90       : line_num (l), start_pos (s), end_pos (e)
  91     {}
  92
  93     line_info ()
  94       :line_num (0), start_pos (0), end_pos (0)
  95     {}
  96   };
  97
  98   bool needs_read_p () const;
  99   bool needs_grow_p () const;
 100   void maybe_grow ();
 101   bool read_data ();
 102   bool maybe_read_data ();
 103   bool get_next_line (char **line, ssize_t *line_len);
 104   bool read_next_line (char ** line, ssize_t *line_len);
 105   bool goto_next_line ();
 106
 107   static const size_t buffer_size = 4 * 1024;
 108   static const size_t line_record_size = 100;
 109
 110   /* The number of time this file has been accessed.  This is used
 111      to designate which file cache to evict from the cache
 112      array.  */
 113   unsigned m_use_count;
 114
 115   /* The file_path is the key for identifying a particular file in
 116      the cache.
 117      For libcpp-using code, the underlying buffer for this field is
 118      owned by the corresponding _cpp_file within the cpp_reader.  */
 119   const char *m_file_path;
 120
 121   FILE *m_fp;
 122
 123   /* This points to the content of the file that we've read so
 124      far.  */
 125   char *m_data;
 126
 127   /* The allocated buffer to be freed may start a little earlier than DATA,
 128      e.g. if a UTF8 BOM was skipped at the beginning.  */
 129   int m_alloc_offset;
 130
 131   /*  The size of the DATA array above.*/
 132   size_t m_size;
 133
 134   /* The number of bytes read from the underlying file so far.  This
 135      must be less (or equal) than SIZE above.  */
 136   size_t m_nb_read;
 137
 138   /* The index of the beginning of the current line.  */
 139   size_t m_line_start_idx;
 140
 141   /* The number of the previous line read.  This starts at 1.  Zero
 142      means we've read no line so far.  */
 143   size_t m_line_num;
 144
 145   /* This is the total number of lines of the current file.  At the
 146      moment, we try to get this information from the line map
 147      subsystem.  Note that this is just a hint.  When using the C++
 148      front-end, this hint is correct because the input file is then
 149      completely tokenized before parsing starts; so the line map knows
 150      the number of lines before compilation really starts.  For e.g,
 151      the C front-end, it can happen that we start emitting diagnostics
 152      before the line map has seen the end of the file.  */
 153   size_t m_total_lines;
 154
 155   /* Could this file be missing a trailing newline on its final line?
 156      Initially true (to cope with empty files), set to true/false
 157      as each line is read.  */
 158   bool m_missing_trailing_newline;
 159
 160   /* This is a record of the beginning and end of the lines we've seen
 161      while reading the file.  This is useful to avoid walking the data
 162      from the beginning when we are asked to read a line that is
 163      before LINE_START_IDX above.  Note that the maximum size of this
 164      record is line_record_size, so that the memory consumption
 165      doesn't explode.  We thus scale total_lines down to
 166      line_record_size.  */
 167   vec<line_info, va_heap> m_line_record;
 168
 169   void offset_buffer (int offset)
 170   {
 171     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
 172                 : (size_t) offset <= m_size);
 173     gcc_assert (m_data);
 174     m_alloc_offset += offset;
 175     m_data += offset;
 176     m_size -= offset;
 177   }
 178
 179 };
 180
 181 /* Current position in real source file.  */
 182
 183 location_t input_location = UNKNOWN_LOCATION;
 184
 185 class line_maps *line_table;
 186
 187 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 188    This needs to be a global so that it can be a GC root, and thus
 189    prevent the stashed copy from being garbage-collected if the GC runs
 190    during a line_table_test.  */
 191
 192 class line_maps *saved_line_table;
 193
 194 /* Expand the source location LOC into a human readable location.  If
 195    LOC resolves to a builtin location, the file name of the readable
 196    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 197    TRUE and LOC is virtual, then it is resolved to the expansion
 198    point of the involved macro.  Otherwise, it is resolved to the
 199    spelling location of the token.
 200
 201    When resolving to the spelling location of the token, if the
 202    resulting location is for a built-in location (that is, it has no
 203    associated line/column) in the context of a macro expansion, the
 204    returned location is the first one (while unwinding the macro
 205    location towards its expansion point) that is in real source
 206    code.
 207
 208    ASPECT controls which part of the location to use.  */
 209
 210 static expanded_location
 211 expand_location_1 (location_t loc,
 212                    bool expansion_point_p,
 213                    enum location_aspect aspect)
 214 {
 215   expanded_location xloc;
 216   const line_map_ordinary *map;
 217   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 218   tree block = NULL;
 219
 220   if (IS_ADHOC_LOC (loc))
 221     {
 222       block = LOCATION_BLOCK (loc);
 223       loc = LOCATION_LOCUS (loc);
 224     }
 225
 226   memset (&xloc, 0, sizeof (xloc));
 227
 228   if (loc >= RESERVED_LOCATION_COUNT)
 229     {
 230       if (!expansion_point_p)
 231         {
 232           /* We want to resolve LOC to its spelling location.
 233
 234              But if that spelling location is a reserved location that
 235              appears in the context of a macro expansion (like for a
 236              location for a built-in token), let's consider the first
 237              location (toward the expansion point) that is not reserved;
 238              that is, the first location that is in real source code.  */
 239           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 240                                                           loc, NULL);
 241           lrk = LRK_SPELLING_LOCATION;
 242         }
 243       loc = linemap_resolve_location (line_table, loc, lrk, &map);
 244
 245       /* loc is now either in an ordinary map, or is a reserved location.
 246          If it is a compound location, the caret is in a spelling location,
 247          but the start/finish might still be a virtual location.
 248          Depending of what the caller asked for, we may need to recurse
 249          one level in order to resolve any virtual locations in the
 250          end-points.  */
 251       switch (aspect)
 252         {
 253         default:
 254           gcc_unreachable ();
 255           /* Fall through.  */
 256         case LOCATION_ASPECT_CARET:
 257           break;
 258         case LOCATION_ASPECT_START:
 259           {
 260             location_t start = get_start (loc);
 261             if (start != loc)
 262               return expand_location_1 (start, expansion_point_p, aspect);
 263           }
 264           break;
 265         case LOCATION_ASPECT_FINISH:
 266           {
 267             location_t finish = get_finish (loc);
 268             if (finish != loc)
 269               return expand_location_1 (finish, expansion_point_p, aspect);
 270           }
 271           break;
 272         }
 273       xloc = linemap_expand_location (line_table, map, loc);
 274     }
 275
 276   xloc.data = block;
 277   if (loc <= BUILTINS_LOCATION)
 278     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 279
 280   return xloc;
 281 }
 282
 283 /* Initialize the set of cache used for files accessed by caret
 284    diagnostic.  */
 285
 286 static void
 287 diagnostic_file_cache_init (void)
 288 {
 289   gcc_assert (global_dc);
 290   if (global_dc->m_file_cache == NULL)
 291     global_dc->m_file_cache = new file_cache ();
 292 }
 293
 294 /* Free the resources used by the set of cache used for files accessed
 295    by caret diagnostic.  */
 296
 297 void
 298 diagnostic_file_cache_fini (void)
 299 {
 300   if (global_dc->m_file_cache)
 301     {
 302       delete global_dc->m_file_cache;
 303       global_dc->m_file_cache = NULL;
 304     }
 305 }
 306
 307 /* Return the total lines number that have been read so far by the
 308    line map (in the preprocessor) so far.  For languages like C++ that
 309    entirely preprocess the input file before starting to parse, this
 310    equals the actual number of lines of the file.  */
 311
 312 static size_t
 313 total_lines_num (const char *file_path)
 314 {
 315   size_t r = 0;
 316   location_t l = 0;
 317   if (linemap_get_file_highest_location (line_table, file_path, &l))
 318     {
 319       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 320       expanded_location xloc = expand_location (l);
 321       r = xloc.line;
 322     }
 323   return r;
 324 }
 325
 326 /* Lookup the cache used for the content of a given file accessed by
 327    caret diagnostic.  Return the found cached file, or NULL if no
 328    cached file was found.  */
 329
 330 file_cache_slot *
 331 file_cache::lookup_file (const char *file_path)
 332 {
 333   gcc_assert (file_path);
 334
 335   /* This will contain the found cached file.  */
 336   file_cache_slot *r = NULL;
 337   for (unsigned i = 0; i < num_file_slots; ++i)
 338     {
 339       file_cache_slot *c = &m_file_slots[i];
 340       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
 341         {
 342           c->inc_use_count ();
 343           r = c;
 344         }
 345     }
 346
 347   if (r)
 348     r->inc_use_count ();
 349
 350   return r;
 351 }
 352
 353 /* Purge any mention of FILENAME from the cache of files used for
 354    printing source code.  For use in selftests when working
 355    with tempfiles.  */
 356
 357 void
 358 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 359 {
 360   gcc_assert (file_path);
 361
 362   if (!global_dc->m_file_cache)
 363     return;
 364
 365   global_dc->m_file_cache->forcibly_evict_file (file_path);
 366 }
 367
 368 void
 369 file_cache::forcibly_evict_file (const char *file_path)
 370 {
 371   gcc_assert (file_path);
 372
 373   file_cache_slot *r = lookup_file (file_path);
 374   if (!r)
 375     /* Not found.  */
 376     return;
 377
 378   r->evict ();
 379 }
 380
 381 void
 382 file_cache_slot::evict ()
 383 {
 384   m_file_path = NULL;
 385   if (m_fp)
 386     fclose (m_fp);
 387   m_fp = NULL;
 388   m_nb_read = 0;
 389   m_line_start_idx = 0;
 390   m_line_num = 0;
 391   m_line_record.truncate (0);
 392   m_use_count = 0;
 393   m_total_lines = 0;
 394   m_missing_trailing_newline = true;
 395 }
 396
 397 /* Return the file cache that has been less used, recently, or the
 398    first empty one.  If HIGHEST_USE_COUNT is non-null,
 399    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 400    in the cache table.  */
 401
 402 file_cache_slot*
 403 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
 404 {
 405   diagnostic_file_cache_init ();
 406
 407   file_cache_slot *to_evict = &m_file_slots[0];
 408   unsigned huc = to_evict->get_use_count ();
 409   for (unsigned i = 1; i < num_file_slots; ++i)
 410     {
 411       file_cache_slot *c = &m_file_slots[i];
 412       bool c_is_empty = (c->get_file_path () == NULL);
 413
 414       if (c->get_use_count () < to_evict->get_use_count ()
 415           || (to_evict->get_file_path () && c_is_empty))
 416         /* We evict C because it's either an entry with a lower use
 417            count or one that is empty.  */
 418         to_evict = c;
 419
 420       if (huc < c->get_use_count ())
 421         huc = c->get_use_count ();
 422
 423       if (c_is_empty)
 424         /* We've reached the end of the cache; subsequent elements are
 425            all empty.  */
 426         break;
 427     }
 428
 429   if (highest_use_count)
 430     *highest_use_count = huc;
 431
 432   return to_evict;
 433 }
 434
 435 /* Create the cache used for the content of a given file to be
 436    accessed by caret diagnostic.  This cache is added to an array of
 437    cache and can be retrieved by lookup_file_in_cache_tab.  This
 438    function returns the created cache.  Note that only the last
 439    num_file_slots files are cached.  */
 440
 441 file_cache_slot*
 442 file_cache::add_file (const char *file_path)
 443 {
 444
 445   FILE *fp = fopen (file_path, "r");
 446   if (fp == NULL)
 447     return NULL;
 448
 449   unsigned highest_use_count = 0;
 450   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
 451   if (!r->create (in_context, file_path, fp, highest_use_count))
 452     return NULL;
 453   return r;
 454 }
 455
 456 /* Populate this slot for use on FILE_PATH and FP, dropping any
 457    existing cached content within it.  */
 458
 459 bool
 460 file_cache_slot::create (const file_cache::input_context &in_context,
 461                          const char *file_path, FILE *fp,
 462                          unsigned highest_use_count)
 463 {
 464   m_file_path = file_path;
 465   if (m_fp)
 466     fclose (m_fp);
 467   m_fp = fp;
 468   if (m_alloc_offset)
 469     offset_buffer (-m_alloc_offset);
 470   m_nb_read = 0;
 471   m_line_start_idx = 0;
 472   m_line_num = 0;
 473   m_line_record.truncate (0);
 474   /* Ensure that this cache entry doesn't get evicted next time
 475      add_file_to_cache_tab is called.  */
 476   m_use_count = ++highest_use_count;
 477   m_total_lines = total_lines_num (file_path);
 478   m_missing_trailing_newline = true;
 479
 480
 481   /* Check the input configuration to determine if we need to do any
 482      transformations, such as charset conversion or BOM skipping.  */
 483   if (const char *input_charset = in_context.ccb (file_path))
 484     {
 485       /* Need a full-blown conversion of the input charset.  */
 486       fclose (m_fp);
 487       m_fp = NULL;
 488       const cpp_converted_source cs
 489         = cpp_get_converted_source (file_path, input_charset);
 490       if (!cs.data)
 491         return false;
 492       if (m_data)
 493         XDELETEVEC (m_data);
 494       m_data = cs.data;
 495       m_nb_read = m_size = cs.len;
 496       m_alloc_offset = cs.data - cs.to_free;
 497     }
 498   else if (in_context.should_skip_bom)
 499     {
 500       if (read_data ())
 501         {
 502           const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
 503           offset_buffer (offset);
 504           m_nb_read -= offset;
 505         }
 506     }
 507
 508   return true;
 509 }
 510
 511 /* file_cache's ctor.  */
 512
 513 file_cache::file_cache ()
 514 : m_file_slots (new file_cache_slot[num_file_slots])
 515 {
 516   initialize_input_context (nullptr, false);
 517 }
 518
 519 /* file_cache's dtor.  */
 520
 521 file_cache::~file_cache ()
 522 {
 523   delete[] m_file_slots;
 524 }
 525
 526 /* Lookup the cache used for the content of a given file accessed by
 527    caret diagnostic.  If no cached file was found, create a new cache
 528    for this file, add it to the array of cached file and return
 529    it.  */
 530
 531 file_cache_slot*
 532 file_cache::lookup_or_add_file (const char *file_path)
 533 {
 534   file_cache_slot *r = lookup_file (file_path);
 535   if (r == NULL)
 536     r = add_file (file_path);
 537   return r;
 538 }
 539
 540 /* Default constructor for a cache of file used by caret
 541    diagnostic.  */
 542
 543 file_cache_slot::file_cache_slot ()
 544 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
 545   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
 546   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 547 {
 548   m_line_record.create (0);
 549 }
 550
 551 /* Destructor for a cache of file used by caret diagnostic.  */
 552
 553 file_cache_slot::~file_cache_slot ()
 554 {
 555   if (m_fp)
 556     {
 557       fclose (m_fp);
 558       m_fp = NULL;
 559     }
 560   if (m_data)
 561     {
 562       offset_buffer (-m_alloc_offset);
 563       XDELETEVEC (m_data);
 564       m_data = 0;
 565     }
 566   m_line_record.release ();
 567 }
 568
 569 /* Returns TRUE iff the cache would need to be filled with data coming
 570    from the file.  That is, either the cache is empty or full or the
 571    current line is empty.  Note that if the cache is full, it would
 572    need to be extended and filled again.  */
 573
 574 bool
 575 file_cache_slot::needs_read_p () const
 576 {
 577   return m_fp && (m_nb_read == 0
 578           || m_nb_read == m_size
 579           || (m_line_start_idx >= m_nb_read - 1));
 580 }
 581
 582 /*  Return TRUE iff the cache is full and thus needs to be
 583     extended.  */
 584
 585 bool
 586 file_cache_slot::needs_grow_p () const
 587 {
 588   return m_nb_read == m_size;
 589 }
 590
 591 /* Grow the cache if it needs to be extended.  */
 592
 593 void
 594 file_cache_slot::maybe_grow ()
 595 {
 596   if (!needs_grow_p ())
 597     return;
 598
 599   if (!m_data)
 600     {
 601       gcc_assert (m_size == 0 && m_alloc_offset == 0);
 602       m_size = buffer_size;
 603       m_data = XNEWVEC (char, m_size);
 604     }
 605   else
 606     {
 607       const int offset = m_alloc_offset;
 608       offset_buffer (-offset);
 609       m_size *= 2;
 610       m_data = XRESIZEVEC (char, m_data, m_size);
 611       offset_buffer (offset);
 612     }
 613 }
 614
 615 /*  Read more data into the cache.  Extends the cache if need be.
 616     Returns TRUE iff new data could be read.  */
 617
 618 bool
 619 file_cache_slot::read_data ()
 620 {
 621   if (feof (m_fp) || ferror (m_fp))
 622     return false;
 623
 624   maybe_grow ();
 625
 626   char * from = m_data + m_nb_read;
 627   size_t to_read = m_size - m_nb_read;
 628   size_t nb_read = fread (from, 1, to_read, m_fp);
 629
 630   if (ferror (m_fp))
 631     return false;
 632
 633   m_nb_read += nb_read;
 634   return !!nb_read;
 635 }
 636
 637 /* Read new data iff the cache needs to be filled with more data
 638    coming from the file FP.  Return TRUE iff the cache was filled with
 639    mode data.  */
 640
 641 bool
 642 file_cache_slot::maybe_read_data ()
 643 {
 644   if (!needs_read_p ())
 645     return false;
 646   return read_data ();
 647 }
 648
 649 /* Read a new line from file FP, using C as a cache for the data
 650    coming from the file.  Upon successful completion, *LINE is set to
 651    the beginning of the line found.  *LINE points directly in the
 652    line cache and is only valid until the next call of get_next_line.
 653    *LINE_LEN is set to the length of the line.  Note that the line
 654    does not contain any terminal delimiter.  This function returns
 655    true if some data was read or process from the cache, false
 656    otherwise.  Note that subsequent calls to get_next_line might
 657    make the content of *LINE invalid.  */
 658
 659 bool
 660 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
 661 {
 662   /* Fill the cache with data to process.  */
 663   maybe_read_data ();
 664
 665   size_t remaining_size = m_nb_read - m_line_start_idx;
 666   if (remaining_size == 0)
 667     /* There is no more data to process.  */
 668     return false;
 669
 670   char *line_start = m_data + m_line_start_idx;
 671
 672   char *next_line_start = NULL;
 673   size_t len = 0;
 674   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 675   if (line_end == NULL)
 676     {
 677       /* We haven't found the end-of-line delimiter in the cache.
 678          Fill the cache with more data from the file and look for the
 679          '\n'.  */
 680       while (maybe_read_data ())
 681         {
 682           line_start = m_data + m_line_start_idx;
 683           remaining_size = m_nb_read - m_line_start_idx;
 684           line_end = (char *) memchr (line_start, '\n', remaining_size);
 685           if (line_end != NULL)
 686             {
 687               next_line_start = line_end + 1;
 688               break;
 689             }
 690         }
 691       if (line_end == NULL)
 692         {
 693           /* We've loadded all the file into the cache and still no
 694              '\n'.  Let's say the line ends up at one byte passed the
 695              end of the file.  This is to stay consistent with the case
 696              of when the line ends up with a '\n' and line_end points to
 697              that terminal '\n'.  That consistency is useful below in
 698              the len calculation.  */
 699           line_end = m_data + m_nb_read ;
 700           m_missing_trailing_newline = true;
 701         }
 702       else
 703         m_missing_trailing_newline = false;
 704     }
 705   else
 706     {
 707       next_line_start = line_end + 1;
 708       m_missing_trailing_newline = false;
 709     }
 710
 711   if (m_fp && ferror (m_fp))
 712     return false;
 713
 714   /* At this point, we've found the end of the of line.  It either
 715      points to the '\n' or to one byte after the last byte of the
 716      file.  */
 717   gcc_assert (line_end != NULL);
 718
 719   len = line_end - line_start;
 720
 721   if (m_line_start_idx < m_nb_read)
 722     *line = line_start;
 723
 724   ++m_line_num;
 725
 726   /* Before we update our line record, make sure the hint about the
 727      total number of lines of the file is correct.  If it's not, then
 728      we give up recording line boundaries from now on.  */
 729   bool update_line_record = true;
 730   if (m_line_num > m_total_lines)
 731     update_line_record = false;
 732
 733     /* Now update our line record so that re-reading lines from the
 734      before m_line_start_idx is faster.  */
 735   if (update_line_record
 736       && m_line_record.length () < line_record_size)
 737     {
 738       /* If the file lines fits in the line record, we just record all
 739          its lines ...*/
 740       if (m_total_lines <= line_record_size
 741           && m_line_num > m_line_record.length ())
 742         m_line_record.safe_push
 743           (file_cache_slot::line_info (m_line_num,
 744                                        m_line_start_idx,
 745                                        line_end - m_data));
 746       else if (m_total_lines > line_record_size)
 747         {
 748           /* ... otherwise, we just scale total_lines down to
 749              (line_record_size lines.  */
 750           size_t n = (m_line_num * line_record_size) / m_total_lines;
 751           if (m_line_record.length () == 0
 752               || n >= m_line_record.length ())
 753             m_line_record.safe_push
 754               (file_cache_slot::line_info (m_line_num,
 755                                            m_line_start_idx,
 756                                            line_end - m_data));
 757         }
 758     }
 759
 760   /* Update m_line_start_idx so that it points to the next line to be
 761      read.  */
 762   if (next_line_start)
 763     m_line_start_idx = next_line_start - m_data;
 764   else
 765     /* We didn't find any terminal '\n'.  Let's consider that the end
 766        of line is the end of the data in the cache.  The next
 767        invocation of get_next_line will either read more data from the
 768        underlying file or return false early because we've reached the
 769        end of the file.  */
 770     m_line_start_idx = m_nb_read;
 771
 772   *line_len = len;
 773
 774   return true;
 775 }
 776
 777 /* Consume the next bytes coming from the cache (or from its
 778    underlying file if there are remaining unread bytes in the file)
 779    until we reach the next end-of-line (or end-of-file).  There is no
 780    copying from the cache involved.  Return TRUE upon successful
 781    completion.  */
 782
 783 bool
 784 file_cache_slot::goto_next_line ()
 785 {
 786   char *l;
 787   ssize_t len;
 788
 789   return get_next_line (&l, &len);
 790 }
 791
 792 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 793    If the line was read successfully, *LINE points to the beginning
 794    of the line in the file cache and *LINE_LEN is the length of the
 795    line.  *LINE is not nul-terminated, but may contain zero bytes.
 796    *LINE is only valid until the next call of read_line_num.
 797    This function returns bool if a line was read.  */
 798
 799 bool
 800 file_cache_slot::read_line_num (size_t line_num,
 801                        char ** line, ssize_t *line_len)
 802 {
 803   gcc_assert (line_num > 0);
 804
 805   if (line_num <= m_line_num)
 806     {
 807       /* We've been asked to read lines that are before m_line_num.
 808          So lets use our line record (if it's not empty) to try to
 809          avoid re-reading the file from the beginning again.  */
 810
 811       if (m_line_record.is_empty ())
 812         {
 813           m_line_start_idx = 0;
 814           m_line_num = 0;
 815         }
 816       else
 817         {
 818           file_cache_slot::line_info *i = NULL;
 819           if (m_total_lines <= line_record_size)
 820             {
 821               /* In languages where the input file is not totally
 822                  preprocessed up front, the m_total_lines hint
 823                  can be smaller than the number of lines of the
 824                  file.  In that case, only the first
 825                  m_total_lines have been recorded.
 826
 827                  Otherwise, the first m_total_lines we've read have
 828                  their start/end recorded here.  */
 829               i = (line_num <= m_total_lines)
 830                 ? &m_line_record[line_num - 1]
 831                 : &m_line_record[m_total_lines - 1];
 832               gcc_assert (i->line_num <= line_num);
 833             }
 834           else
 835             {
 836               /*  So the file had more lines than our line record
 837                   size.  Thus the number of lines we've recorded has
 838                   been scaled down to line_record_size.  Let's
 839                   pick the start/end of the recorded line that is
 840                   closest to line_num.  */
 841               size_t n = (line_num <= m_total_lines)
 842                 ? line_num * line_record_size / m_total_lines
 843                 : m_line_record.length () - 1;
 844               if (n < m_line_record.length ())
 845                 {
 846                   i = &m_line_record[n];
 847                   gcc_assert (i->line_num <= line_num);
 848                 }
 849             }
 850
 851           if (i && i->line_num == line_num)
 852             {
 853               /* We have the start/end of the line.  */
 854               *line = m_data + i->start_pos;
 855               *line_len = i->end_pos - i->start_pos;
 856               return true;
 857             }
 858
 859           if (i)
 860             {
 861               m_line_start_idx = i->start_pos;
 862               m_line_num = i->line_num - 1;
 863             }
 864           else
 865             {
 866               m_line_start_idx = 0;
 867               m_line_num = 0;
 868             }
 869         }
 870     }
 871
 872   /*  Let's walk from line m_line_num up to line_num - 1, without
 873       copying any line.  */
 874   while (m_line_num < line_num - 1)
 875     if (!goto_next_line ())
 876       return false;
 877
 878   /* The line we want is the next one.  Let's read and copy it back to
 879      the caller.  */
 880   return get_next_line (line, line_len);
 881 }
 882
 883 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 884    The line is not nul-terminated.  The returned pointer is only
 885    valid until the next call of location_get_source_line.
 886    Note that the line can contain several null characters,
 887    so the returned value's length has the actual length of the line.
 888    If the function fails, a NULL char_span is returned.  */
 889
 890 char_span
 891 location_get_source_line (const char *file_path, int line)
 892 {
 893   char *buffer = NULL;
 894   ssize_t len;
 895
 896   if (line == 0)
 897     return char_span (NULL, 0);
 898
 899   if (file_path == NULL)
 900     return char_span (NULL, 0);
 901
 902   diagnostic_file_cache_init ();
 903
 904   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 905   if (c == NULL)
 906     return char_span (NULL, 0);
 907
 908   bool read = c->read_line_num (line, &buffer, &len);
 909   if (!read)
 910     return char_span (NULL, 0);
 911
 912   return char_span (buffer, len);
 913 }
 914
 915 /* Determine if FILE_PATH missing a trailing newline on its final line.
 916    Only valid to call once all of the file has been loaded, by
 917    requesting a line number beyond the end of the file.  */
 918
 919 bool
 920 location_missing_trailing_newline (const char *file_path)
 921 {
 922   diagnostic_file_cache_init ();
 923
 924   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 925   if (c == NULL)
 926     return false;
 927
 928   return c->missing_trailing_newline_p ();
 929 }
 930
 931 /* Test if the location originates from the spelling location of a
 932    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 933    virtual) location of a built-in token that appears in the expansion
 934    list of a macro.  Please note that this function also works on
 935    tokens that result from built-in tokens.  For instance, the
 936    function would return true if passed a token "4" that is the result
 937    of the expansion of the built-in __LINE__ macro.  */
 938 bool
 939 is_location_from_builtin_token (location_t loc)
 940 {
 941   const line_map_ordinary *map = NULL;
 942   loc = linemap_resolve_location (line_table, loc,
 943                                   LRK_SPELLING_LOCATION, &map);
 944   return loc == BUILTINS_LOCATION;
 945 }
 946
 947 /* Expand the source location LOC into a human readable location.  If
 948    LOC is virtual, it resolves to the expansion point of the involved
 949    macro.  If LOC resolves to a builtin location, the file name of the
 950    readable location is set to the string "<built-in>".  */
 951
 952 expanded_location
 953 expand_location (location_t loc)
 954 {
 955   return expand_location_1 (loc, /*expansion_point_p=*/true,
 956                             LOCATION_ASPECT_CARET);
 957 }
 958
 959 /* Expand the source location LOC into a human readable location.  If
 960    LOC is virtual, it resolves to the expansion location of the
 961    relevant macro.  If LOC resolves to a builtin location, the file
 962    name of the readable location is set to the string
 963    "<built-in>".  */
 964
 965 expanded_location
 966 expand_location_to_spelling_point (location_t loc,
 967                                    enum location_aspect aspect)
 968 {
 969   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 970 }
 971
 972 /* The rich_location class within libcpp requires a way to expand
 973    location_t instances, and relies on the client code
 974    providing a symbol named
 975      linemap_client_expand_location_to_spelling_point
 976    to do this.
 977
 978    This is the implementation for libcommon.a (all host binaries),
 979    which simply calls into expand_location_1.  */
 980
 981 expanded_location
 982 linemap_client_expand_location_to_spelling_point (location_t loc,
 983                                                   enum location_aspect aspect)
 984 {
 985   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 986 }
 987
 988
 989 /* If LOCATION is in a system header and if it is a virtual location for
 990    a token coming from the expansion of a macro, unwind it to the
 991    location of the expansion point of the macro.  Otherwise, just return
 992    LOCATION.
 993
 994    This is used for instance when we want to emit diagnostics about a
 995    token that may be located in a macro that is itself defined in a
 996    system header, for example, for the NULL macro.  In such a case, if
 997    LOCATION were passed directly to diagnostic functions such as
 998    warning_at, the diagnostic would be suppressed (unless
 999    -Wsystem-headers).  */
1000
1001 location_t
1002 expansion_point_location_if_in_system_header (location_t location)
1003 {
1004   if (in_system_header_at (location))
1005     location = linemap_resolve_location (line_table, location,
1006                                          LRK_MACRO_EXPANSION_POINT,
1007                                          NULL);
1008   return location;
1009 }
1010
1011 /* If LOCATION is a virtual location for a token coming from the expansion
1012    of a macro, unwind to the location of the expansion point of the macro.  */
1013
1014 location_t
1015 expansion_point_location (location_t location)
1016 {
1017   return linemap_resolve_location (line_table, location,
1018                                    LRK_MACRO_EXPANSION_POINT, NULL);
1019 }
1020
1021 /* Construct a location with caret at CARET, ranging from START to
1022    finish e.g.
1023
1024                  11111111112
1025         12345678901234567890
1026      522
1027      523   return foo + bar;
1028                   ~~~~^~~~~
1029      524
1030
1031    The location's caret is at the "+", line 523 column 15, but starts
1032    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1033    of "bar" at column 19.  */
1034
1035 location_t
1036 make_location (location_t caret, location_t start, location_t finish)
1037 {
1038   location_t pure_loc = get_pure_location (caret);
1039   source_range src_range;
1040   src_range.m_start = get_start (start);
1041   src_range.m_finish = get_finish (finish);
1042   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1043                                                    pure_loc,
1044                                                    src_range,
1045                                                    NULL);
1046   return combined_loc;
1047 }
1048
1049 /* Same as above, but taking a source range rather than two locations.  */
1050
1051 location_t
1052 make_location (location_t caret, source_range src_range)
1053 {
1054   location_t pure_loc = get_pure_location (caret);
1055   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
1056 }
1057
1058 /* An expanded_location stores the column in byte units.  This function
1059    converts that column to display units.  That requires reading the associated
1060    source line in order to calculate the display width.  If that cannot be done
1061    for any reason, then returns the byte column as a fallback.  */
1062 int
1063 location_compute_display_column (expanded_location exploc, int tabstop)
1064 {
1065   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1066     return exploc.column;
1067   char_span line = location_get_source_line (exploc.file, exploc.line);
1068   /* If line is NULL, this function returns exploc.column which is the
1069      desired fallback.  */
1070   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1071                                             exploc.column, tabstop);
1072 }
1073
1074 /* Dump statistics to stderr about the memory usage of the line_table
1075    set of line maps.  This also displays some statistics about macro
1076    expansion.  */
1077
1078 void
1079 dump_line_table_statistics (void)
1080 {
1081   struct linemap_stats s;
1082   long total_used_map_size,
1083     macro_maps_size,
1084     total_allocated_map_size;
1085
1086   memset (&s, 0, sizeof (s));
1087
1088   linemap_get_statistics (line_table, &s);
1089
1090   macro_maps_size = s.macro_maps_used_size
1091     + s.macro_maps_locations_size;
1092
1093   total_allocated_map_size = s.ordinary_maps_allocated_size
1094     + s.macro_maps_allocated_size
1095     + s.macro_maps_locations_size;
1096
1097   total_used_map_size = s.ordinary_maps_used_size
1098     + s.macro_maps_used_size
1099     + s.macro_maps_locations_size;
1100
1101   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1102            s.num_expanded_macros);
1103   if (s.num_expanded_macros != 0)
1104     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1105              s.num_macro_tokens / s.num_expanded_macros);
1106   fprintf (stderr,
1107            "\nLine Table allocations during the "
1108            "compilation process\n");
1109   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1110            SIZE_AMOUNT (s.num_ordinary_maps_used));
1111   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1112            SIZE_AMOUNT (s.ordinary_maps_used_size));
1113   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1114            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1115   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1116            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1117   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1118            SIZE_AMOUNT (s.num_macro_maps_used));
1119   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1120            SIZE_AMOUNT (s.macro_maps_used_size));
1121   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1122            SIZE_AMOUNT (s.macro_maps_locations_size));
1123   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1124            SIZE_AMOUNT (macro_maps_size));
1125   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1126            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1127   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1128            SIZE_AMOUNT (total_allocated_map_size));
1129   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1130            SIZE_AMOUNT (total_used_map_size));
1131   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1132            SIZE_AMOUNT (s.adhoc_table_size));
1133   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1134            SIZE_AMOUNT (s.adhoc_table_entries_used));
1135   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1136            SIZE_AMOUNT (line_table->num_optimized_ranges));
1137   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1138            SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1139
1140   fprintf (stderr, "\n");
1141 }
1142
1143 /* Get location one beyond the final location in ordinary map IDX.  */
1144
1145 static location_t
1146 get_end_location (class line_maps *set, unsigned int idx)
1147 {
1148   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1149     return set->highest_location;
1150
1151   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1152   return MAP_START_LOCATION (next_map);
1153 }
1154
1155 /* Helper function for write_digit_row.  */
1156
1157 static void
1158 write_digit (FILE *stream, int digit)
1159 {
1160   fputc ('0' + (digit % 10), stream);
1161 }
1162
1163 /* Helper function for dump_location_info.
1164    Write a row of numbers to STREAM, numbering a source line,
1165    giving the units, tens, hundreds etc of the column number.  */
1166
1167 static void
1168 write_digit_row (FILE *stream, int indent,
1169                  const line_map_ordinary *map,
1170                  location_t loc, int max_col, int divisor)
1171 {
1172   fprintf (stream, "%*c", indent, ' ');
1173   fprintf (stream, "|");
1174   for (int column = 1; column < max_col; column++)
1175     {
1176       location_t column_loc = loc + (column << map->m_range_bits);
1177       write_digit (stream, column_loc / divisor);
1178     }
1179   fprintf (stream, "\n");
1180 }
1181
1182 /* Write a half-closed (START) / half-open (END) interval of
1183    location_t to STREAM.  */
1184
1185 static void
1186 dump_location_range (FILE *stream,
1187                      location_t start, location_t end)
1188 {
1189   fprintf (stream,
1190            "  location_t interval: %u <= loc < %u\n",
1191            start, end);
1192 }
1193
1194 /* Write a labelled description of a half-closed (START) / half-open (END)
1195    interval of location_t to STREAM.  */
1196
1197 static void
1198 dump_labelled_location_range (FILE *stream,
1199                               const char *name,
1200                               location_t start, location_t end)
1201 {
1202   fprintf (stream, "%s\n", name);
1203   dump_location_range (stream, start, end);
1204   fprintf (stream, "\n");
1205 }
1206
1207 /* Write a visualization of the locations in the line_table to STREAM.  */
1208
1209 void
1210 dump_location_info (FILE *stream)
1211 {
1212   /* Visualize the reserved locations.  */
1213   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1214                                 0, RESERVED_LOCATION_COUNT);
1215
1216   /* Visualize the ordinary line_map instances, rendering the sources. */
1217   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1218     {
1219       location_t end_location = get_end_location (line_table, idx);
1220       /* half-closed: doesn't include this one. */
1221
1222       const line_map_ordinary *map
1223         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1224       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1225       dump_location_range (stream,
1226                            MAP_START_LOCATION (map), end_location);
1227       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1228       fprintf (stream, "  starting at line: %i\n",
1229                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1230       fprintf (stream, "  column and range bits: %i\n",
1231                map->m_column_and_range_bits);
1232       fprintf (stream, "  column bits: %i\n",
1233                map->m_column_and_range_bits - map->m_range_bits);
1234       fprintf (stream, "  range bits: %i\n",
1235                map->m_range_bits);
1236       const char * reason;
1237       switch (map->reason) {
1238       case LC_ENTER:
1239         reason = "LC_ENTER";
1240         break;
1241       case LC_LEAVE:
1242         reason = "LC_LEAVE";
1243         break;
1244       case LC_RENAME:
1245         reason = "LC_RENAME";
1246         break;
1247       case LC_RENAME_VERBATIM:
1248         reason = "LC_RENAME_VERBATIM";
1249         break;
1250       case LC_ENTER_MACRO:
1251         reason = "LC_RENAME_MACRO";
1252         break;
1253       default:
1254         reason = "Unknown";
1255       }
1256       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1257
1258       const line_map_ordinary *includer_map
1259         = linemap_included_from_linemap (line_table, map);
1260       fprintf (stream, "  included from location: %d",
1261                linemap_included_from (map));
1262       if (includer_map) {
1263         fprintf (stream, " (in ordinary map %d)",
1264                  int (includer_map - line_table->info_ordinary.maps));
1265       }
1266       fprintf (stream, "\n");
1267
1268       /* Render the span of source lines that this "map" covers.  */
1269       for (location_t loc = MAP_START_LOCATION (map);
1270            loc < end_location;
1271            loc += (1 << map->m_range_bits) )
1272         {
1273           gcc_assert (pure_location_p (line_table, loc) );
1274
1275           expanded_location exploc
1276             = linemap_expand_location (line_table, map, loc);
1277
1278           if (exploc.column == 0)
1279             {
1280               /* Beginning of a new source line: draw the line.  */
1281
1282               char_span line_text = location_get_source_line (exploc.file,
1283                                                               exploc.line);
1284               if (!line_text)
1285                 break;
1286               fprintf (stream,
1287                        "%s:%3i|loc:%5i|%.*s\n",
1288                        exploc.file, exploc.line,
1289                        loc,
1290                        (int)line_text.length (), line_text.get_buffer ());
1291
1292               /* "loc" is at column 0, which means "the whole line".
1293                  Render the locations *within* the line, by underlining
1294                  it, showing the location_t numeric values
1295                  at each column.  */
1296               size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1297               if (max_col > line_text.length ())
1298                 max_col = line_text.length () + 1;
1299
1300               int len_lnum = num_digits (exploc.line);
1301               if (len_lnum < 3)
1302                 len_lnum = 3;
1303               int len_loc = num_digits (loc);
1304               if (len_loc < 5)
1305                 len_loc = 5;
1306
1307               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1308
1309               /* Thousands.  */
1310               if (end_location > 999)
1311                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1312
1313               /* Hundreds.  */
1314               if (end_location > 99)
1315                 write_digit_row (stream, indent, map, loc, max_col, 100);
1316
1317               /* Tens.  */
1318               write_digit_row (stream, indent, map, loc, max_col, 10);
1319
1320               /* Units.  */
1321               write_digit_row (stream, indent, map, loc, max_col, 1);
1322             }
1323         }
1324       fprintf (stream, "\n");
1325     }
1326
1327   /* Visualize unallocated values.  */
1328   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1329                                 line_table->highest_location,
1330                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1331
1332   /* Visualize the macro line_map instances, rendering the sources. */
1333   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1334     {
1335       /* Each macro map that is allocated owns location_t values
1336          that are *lower* that the one before them.
1337          Hence it's meaningful to view them either in order of ascending
1338          source locations, or in order of ascending macro map index.  */
1339       const bool ascending_location_ts = true;
1340       unsigned int idx = (ascending_location_ts
1341                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1342                           : i);
1343       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1344       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1345                idx,
1346                linemap_map_get_macro_name (map),
1347                MACRO_MAP_NUM_MACRO_TOKENS (map));
1348       dump_location_range (stream,
1349                            map->start_location,
1350                            (map->start_location
1351                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1352       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1353               "expansion point is location %i",
1354               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1355       fprintf (stream, "  map->start_location: %u\n",
1356                map->start_location);
1357
1358       fprintf (stream, "  macro_locations:\n");
1359       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1360         {
1361           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1362           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1363
1364           /* linemap_add_macro_token encodes token numbers in an expansion
1365              by putting them after MAP_START_LOCATION. */
1366
1367           /* I'm typically seeing 4 uninitialized entries at the end of
1368              0xafafafaf.
1369              This appears to be due to macro.c:replace_args
1370              adding 2 extra args for padding tokens; presumably there may
1371              be a leading and/or trailing padding token injected,
1372              each for 2 more location slots.
1373              This would explain there being up to 4 location_ts slots
1374              that may be uninitialized.  */
1375
1376           fprintf (stream, "    %u: %u, %u\n",
1377                    i,
1378                    x,
1379                    y);
1380           if (x == y)
1381             {
1382               if (x < MAP_START_LOCATION (map))
1383                 inform (x, "token %u has %<x-location == y-location == %u%>",
1384                         i, x);
1385               else
1386                 fprintf (stream,
1387                          "x-location == y-location == %u encodes token # %u\n",
1388                          x, x - MAP_START_LOCATION (map));
1389                 }
1390           else
1391             {
1392               inform (x, "token %u has %<x-location == %u%>", i, x);
1393               inform (x, "token %u has %<y-location == %u%>", i, y);
1394             }
1395         }
1396       fprintf (stream, "\n");
1397     }
1398
1399   /* It appears that MAX_LOCATION_T itself is never assigned to a
1400      macro map, presumably due to an off-by-one error somewhere
1401      between the logic in linemap_enter_macro and
1402      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1403   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1404                                 MAX_LOCATION_T,
1405                                 MAX_LOCATION_T + 1);
1406
1407   /* Visualize ad-hoc values.  */
1408   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1409                                 MAX_LOCATION_T + 1, UINT_MAX);
1410 }
1411
1412 /* string_concat's constructor.  */
1413
1414 string_concat::string_concat (int num, location_t *locs)
1415   : m_num (num)
1416 {
1417   m_locs = ggc_vec_alloc <location_t> (num);
1418   for (int i = 0; i < num; i++)
1419     m_locs[i] = locs[i];
1420 }
1421
1422 /* string_concat_db's constructor.  */
1423
1424 string_concat_db::string_concat_db ()
1425 {
1426   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1427 }
1428
1429 /* Record that a string concatenation occurred, covering NUM
1430    string literal tokens.  LOCS is an array of size NUM, containing the
1431    locations of the tokens.  A copy of LOCS is taken.  */
1432
1433 void
1434 string_concat_db::record_string_concatenation (int num, location_t *locs)
1435 {
1436   gcc_assert (num > 1);
1437   gcc_assert (locs);
1438
1439   location_t key_loc = get_key_loc (locs[0]);
1440   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1441      any data now recorded under key 'key_loc' would be overwritten by a
1442      subsequent call with the same key 'key_loc'.  */
1443   if (RESERVED_LOCATION_P (key_loc))
1444     return;
1445
1446   string_concat *concat
1447     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1448   m_table->put (key_loc, concat);
1449 }
1450
1451 /* Determine if LOC was the location of the initial token of a
1452    concatenation of string literal tokens.
1453    If so, *OUT_NUM is written to with the number of tokens, and
1454    *OUT_LOCS with the location of an array of locations of the
1455    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1456    storage owned by the string_concat_db.
1457    Otherwise, return false.  */
1458
1459 bool
1460 string_concat_db::get_string_concatenation (location_t loc,
1461                                             int *out_num,
1462                                             location_t **out_locs)
1463 {
1464   gcc_assert (out_num);
1465   gcc_assert (out_locs);
1466
1467   location_t key_loc = get_key_loc (loc);
1468   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1469      discussion in 'string_concat_db::record_string_concatenation'.  */
1470   if (RESERVED_LOCATION_P (key_loc))
1471     return false;
1472
1473   string_concat **concat = m_table->get (key_loc);
1474   if (!concat)
1475     return false;
1476
1477   *out_num = (*concat)->m_num;
1478   *out_locs =(*concat)->m_locs;
1479   return true;
1480 }
1481
1482 /* Internal function.  Canonicalize LOC into a form suitable for
1483    use as a key within the database, stripping away macro expansion,
1484    ad-hoc information, and range information, using the location of
1485    the start of LOC within an ordinary linemap.  */
1486
1487 location_t
1488 string_concat_db::get_key_loc (location_t loc)
1489 {
1490   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1491                                   NULL);
1492
1493   loc = get_range_from_loc (line_table, loc).m_start;
1494
1495   return loc;
1496 }
1497
1498 /* Helper class for use within get_substring_ranges_for_loc.
1499    An vec of cpp_string with responsibility for releasing all of the
1500    str->text for each str in the vector.  */
1501
1502 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1503 {
1504  public:
1505   auto_cpp_string_vec (int alloc)
1506     : auto_vec <cpp_string> (alloc) {}
1507
1508   ~auto_cpp_string_vec ()
1509   {
1510     /* Clean up the copies within this vec.  */
1511     int i;
1512     cpp_string *str;
1513     FOR_EACH_VEC_ELT (*this, i, str)
1514       free (const_cast <unsigned char *> (str->text));
1515   }
1516 };
1517
1518 /* Attempt to populate RANGES with source location information on the
1519    individual characters within the string literal found at STRLOC.
1520    If CONCATS is non-NULL, then any string literals that the token at
1521    STRLOC  was concatenated with are also added to RANGES.
1522
1523    Return NULL if successful, or an error message if any errors occurred (in
1524    which case RANGES may be only partially populated and should not
1525    be used).
1526
1527    This is implemented by re-parsing the relevant source line(s).  */
1528
1529 static const char *
1530 get_substring_ranges_for_loc (cpp_reader *pfile,
1531                               string_concat_db *concats,
1532                               location_t strloc,
1533                               enum cpp_ttype type,
1534                               cpp_substring_ranges &ranges)
1535 {
1536   gcc_assert (pfile);
1537
1538   if (strloc == UNKNOWN_LOCATION)
1539     return "unknown location";
1540
1541   /* Reparsing the strings requires accurate location information.
1542      If -ftrack-macro-expansion has been overridden from its default
1543      of 2, then we might have a location of a macro expansion point,
1544      rather than the location of the literal itself.
1545      Avoid this by requiring that we have full macro expansion tracking
1546      for substring locations to be available.  */
1547   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1548     return "track_macro_expansion != 2";
1549
1550   /* If #line or # 44 "file"-style directives are present, then there's
1551      no guarantee that the line numbers we have can be used to locate
1552      the strings.  For example, we might have a .i file with # directives
1553      pointing back to lines within a .c file, but the .c file might
1554      have been edited since the .i file was created.
1555      In such a case, the safest course is to disable on-demand substring
1556      locations.  */
1557   if (line_table->seen_line_directive)
1558     return "seen line directive";
1559
1560   /* If string concatenation has occurred at STRLOC, get the locations
1561      of all of the literal tokens making up the compound string.
1562      Otherwise, just use STRLOC.  */
1563   int num_locs = 1;
1564   location_t *strlocs = &strloc;
1565   if (concats)
1566     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1567
1568   auto_cpp_string_vec strs (num_locs);
1569   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1570   for (int i = 0; i < num_locs; i++)
1571     {
1572       /* Get range of strloc.  We will use it to locate the start and finish
1573          of the literal token within the line.  */
1574       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1575
1576       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1577         {
1578           /* If the string token was within a macro expansion, then we can
1579              cope with it for the simple case where we have a single token.
1580              Otherwise, bail out.  */
1581           if (src_range.m_start != src_range.m_finish)
1582             return "macro expansion";
1583         }
1584       else
1585         {
1586           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1587             /* If so, we can't reliably determine where the token started within
1588                its line.  */
1589             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1590
1591           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1592             /* If so, we can't reliably determine where the token finished
1593                within its line.  */
1594             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1595         }
1596
1597       expanded_location start
1598         = expand_location_to_spelling_point (src_range.m_start,
1599                                              LOCATION_ASPECT_START);
1600       expanded_location finish
1601         = expand_location_to_spelling_point (src_range.m_finish,
1602                                              LOCATION_ASPECT_FINISH);
1603       if (start.file != finish.file)
1604         return "range endpoints are in different files";
1605       if (start.line != finish.line)
1606         return "range endpoints are on different lines";
1607       if (start.column > finish.column)
1608         return "range endpoints are reversed";
1609
1610       char_span line = location_get_source_line (start.file, start.line);
1611       if (!line)
1612         return "unable to read source line";
1613
1614       /* Determine the location of the literal (including quotes
1615          and leading prefix chars, such as the 'u' in a u""
1616          token).  */
1617       size_t literal_length = finish.column - start.column + 1;
1618
1619       /* Ensure that we don't crash if we got the wrong location.  */
1620       if (start.column < 1)
1621         return "zero start column";
1622       if (line.length () < (start.column - 1 + literal_length))
1623         return "line is not wide enough";
1624
1625       char_span literal = line.subspan (start.column - 1, literal_length);
1626
1627       cpp_string from;
1628       from.len = literal_length;
1629       /* Make a copy of the literal, to avoid having to rely on
1630          the lifetime of the copy of the line within the cache.
1631          This will be released by the auto_cpp_string_vec dtor.  */
1632       from.text = (unsigned char *)literal.xstrdup ();
1633       strs.safe_push (from);
1634
1635       /* For very long lines, a new linemap could have started
1636          halfway through the token.
1637          Ensure that the loc_reader uses the linemap of the
1638          *end* of the token for its start location.  */
1639       const line_map_ordinary *start_ord_map;
1640       linemap_resolve_location (line_table, src_range.m_start,
1641                                 LRK_SPELLING_LOCATION, &start_ord_map);
1642       const line_map_ordinary *final_ord_map;
1643       linemap_resolve_location (line_table, src_range.m_finish,
1644                                 LRK_SPELLING_LOCATION, &final_ord_map);
1645       if (start_ord_map == NULL || final_ord_map == NULL)
1646         return "failed to get ordinary maps";
1647       /* Bulletproofing.  We ought to only have different ordinary maps
1648          for start vs finish due to line-length jumps.  */
1649       if (start_ord_map != final_ord_map
1650           && start_ord_map->to_file != final_ord_map->to_file)
1651         return "start and finish are spelled in different ordinary maps";
1652       /* The file from linemap_resolve_location ought to match that from
1653          expand_location_to_spelling_point.  */
1654       if (start_ord_map->to_file != start.file)
1655         return "mismatching file after resolving linemap";
1656
1657       location_t start_loc
1658         = linemap_position_for_line_and_column (line_table, final_ord_map,
1659                                                 start.line, start.column);
1660
1661       cpp_string_location_reader loc_reader (start_loc, line_table);
1662       loc_readers.safe_push (loc_reader);
1663     }
1664
1665   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1666   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1667                                                  loc_readers.address (),
1668                                                  num_locs, &ranges, type);
1669   if (err)
1670     return err;
1671
1672   /* Success: "ranges" should now contain information on the string.  */
1673   return NULL;
1674 }
1675
1676 /* Attempt to populate *OUT_LOC with source location information on the
1677    given characters within the string literal found at STRLOC.
1678    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1679    character set.
1680
1681    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1682    and string literal "012345\n789"
1683    *OUT_LOC is written to with:
1684      "012345\n789"
1685          ~^~~~~
1686
1687    If CONCATS is non-NULL, then any string literals that the token at
1688    STRLOC was concatenated with are also considered.
1689
1690    This is implemented by re-parsing the relevant source line(s).
1691
1692    Return NULL if successful, or an error message if any errors occurred.
1693    Error messages are intended for GCC developers (to help debugging) rather
1694    than for end-users.  */
1695
1696 const char *
1697 get_location_within_string (cpp_reader *pfile,
1698                             string_concat_db *concats,
1699                             location_t strloc,
1700                             enum cpp_ttype type,
1701                             int caret_idx, int start_idx, int end_idx,
1702                             location_t *out_loc)
1703 {
1704   gcc_checking_assert (caret_idx >= 0);
1705   gcc_checking_assert (start_idx >= 0);
1706   gcc_checking_assert (end_idx >= 0);
1707   gcc_assert (out_loc);
1708
1709   cpp_substring_ranges ranges;
1710   const char *err
1711     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1712   if (err)
1713     return err;
1714
1715   if (caret_idx >= ranges.get_num_ranges ())
1716     return "caret_idx out of range";
1717   if (start_idx >= ranges.get_num_ranges ())
1718     return "start_idx out of range";
1719   if (end_idx >= ranges.get_num_ranges ())
1720     return "end_idx out of range";
1721
1722   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1723                             ranges.get_range (start_idx).m_start,
1724                             ranges.get_range (end_idx).m_finish);
1725   return NULL;
1726 }
1727
1728 #if CHECKING_P
1729
1730 namespace selftest {
1731
1732 /* Selftests of location handling.  */
1733
1734 /* Attempt to populate *OUT_RANGE with source location information on the
1735    given character within the string literal found at STRLOC.
1736    CHAR_IDX refers to an offset within the execution character set.
1737    If CONCATS is non-NULL, then any string literals that the token at
1738    STRLOC was concatenated with are also considered.
1739
1740    This is implemented by re-parsing the relevant source line(s).
1741
1742    Return NULL if successful, or an error message if any errors occurred.
1743    Error messages are intended for GCC developers (to help debugging) rather
1744    than for end-users.  */
1745
1746 static const char *
1747 get_source_range_for_char (cpp_reader *pfile,
1748                            string_concat_db *concats,
1749                            location_t strloc,
1750                            enum cpp_ttype type,
1751                            int char_idx,
1752                            source_range *out_range)
1753 {
1754   gcc_checking_assert (char_idx >= 0);
1755   gcc_assert (out_range);
1756
1757   cpp_substring_ranges ranges;
1758   const char *err
1759     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1760   if (err)
1761     return err;
1762
1763   if (char_idx >= ranges.get_num_ranges ())
1764     return "char_idx out of range";
1765
1766   *out_range = ranges.get_range (char_idx);
1767   return NULL;
1768 }
1769
1770 /* As get_source_range_for_char, but write to *OUT the number
1771    of ranges that are available.  */
1772
1773 static const char *
1774 get_num_source_ranges_for_substring (cpp_reader *pfile,
1775                                      string_concat_db *concats,
1776                                      location_t strloc,
1777                                      enum cpp_ttype type,
1778                                      int *out)
1779 {
1780   gcc_assert (out);
1781
1782   cpp_substring_ranges ranges;
1783   const char *err
1784     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1785
1786   if (err)
1787     return err;
1788
1789   *out = ranges.get_num_ranges ();
1790   return NULL;
1791 }
1792
1793 /* Selftests of location handling.  */
1794
1795 /* Verify that compare() on linenum_type handles comparisons over the full
1796    range of the type.  */
1797
1798 static void
1799 test_linenum_comparisons ()
1800 {
1801   linenum_type min_line (0);
1802   linenum_type max_line (0xffffffff);
1803   ASSERT_EQ (0, compare (min_line, min_line));
1804   ASSERT_EQ (0, compare (max_line, max_line));
1805
1806   ASSERT_GT (compare (max_line, min_line), 0);
1807   ASSERT_LT (compare (min_line, max_line), 0);
1808 }
1809
1810 /* Helper function for verifying location data: when location_t
1811    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1812    as having column 0.  */
1813
1814 static bool
1815 should_have_column_data_p (location_t loc)
1816 {
1817   if (IS_ADHOC_LOC (loc))
1818     loc = get_location_from_adhoc_loc (line_table, loc);
1819   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1820     return false;
1821   return true;
1822 }
1823
1824 /* Selftest for should_have_column_data_p.  */
1825
1826 static void
1827 test_should_have_column_data_p ()
1828 {
1829   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1830   ASSERT_TRUE
1831     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1832   ASSERT_FALSE
1833     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1834 }
1835
1836 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1837    on LOC.  */
1838
1839 static void
1840 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1841               location_t loc)
1842 {
1843   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1844   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1845   /* If location_t values are sufficiently high, then column numbers
1846      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1847      When close to the threshold, column numbers *may* be present: if
1848      the final linemap before the threshold contains a line that straddles
1849      the threshold, locations in that line have column information.  */
1850   if (should_have_column_data_p (loc))
1851     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1852 }
1853
1854 /* Various selftests involve constructing a line table and one or more
1855    line maps within it.
1856
1857    For maximum test coverage we want to run these tests with a variety
1858    of situations:
1859    - line_table->default_range_bits: some frontends use a non-zero value
1860    and others use zero
1861    - the fallback modes within line-map.c: there are various threshold
1862    values for location_t beyond line-map.c changes
1863    behavior (disabling of the range-packing optimization, disabling
1864    of column-tracking).  We can exercise these by starting the line_table
1865    at interesting values at or near these thresholds.
1866
1867    The following struct describes a particular case within our test
1868    matrix.  */
1869
1870 class line_table_case
1871 {
1872 public:
1873   line_table_case (int default_range_bits, int base_location)
1874   : m_default_range_bits (default_range_bits),
1875     m_base_location (base_location)
1876   {}
1877
1878   int m_default_range_bits;
1879   int m_base_location;
1880 };
1881
1882 /* Constructor.  Store the old value of line_table, and create a new
1883    one, using sane defaults.  */
1884
1885 line_table_test::line_table_test ()
1886 {
1887   gcc_assert (saved_line_table == NULL);
1888   saved_line_table = line_table;
1889   line_table = ggc_alloc<line_maps> ();
1890   linemap_init (line_table, BUILTINS_LOCATION);
1891   gcc_assert (saved_line_table->reallocator);
1892   line_table->reallocator = saved_line_table->reallocator;
1893   gcc_assert (saved_line_table->round_alloc_size);
1894   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1895   line_table->default_range_bits = 0;
1896 }
1897
1898 /* Constructor.  Store the old value of line_table, and create a new
1899    one, using the sitation described in CASE_.  */
1900
1901 line_table_test::line_table_test (const line_table_case &case_)
1902 {
1903   gcc_assert (saved_line_table == NULL);
1904   saved_line_table = line_table;
1905   line_table = ggc_alloc<line_maps> ();
1906   linemap_init (line_table, BUILTINS_LOCATION);
1907   gcc_assert (saved_line_table->reallocator);
1908   line_table->reallocator = saved_line_table->reallocator;
1909   gcc_assert (saved_line_table->round_alloc_size);
1910   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1911   line_table->default_range_bits = case_.m_default_range_bits;
1912   if (case_.m_base_location)
1913     {
1914       line_table->highest_location = case_.m_base_location;
1915       line_table->highest_line = case_.m_base_location;
1916     }
1917 }
1918
1919 /* Destructor.  Restore the old value of line_table.  */
1920
1921 line_table_test::~line_table_test ()
1922 {
1923   gcc_assert (saved_line_table != NULL);
1924   line_table = saved_line_table;
1925   saved_line_table = NULL;
1926 }
1927
1928 /* Verify basic operation of ordinary linemaps.  */
1929
1930 static void
1931 test_accessing_ordinary_linemaps (const line_table_case &case_)
1932 {
1933   line_table_test ltt (case_);
1934
1935   /* Build a simple linemap describing some locations. */
1936   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1937
1938   linemap_line_start (line_table, 1, 100);
1939   location_t loc_a = linemap_position_for_column (line_table, 1);
1940   location_t loc_b = linemap_position_for_column (line_table, 23);
1941
1942   linemap_line_start (line_table, 2, 100);
1943   location_t loc_c = linemap_position_for_column (line_table, 1);
1944   location_t loc_d = linemap_position_for_column (line_table, 17);
1945
1946   /* Example of a very long line.  */
1947   linemap_line_start (line_table, 3, 2000);
1948   location_t loc_e = linemap_position_for_column (line_table, 700);
1949
1950   /* Transitioning back to a short line.  */
1951   linemap_line_start (line_table, 4, 0);
1952   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1953
1954   if (should_have_column_data_p (loc_back_to_short))
1955     {
1956       /* Verify that we switched to short lines in the linemap.  */
1957       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1958       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1959     }
1960
1961   /* Example of a line that will eventually be seen to be longer
1962      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1963      below that.  */
1964   linemap_line_start (line_table, 5, 2000);
1965
1966   location_t loc_start_of_very_long_line
1967     = linemap_position_for_column (line_table, 2000);
1968   location_t loc_too_wide
1969     = linemap_position_for_column (line_table, 4097);
1970   location_t loc_too_wide_2
1971     = linemap_position_for_column (line_table, 4098);
1972
1973   /* ...and back to a sane line length.  */
1974   linemap_line_start (line_table, 6, 100);
1975   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1976
1977   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1978
1979   /* Multiple files.  */
1980   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1981   linemap_line_start (line_table, 1, 200);
1982   location_t loc_f = linemap_position_for_column (line_table, 150);
1983   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1984
1985   /* Verify that we can recover the location info.  */
1986   assert_loceq ("foo.c", 1, 1, loc_a);
1987   assert_loceq ("foo.c", 1, 23, loc_b);
1988   assert_loceq ("foo.c", 2, 1, loc_c);
1989   assert_loceq ("foo.c", 2, 17, loc_d);
1990   assert_loceq ("foo.c", 3, 700, loc_e);
1991   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1992
1993   /* In the very wide line, the initial location should be fully tracked.  */
1994   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1995   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1996      be disabled.  */
1997   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1998   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1999   /*...and column-tracking should be re-enabled for subsequent lines.  */
2000   assert_loceq ("foo.c", 6, 10, loc_sane_again);
2001
2002   assert_loceq ("bar.c", 1, 150, loc_f);
2003
2004   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2005   ASSERT_TRUE (pure_location_p (line_table, loc_a));
2006
2007   /* Verify using make_location to build a range, and extracting data
2008      back from it.  */
2009   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2010   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2011   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2012   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2013   ASSERT_EQ (loc_b, src_range.m_start);
2014   ASSERT_EQ (loc_d, src_range.m_finish);
2015 }
2016
2017 /* Verify various properties of UNKNOWN_LOCATION.  */
2018
2019 static void
2020 test_unknown_location ()
2021 {
2022   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2023   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2024   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2025 }
2026
2027 /* Verify various properties of BUILTINS_LOCATION.  */
2028
2029 static void
2030 test_builtins ()
2031 {
2032   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
2033   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2034 }
2035
2036 /* Regression test for make_location.
2037    Ensure that we use pure locations for the start/finish of the range,
2038    rather than storing a packed or ad-hoc range as the start/finish.  */
2039
2040 static void
2041 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2042 {
2043   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2044      with C++ frontend.
2045      ....................0000000001111111111222.
2046      ....................1234567890123456789012.  */
2047   const char *content = "     r += !aaa == bbb;\n";
2048   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2049   line_table_test ltt (case_);
2050   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2051
2052   const location_t c11 = linemap_position_for_column (line_table, 11);
2053   const location_t c12 = linemap_position_for_column (line_table, 12);
2054   const location_t c13 = linemap_position_for_column (line_table, 13);
2055   const location_t c14 = linemap_position_for_column (line_table, 14);
2056   const location_t c21 = linemap_position_for_column (line_table, 21);
2057
2058   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2059     return;
2060
2061   /* Use column 13 for the caret location, arbitrarily, to verify that we
2062      handle start != caret.  */
2063   const location_t aaa = make_location (c13, c12, c14);
2064   ASSERT_EQ (c13, get_pure_location (aaa));
2065   ASSERT_EQ (c12, get_start (aaa));
2066   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2067   ASSERT_EQ (c14, get_finish (aaa));
2068   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2069
2070   /* Make a location using a location with a range as the start-point.  */
2071   const location_t not_aaa = make_location (c11, aaa, c14);
2072   ASSERT_EQ (c11, get_pure_location (not_aaa));
2073   /* It should use the start location of the range, not store the range
2074      itself.  */
2075   ASSERT_EQ (c12, get_start (not_aaa));
2076   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2077   ASSERT_EQ (c14, get_finish (not_aaa));
2078   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2079
2080   /* Similarly, make a location with a range as the end-point.  */
2081   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2082   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2083   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2084   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2085   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2086   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2087   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2088   /* It should use the finish location of the range, not store the range
2089      itself.  */
2090   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2091   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2092   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2093   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2094   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2095 }
2096
2097 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2098
2099 static void
2100 test_reading_source_line ()
2101 {
2102   /* Create a tempfile and write some text to it.  */
2103   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2104                         "01234567890123456789\n"
2105                         "This is the test text\n"
2106                         "This is the 3rd line");
2107
2108   /* Read back a specific line from the tempfile.  */
2109   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2110   ASSERT_TRUE (source_line);
2111   ASSERT_TRUE (source_line.get_buffer () != NULL);
2112   ASSERT_EQ (20, source_line.length ());
2113   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2114                          source_line.get_buffer (), source_line.length ()));
2115
2116   source_line = location_get_source_line (tmp.get_filename (), 2);
2117   ASSERT_TRUE (source_line);
2118   ASSERT_TRUE (source_line.get_buffer () != NULL);
2119   ASSERT_EQ (21, source_line.length ());
2120   ASSERT_TRUE (!strncmp ("This is the test text",
2121                          source_line.get_buffer (), source_line.length ()));
2122
2123   source_line = location_get_source_line (tmp.get_filename (), 4);
2124   ASSERT_FALSE (source_line);
2125   ASSERT_TRUE (source_line.get_buffer () == NULL);
2126 }
2127
2128 /* Tests of lexing.  */
2129
2130 /* Verify that token TOK from PARSER has cpp_token_as_text
2131    equal to EXPECTED_TEXT.  */
2132
2133 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
2134   SELFTEST_BEGIN_STMT                                                   \
2135     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
2136     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
2137   SELFTEST_END_STMT
2138
2139 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2140    and ranges from EXP_START_COL to EXP_FINISH_COL.
2141    Use LOC as the effective location of the selftest.  */
2142
2143 static void
2144 assert_token_loc_eq (const location &loc,
2145                      const cpp_token *tok,
2146                      const char *exp_filename, int exp_linenum,
2147                      int exp_start_col, int exp_finish_col)
2148 {
2149   location_t tok_loc = tok->src_loc;
2150   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2151   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2152
2153   /* If location_t values are sufficiently high, then column numbers
2154      will be unavailable.  */
2155   if (!should_have_column_data_p (tok_loc))
2156     return;
2157
2158   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2159   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2160   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2161   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2162 }
2163
2164 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2165    SELFTEST_LOCATION as the effective location of the selftest.  */
2166
2167 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2168                             EXP_START_COL, EXP_FINISH_COL) \
2169   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2170                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2171
2172 /* Test of lexing a file using libcpp, verifying tokens and their
2173    location information.  */
2174
2175 static void
2176 test_lexer (const line_table_case &case_)
2177 {
2178   /* Create a tempfile and write some text to it.  */
2179   const char *content =
2180     /*00000000011111111112222222222333333.3333444444444.455555555556
2181       12345678901234567890123456789012345.6789012345678.901234567890.  */
2182     ("test_name /* c-style comment */\n"
2183      "                                  \"test literal\"\n"
2184      " // test c++-style comment\n"
2185      "   42\n");
2186   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2187
2188   line_table_test ltt (case_);
2189
2190   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2191
2192   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2193   ASSERT_NE (fname, NULL);
2194
2195   /* Verify that we get the expected tokens back, with the correct
2196      location information.  */
2197
2198   location_t loc;
2199   const cpp_token *tok;
2200   tok = cpp_get_token_with_location (parser, &loc);
2201   ASSERT_NE (tok, NULL);
2202   ASSERT_EQ (tok->type, CPP_NAME);
2203   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2204   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2205
2206   tok = cpp_get_token_with_location (parser, &loc);
2207   ASSERT_NE (tok, NULL);
2208   ASSERT_EQ (tok->type, CPP_STRING);
2209   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2210   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2211
2212   tok = cpp_get_token_with_location (parser, &loc);
2213   ASSERT_NE (tok, NULL);
2214   ASSERT_EQ (tok->type, CPP_NUMBER);
2215   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2216   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2217
2218   tok = cpp_get_token_with_location (parser, &loc);
2219   ASSERT_NE (tok, NULL);
2220   ASSERT_EQ (tok->type, CPP_EOF);
2221
2222   cpp_finish (parser, NULL);
2223   cpp_destroy (parser);
2224 }
2225
2226 /* Forward decls.  */
2227
2228 class lexer_test;
2229 class lexer_test_options;
2230
2231 /* A class for specifying options of a lexer_test.
2232    The "apply" vfunc is called during the lexer_test constructor.  */
2233
2234 class lexer_test_options
2235 {
2236  public:
2237   virtual void apply (lexer_test &) = 0;
2238 };
2239
2240 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2241    in its dtor.
2242
2243    This is needed by struct lexer_test to ensure that the cleanup of the
2244    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2245
2246 class cpp_reader_ptr
2247 {
2248  public:
2249   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2250
2251   ~cpp_reader_ptr ()
2252   {
2253     cpp_finish (m_ptr, NULL);
2254     cpp_destroy (m_ptr);
2255   }
2256
2257   operator cpp_reader * () const { return m_ptr; }
2258
2259  private:
2260   cpp_reader *m_ptr;
2261 };
2262
2263 /* A struct for writing lexer tests.  */
2264
2265 class lexer_test
2266 {
2267 public:
2268   lexer_test (const line_table_case &case_, const char *content,
2269               lexer_test_options *options);
2270   ~lexer_test ();
2271
2272   const cpp_token *get_token ();
2273
2274   /* The ordering of these fields matters.
2275      The line_table_test must be first, since the cpp_reader_ptr
2276      uses it.
2277      The cpp_reader must be cleaned up *after* the temp_source_file
2278      since the filenames in input.c's input cache are owned by the
2279      cpp_reader; in particular, when ~temp_source_file evicts the
2280      filename the filenames must still be alive.  */
2281   line_table_test m_ltt;
2282   cpp_reader_ptr m_parser;
2283   temp_source_file m_tempfile;
2284   string_concat_db m_concats;
2285   bool m_implicitly_expect_EOF;
2286 };
2287
2288 /* Use an EBCDIC encoding for the execution charset, specifically
2289    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2290
2291    This exercises iconv integration within libcpp.
2292    Not every build of iconv supports the given charset,
2293    so we need to flag this error and handle it gracefully.  */
2294
2295 class ebcdic_execution_charset : public lexer_test_options
2296 {
2297  public:
2298   ebcdic_execution_charset () : m_num_iconv_errors (0)
2299     {
2300       gcc_assert (s_singleton == NULL);
2301       s_singleton = this;
2302     }
2303   ~ebcdic_execution_charset ()
2304     {
2305       gcc_assert (s_singleton == this);
2306       s_singleton = NULL;
2307     }
2308
2309   void apply (lexer_test &test) FINAL OVERRIDE
2310   {
2311     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2312     cpp_opts->narrow_charset = "IBM1047";
2313
2314     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2315     callbacks->diagnostic = on_diagnostic;
2316   }
2317
2318   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2319                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2320                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2321                              rich_location *richloc ATTRIBUTE_UNUSED,
2322                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2323     ATTRIBUTE_FPTR_PRINTF(5,0)
2324   {
2325     gcc_assert (s_singleton);
2326     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2327     const char *msg = "conversion from %s to %s not supported by iconv";
2328 #ifdef ENABLE_NLS
2329     msg = dgettext ("cpplib", msg);
2330 #endif
2331     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2332        when the local iconv build doesn't support the conversion.  */
2333     if (strcmp (msgid, msg) == 0)
2334       {
2335         s_singleton->m_num_iconv_errors++;
2336         return true;
2337       }
2338
2339     /* Otherwise, we have an unexpected error.  */
2340     abort ();
2341   }
2342
2343   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2344
2345  private:
2346   static ebcdic_execution_charset *s_singleton;
2347   int m_num_iconv_errors;
2348 };
2349
2350 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2351
2352 /* A lexer_test_options subclass that records a list of diagnostic
2353    messages emitted by the lexer.  */
2354
2355 class lexer_diagnostic_sink : public lexer_test_options
2356 {
2357  public:
2358   lexer_diagnostic_sink ()
2359   {
2360     gcc_assert (s_singleton == NULL);
2361     s_singleton = this;
2362   }
2363   ~lexer_diagnostic_sink ()
2364   {
2365     gcc_assert (s_singleton == this);
2366     s_singleton = NULL;
2367
2368     int i;
2369     char *str;
2370     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2371       free (str);
2372   }
2373
2374   void apply (lexer_test &test) FINAL OVERRIDE
2375   {
2376     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2377     callbacks->diagnostic = on_diagnostic;
2378   }
2379
2380   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2381                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2382                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2383                              rich_location *richloc ATTRIBUTE_UNUSED,
2384                              const char *msgid, va_list *ap)
2385     ATTRIBUTE_FPTR_PRINTF(5,0)
2386   {
2387     char *msg = xvasprintf (msgid, *ap);
2388     s_singleton->m_diagnostics.safe_push (msg);
2389     return true;
2390   }
2391
2392   auto_vec<char *> m_diagnostics;
2393
2394  private:
2395   static lexer_diagnostic_sink *s_singleton;
2396 };
2397
2398 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2399
2400 /* Constructor.  Override line_table with a new instance based on CASE_,
2401    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2402    start parsing the tempfile.  */
2403
2404 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2405                         lexer_test_options *options)
2406 : m_ltt (case_),
2407   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2408   /* Create a tempfile and write the text to it.  */
2409   m_tempfile (SELFTEST_LOCATION, ".c", content),
2410   m_concats (),
2411   m_implicitly_expect_EOF (true)
2412 {
2413   if (options)
2414     options->apply (*this);
2415
2416   cpp_init_iconv (m_parser);
2417
2418   /* Parse the file.  */
2419   const char *fname = cpp_read_main_file (m_parser,
2420                                           m_tempfile.get_filename ());
2421   ASSERT_NE (fname, NULL);
2422 }
2423
2424 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2425
2426 lexer_test::~lexer_test ()
2427 {
2428   location_t loc;
2429   const cpp_token *tok;
2430
2431   if (m_implicitly_expect_EOF)
2432     {
2433       tok = cpp_get_token_with_location (m_parser, &loc);
2434       ASSERT_NE (tok, NULL);
2435       ASSERT_EQ (tok->type, CPP_EOF);
2436     }
2437 }
2438
2439 /* Get the next token from m_parser.  */
2440
2441 const cpp_token *
2442 lexer_test::get_token ()
2443 {
2444   location_t loc;
2445   const cpp_token *tok;
2446
2447   tok = cpp_get_token_with_location (m_parser, &loc);
2448   ASSERT_NE (tok, NULL);
2449   return tok;
2450 }
2451
2452 /* Verify that locations within string literals are correctly handled.  */
2453
2454 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2455    using the string concatenation database for TEST.
2456
2457    Assert that the character at index IDX is on EXPECTED_LINE,
2458    and that it begins at column EXPECTED_START_COL and ends at
2459    EXPECTED_FINISH_COL (unless the locations are beyond
2460    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2461    columns).  */
2462
2463 static void
2464 assert_char_at_range (const location &loc,
2465                       lexer_test& test,
2466                       location_t strloc, enum cpp_ttype type, int idx,
2467                       int expected_line, int expected_start_col,
2468                       int expected_finish_col)
2469 {
2470   cpp_reader *pfile = test.m_parser;
2471   string_concat_db *concats = &test.m_concats;
2472
2473   source_range actual_range = source_range();
2474   const char *err
2475     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2476                                  &actual_range);
2477   if (should_have_column_data_p (strloc))
2478     ASSERT_EQ_AT (loc, NULL, err);
2479   else
2480     {
2481       ASSERT_STREQ_AT (loc,
2482                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2483                        err);
2484       return;
2485     }
2486
2487   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2488   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2489   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2490   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2491
2492   if (should_have_column_data_p (actual_range.m_start))
2493     {
2494       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2495       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2496     }
2497   if (should_have_column_data_p (actual_range.m_finish))
2498     {
2499       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2500       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2501     }
2502 }
2503
2504 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2505    the effective location of any errors.  */
2506
2507 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2508                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2509   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2510                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2511                         (EXPECTED_FINISH_COL))
2512
2513 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2514    using the string concatenation database for TEST.
2515
2516    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2517
2518 static void
2519 assert_num_substring_ranges (const location &loc,
2520                              lexer_test& test,
2521                              location_t strloc,
2522                              enum cpp_ttype type,
2523                              int expected_num_ranges)
2524 {
2525   cpp_reader *pfile = test.m_parser;
2526   string_concat_db *concats = &test.m_concats;
2527
2528   int actual_num_ranges = -1;
2529   const char *err
2530     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2531                                            &actual_num_ranges);
2532   if (should_have_column_data_p (strloc))
2533     ASSERT_EQ_AT (loc, NULL, err);
2534   else
2535     {
2536       ASSERT_STREQ_AT (loc,
2537                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2538                        err);
2539       return;
2540     }
2541   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2542 }
2543
2544 /* Macro for calling assert_num_substring_ranges, supplying
2545    SELFTEST_LOCATION for the effective location of any errors.  */
2546
2547 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2548                                     EXPECTED_NUM_RANGES)                \
2549   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2550                                (TYPE), (EXPECTED_NUM_RANGES))
2551
2552
2553 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2554    returns an error (using the string concatenation database for TEST).  */
2555
2556 static void
2557 assert_has_no_substring_ranges (const location &loc,
2558                                 lexer_test& test,
2559                                 location_t strloc,
2560                                 enum cpp_ttype type,
2561                                 const char *expected_err)
2562 {
2563   cpp_reader *pfile = test.m_parser;
2564   string_concat_db *concats = &test.m_concats;
2565   cpp_substring_ranges ranges;
2566   const char *actual_err
2567     = get_substring_ranges_for_loc (pfile, concats, strloc,
2568                                     type, ranges);
2569   if (should_have_column_data_p (strloc))
2570     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2571   else
2572     ASSERT_STREQ_AT (loc,
2573                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2574                      actual_err);
2575 }
2576
2577 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2578     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2579                                     (STRLOC), (TYPE), (ERR))
2580
2581 /* Lex a simple string literal.  Verify the substring location data, before
2582    and after running cpp_interpret_string on it.  */
2583
2584 static void
2585 test_lexer_string_locations_simple (const line_table_case &case_)
2586 {
2587   /* Digits 0-9 (with 0 at column 10), the simple way.
2588      ....................000000000.11111111112.2222222223333333333
2589      ....................123456789.01234567890.1234567890123456789
2590      We add a trailing comment to ensure that we correctly locate
2591      the end of the string literal token.  */
2592   const char *content = "        \"0123456789\" /* not a string */\n";
2593   lexer_test test (case_, content, NULL);
2594
2595   /* Verify that we get the expected token back, with the correct
2596      location information.  */
2597   const cpp_token *tok = test.get_token ();
2598   ASSERT_EQ (tok->type, CPP_STRING);
2599   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2600   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2601
2602   /* At this point in lexing, the quote characters are treated as part of
2603      the string (they are stripped off by cpp_interpret_string).  */
2604
2605   ASSERT_EQ (tok->val.str.len, 12);
2606
2607   /* Verify that cpp_interpret_string works.  */
2608   cpp_string dst_string;
2609   const enum cpp_ttype type = CPP_STRING;
2610   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2611                                       &dst_string, type);
2612   ASSERT_TRUE (result);
2613   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2614   free (const_cast <unsigned char *> (dst_string.text));
2615
2616   /* Verify ranges of individual characters.  This no longer includes the
2617      opening quote, but does include the closing quote.  */
2618   for (int i = 0; i <= 10; i++)
2619     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2620                           10 + i, 10 + i);
2621
2622   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2623 }
2624
2625 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2626    encoding.  */
2627
2628 static void
2629 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2630 {
2631   /* EBCDIC support requires iconv.  */
2632   if (!HAVE_ICONV)
2633     return;
2634
2635   /* Digits 0-9 (with 0 at column 10), the simple way.
2636      ....................000000000.11111111112.2222222223333333333
2637      ....................123456789.01234567890.1234567890123456789
2638      We add a trailing comment to ensure that we correctly locate
2639      the end of the string literal token.  */
2640   const char *content = "        \"0123456789\" /* not a string */\n";
2641   ebcdic_execution_charset use_ebcdic;
2642   lexer_test test (case_, content, &use_ebcdic);
2643
2644   /* Verify that we get the expected token back, with the correct
2645      location information.  */
2646   const cpp_token *tok = test.get_token ();
2647   ASSERT_EQ (tok->type, CPP_STRING);
2648   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2649   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2650
2651   /* At this point in lexing, the quote characters are treated as part of
2652      the string (they are stripped off by cpp_interpret_string).  */
2653
2654   ASSERT_EQ (tok->val.str.len, 12);
2655
2656   /* The remainder of the test requires an iconv implementation that
2657      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2658   if (use_ebcdic.iconv_errors_occurred_p ())
2659     return;
2660
2661   /* Verify that cpp_interpret_string works.  */
2662   cpp_string dst_string;
2663   const enum cpp_ttype type = CPP_STRING;
2664   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2665                                       &dst_string, type);
2666   ASSERT_TRUE (result);
2667   /* We should now have EBCDIC-encoded text, specifically
2668      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2669      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2670   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2671                 (const char *)dst_string.text);
2672   free (const_cast <unsigned char *> (dst_string.text));
2673
2674   /* Verify that we don't attempt to record substring location information
2675      for such cases.  */
2676   ASSERT_HAS_NO_SUBSTRING_RANGES
2677     (test, tok->src_loc, type,
2678      "execution character set != source character set");
2679 }
2680
2681 /* Lex a string literal containing a hex-escaped character.
2682    Verify the substring location data, before and after running
2683    cpp_interpret_string on it.  */
2684
2685 static void
2686 test_lexer_string_locations_hex (const line_table_case &case_)
2687 {
2688   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2689      and with a space in place of digit 6, to terminate the escaped
2690      hex code.
2691      ....................000000000.111111.11112222.
2692      ....................123456789.012345.67890123.  */
2693   const char *content = "        \"01234\\x35 789\"\n";
2694   lexer_test test (case_, content, NULL);
2695
2696   /* Verify that we get the expected token back, with the correct
2697      location information.  */
2698   const cpp_token *tok = test.get_token ();
2699   ASSERT_EQ (tok->type, CPP_STRING);
2700   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2701   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2702
2703   /* At this point in lexing, the quote characters are treated as part of
2704      the string (they are stripped off by cpp_interpret_string).  */
2705   ASSERT_EQ (tok->val.str.len, 15);
2706
2707   /* Verify that cpp_interpret_string works.  */
2708   cpp_string dst_string;
2709   const enum cpp_ttype type = CPP_STRING;
2710   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2711                                       &dst_string, type);
2712   ASSERT_TRUE (result);
2713   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2714   free (const_cast <unsigned char *> (dst_string.text));
2715
2716   /* Verify ranges of individual characters.  This no longer includes the
2717      opening quote, but does include the closing quote.  */
2718   for (int i = 0; i <= 4; i++)
2719     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2720   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2721   for (int i = 6; i <= 10; i++)
2722     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2723
2724   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2725 }
2726
2727 /* Lex a string literal containing an octal-escaped character.
2728    Verify the substring location data after running cpp_interpret_string
2729    on it.  */
2730
2731 static void
2732 test_lexer_string_locations_oct (const line_table_case &case_)
2733 {
2734   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2735      and with a space in place of digit 6, to terminate the escaped
2736      octal code.
2737      ....................000000000.111111.11112222.2222223333333333444
2738      ....................123456789.012345.67890123.4567890123456789012  */
2739   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2740   lexer_test test (case_, content, NULL);
2741
2742   /* Verify that we get the expected token back, with the correct
2743      location information.  */
2744   const cpp_token *tok = test.get_token ();
2745   ASSERT_EQ (tok->type, CPP_STRING);
2746   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2747
2748   /* Verify that cpp_interpret_string works.  */
2749   cpp_string dst_string;
2750   const enum cpp_ttype type = CPP_STRING;
2751   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2752                                       &dst_string, type);
2753   ASSERT_TRUE (result);
2754   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2755   free (const_cast <unsigned char *> (dst_string.text));
2756
2757   /* Verify ranges of individual characters.  This no longer includes the
2758      opening quote, but does include the closing quote.  */
2759   for (int i = 0; i < 5; i++)
2760     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2761   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2762   for (int i = 6; i <= 10; i++)
2763     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2764
2765   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2766 }
2767
2768 /* Test of string literal containing letter escapes.  */
2769
2770 static void
2771 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2772 {
2773   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2774      .....................000000000.1.11111.1.1.11222.22222223333333
2775      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2776   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2777   lexer_test test (case_, content, NULL);
2778
2779   /* Verify that we get the expected tokens back.  */
2780   const cpp_token *tok = test.get_token ();
2781   ASSERT_EQ (tok->type, CPP_STRING);
2782   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2783
2784   /* Verify ranges of individual characters. */
2785   /* "\t".  */
2786   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2787                         0, 1, 10, 11);
2788   /* "foo". */
2789   for (int i = 1; i <= 3; i++)
2790     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2791                           i, 1, 11 + i, 11 + i);
2792   /* "\\" and "\n".  */
2793   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2794                         4, 1, 15, 16);
2795   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2796                         5, 1, 17, 18);
2797
2798   /* "bar" and closing quote for nul-terminator.  */
2799   for (int i = 6; i <= 9; i++)
2800     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2801                           i, 1, 13 + i, 13 + i);
2802
2803   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2804 }
2805
2806 /* Another test of a string literal containing a letter escape.
2807    Based on string seen in
2808      printf ("%-%\n");
2809    in gcc.dg/format/c90-printf-1.c.  */
2810
2811 static void
2812 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2813 {
2814   /* .....................000000000.1111.11.1111.22222222223.
2815      .....................123456789.0123.45.6789.01234567890.  */
2816   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2817   lexer_test test (case_, content, NULL);
2818
2819   /* Verify that we get the expected tokens back.  */
2820   const cpp_token *tok = test.get_token ();
2821   ASSERT_EQ (tok->type, CPP_STRING);
2822   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2823
2824   /* Verify ranges of individual characters. */
2825   /* "%-%".  */
2826   for (int i = 0; i < 3; i++)
2827     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2828                           i, 1, 10 + i, 10 + i);
2829   /* "\n".  */
2830   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2831                         3, 1, 13, 14);
2832
2833   /* Closing quote for nul-terminator.  */
2834   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2835                         4, 1, 15, 15);
2836
2837   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2838 }
2839
2840 /* Lex a string literal containing UCN 4 characters.
2841    Verify the substring location data after running cpp_interpret_string
2842    on it.  */
2843
2844 static void
2845 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2846 {
2847   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2848      as UCN 4.
2849      ....................000000000.111111.111122.222222223.33333333344444
2850      ....................123456789.012345.678901.234567890.12345678901234  */
2851   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2852   lexer_test test (case_, content, NULL);
2853
2854   /* Verify that we get the expected token back, with the correct
2855      location information.  */
2856   const cpp_token *tok = test.get_token ();
2857   ASSERT_EQ (tok->type, CPP_STRING);
2858   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2859
2860   /* Verify that cpp_interpret_string works.
2861      The string should be encoded in the execution character
2862      set.  Assuming that is UTF-8, we should have the following:
2863      -----------  ----  -----  -------  ----------------
2864      Byte offset  Byte  Octal  Unicode  Source Column(s)
2865      -----------  ----  -----  -------  ----------------
2866      0            0x30         '0'      10
2867      1            0x31         '1'      11
2868      2            0x32         '2'      12
2869      3            0x33         '3'      13
2870      4            0x34         '4'      14
2871      5            0xE2  \342   U+2174   15-20
2872      6            0x85  \205    (cont)  15-20
2873      7            0xB4  \264    (cont)  15-20
2874      8            0xE2  \342   U+2175   21-26
2875      9            0x85  \205    (cont)  21-26
2876      10           0xB5  \265    (cont)  21-26
2877      11           0x37         '7'      27
2878      12           0x38         '8'      28
2879      13           0x39         '9'      29
2880      14           0x00                  30 (closing quote)
2881      -----------  ----  -----  -------  ---------------.  */
2882
2883   cpp_string dst_string;
2884   const enum cpp_ttype type = CPP_STRING;
2885   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2886                                       &dst_string, type);
2887   ASSERT_TRUE (result);
2888   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2889                 (const char *)dst_string.text);
2890   free (const_cast <unsigned char *> (dst_string.text));
2891
2892   /* Verify ranges of individual characters.  This no longer includes the
2893      opening quote, but does include the closing quote.
2894      '01234'.  */
2895   for (int i = 0; i <= 4; i++)
2896     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2897   /* U+2174.  */
2898   for (int i = 5; i <= 7; i++)
2899     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2900   /* U+2175.  */
2901   for (int i = 8; i <= 10; i++)
2902     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2903   /* '789' and nul terminator  */
2904   for (int i = 11; i <= 14; i++)
2905     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2906
2907   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2908 }
2909
2910 /* Lex a string literal containing UCN 8 characters.
2911    Verify the substring location data after running cpp_interpret_string
2912    on it.  */
2913
2914 static void
2915 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2916 {
2917   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2918      ....................000000000.111111.1111222222.2222333333333.344444
2919      ....................123456789.012345.6789012345.6789012345678.901234  */
2920   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2921   lexer_test test (case_, content, NULL);
2922
2923   /* Verify that we get the expected token back, with the correct
2924      location information.  */
2925   const cpp_token *tok = test.get_token ();
2926   ASSERT_EQ (tok->type, CPP_STRING);
2927   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2928                            "\"01234\\U00002174\\U00002175789\"");
2929
2930   /* Verify that cpp_interpret_string works.
2931      The UTF-8 encoding of the string is identical to that from
2932      the ucn4 testcase above; the only difference is the column
2933      locations.  */
2934   cpp_string dst_string;
2935   const enum cpp_ttype type = CPP_STRING;
2936   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2937                                       &dst_string, type);
2938   ASSERT_TRUE (result);
2939   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2940                 (const char *)dst_string.text);
2941   free (const_cast <unsigned char *> (dst_string.text));
2942
2943   /* Verify ranges of individual characters.  This no longer includes the
2944      opening quote, but does include the closing quote.
2945      '01234'.  */
2946   for (int i = 0; i <= 4; i++)
2947     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2948   /* U+2174.  */
2949   for (int i = 5; i <= 7; i++)
2950     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2951   /* U+2175.  */
2952   for (int i = 8; i <= 10; i++)
2953     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2954   /* '789' at columns 35-37  */
2955   for (int i = 11; i <= 13; i++)
2956     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2957   /* Closing quote/nul-terminator at column 38.  */
2958   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2959
2960   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2961 }
2962
2963 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2964
2965 static uint32_t
2966 uint32_from_big_endian (const uint32_t *ptr_be_value)
2967 {
2968   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2969   return (((uint32_t) buf[0] << 24)
2970           | ((uint32_t) buf[1] << 16)
2971           | ((uint32_t) buf[2] << 8)
2972           | (uint32_t) buf[3]);
2973 }
2974
2975 /* Lex a wide string literal and verify that attempts to read substring
2976    location data from it fail gracefully.  */
2977
2978 static void
2979 test_lexer_string_locations_wide_string (const line_table_case &case_)
2980 {
2981   /* Digits 0-9.
2982      ....................000000000.11111111112.22222222233333
2983      ....................123456789.01234567890.12345678901234  */
2984   const char *content = "       L\"0123456789\" /* non-str */\n";
2985   lexer_test test (case_, content, NULL);
2986
2987   /* Verify that we get the expected token back, with the correct
2988      location information.  */
2989   const cpp_token *tok = test.get_token ();
2990   ASSERT_EQ (tok->type, CPP_WSTRING);
2991   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2992
2993   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2994   cpp_string dst_string;
2995   const enum cpp_ttype type = CPP_WSTRING;
2996   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2997                                       &dst_string, type);
2998   ASSERT_TRUE (result);
2999   /* The cpp_reader defaults to big-endian with
3000      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3001      now be encoded as UTF-32BE.  */
3002   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3003   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3004   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3005   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3006   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3007   free (const_cast <unsigned char *> (dst_string.text));
3008
3009   /* We don't yet support generating substring location information
3010      for L"" strings.  */
3011   ASSERT_HAS_NO_SUBSTRING_RANGES
3012     (test, tok->src_loc, type,
3013      "execution character set != source character set");
3014 }
3015
3016 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3017
3018 static uint16_t
3019 uint16_from_big_endian (const uint16_t *ptr_be_value)
3020 {
3021   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3022   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3023 }
3024
3025 /* Lex a u"" string literal and verify that attempts to read substring
3026    location data from it fail gracefully.  */
3027
3028 static void
3029 test_lexer_string_locations_string16 (const line_table_case &case_)
3030 {
3031   /* Digits 0-9.
3032      ....................000000000.11111111112.22222222233333
3033      ....................123456789.01234567890.12345678901234  */
3034   const char *content = "       u\"0123456789\" /* non-str */\n";
3035   lexer_test test (case_, content, NULL);
3036
3037   /* Verify that we get the expected token back, with the correct
3038      location information.  */
3039   const cpp_token *tok = test.get_token ();
3040   ASSERT_EQ (tok->type, CPP_STRING16);
3041   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3042
3043   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3044   cpp_string dst_string;
3045   const enum cpp_ttype type = CPP_STRING16;
3046   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3047                                       &dst_string, type);
3048   ASSERT_TRUE (result);
3049
3050   /* The cpp_reader defaults to big-endian, so dst_string should
3051      now be encoded as UTF-16BE.  */
3052   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3053   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3054   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3055   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3056   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3057   free (const_cast <unsigned char *> (dst_string.text));
3058
3059   /* We don't yet support generating substring location information
3060      for L"" strings.  */
3061   ASSERT_HAS_NO_SUBSTRING_RANGES
3062     (test, tok->src_loc, type,
3063      "execution character set != source character set");
3064 }
3065
3066 /* Lex a U"" string literal and verify that attempts to read substring
3067    location data from it fail gracefully.  */
3068
3069 static void
3070 test_lexer_string_locations_string32 (const line_table_case &case_)
3071 {
3072   /* Digits 0-9.
3073      ....................000000000.11111111112.22222222233333
3074      ....................123456789.01234567890.12345678901234  */
3075   const char *content = "       U\"0123456789\" /* non-str */\n";
3076   lexer_test test (case_, content, NULL);
3077
3078   /* Verify that we get the expected token back, with the correct
3079      location information.  */
3080   const cpp_token *tok = test.get_token ();
3081   ASSERT_EQ (tok->type, CPP_STRING32);
3082   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3083
3084   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3085   cpp_string dst_string;
3086   const enum cpp_ttype type = CPP_STRING32;
3087   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3088                                       &dst_string, type);
3089   ASSERT_TRUE (result);
3090
3091   /* The cpp_reader defaults to big-endian, so dst_string should
3092      now be encoded as UTF-32BE.  */
3093   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3094   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3095   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3096   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3097   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3098   free (const_cast <unsigned char *> (dst_string.text));
3099
3100   /* We don't yet support generating substring location information
3101      for L"" strings.  */
3102   ASSERT_HAS_NO_SUBSTRING_RANGES
3103     (test, tok->src_loc, type,
3104      "execution character set != source character set");
3105 }
3106
3107 /* Lex a u8-string literal.
3108    Verify the substring location data after running cpp_interpret_string
3109    on it.  */
3110
3111 static void
3112 test_lexer_string_locations_u8 (const line_table_case &case_)
3113 {
3114   /* Digits 0-9.
3115      ....................000000000.11111111112.22222222233333
3116      ....................123456789.01234567890.12345678901234  */
3117   const char *content = "      u8\"0123456789\" /* non-str */\n";
3118   lexer_test test (case_, content, NULL);
3119
3120   /* Verify that we get the expected token back, with the correct
3121      location information.  */
3122   const cpp_token *tok = test.get_token ();
3123   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3124   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3125
3126   /* Verify that cpp_interpret_string works.  */
3127   cpp_string dst_string;
3128   const enum cpp_ttype type = CPP_STRING;
3129   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3130                                       &dst_string, type);
3131   ASSERT_TRUE (result);
3132   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3133   free (const_cast <unsigned char *> (dst_string.text));
3134
3135   /* Verify ranges of individual characters.  This no longer includes the
3136      opening quote, but does include the closing quote.  */
3137   for (int i = 0; i <= 10; i++)
3138     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3139 }
3140
3141 /* Lex a string literal containing UTF-8 source characters.
3142    Verify the substring location data after running cpp_interpret_string
3143    on it.  */
3144
3145 static void
3146 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3147 {
3148  /* This string literal is written out to the source file as UTF-8,
3149     and is of the form "before mojibake after", where "mojibake"
3150     is written as the following four unicode code points:
3151        U+6587 CJK UNIFIED IDEOGRAPH-6587
3152        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3153        U+5316 CJK UNIFIED IDEOGRAPH-5316
3154        U+3051 HIRAGANA LETTER KE.
3155      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3156      "before" and "after" are 1 byte per unicode character.
3157
3158      The numbering shown are "columns", which are *byte* numbers within
3159      the line, rather than unicode character numbers.
3160
3161      .................... 000000000.1111111.
3162      .................... 123456789.0123456.  */
3163   const char *content = ("        \"before "
3164                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3165                               UTF-8: 0xE6 0x96 0x87
3166                               C octal escaped UTF-8: \346\226\207
3167                             "column" numbers: 17-19.  */
3168                          "\346\226\207"
3169
3170                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3171                               UTF-8: 0xE5 0xAD 0x97
3172                               C octal escaped UTF-8: \345\255\227
3173                             "column" numbers: 20-22.  */
3174                          "\345\255\227"
3175
3176                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3177                               UTF-8: 0xE5 0x8C 0x96
3178                               C octal escaped UTF-8: \345\214\226
3179                             "column" numbers: 23-25.  */
3180                          "\345\214\226"
3181
3182                          /* U+3051 HIRAGANA LETTER KE
3183                               UTF-8: 0xE3 0x81 0x91
3184                               C octal escaped UTF-8: \343\201\221
3185                             "column" numbers: 26-28.  */
3186                          "\343\201\221"
3187
3188                          /* column numbers 29 onwards
3189                           2333333.33334444444444
3190                           9012345.67890123456789. */
3191                          " after\" /* non-str */\n");
3192   lexer_test test (case_, content, NULL);
3193
3194   /* Verify that we get the expected token back, with the correct
3195      location information.  */
3196   const cpp_token *tok = test.get_token ();
3197   ASSERT_EQ (tok->type, CPP_STRING);
3198   ASSERT_TOKEN_AS_TEXT_EQ
3199     (test.m_parser, tok,
3200      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3201
3202   /* Verify that cpp_interpret_string works.  */
3203   cpp_string dst_string;
3204   const enum cpp_ttype type = CPP_STRING;
3205   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3206                                       &dst_string, type);
3207   ASSERT_TRUE (result);
3208   ASSERT_STREQ
3209     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3210      (const char *)dst_string.text);
3211   free (const_cast <unsigned char *> (dst_string.text));
3212
3213   /* Verify ranges of individual characters.  This no longer includes the
3214      opening quote, but does include the closing quote.
3215      Assuming that both source and execution encodings are UTF-8, we have
3216      a run of 25 octets in each, plus the NUL terminator.  */
3217   for (int i = 0; i < 25; i++)
3218     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3219   /* NUL-terminator should use the closing quote at column 35.  */
3220   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3221
3222   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3223 }
3224
3225 /* Test of string literal concatenation.  */
3226
3227 static void
3228 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3229 {
3230   /* Digits 0-9.
3231      .....................000000000.111111.11112222222222
3232      .....................123456789.012345.67890123456789.  */
3233   const char *content = ("        \"01234\" /* non-str */\n"
3234                          "        \"56789\" /* non-str */\n");
3235   lexer_test test (case_, content, NULL);
3236
3237   location_t input_locs[2];
3238
3239   /* Verify that we get the expected tokens back.  */
3240   auto_vec <cpp_string> input_strings;
3241   const cpp_token *tok_a = test.get_token ();
3242   ASSERT_EQ (tok_a->type, CPP_STRING);
3243   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3244   input_strings.safe_push (tok_a->val.str);
3245   input_locs[0] = tok_a->src_loc;
3246
3247   const cpp_token *tok_b = test.get_token ();
3248   ASSERT_EQ (tok_b->type, CPP_STRING);
3249   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3250   input_strings.safe_push (tok_b->val.str);
3251   input_locs[1] = tok_b->src_loc;
3252
3253   /* Verify that cpp_interpret_string works.  */
3254   cpp_string dst_string;
3255   const enum cpp_ttype type = CPP_STRING;
3256   bool result = cpp_interpret_string (test.m_parser,
3257                                       input_strings.address (), 2,
3258                                       &dst_string, type);
3259   ASSERT_TRUE (result);
3260   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3261   free (const_cast <unsigned char *> (dst_string.text));
3262
3263   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3264   test.m_concats.record_string_concatenation (2, input_locs);
3265
3266   location_t initial_loc = input_locs[0];
3267
3268   /* "01234" on line 1.  */
3269   for (int i = 0; i <= 4; i++)
3270     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3271   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3272   for (int i = 5; i <= 10; i++)
3273     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3274
3275   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3276 }
3277
3278 /* Another test of string literal concatenation.  */
3279
3280 static void
3281 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3282 {
3283   /* Digits 0-9.
3284      .....................000000000.111.11111112222222
3285      .....................123456789.012.34567890123456.  */
3286   const char *content = ("        \"01\" /* non-str */\n"
3287                          "        \"23\" /* non-str */\n"
3288                          "        \"45\" /* non-str */\n"
3289                          "        \"67\" /* non-str */\n"
3290                          "        \"89\" /* non-str */\n");
3291   lexer_test test (case_, content, NULL);
3292
3293   auto_vec <cpp_string> input_strings;
3294   location_t input_locs[5];
3295
3296   /* Verify that we get the expected tokens back.  */
3297   for (int i = 0; i < 5; i++)
3298     {
3299       const cpp_token *tok = test.get_token ();
3300       ASSERT_EQ (tok->type, CPP_STRING);
3301       input_strings.safe_push (tok->val.str);
3302       input_locs[i] = tok->src_loc;
3303     }
3304
3305   /* Verify that cpp_interpret_string works.  */
3306   cpp_string dst_string;
3307   const enum cpp_ttype type = CPP_STRING;
3308   bool result = cpp_interpret_string (test.m_parser,
3309                                       input_strings.address (), 5,
3310                                       &dst_string, type);
3311   ASSERT_TRUE (result);
3312   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3313   free (const_cast <unsigned char *> (dst_string.text));
3314
3315   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3316   test.m_concats.record_string_concatenation (5, input_locs);
3317
3318   location_t initial_loc = input_locs[0];
3319
3320   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3321      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3322      and expect get_source_range_for_substring to fail.
3323      However, for a string concatenation test, we can have a case
3324      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3325      but subsequent strings can be after it.
3326      Attempting to detect this within assert_char_at_range
3327      would overcomplicate the logic for the common test cases, so
3328      we detect it here.  */
3329   if (should_have_column_data_p (input_locs[0])
3330       && !should_have_column_data_p (input_locs[4]))
3331     {
3332       /* Verify that get_source_range_for_substring gracefully rejects
3333          this case.  */
3334       source_range actual_range;
3335       const char *err
3336         = get_source_range_for_char (test.m_parser, &test.m_concats,
3337                                      initial_loc, type, 0, &actual_range);
3338       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3339       return;
3340     }
3341
3342   for (int i = 0; i < 5; i++)
3343     for (int j = 0; j < 2; j++)
3344       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3345                             i + 1, 10 + j, 10 + j);
3346
3347   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3348   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3349
3350   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3351 }
3352
3353 /* Another test of string literal concatenation, this time combined with
3354    various kinds of escaped characters.  */
3355
3356 static void
3357 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3358 {
3359   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3360      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3361   const char *content
3362     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3363        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3364     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3365   lexer_test test (case_, content, NULL);
3366
3367   auto_vec <cpp_string> input_strings;
3368   location_t input_locs[4];
3369
3370   /* Verify that we get the expected tokens back.  */
3371   for (int i = 0; i < 4; i++)
3372     {
3373       const cpp_token *tok = test.get_token ();
3374       ASSERT_EQ (tok->type, CPP_STRING);
3375       input_strings.safe_push (tok->val.str);
3376       input_locs[i] = tok->src_loc;
3377     }
3378
3379   /* Verify that cpp_interpret_string works.  */
3380   cpp_string dst_string;
3381   const enum cpp_ttype type = CPP_STRING;
3382   bool result = cpp_interpret_string (test.m_parser,
3383                                       input_strings.address (), 4,
3384                                       &dst_string, type);
3385   ASSERT_TRUE (result);
3386   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3387   free (const_cast <unsigned char *> (dst_string.text));
3388
3389   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3390   test.m_concats.record_string_concatenation (4, input_locs);
3391
3392   location_t initial_loc = input_locs[0];
3393
3394   for (int i = 0; i <= 4; i++)
3395     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3396   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3397   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3398   for (int i = 7; i <= 9; i++)
3399     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3400
3401   /* NUL-terminator should use the location of the final closing quote.  */
3402   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3403
3404   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3405 }
3406
3407 /* Test of string literal in a macro.  */
3408
3409 static void
3410 test_lexer_string_locations_macro (const line_table_case &case_)
3411 {
3412   /* Digits 0-9.
3413      .....................0000000001111111111.22222222223.
3414      .....................1234567890123456789.01234567890.  */
3415   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3416                          "  MACRO");
3417   lexer_test test (case_, content, NULL);
3418
3419   /* Verify that we get the expected tokens back.  */
3420   const cpp_token *tok = test.get_token ();
3421   ASSERT_EQ (tok->type, CPP_PADDING);
3422
3423   tok = test.get_token ();
3424   ASSERT_EQ (tok->type, CPP_STRING);
3425   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3426
3427   /* Verify ranges of individual characters.  We ought to
3428      see columns within the macro definition.  */
3429   for (int i = 0; i <= 10; i++)
3430     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3431                           i, 1, 20 + i, 20 + i);
3432
3433   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3434
3435   tok = test.get_token ();
3436   ASSERT_EQ (tok->type, CPP_PADDING);
3437 }
3438
3439 /* Test of stringification of a macro argument.  */
3440
3441 static void
3442 test_lexer_string_locations_stringified_macro_argument
3443   (const line_table_case &case_)
3444 {
3445   /* .....................000000000111111111122222222223.
3446      .....................123456789012345678901234567890.  */
3447   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3448                          "MACRO(foo)\n");
3449   lexer_test test (case_, content, NULL);
3450
3451   /* Verify that we get the expected token back.  */
3452   const cpp_token *tok = test.get_token ();
3453   ASSERT_EQ (tok->type, CPP_PADDING);
3454
3455   tok = test.get_token ();
3456   ASSERT_EQ (tok->type, CPP_STRING);
3457   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3458
3459   /* We don't support getting the location of a stringified macro
3460      argument.  Verify that it fails gracefully.  */
3461   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3462                                   "cpp_interpret_string_1 failed");
3463
3464   tok = test.get_token ();
3465   ASSERT_EQ (tok->type, CPP_PADDING);
3466
3467   tok = test.get_token ();
3468   ASSERT_EQ (tok->type, CPP_PADDING);
3469 }
3470
3471 /* Ensure that we are fail gracefully if something attempts to pass
3472    in a location that isn't a string literal token.  Seen on this code:
3473
3474      const char a[] = " %d ";
3475      __builtin_printf (a, 0.5);
3476                        ^
3477
3478    when c-format.c erroneously used the indicated one-character
3479    location as the format string location, leading to a read past the
3480    end of a string buffer in cpp_interpret_string_1.  */
3481
3482 static void
3483 test_lexer_string_locations_non_string (const line_table_case &case_)
3484 {
3485   /* .....................000000000111111111122222222223.
3486      .....................123456789012345678901234567890.  */
3487   const char *content = ("         a\n");
3488   lexer_test test (case_, content, NULL);
3489
3490   /* Verify that we get the expected token back.  */
3491   const cpp_token *tok = test.get_token ();
3492   ASSERT_EQ (tok->type, CPP_NAME);
3493   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3494
3495   /* At this point, libcpp is attempting to interpret the name as a
3496      string literal, despite it not starting with a quote.  We don't detect
3497      that, but we should at least fail gracefully.  */
3498   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3499                                   "cpp_interpret_string_1 failed");
3500 }
3501
3502 /* Ensure that we can read substring information for a token which
3503    starts in one linemap and ends in another .  Adapted from
3504    gcc.dg/cpp/pr69985.c.  */
3505
3506 static void
3507 test_lexer_string_locations_long_line (const line_table_case &case_)
3508 {
3509   /* .....................000000.000111111111
3510      .....................123456.789012346789.  */
3511   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3512                          "     \"0123456789012345678901234567890123456789"
3513                          "0123456789012345678901234567890123456789"
3514                          "0123456789012345678901234567890123456789"
3515                          "0123456789\"\n");
3516
3517   lexer_test test (case_, content, NULL);
3518
3519   /* Verify that we get the expected token back.  */
3520   const cpp_token *tok = test.get_token ();
3521   ASSERT_EQ (tok->type, CPP_STRING);
3522
3523   if (!should_have_column_data_p (line_table->highest_location))
3524     return;
3525
3526   /* Verify ranges of individual characters.  */
3527   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3528   for (int i = 0; i < 131; i++)
3529     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3530                           i, 2, 7 + i, 7 + i);
3531 }
3532
3533 /* Test of locations within a raw string that doesn't contain a newline.  */
3534
3535 static void
3536 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3537 {
3538   /* .....................00.0000000111111111122.
3539      .....................12.3456789012345678901.  */
3540   const char *content = ("R\"foo(0123456789)foo\"\n");
3541   lexer_test test (case_, content, NULL);
3542
3543   /* Verify that we get the expected token back.  */
3544   const cpp_token *tok = test.get_token ();
3545   ASSERT_EQ (tok->type, CPP_STRING);
3546
3547   /* Verify that cpp_interpret_string works.  */
3548   cpp_string dst_string;
3549   const enum cpp_ttype type = CPP_STRING;
3550   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3551                                       &dst_string, type);
3552   ASSERT_TRUE (result);
3553   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3554   free (const_cast <unsigned char *> (dst_string.text));
3555
3556   if (!should_have_column_data_p (line_table->highest_location))
3557     return;
3558
3559   /* 0-9, plus the nil terminator.  */
3560   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3561   for (int i = 0; i < 11; i++)
3562     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3563                           i, 1, 7 + i, 7 + i);
3564 }
3565
3566 /* Test of locations within a raw string that contains a newline.  */
3567
3568 static void
3569 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3570 {
3571   /* .....................00.0000.
3572      .....................12.3456.  */
3573   const char *content = ("R\"foo(\n"
3574   /* .....................00000.
3575      .....................12345.  */
3576                          "hello\n"
3577                          "world\n"
3578   /* .....................00000.
3579      .....................12345.  */
3580                          ")foo\"\n");
3581   lexer_test test (case_, content, NULL);
3582
3583   /* Verify that we get the expected token back.  */
3584   const cpp_token *tok = test.get_token ();
3585   ASSERT_EQ (tok->type, CPP_STRING);
3586
3587   /* Verify that cpp_interpret_string works.  */
3588   cpp_string dst_string;
3589   const enum cpp_ttype type = CPP_STRING;
3590   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3591                                       &dst_string, type);
3592   ASSERT_TRUE (result);
3593   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3594   free (const_cast <unsigned char *> (dst_string.text));
3595
3596   if (!should_have_column_data_p (line_table->highest_location))
3597     return;
3598
3599   /* Currently we don't support locations within raw strings that
3600      contain newlines.  */
3601   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3602                                   "range endpoints are on different lines");
3603 }
3604
3605 /* Test of parsing an unterminated raw string.  */
3606
3607 static void
3608 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3609 {
3610   const char *content = "R\"ouch()ouCh\" /* etc */";
3611
3612   lexer_diagnostic_sink diagnostics;
3613   lexer_test test (case_, content, &diagnostics);
3614   test.m_implicitly_expect_EOF = false;
3615
3616   /* Attempt to parse the raw string.  */
3617   const cpp_token *tok = test.get_token ();
3618   ASSERT_EQ (tok->type, CPP_EOF);
3619
3620   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3621   /* We expect the message "unterminated raw string"
3622      in the "cpplib" translation domain.
3623      It's not clear that dgettext is available on all supported hosts,
3624      so this assertion is commented-out for now.
3625        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3626                      diagnostics.m_diagnostics[0]);
3627   */
3628 }
3629
3630 /* Test of lexing char constants.  */
3631
3632 static void
3633 test_lexer_char_constants (const line_table_case &case_)
3634 {
3635   /* Various char constants.
3636      .....................0000000001111111111.22222222223.
3637      .....................1234567890123456789.01234567890.  */
3638   const char *content = ("         'a'\n"
3639                          "        u'a'\n"
3640                          "        U'a'\n"
3641                          "        L'a'\n"
3642                          "         'abc'\n");
3643   lexer_test test (case_, content, NULL);
3644
3645   /* Verify that we get the expected tokens back.  */
3646   /* 'a'.  */
3647   const cpp_token *tok = test.get_token ();
3648   ASSERT_EQ (tok->type, CPP_CHAR);
3649   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3650
3651   unsigned int chars_seen;
3652   int unsignedp;
3653   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3654                                           &chars_seen, &unsignedp);
3655   ASSERT_EQ (cc, 'a');
3656   ASSERT_EQ (chars_seen, 1);
3657
3658   /* u'a'.  */
3659   tok = test.get_token ();
3660   ASSERT_EQ (tok->type, CPP_CHAR16);
3661   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3662
3663   /* U'a'.  */
3664   tok = test.get_token ();
3665   ASSERT_EQ (tok->type, CPP_CHAR32);
3666   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3667
3668   /* L'a'.  */
3669   tok = test.get_token ();
3670   ASSERT_EQ (tok->type, CPP_WCHAR);
3671   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3672
3673   /* 'abc' (c-char-sequence).  */
3674   tok = test.get_token ();
3675   ASSERT_EQ (tok->type, CPP_CHAR);
3676   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3677 }
3678 /* A table of interesting location_t values, giving one axis of our test
3679    matrix.  */
3680
3681 static const location_t boundary_locations[] = {
3682   /* Zero means "don't override the default values for a new line_table".  */
3683   0,
3684
3685   /* An arbitrary non-zero value that isn't close to one of
3686      the boundary values below.  */
3687   0x10000,
3688
3689   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3690   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3691   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3692   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3693   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3694   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3695
3696   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3697   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3698   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3699   LINE_MAP_MAX_LOCATION_WITH_COLS,
3700   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3701   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3702 };
3703
3704 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3705
3706 void
3707 for_each_line_table_case (void (*testcase) (const line_table_case &))
3708 {
3709   /* As noted above in the description of struct line_table_case,
3710      we want to explore a test matrix of interesting line_table
3711      situations, running various selftests for each case within the
3712      matrix.  */
3713
3714   /* Run all tests with:
3715      (a) line_table->default_range_bits == 0, and
3716      (b) line_table->default_range_bits == 5.  */
3717   int num_cases_tested = 0;
3718   for (int default_range_bits = 0; default_range_bits <= 5;
3719        default_range_bits += 5)
3720     {
3721       /* ...and use each of the "interesting" location values as
3722          the starting location within line_table.  */
3723       const int num_boundary_locations
3724         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3725       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3726         {
3727           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3728
3729           testcase (c);
3730
3731           num_cases_tested++;
3732         }
3733     }
3734
3735   /* Verify that we fully covered the test matrix.  */
3736   ASSERT_EQ (num_cases_tested, 2 * 12);
3737 }
3738
3739 /* Verify that when presented with a consecutive pair of locations with
3740    a very large line offset, we don't attempt to consolidate them into
3741    a single ordinary linemap where the line offsets within the line map
3742    would lead to overflow (PR lto/88147).  */
3743
3744 static void
3745 test_line_offset_overflow ()
3746 {
3747   line_table_test ltt (line_table_case (5, 0));
3748
3749   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3750   linemap_line_start (line_table, 1, 100);
3751   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3752   assert_loceq ("foo.c", 2578, 0, loc_a);
3753
3754   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3755   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3756   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3757
3758   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3759   assert_loceq ("foo.c", 404198, 0, loc_b);
3760
3761   /* We should have started a new linemap, rather than attempting to store
3762      a very large line offset.  */
3763   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3764   ASSERT_NE (ordmap_a, ordmap_b);
3765 }
3766
3767 void test_cpp_utf8 ()
3768 {
3769   const int def_tabstop = 8;
3770   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3771   {
3772     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop);
3773     ASSERT_EQ (8, w_bad);
3774     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop);
3775     ASSERT_EQ (5, w_ctrl);
3776   }
3777
3778   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3779   {
3780     const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop);
3781     ASSERT_EQ (1, w_pi);
3782     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop);
3783     ASSERT_EQ (2, w_emoji);
3784     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3785                                                         def_tabstop);
3786     ASSERT_EQ (1, w_umlaut_precomposed);
3787     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3788                                                       def_tabstop);
3789     ASSERT_EQ (1, w_umlaut_combining);
3790     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop);
3791     ASSERT_EQ (2, w_han);
3792     const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop);
3793     ASSERT_EQ (3, w_ascii);
3794     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3795                                            "\x9f! \xe4\xb8\xba y\xcc\x88",
3796                                            24, def_tabstop);
3797     ASSERT_EQ (18, w_mixed);
3798   }
3799
3800   /* Verify that display width properly expands tabs.  */
3801   {
3802     const char *tstr = "\tabc\td";
3803     ASSERT_EQ (6, cpp_display_width (tstr, 6, 1));
3804     ASSERT_EQ (10, cpp_display_width (tstr, 6, 3));
3805     ASSERT_EQ (17, cpp_display_width (tstr, 6, 8));
3806     ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8));
3807   }
3808
3809   /* Verify that cpp_byte_column_to_display_column can go past the end,
3810      and similar edge cases.  */
3811   {
3812     const char *str
3813       /* Display columns.
3814          111111112345  */
3815       = "\xcf\x80 abc";
3816       /* 111122223456
3817          Byte columns.  */
3818
3819     ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop));
3820     ASSERT_EQ (105,
3821                cpp_byte_column_to_display_column (str, 6, 106, def_tabstop));
3822     ASSERT_EQ (10000,
3823                cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop));
3824     ASSERT_EQ (0,
3825                cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop));
3826   }
3827
3828   /* Verify that cpp_display_column_to_byte_column can go past the end,
3829      and similar edge cases, and check invertibility.  */
3830   {
3831     const char *str
3832       /* Display columns.
3833          000000000000000000000000000000000000011
3834          111111112222222234444444455555555678901  */
3835       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3836       /* 000000000000000000000000000000000111111
3837          111122223333444456666777788889999012345
3838          Byte columns.  */
3839     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop));
3840     ASSERT_EQ (15,
3841                cpp_display_column_to_byte_column (str, 15, 11, def_tabstop));
3842     ASSERT_EQ (115,
3843                cpp_display_column_to_byte_column (str, 15, 111, def_tabstop));
3844     ASSERT_EQ (10000,
3845                cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop));
3846     ASSERT_EQ (0,
3847                cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop));
3848
3849     /* Verify that we do not interrupt a UTF-8 sequence.  */
3850     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop));
3851
3852     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3853       {
3854         const int disp_col
3855           = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop);
3856         const int byte_col2
3857           = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop);
3858
3859         /* If we ask for the display column in the middle of a UTF-8
3860            sequence, it will return the length of the partial sequence,
3861            matching the behavior of GCC before display column support.
3862            Otherwise check the round trip was successful.  */
3863         if (byte_col < 4)
3864           ASSERT_EQ (byte_col, disp_col);
3865         else if (byte_col >= 6 && byte_col < 9)
3866           ASSERT_EQ (3 + (byte_col - 5), disp_col);
3867         else
3868           ASSERT_EQ (byte_col2, byte_col);
3869       }
3870   }
3871
3872 }
3873
3874 /* Run all of the selftests within this file.  */
3875
3876 void
3877 input_c_tests ()
3878 {
3879   test_linenum_comparisons ();
3880   test_should_have_column_data_p ();
3881   test_unknown_location ();
3882   test_builtins ();
3883   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3884
3885   for_each_line_table_case (test_accessing_ordinary_linemaps);
3886   for_each_line_table_case (test_lexer);
3887   for_each_line_table_case (test_lexer_string_locations_simple);
3888   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3889   for_each_line_table_case (test_lexer_string_locations_hex);
3890   for_each_line_table_case (test_lexer_string_locations_oct);
3891   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3892   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3893   for_each_line_table_case (test_lexer_string_locations_ucn4);
3894   for_each_line_table_case (test_lexer_string_locations_ucn8);
3895   for_each_line_table_case (test_lexer_string_locations_wide_string);
3896   for_each_line_table_case (test_lexer_string_locations_string16);
3897   for_each_line_table_case (test_lexer_string_locations_string32);
3898   for_each_line_table_case (test_lexer_string_locations_u8);
3899   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3900   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3901   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3902   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3903   for_each_line_table_case (test_lexer_string_locations_macro);
3904   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3905   for_each_line_table_case (test_lexer_string_locations_non_string);
3906   for_each_line_table_case (test_lexer_string_locations_long_line);
3907   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3908   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3909   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3910   for_each_line_table_case (test_lexer_char_constants);
3911
3912   test_reading_source_line ();
3913
3914   test_line_offset_overflow ();
3915
3916   test_cpp_utf8 ();
3917 }
3918
3919 } // namespace selftest
3920
3921 #endif /* CHECKING_P */