gcc/tree-ssa-loop-prefetch.c

   1 /* Array prefetching.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "backend.h"
  24 #include "predict.h"
  25 #include "tree.h"
  26 #include "gimple.h"
  27 #include "rtl.h"
  28 #include "alias.h"
  29 #include "fold-const.h"
  30 #include "stor-layout.h"
  31 #include "tm_p.h"
  32 #include "tree-pretty-print.h"
  33 #include "internal-fn.h"
  34 #include "gimplify.h"
  35 #include "gimple-iterator.h"
  36 #include "gimplify-me.h"
  37 #include "gimple-ssa.h"
  38 #include "tree-ssa-loop-ivopts.h"
  39 #include "tree-ssa-loop-manip.h"
  40 #include "tree-ssa-loop-niter.h"
  41 #include "tree-ssa-loop.h"
  42 #include "tree-into-ssa.h"
  43 #include "cfgloop.h"
  44 #include "tree-pass.h"
  45 #include "insn-config.h"
  46 #include "tree-chrec.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "diagnostic-core.h"
  49 #include "params.h"
  50 #include "langhooks.h"
  51 #include "tree-inline.h"
  52 #include "tree-data-ref.h"
  53 #include "target.h"
  54
  55
  56 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
  57    between the GIMPLE and RTL worlds.  */
  58 #include "flags.h"
  59 #include "expmed.h"
  60 #include "dojump.h"
  61 #include "explow.h"
  62 #include "calls.h"
  63 #include "emit-rtl.h"
  64 #include "varasm.h"
  65 #include "stmt.h"
  66 #include "expr.h"
  67 #include "insn-codes.h"
  68 #include "optabs.h"
  69 #include "recog.h"
  70
  71 /* This pass inserts prefetch instructions to optimize cache usage during
  72    accesses to arrays in loops.  It processes loops sequentially and:
  73
  74    1) Gathers all memory references in the single loop.
  75    2) For each of the references it decides when it is profitable to prefetch
  76       it.  To do it, we evaluate the reuse among the accesses, and determines
  77       two values: PREFETCH_BEFORE (meaning that it only makes sense to do
  78       prefetching in the first PREFETCH_BEFORE iterations of the loop) and
  79       PREFETCH_MOD (meaning that it only makes sense to prefetch in the
  80       iterations of the loop that are zero modulo PREFETCH_MOD).  For example
  81       (assuming cache line size is 64 bytes, char has size 1 byte and there
  82       is no hardware sequential prefetch):
  83
  84       char *a;
  85       for (i = 0; i < max; i++)
  86         {
  87           a[255] = ...;         (0)
  88           a[i] = ...;           (1)
  89           a[i + 64] = ...;      (2)
  90           a[16*i] = ...;        (3)
  91           a[187*i] = ...;       (4)
  92           a[187*i + 50] = ...;  (5)
  93         }
  94
  95        (0) obviously has PREFETCH_BEFORE 1
  96        (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
  97            location 64 iterations before it, and PREFETCH_MOD 64 (since
  98            it hits the same cache line otherwise).
  99        (2) has PREFETCH_MOD 64
 100        (3) has PREFETCH_MOD 4
 101        (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
 102            the cache line accessed by (5) is the same with probability only
 103            7/32.
 104        (5) has PREFETCH_MOD 1 as well.
 105
 106       Additionally, we use data dependence analysis to determine for each
 107       reference the distance till the first reuse; this information is used
 108       to determine the temporality of the issued prefetch instruction.
 109
 110    3) We determine how much ahead we need to prefetch.  The number of
 111       iterations needed is time to fetch / time spent in one iteration of
 112       the loop.  The problem is that we do not know either of these values,
 113       so we just make a heuristic guess based on a magic (possibly)
 114       target-specific constant and size of the loop.
 115
 116    4) Determine which of the references we prefetch.  We take into account
 117       that there is a maximum number of simultaneous prefetches (provided
 118       by machine description).  We prefetch as many prefetches as possible
 119       while still within this bound (starting with those with lowest
 120       prefetch_mod, since they are responsible for most of the cache
 121       misses).
 122
 123    5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
 124       and PREFETCH_BEFORE requirements (within some bounds), and to avoid
 125       prefetching nonaccessed memory.
 126       TODO -- actually implement peeling.
 127
 128    6) We actually emit the prefetch instructions.  ??? Perhaps emit the
 129       prefetch instructions with guards in cases where 5) was not sufficient
 130       to satisfy the constraints?
 131
 132    A cost model is implemented to determine whether or not prefetching is
 133    profitable for a given loop.  The cost model has three heuristics:
 134
 135    1. Function trip_count_to_ahead_ratio_too_small_p implements a
 136       heuristic that determines whether or not the loop has too few
 137       iterations (compared to ahead).  Prefetching is not likely to be
 138       beneficial if the trip count to ahead ratio is below a certain
 139       minimum.
 140
 141    2. Function mem_ref_count_reasonable_p implements a heuristic that
 142       determines whether the given loop has enough CPU ops that can be
 143       overlapped with cache missing memory ops.  If not, the loop
 144       won't benefit from prefetching.  In the implementation,
 145       prefetching is not considered beneficial if the ratio between
 146       the instruction count and the mem ref count is below a certain
 147       minimum.
 148
 149    3. Function insn_to_prefetch_ratio_too_small_p implements a
 150       heuristic that disables prefetching in a loop if the prefetching
 151       cost is above a certain limit.  The relative prefetching cost is
 152       estimated by taking the ratio between the prefetch count and the
 153       total intruction count (this models the I-cache cost).
 154
 155    The limits used in these heuristics are defined as parameters with
 156    reasonable default values. Machine-specific default values will be
 157    added later.
 158
 159    Some other TODO:
 160       -- write and use more general reuse analysis (that could be also used
 161          in other cache aimed loop optimizations)
 162       -- make it behave sanely together with the prefetches given by user
 163          (now we just ignore them; at the very least we should avoid
 164          optimizing loops in that user put his own prefetches)
 165       -- we assume cache line size alignment of arrays; this could be
 166          improved.  */
 167
 168 /* Magic constants follow.  These should be replaced by machine specific
 169    numbers.  */
 170
 171 /* True if write can be prefetched by a read prefetch.  */
 172
 173 #ifndef WRITE_CAN_USE_READ_PREFETCH
 174 #define WRITE_CAN_USE_READ_PREFETCH 1
 175 #endif
 176
 177 /* True if read can be prefetched by a write prefetch. */
 178
 179 #ifndef READ_CAN_USE_WRITE_PREFETCH
 180 #define READ_CAN_USE_WRITE_PREFETCH 0
 181 #endif
 182
 183 /* The size of the block loaded by a single prefetch.  Usually, this is
 184    the same as cache line size (at the moment, we only consider one level
 185    of cache hierarchy).  */
 186
 187 #ifndef PREFETCH_BLOCK
 188 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
 189 #endif
 190
 191 /* Do we have a forward hardware sequential prefetching?  */
 192
 193 #ifndef HAVE_FORWARD_PREFETCH
 194 #define HAVE_FORWARD_PREFETCH 0
 195 #endif
 196
 197 /* Do we have a backward hardware sequential prefetching?  */
 198
 199 #ifndef HAVE_BACKWARD_PREFETCH
 200 #define HAVE_BACKWARD_PREFETCH 0
 201 #endif
 202
 203 /* In some cases we are only able to determine that there is a certain
 204    probability that the two accesses hit the same cache line.  In this
 205    case, we issue the prefetches for both of them if this probability
 206    is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
 207
 208 #ifndef ACCEPTABLE_MISS_RATE
 209 #define ACCEPTABLE_MISS_RATE 50
 210 #endif
 211
 212 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
 213 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
 214
 215 /* We consider a memory access nontemporal if it is not reused sooner than
 216    after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
 217    accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
 218    so that we use nontemporal prefetches e.g. if single memory location
 219    is accessed several times in a single iteration of the loop.  */
 220 #define NONTEMPORAL_FRACTION 16
 221
 222 /* In case we have to emit a memory fence instruction after the loop that
 223    uses nontemporal stores, this defines the builtin to use.  */
 224
 225 #ifndef FENCE_FOLLOWING_MOVNT
 226 #define FENCE_FOLLOWING_MOVNT NULL_TREE
 227 #endif
 228
 229 /* It is not profitable to prefetch when the trip count is not at
 230    least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
 231    For example, in a loop with a prefetch ahead distance of 10,
 232    supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
 233    profitable to prefetch when the trip count is greater or equal to
 234    40.  In that case, 30 out of the 40 iterations will benefit from
 235    prefetching.  */
 236
 237 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
 238 #define TRIP_COUNT_TO_AHEAD_RATIO 4
 239 #endif
 240
 241 /* The group of references between that reuse may occur.  */
 242
 243 struct mem_ref_group
 244 {
 245   tree base;                    /* Base of the reference.  */
 246   tree step;                    /* Step of the reference.  */
 247   struct mem_ref *refs;         /* References in the group.  */
 248   struct mem_ref_group *next;   /* Next group of references.  */
 249 };
 250
 251 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
 252
 253 #define PREFETCH_ALL            (~(unsigned HOST_WIDE_INT) 0)
 254
 255 /* Do not generate a prefetch if the unroll factor is significantly less
 256    than what is required by the prefetch.  This is to avoid redundant
 257    prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
 258    2, prefetching requires unrolling the loop 16 times, but
 259    the loop is actually unrolled twice.  In this case (ratio = 8),
 260    prefetching is not likely to be beneficial.  */
 261
 262 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
 263 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
 264 #endif
 265
 266 /* Some of the prefetch computations have quadratic complexity.  We want to
 267    avoid huge compile times and, therefore, want to limit the amount of
 268    memory references per loop where we consider prefetching.  */
 269
 270 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
 271 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
 272 #endif
 273
 274 /* The memory reference.  */
 275
 276 struct mem_ref
 277 {
 278   gimple stmt;                  /* Statement in that the reference appears.  */
 279   tree mem;                     /* The reference.  */
 280   HOST_WIDE_INT delta;          /* Constant offset of the reference.  */
 281   struct mem_ref_group *group;  /* The group of references it belongs to.  */
 282   unsigned HOST_WIDE_INT prefetch_mod;
 283                                 /* Prefetch only each PREFETCH_MOD-th
 284                                    iteration.  */
 285   unsigned HOST_WIDE_INT prefetch_before;
 286                                 /* Prefetch only first PREFETCH_BEFORE
 287                                    iterations.  */
 288   unsigned reuse_distance;      /* The amount of data accessed before the first
 289                                    reuse of this value.  */
 290   struct mem_ref *next;         /* The next reference in the group.  */
 291   unsigned write_p : 1;         /* Is it a write?  */
 292   unsigned independent_p : 1;   /* True if the reference is independent on
 293                                    all other references inside the loop.  */
 294   unsigned issue_prefetch_p : 1;        /* Should we really issue the prefetch?  */
 295   unsigned storent_p : 1;       /* True if we changed the store to a
 296                                    nontemporal one.  */
 297 };
 298
 299 /* Dumps information about memory reference */
 300 static void
 301 dump_mem_details (FILE *file, tree base, tree step,
 302             HOST_WIDE_INT delta, bool write_p)
 303 {
 304   fprintf (file, "(base ");
 305   print_generic_expr (file, base, TDF_SLIM);
 306   fprintf (file, ", step ");
 307   if (cst_and_fits_in_hwi (step))
 308     fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
 309   else
 310     print_generic_expr (file, step, TDF_TREE);
 311   fprintf (file, ")\n");
 312   fprintf (file, "  delta ");
 313   fprintf (file, HOST_WIDE_INT_PRINT_DEC, delta);
 314   fprintf (file, "\n");
 315   fprintf (file, "  %s\n", write_p ? "write" : "read");
 316   fprintf (file, "\n");
 317 }
 318
 319 /* Dumps information about reference REF to FILE.  */
 320
 321 static void
 322 dump_mem_ref (FILE *file, struct mem_ref *ref)
 323 {
 324   fprintf (file, "Reference %p:\n", (void *) ref);
 325
 326   fprintf (file, "  group %p ", (void *) ref->group);
 327
 328   dump_mem_details (file, ref->group->base, ref->group->step, ref->delta,
 329                    ref->write_p);
 330 }
 331
 332 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
 333    exist.  */
 334
 335 static struct mem_ref_group *
 336 find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
 337 {
 338   struct mem_ref_group *group;
 339
 340   for (; *groups; groups = &(*groups)->next)
 341     {
 342       if (operand_equal_p ((*groups)->step, step, 0)
 343           && operand_equal_p ((*groups)->base, base, 0))
 344         return *groups;
 345
 346       /* If step is an integer constant, keep the list of groups sorted
 347          by decreasing step.  */
 348       if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
 349           && int_cst_value ((*groups)->step) < int_cst_value (step))
 350         break;
 351     }
 352
 353   group = XNEW (struct mem_ref_group);
 354   group->base = base;
 355   group->step = step;
 356   group->refs = NULL;
 357   group->next = *groups;
 358   *groups = group;
 359
 360   return group;
 361 }
 362
 363 /* Records a memory reference MEM in GROUP with offset DELTA and write status
 364    WRITE_P.  The reference occurs in statement STMT.  */
 365
 366 static void
 367 record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
 368             HOST_WIDE_INT delta, bool write_p)
 369 {
 370   struct mem_ref **aref;
 371
 372   /* Do not record the same address twice.  */
 373   for (aref = &group->refs; *aref; aref = &(*aref)->next)
 374     {
 375       /* It does not have to be possible for write reference to reuse the read
 376          prefetch, or vice versa.  */
 377       if (!WRITE_CAN_USE_READ_PREFETCH
 378           && write_p
 379           && !(*aref)->write_p)
 380         continue;
 381       if (!READ_CAN_USE_WRITE_PREFETCH
 382           && !write_p
 383           && (*aref)->write_p)
 384         continue;
 385
 386       if ((*aref)->delta == delta)
 387         return;
 388     }
 389
 390   (*aref) = XNEW (struct mem_ref);
 391   (*aref)->stmt = stmt;
 392   (*aref)->mem = mem;
 393   (*aref)->delta = delta;
 394   (*aref)->write_p = write_p;
 395   (*aref)->prefetch_before = PREFETCH_ALL;
 396   (*aref)->prefetch_mod = 1;
 397   (*aref)->reuse_distance = 0;
 398   (*aref)->issue_prefetch_p = false;
 399   (*aref)->group = group;
 400   (*aref)->next = NULL;
 401   (*aref)->independent_p = false;
 402   (*aref)->storent_p = false;
 403
 404   if (dump_file && (dump_flags & TDF_DETAILS))
 405     dump_mem_ref (dump_file, *aref);
 406 }
 407
 408 /* Release memory references in GROUPS.  */
 409
 410 static void
 411 release_mem_refs (struct mem_ref_group *groups)
 412 {
 413   struct mem_ref_group *next_g;
 414   struct mem_ref *ref, *next_r;
 415
 416   for (; groups; groups = next_g)
 417     {
 418       next_g = groups->next;
 419       for (ref = groups->refs; ref; ref = next_r)
 420         {
 421           next_r = ref->next;
 422           free (ref);
 423         }
 424       free (groups);
 425     }
 426 }
 427
 428 /* A structure used to pass arguments to idx_analyze_ref.  */
 429
 430 struct ar_data
 431 {
 432   struct loop *loop;                    /* Loop of the reference.  */
 433   gimple stmt;                          /* Statement of the reference.  */
 434   tree *step;                           /* Step of the memory reference.  */
 435   HOST_WIDE_INT *delta;                 /* Offset of the memory reference.  */
 436 };
 437
 438 /* Analyzes a single INDEX of a memory reference to obtain information
 439    described at analyze_ref.  Callback for for_each_index.  */
 440
 441 static bool
 442 idx_analyze_ref (tree base, tree *index, void *data)
 443 {
 444   struct ar_data *ar_data = (struct ar_data *) data;
 445   tree ibase, step, stepsize;
 446   HOST_WIDE_INT idelta = 0, imult = 1;
 447   affine_iv iv;
 448
 449   if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
 450                   *index, &iv, true))
 451     return false;
 452   ibase = iv.base;
 453   step = iv.step;
 454
 455   if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
 456       && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
 457     {
 458       idelta = int_cst_value (TREE_OPERAND (ibase, 1));
 459       ibase = TREE_OPERAND (ibase, 0);
 460     }
 461   if (cst_and_fits_in_hwi (ibase))
 462     {
 463       idelta += int_cst_value (ibase);
 464       ibase = build_int_cst (TREE_TYPE (ibase), 0);
 465     }
 466
 467   if (TREE_CODE (base) == ARRAY_REF)
 468     {
 469       stepsize = array_ref_element_size (base);
 470       if (!cst_and_fits_in_hwi (stepsize))
 471         return false;
 472       imult = int_cst_value (stepsize);
 473       step = fold_build2 (MULT_EXPR, sizetype,
 474                           fold_convert (sizetype, step),
 475                           fold_convert (sizetype, stepsize));
 476       idelta *= imult;
 477     }
 478
 479   if (*ar_data->step == NULL_TREE)
 480     *ar_data->step = step;
 481   else
 482     *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
 483                                   fold_convert (sizetype, *ar_data->step),
 484                                   fold_convert (sizetype, step));
 485   *ar_data->delta += idelta;
 486   *index = ibase;
 487
 488   return true;
 489 }
 490
 491 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
 492    STEP are integer constants and iter is number of iterations of LOOP.  The
 493    reference occurs in statement STMT.  Strips nonaddressable component
 494    references from REF_P.  */
 495
 496 static bool
 497 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
 498              tree *step, HOST_WIDE_INT *delta,
 499              gimple stmt)
 500 {
 501   struct ar_data ar_data;
 502   tree off;
 503   HOST_WIDE_INT bit_offset;
 504   tree ref = *ref_p;
 505
 506   *step = NULL_TREE;
 507   *delta = 0;
 508
 509   /* First strip off the component references.  Ignore bitfields.
 510      Also strip off the real and imagine parts of a complex, so that
 511      they can have the same base.  */
 512   if (TREE_CODE (ref) == REALPART_EXPR
 513       || TREE_CODE (ref) == IMAGPART_EXPR
 514       || (TREE_CODE (ref) == COMPONENT_REF
 515           && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
 516     {
 517       if (TREE_CODE (ref) == IMAGPART_EXPR)
 518         *delta += int_size_in_bytes (TREE_TYPE (ref));
 519       ref = TREE_OPERAND (ref, 0);
 520     }
 521
 522   *ref_p = ref;
 523
 524   for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
 525     {
 526       off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
 527       bit_offset = TREE_INT_CST_LOW (off);
 528       gcc_assert (bit_offset % BITS_PER_UNIT == 0);
 529
 530       *delta += bit_offset / BITS_PER_UNIT;
 531     }
 532
 533   *base = unshare_expr (ref);
 534   ar_data.loop = loop;
 535   ar_data.stmt = stmt;
 536   ar_data.step = step;
 537   ar_data.delta = delta;
 538   return for_each_index (base, idx_analyze_ref, &ar_data);
 539 }
 540
 541 /* Record a memory reference REF to the list REFS.  The reference occurs in
 542    LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
 543    reference was recorded, false otherwise.  */
 544
 545 static bool
 546 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
 547                               tree ref, bool write_p, gimple stmt)
 548 {
 549   tree base, step;
 550   HOST_WIDE_INT delta;
 551   struct mem_ref_group *agrp;
 552
 553   if (get_base_address (ref) == NULL)
 554     return false;
 555
 556   if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
 557     return false;
 558   /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
 559   if (step == NULL_TREE)
 560     return false;
 561
 562   /* Stop if the address of BASE could not be taken.  */
 563   if (may_be_nonaddressable_p (base))
 564     return false;
 565
 566   /* Limit non-constant step prefetching only to the innermost loops and
 567      only when the step is loop invariant in the entire loop nest. */
 568   if (!cst_and_fits_in_hwi (step))
 569     {
 570       if (loop->inner != NULL)
 571         {
 572           if (dump_file && (dump_flags & TDF_DETAILS))
 573             {
 574               fprintf (dump_file, "Memory expression %p\n",(void *) ref );
 575               print_generic_expr (dump_file, ref, TDF_TREE);
 576               fprintf (dump_file,":");
 577               dump_mem_details (dump_file, base, step, delta, write_p);
 578               fprintf (dump_file,
 579                        "Ignoring %p, non-constant step prefetching is "
 580                        "limited to inner most loops \n",
 581                        (void *) ref);
 582             }
 583             return false;
 584          }
 585       else
 586         {
 587           if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
 588           {
 589             if (dump_file && (dump_flags & TDF_DETAILS))
 590               {
 591                 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
 592                 print_generic_expr (dump_file, ref, TDF_TREE);
 593                 fprintf (dump_file,":");
 594                 dump_mem_details (dump_file, base, step, delta, write_p);
 595                 fprintf (dump_file,
 596                          "Not prefetching, ignoring %p due to "
 597                          "loop variant step\n",
 598                          (void *) ref);
 599               }
 600               return false;
 601             }
 602         }
 603     }
 604
 605   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
 606      are integer constants.  */
 607   agrp = find_or_create_group (refs, base, step);
 608   record_ref (agrp, stmt, ref, delta, write_p);
 609
 610   return true;
 611 }
 612
 613 /* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
 614    true if there are no other memory references inside the loop.  */
 615
 616 static struct mem_ref_group *
 617 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
 618 {
 619   basic_block *body = get_loop_body_in_dom_order (loop);
 620   basic_block bb;
 621   unsigned i;
 622   gimple_stmt_iterator bsi;
 623   gimple stmt;
 624   tree lhs, rhs;
 625   struct mem_ref_group *refs = NULL;
 626
 627   *no_other_refs = true;
 628   *ref_count = 0;
 629
 630   /* Scan the loop body in order, so that the former references precede the
 631      later ones.  */
 632   for (i = 0; i < loop->num_nodes; i++)
 633     {
 634       bb = body[i];
 635       if (bb->loop_father != loop)
 636         continue;
 637
 638       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
 639         {
 640           stmt = gsi_stmt (bsi);
 641
 642           if (gimple_code (stmt) != GIMPLE_ASSIGN)
 643             {
 644               if (gimple_vuse (stmt)
 645                   || (is_gimple_call (stmt)
 646                       && !(gimple_call_flags (stmt) & ECF_CONST)))
 647                 *no_other_refs = false;
 648               continue;
 649             }
 650
 651           lhs = gimple_assign_lhs (stmt);
 652           rhs = gimple_assign_rhs1 (stmt);
 653
 654           if (REFERENCE_CLASS_P (rhs))
 655             {
 656             *no_other_refs &= gather_memory_references_ref (loop, &refs,
 657                                                             rhs, false, stmt);
 658             *ref_count += 1;
 659             }
 660           if (REFERENCE_CLASS_P (lhs))
 661             {
 662             *no_other_refs &= gather_memory_references_ref (loop, &refs,
 663                                                             lhs, true, stmt);
 664             *ref_count += 1;
 665             }
 666         }
 667     }
 668   free (body);
 669
 670   return refs;
 671 }
 672
 673 /* Prune the prefetch candidate REF using the self-reuse.  */
 674
 675 static void
 676 prune_ref_by_self_reuse (struct mem_ref *ref)
 677 {
 678   HOST_WIDE_INT step;
 679   bool backward;
 680
 681   /* If the step size is non constant, we cannot calculate prefetch_mod.  */
 682   if (!cst_and_fits_in_hwi (ref->group->step))
 683     return;
 684
 685   step = int_cst_value (ref->group->step);
 686
 687   backward = step < 0;
 688
 689   if (step == 0)
 690     {
 691       /* Prefetch references to invariant address just once.  */
 692       ref->prefetch_before = 1;
 693       return;
 694     }
 695
 696   if (backward)
 697     step = -step;
 698
 699   if (step > PREFETCH_BLOCK)
 700     return;
 701
 702   if ((backward && HAVE_BACKWARD_PREFETCH)
 703       || (!backward && HAVE_FORWARD_PREFETCH))
 704     {
 705       ref->prefetch_before = 1;
 706       return;
 707     }
 708
 709   ref->prefetch_mod = PREFETCH_BLOCK / step;
 710 }
 711
 712 /* Divides X by BY, rounding down.  */
 713
 714 static HOST_WIDE_INT
 715 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
 716 {
 717   gcc_assert (by > 0);
 718
 719   if (x >= 0)
 720     return x / by;
 721   else
 722     return (x + by - 1) / by;
 723 }
 724
 725 /* Given a CACHE_LINE_SIZE and two inductive memory references
 726    with a common STEP greater than CACHE_LINE_SIZE and an address
 727    difference DELTA, compute the probability that they will fall
 728    in different cache lines.  Return true if the computed miss rate
 729    is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
 730    number of distinct iterations after which the pattern repeats itself.
 731    ALIGN_UNIT is the unit of alignment in bytes.  */
 732
 733 static bool
 734 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
 735                    HOST_WIDE_INT step, HOST_WIDE_INT delta,
 736                    unsigned HOST_WIDE_INT distinct_iters,
 737                    int align_unit)
 738 {
 739   unsigned align, iter;
 740   int total_positions, miss_positions, max_allowed_miss_positions;
 741   int address1, address2, cache_line1, cache_line2;
 742
 743   /* It always misses if delta is greater than or equal to the cache
 744      line size.  */
 745   if (delta >= (HOST_WIDE_INT) cache_line_size)
 746     return false;
 747
 748   miss_positions = 0;
 749   total_positions = (cache_line_size / align_unit) * distinct_iters;
 750   max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
 751
 752   /* Iterate through all possible alignments of the first
 753      memory reference within its cache line.  */
 754   for (align = 0; align < cache_line_size; align += align_unit)
 755
 756     /* Iterate through all distinct iterations.  */
 757     for (iter = 0; iter < distinct_iters; iter++)
 758       {
 759         address1 = align + step * iter;
 760         address2 = address1 + delta;
 761         cache_line1 = address1 / cache_line_size;
 762         cache_line2 = address2 / cache_line_size;
 763         if (cache_line1 != cache_line2)
 764           {
 765             miss_positions += 1;
 766             if (miss_positions > max_allowed_miss_positions)
 767               return false;
 768           }
 769       }
 770   return true;
 771 }
 772
 773 /* Prune the prefetch candidate REF using the reuse with BY.
 774    If BY_IS_BEFORE is true, BY is before REF in the loop.  */
 775
 776 static void
 777 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
 778                           bool by_is_before)
 779 {
 780   HOST_WIDE_INT step;
 781   bool backward;
 782   HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
 783   HOST_WIDE_INT delta = delta_b - delta_r;
 784   HOST_WIDE_INT hit_from;
 785   unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
 786   HOST_WIDE_INT reduced_step;
 787   unsigned HOST_WIDE_INT reduced_prefetch_block;
 788   tree ref_type;
 789   int align_unit;
 790
 791   /* If the step is non constant we cannot calculate prefetch_before.  */
 792   if (!cst_and_fits_in_hwi (ref->group->step)) {
 793     return;
 794   }
 795
 796   step = int_cst_value (ref->group->step);
 797
 798   backward = step < 0;
 799
 800
 801   if (delta == 0)
 802     {
 803       /* If the references has the same address, only prefetch the
 804          former.  */
 805       if (by_is_before)
 806         ref->prefetch_before = 0;
 807
 808       return;
 809     }
 810
 811   if (!step)
 812     {
 813       /* If the reference addresses are invariant and fall into the
 814          same cache line, prefetch just the first one.  */
 815       if (!by_is_before)
 816         return;
 817
 818       if (ddown (ref->delta, PREFETCH_BLOCK)
 819           != ddown (by->delta, PREFETCH_BLOCK))
 820         return;
 821
 822       ref->prefetch_before = 0;
 823       return;
 824     }
 825
 826   /* Only prune the reference that is behind in the array.  */
 827   if (backward)
 828     {
 829       if (delta > 0)
 830         return;
 831
 832       /* Transform the data so that we may assume that the accesses
 833          are forward.  */
 834       delta = - delta;
 835       step = -step;
 836       delta_r = PREFETCH_BLOCK - 1 - delta_r;
 837       delta_b = PREFETCH_BLOCK - 1 - delta_b;
 838     }
 839   else
 840     {
 841       if (delta < 0)
 842         return;
 843     }
 844
 845   /* Check whether the two references are likely to hit the same cache
 846      line, and how distant the iterations in that it occurs are from
 847      each other.  */
 848
 849   if (step <= PREFETCH_BLOCK)
 850     {
 851       /* The accesses are sure to meet.  Let us check when.  */
 852       hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
 853       prefetch_before = (hit_from - delta_r + step - 1) / step;
 854
 855       /* Do not reduce prefetch_before if we meet beyond cache size.  */
 856       if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
 857         prefetch_before = PREFETCH_ALL;
 858       if (prefetch_before < ref->prefetch_before)
 859         ref->prefetch_before = prefetch_before;
 860
 861       return;
 862     }
 863
 864   /* A more complicated case with step > prefetch_block.  First reduce
 865      the ratio between the step and the cache line size to its simplest
 866      terms.  The resulting denominator will then represent the number of
 867      distinct iterations after which each address will go back to its
 868      initial location within the cache line.  This computation assumes
 869      that PREFETCH_BLOCK is a power of two.  */
 870   prefetch_block = PREFETCH_BLOCK;
 871   reduced_prefetch_block = prefetch_block;
 872   reduced_step = step;
 873   while ((reduced_step & 1) == 0
 874          && reduced_prefetch_block > 1)
 875     {
 876       reduced_step >>= 1;
 877       reduced_prefetch_block >>= 1;
 878     }
 879
 880   prefetch_before = delta / step;
 881   delta %= step;
 882   ref_type = TREE_TYPE (ref->mem);
 883   align_unit = TYPE_ALIGN (ref_type) / 8;
 884   if (is_miss_rate_acceptable (prefetch_block, step, delta,
 885                                reduced_prefetch_block, align_unit))
 886     {
 887       /* Do not reduce prefetch_before if we meet beyond cache size.  */
 888       if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
 889         prefetch_before = PREFETCH_ALL;
 890       if (prefetch_before < ref->prefetch_before)
 891         ref->prefetch_before = prefetch_before;
 892
 893       return;
 894     }
 895
 896   /* Try also the following iteration.  */
 897   prefetch_before++;
 898   delta = step - delta;
 899   if (is_miss_rate_acceptable (prefetch_block, step, delta,
 900                                reduced_prefetch_block, align_unit))
 901     {
 902       if (prefetch_before < ref->prefetch_before)
 903         ref->prefetch_before = prefetch_before;
 904
 905       return;
 906     }
 907
 908   /* The ref probably does not reuse by.  */
 909   return;
 910 }
 911
 912 /* Prune the prefetch candidate REF using the reuses with other references
 913    in REFS.  */
 914
 915 static void
 916 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
 917 {
 918   struct mem_ref *prune_by;
 919   bool before = true;
 920
 921   prune_ref_by_self_reuse (ref);
 922
 923   for (prune_by = refs; prune_by; prune_by = prune_by->next)
 924     {
 925       if (prune_by == ref)
 926         {
 927           before = false;
 928           continue;
 929         }
 930
 931       if (!WRITE_CAN_USE_READ_PREFETCH
 932           && ref->write_p
 933           && !prune_by->write_p)
 934         continue;
 935       if (!READ_CAN_USE_WRITE_PREFETCH
 936           && !ref->write_p
 937           && prune_by->write_p)
 938         continue;
 939
 940       prune_ref_by_group_reuse (ref, prune_by, before);
 941     }
 942 }
 943
 944 /* Prune the prefetch candidates in GROUP using the reuse analysis.  */
 945
 946 static void
 947 prune_group_by_reuse (struct mem_ref_group *group)
 948 {
 949   struct mem_ref *ref_pruned;
 950
 951   for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
 952     {
 953       prune_ref_by_reuse (ref_pruned, group->refs);
 954
 955       if (dump_file && (dump_flags & TDF_DETAILS))
 956         {
 957           fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
 958
 959           if (ref_pruned->prefetch_before == PREFETCH_ALL
 960               && ref_pruned->prefetch_mod == 1)
 961             fprintf (dump_file, " no restrictions");
 962           else if (ref_pruned->prefetch_before == 0)
 963             fprintf (dump_file, " do not prefetch");
 964           else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
 965             fprintf (dump_file, " prefetch once");
 966           else
 967             {
 968               if (ref_pruned->prefetch_before != PREFETCH_ALL)
 969                 {
 970                   fprintf (dump_file, " prefetch before ");
 971                   fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
 972                            ref_pruned->prefetch_before);
 973                 }
 974               if (ref_pruned->prefetch_mod != 1)
 975                 {
 976                   fprintf (dump_file, " prefetch mod ");
 977                   fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
 978                            ref_pruned->prefetch_mod);
 979                 }
 980             }
 981           fprintf (dump_file, "\n");
 982         }
 983     }
 984 }
 985
 986 /* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
 987
 988 static void
 989 prune_by_reuse (struct mem_ref_group *groups)
 990 {
 991   for (; groups; groups = groups->next)
 992     prune_group_by_reuse (groups);
 993 }
 994
 995 /* Returns true if we should issue prefetch for REF.  */
 996
 997 static bool
 998 should_issue_prefetch_p (struct mem_ref *ref)
 999 {
1000   /* For now do not issue prefetches for only first few of the
1001      iterations.  */
1002   if (ref->prefetch_before != PREFETCH_ALL)
1003     {
1004       if (dump_file && (dump_flags & TDF_DETAILS))
1005         fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
1006                  (void *) ref);
1007       return false;
1008     }
1009
1010   /* Do not prefetch nontemporal stores.  */
1011   if (ref->storent_p)
1012     {
1013       if (dump_file && (dump_flags & TDF_DETAILS))
1014         fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
1015       return false;
1016     }
1017
1018   return true;
1019 }
1020
1021 /* Decide which of the prefetch candidates in GROUPS to prefetch.
1022    AHEAD is the number of iterations to prefetch ahead (which corresponds
1023    to the number of simultaneous instances of one prefetch running at a
1024    time).  UNROLL_FACTOR is the factor by that the loop is going to be
1025    unrolled.  Returns true if there is anything to prefetch.  */
1026
1027 static bool
1028 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1029                      unsigned ahead)
1030 {
1031   unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1032   unsigned slots_per_prefetch;
1033   struct mem_ref *ref;
1034   bool any = false;
1035
1036   /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
1037   remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1038
1039   /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1040      AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
1041      it will need a prefetch slot.  */
1042   slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1043   if (dump_file && (dump_flags & TDF_DETAILS))
1044     fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1045              slots_per_prefetch);
1046
1047   /* For now we just take memory references one by one and issue
1048      prefetches for as many as possible.  The groups are sorted
1049      starting with the largest step, since the references with
1050      large step are more likely to cause many cache misses.  */
1051
1052   for (; groups; groups = groups->next)
1053     for (ref = groups->refs; ref; ref = ref->next)
1054       {
1055         if (!should_issue_prefetch_p (ref))
1056           continue;
1057
1058         /* The loop is far from being sufficiently unrolled for this
1059            prefetch.  Do not generate prefetch to avoid many redudant
1060            prefetches.  */
1061         if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1062           continue;
1063
1064         /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1065            and we unroll the loop UNROLL_FACTOR times, we need to insert
1066            ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1067            iteration.  */
1068         n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1069                         / ref->prefetch_mod);
1070         prefetch_slots = n_prefetches * slots_per_prefetch;
1071
1072         /* If more than half of the prefetches would be lost anyway, do not
1073            issue the prefetch.  */
1074         if (2 * remaining_prefetch_slots < prefetch_slots)
1075           continue;
1076
1077         ref->issue_prefetch_p = true;
1078
1079         if (remaining_prefetch_slots <= prefetch_slots)
1080           return true;
1081         remaining_prefetch_slots -= prefetch_slots;
1082         any = true;
1083       }
1084
1085   return any;
1086 }
1087
1088 /* Return TRUE if no prefetch is going to be generated in the given
1089    GROUPS.  */
1090
1091 static bool
1092 nothing_to_prefetch_p (struct mem_ref_group *groups)
1093 {
1094   struct mem_ref *ref;
1095
1096   for (; groups; groups = groups->next)
1097     for (ref = groups->refs; ref; ref = ref->next)
1098       if (should_issue_prefetch_p (ref))
1099         return false;
1100
1101   return true;
1102 }
1103
1104 /* Estimate the number of prefetches in the given GROUPS.
1105    UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
1106
1107 static int
1108 estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1109 {
1110   struct mem_ref *ref;
1111   unsigned n_prefetches;
1112   int prefetch_count = 0;
1113
1114   for (; groups; groups = groups->next)
1115     for (ref = groups->refs; ref; ref = ref->next)
1116       if (should_issue_prefetch_p (ref))
1117         {
1118           n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1119                           / ref->prefetch_mod);
1120           prefetch_count += n_prefetches;
1121         }
1122
1123   return prefetch_count;
1124 }
1125
1126 /* Issue prefetches for the reference REF into loop as decided before.
1127    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
1128    is the factor by which LOOP was unrolled.  */
1129
1130 static void
1131 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1132 {
1133   HOST_WIDE_INT delta;
1134   tree addr, addr_base, write_p, local, forward;
1135   gcall *prefetch;
1136   gimple_stmt_iterator bsi;
1137   unsigned n_prefetches, ap;
1138   bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1139
1140   if (dump_file && (dump_flags & TDF_DETAILS))
1141     fprintf (dump_file, "Issued%s prefetch for %p.\n",
1142              nontemporal ? " nontemporal" : "",
1143              (void *) ref);
1144
1145   bsi = gsi_for_stmt (ref->stmt);
1146
1147   n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1148                   / ref->prefetch_mod);
1149   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1150   addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1151                                         true, NULL, true, GSI_SAME_STMT);
1152   write_p = ref->write_p ? integer_one_node : integer_zero_node;
1153   local = nontemporal ? integer_zero_node : integer_three_node;
1154
1155   for (ap = 0; ap < n_prefetches; ap++)
1156     {
1157       if (cst_and_fits_in_hwi (ref->group->step))
1158         {
1159           /* Determine the address to prefetch.  */
1160           delta = (ahead + ap * ref->prefetch_mod) *
1161                    int_cst_value (ref->group->step);
1162           addr = fold_build_pointer_plus_hwi (addr_base, delta);
1163           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1164                                            true, GSI_SAME_STMT);
1165         }
1166       else
1167         {
1168           /* The step size is non-constant but loop-invariant.  We use the
1169              heuristic to simply prefetch ahead iterations ahead.  */
1170           forward = fold_build2 (MULT_EXPR, sizetype,
1171                                  fold_convert (sizetype, ref->group->step),
1172                                  fold_convert (sizetype, size_int (ahead)));
1173           addr = fold_build_pointer_plus (addr_base, forward);
1174           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1175                                            NULL, true, GSI_SAME_STMT);
1176       }
1177       /* Create the prefetch instruction.  */
1178       prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1179                                     3, addr, write_p, local);
1180       gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1181     }
1182 }
1183
1184 /* Issue prefetches for the references in GROUPS into loop as decided before.
1185    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
1186    factor by that LOOP was unrolled.  */
1187
1188 static void
1189 issue_prefetches (struct mem_ref_group *groups,
1190                   unsigned unroll_factor, unsigned ahead)
1191 {
1192   struct mem_ref *ref;
1193
1194   for (; groups; groups = groups->next)
1195     for (ref = groups->refs; ref; ref = ref->next)
1196       if (ref->issue_prefetch_p)
1197         issue_prefetch_ref (ref, unroll_factor, ahead);
1198 }
1199
1200 /* Returns true if REF is a memory write for that a nontemporal store insn
1201    can be used.  */
1202
1203 static bool
1204 nontemporal_store_p (struct mem_ref *ref)
1205 {
1206   machine_mode mode;
1207   enum insn_code code;
1208
1209   /* REF must be a write that is not reused.  We require it to be independent
1210      on all other memory references in the loop, as the nontemporal stores may
1211      be reordered with respect to other memory references.  */
1212   if (!ref->write_p
1213       || !ref->independent_p
1214       || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1215     return false;
1216
1217   /* Check that we have the storent instruction for the mode.  */
1218   mode = TYPE_MODE (TREE_TYPE (ref->mem));
1219   if (mode == BLKmode)
1220     return false;
1221
1222   code = optab_handler (storent_optab, mode);
1223   return code != CODE_FOR_nothing;
1224 }
1225
1226 /* If REF is a nontemporal store, we mark the corresponding modify statement
1227    and return true.  Otherwise, we return false.  */
1228
1229 static bool
1230 mark_nontemporal_store (struct mem_ref *ref)
1231 {
1232   if (!nontemporal_store_p (ref))
1233     return false;
1234
1235   if (dump_file && (dump_flags & TDF_DETAILS))
1236     fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1237              (void *) ref);
1238
1239   gimple_assign_set_nontemporal_move (ref->stmt, true);
1240   ref->storent_p = true;
1241
1242   return true;
1243 }
1244
1245 /* Issue a memory fence instruction after LOOP.  */
1246
1247 static void
1248 emit_mfence_after_loop (struct loop *loop)
1249 {
1250   vec<edge> exits = get_loop_exit_edges (loop);
1251   edge exit;
1252   gcall *call;
1253   gimple_stmt_iterator bsi;
1254   unsigned i;
1255
1256   FOR_EACH_VEC_ELT (exits, i, exit)
1257     {
1258       call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1259
1260       if (!single_pred_p (exit->dest)
1261           /* If possible, we prefer not to insert the fence on other paths
1262              in cfg.  */
1263           && !(exit->flags & EDGE_ABNORMAL))
1264         split_loop_exit_edge (exit);
1265       bsi = gsi_after_labels (exit->dest);
1266
1267       gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1268     }
1269
1270   exits.release ();
1271   update_ssa (TODO_update_ssa_only_virtuals);
1272 }
1273
1274 /* Returns true if we can use storent in loop, false otherwise.  */
1275
1276 static bool
1277 may_use_storent_in_loop_p (struct loop *loop)
1278 {
1279   bool ret = true;
1280
1281   if (loop->inner != NULL)
1282     return false;
1283
1284   /* If we must issue a mfence insn after using storent, check that there
1285      is a suitable place for it at each of the loop exits.  */
1286   if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1287     {
1288       vec<edge> exits = get_loop_exit_edges (loop);
1289       unsigned i;
1290       edge exit;
1291
1292       FOR_EACH_VEC_ELT (exits, i, exit)
1293         if ((exit->flags & EDGE_ABNORMAL)
1294             && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1295           ret = false;
1296
1297       exits.release ();
1298     }
1299
1300   return ret;
1301 }
1302
1303 /* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1304    references in the loop.  */
1305
1306 static void
1307 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1308 {
1309   struct mem_ref *ref;
1310   bool any = false;
1311
1312   if (!may_use_storent_in_loop_p (loop))
1313     return;
1314
1315   for (; groups; groups = groups->next)
1316     for (ref = groups->refs; ref; ref = ref->next)
1317       any |= mark_nontemporal_store (ref);
1318
1319   if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1320     emit_mfence_after_loop (loop);
1321 }
1322
1323 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1324    this is the case, fill in DESC by the description of number of
1325    iterations.  */
1326
1327 static bool
1328 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1329                       unsigned factor)
1330 {
1331   if (!can_unroll_loop_p (loop, factor, desc))
1332     return false;
1333
1334   /* We only consider loops without control flow for unrolling.  This is not
1335      a hard restriction -- tree_unroll_loop works with arbitrary loops
1336      as well; but the unrolling/prefetching is usually more profitable for
1337      loops consisting of a single basic block, and we want to limit the
1338      code growth.  */
1339   if (loop->num_nodes > 2)
1340     return false;
1341
1342   return true;
1343 }
1344
1345 /* Determine the coefficient by that unroll LOOP, from the information
1346    contained in the list of memory references REFS.  Description of
1347    umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1348    insns of the LOOP.  EST_NITER is the estimated number of iterations of
1349    the loop, or -1 if no estimate is available.  */
1350
1351 static unsigned
1352 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1353                          unsigned ninsns, struct tree_niter_desc *desc,
1354                          HOST_WIDE_INT est_niter)
1355 {
1356   unsigned upper_bound;
1357   unsigned nfactor, factor, mod_constraint;
1358   struct mem_ref_group *agp;
1359   struct mem_ref *ref;
1360
1361   /* First check whether the loop is not too large to unroll.  We ignore
1362      PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1363      from unrolling them enough to make exactly one cache line covered by each
1364      iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1365      us from unrolling the loops too many times in cases where we only expect
1366      gains from better scheduling and decreasing loop overhead, which is not
1367      the case here.  */
1368   upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1369
1370   /* If we unrolled the loop more times than it iterates, the unrolled version
1371      of the loop would be never entered.  */
1372   if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1373     upper_bound = est_niter;
1374
1375   if (upper_bound <= 1)
1376     return 1;
1377
1378   /* Choose the factor so that we may prefetch each cache just once,
1379      but bound the unrolling by UPPER_BOUND.  */
1380   factor = 1;
1381   for (agp = refs; agp; agp = agp->next)
1382     for (ref = agp->refs; ref; ref = ref->next)
1383       if (should_issue_prefetch_p (ref))
1384         {
1385           mod_constraint = ref->prefetch_mod;
1386           nfactor = least_common_multiple (mod_constraint, factor);
1387           if (nfactor <= upper_bound)
1388             factor = nfactor;
1389         }
1390
1391   if (!should_unroll_loop_p (loop, desc, factor))
1392     return 1;
1393
1394   return factor;
1395 }
1396
1397 /* Returns the total volume of the memory references REFS, taking into account
1398    reuses in the innermost loop and cache line size.  TODO -- we should also
1399    take into account reuses across the iterations of the loops in the loop
1400    nest.  */
1401
1402 static unsigned
1403 volume_of_references (struct mem_ref_group *refs)
1404 {
1405   unsigned volume = 0;
1406   struct mem_ref_group *gr;
1407   struct mem_ref *ref;
1408
1409   for (gr = refs; gr; gr = gr->next)
1410     for (ref = gr->refs; ref; ref = ref->next)
1411       {
1412         /* Almost always reuses another value?  */
1413         if (ref->prefetch_before != PREFETCH_ALL)
1414           continue;
1415
1416         /* If several iterations access the same cache line, use the size of
1417            the line divided by this number.  Otherwise, a cache line is
1418            accessed in each iteration.  TODO -- in the latter case, we should
1419            take the size of the reference into account, rounding it up on cache
1420            line size multiple.  */
1421         volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1422       }
1423   return volume;
1424 }
1425
1426 /* Returns the volume of memory references accessed across VEC iterations of
1427    loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1428    of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1429
1430 static unsigned
1431 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1432 {
1433   unsigned i;
1434
1435   for (i = 0; i < n; i++)
1436     if (vec[i] != 0)
1437       break;
1438
1439   if (i == n)
1440     return 0;
1441
1442   gcc_assert (vec[i] > 0);
1443
1444   /* We ignore the parts of the distance vector in subloops, since usually
1445      the numbers of iterations are much smaller.  */
1446   return loop_sizes[i] * vec[i];
1447 }
1448
1449 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1450    at the position corresponding to the loop of the step.  N is the depth
1451    of the considered loop nest, and, LOOP is its innermost loop.  */
1452
1453 static void
1454 add_subscript_strides (tree access_fn, unsigned stride,
1455                        HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1456 {
1457   struct loop *aloop;
1458   tree step;
1459   HOST_WIDE_INT astep;
1460   unsigned min_depth = loop_depth (loop) - n;
1461
1462   while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1463     {
1464       aloop = get_chrec_loop (access_fn);
1465       step = CHREC_RIGHT (access_fn);
1466       access_fn = CHREC_LEFT (access_fn);
1467
1468       if ((unsigned) loop_depth (aloop) <= min_depth)
1469         continue;
1470
1471       if (tree_fits_shwi_p (step))
1472         astep = tree_to_shwi (step);
1473       else
1474         astep = L1_CACHE_LINE_SIZE;
1475
1476       strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1477
1478     }
1479 }
1480
1481 /* Returns the volume of memory references accessed between two consecutive
1482    self-reuses of the reference DR.  We consider the subscripts of DR in N
1483    loops, and LOOP_SIZES contains the volumes of accesses in each of the
1484    loops.  LOOP is the innermost loop of the current loop nest.  */
1485
1486 static unsigned
1487 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1488                      struct loop *loop)
1489 {
1490   tree stride, access_fn;
1491   HOST_WIDE_INT *strides, astride;
1492   vec<tree> access_fns;
1493   tree ref = DR_REF (dr);
1494   unsigned i, ret = ~0u;
1495
1496   /* In the following example:
1497
1498      for (i = 0; i < N; i++)
1499        for (j = 0; j < N; j++)
1500          use (a[j][i]);
1501      the same cache line is accessed each N steps (except if the change from
1502      i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1503      we cannot rely purely on the results of the data dependence analysis.
1504
1505      Instead, we compute the stride of the reference in each loop, and consider
1506      the innermost loop in that the stride is less than cache size.  */
1507
1508   strides = XCNEWVEC (HOST_WIDE_INT, n);
1509   access_fns = DR_ACCESS_FNS (dr);
1510
1511   FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1512     {
1513       /* Keep track of the reference corresponding to the subscript, so that we
1514          know its stride.  */
1515       while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1516         ref = TREE_OPERAND (ref, 0);
1517
1518       if (TREE_CODE (ref) == ARRAY_REF)
1519         {
1520           stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1521           if (tree_fits_uhwi_p (stride))
1522             astride = tree_to_uhwi (stride);
1523           else
1524             astride = L1_CACHE_LINE_SIZE;
1525
1526           ref = TREE_OPERAND (ref, 0);
1527         }
1528       else
1529         astride = 1;
1530
1531       add_subscript_strides (access_fn, astride, strides, n, loop);
1532     }
1533
1534   for (i = n; i-- > 0; )
1535     {
1536       unsigned HOST_WIDE_INT s;
1537
1538       s = strides[i] < 0 ?  -strides[i] : strides[i];
1539
1540       if (s < (unsigned) L1_CACHE_LINE_SIZE
1541           && (loop_sizes[i]
1542               > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1543         {
1544           ret = loop_sizes[i];
1545           break;
1546         }
1547     }
1548
1549   free (strides);
1550   return ret;
1551 }
1552
1553 /* Determines the distance till the first reuse of each reference in REFS
1554    in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1555    memory references in the loop.  Return false if the analysis fails.  */
1556
1557 static bool
1558 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1559                            bool no_other_refs)
1560 {
1561   struct loop *nest, *aloop;
1562   vec<data_reference_p> datarefs = vNULL;
1563   vec<ddr_p> dependences = vNULL;
1564   struct mem_ref_group *gr;
1565   struct mem_ref *ref, *refb;
1566   vec<loop_p> vloops = vNULL;
1567   unsigned *loop_data_size;
1568   unsigned i, j, n;
1569   unsigned volume, dist, adist;
1570   HOST_WIDE_INT vol;
1571   data_reference_p dr;
1572   ddr_p dep;
1573
1574   if (loop->inner)
1575     return true;
1576
1577   /* Find the outermost loop of the loop nest of loop (we require that
1578      there are no sibling loops inside the nest).  */
1579   nest = loop;
1580   while (1)
1581     {
1582       aloop = loop_outer (nest);
1583
1584       if (aloop == current_loops->tree_root
1585           || aloop->inner->next)
1586         break;
1587
1588       nest = aloop;
1589     }
1590
1591   /* For each loop, determine the amount of data accessed in each iteration.
1592      We use this to estimate whether the reference is evicted from the
1593      cache before its reuse.  */
1594   find_loop_nest (nest, &vloops);
1595   n = vloops.length ();
1596   loop_data_size = XNEWVEC (unsigned, n);
1597   volume = volume_of_references (refs);
1598   i = n;
1599   while (i-- != 0)
1600     {
1601       loop_data_size[i] = volume;
1602       /* Bound the volume by the L2 cache size, since above this bound,
1603          all dependence distances are equivalent.  */
1604       if (volume > L2_CACHE_SIZE_BYTES)
1605         continue;
1606
1607       aloop = vloops[i];
1608       vol = estimated_stmt_executions_int (aloop);
1609       if (vol == -1)
1610         vol = expected_loop_iterations (aloop);
1611       volume *= vol;
1612     }
1613
1614   /* Prepare the references in the form suitable for data dependence
1615      analysis.  We ignore unanalyzable data references (the results
1616      are used just as a heuristics to estimate temporality of the
1617      references, hence we do not need to worry about correctness).  */
1618   for (gr = refs; gr; gr = gr->next)
1619     for (ref = gr->refs; ref; ref = ref->next)
1620       {
1621         dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1622                               ref->mem, ref->stmt, !ref->write_p);
1623
1624         if (dr)
1625           {
1626             ref->reuse_distance = volume;
1627             dr->aux = ref;
1628             datarefs.safe_push (dr);
1629           }
1630         else
1631           no_other_refs = false;
1632       }
1633
1634   FOR_EACH_VEC_ELT (datarefs, i, dr)
1635     {
1636       dist = self_reuse_distance (dr, loop_data_size, n, loop);
1637       ref = (struct mem_ref *) dr->aux;
1638       if (ref->reuse_distance > dist)
1639         ref->reuse_distance = dist;
1640
1641       if (no_other_refs)
1642         ref->independent_p = true;
1643     }
1644
1645   if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1646     return false;
1647
1648   FOR_EACH_VEC_ELT (dependences, i, dep)
1649     {
1650       if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1651         continue;
1652
1653       ref = (struct mem_ref *) DDR_A (dep)->aux;
1654       refb = (struct mem_ref *) DDR_B (dep)->aux;
1655
1656       if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1657           || DDR_NUM_DIST_VECTS (dep) == 0)
1658         {
1659           /* If the dependence cannot be analyzed, assume that there might be
1660              a reuse.  */
1661           dist = 0;
1662
1663           ref->independent_p = false;
1664           refb->independent_p = false;
1665         }
1666       else
1667         {
1668           /* The distance vectors are normalized to be always lexicographically
1669              positive, hence we cannot tell just from them whether DDR_A comes
1670              before DDR_B or vice versa.  However, it is not important,
1671              anyway -- if DDR_A is close to DDR_B, then it is either reused in
1672              DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1673              in cache (and marking it as nontemporal would not affect
1674              anything).  */
1675
1676           dist = volume;
1677           for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1678             {
1679               adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1680                                              loop_data_size, n);
1681
1682               /* If this is a dependence in the innermost loop (i.e., the
1683                  distances in all superloops are zero) and it is not
1684                  the trivial self-dependence with distance zero, record that
1685                  the references are not completely independent.  */
1686               if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1687                   && (ref != refb
1688                       || DDR_DIST_VECT (dep, j)[n-1] != 0))
1689                 {
1690                   ref->independent_p = false;
1691                   refb->independent_p = false;
1692                 }
1693
1694               /* Ignore accesses closer than
1695                  L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1696                  so that we use nontemporal prefetches e.g. if single memory
1697                  location is accessed several times in a single iteration of
1698                  the loop.  */
1699               if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1700                 continue;
1701
1702               if (adist < dist)
1703                 dist = adist;
1704             }
1705         }
1706
1707       if (ref->reuse_distance > dist)
1708         ref->reuse_distance = dist;
1709       if (refb->reuse_distance > dist)
1710         refb->reuse_distance = dist;
1711     }
1712
1713   free_dependence_relations (dependences);
1714   free_data_refs (datarefs);
1715   free (loop_data_size);
1716
1717   if (dump_file && (dump_flags & TDF_DETAILS))
1718     {
1719       fprintf (dump_file, "Reuse distances:\n");
1720       for (gr = refs; gr; gr = gr->next)
1721         for (ref = gr->refs; ref; ref = ref->next)
1722           fprintf (dump_file, " ref %p distance %u\n",
1723                    (void *) ref, ref->reuse_distance);
1724     }
1725
1726   return true;
1727 }
1728
1729 /* Determine whether or not the trip count to ahead ratio is too small based
1730    on prefitablility consideration.
1731    AHEAD: the iteration ahead distance,
1732    EST_NITER: the estimated trip count.  */
1733
1734 static bool
1735 trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1736 {
1737   /* Assume trip count to ahead ratio is big enough if the trip count could not
1738      be estimated at compile time.  */
1739   if (est_niter < 0)
1740     return false;
1741
1742   if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1743     {
1744       if (dump_file && (dump_flags & TDF_DETAILS))
1745         fprintf (dump_file,
1746                  "Not prefetching -- loop estimated to roll only %d times\n",
1747                  (int) est_niter);
1748       return true;
1749     }
1750
1751   return false;
1752 }
1753
1754 /* Determine whether or not the number of memory references in the loop is
1755    reasonable based on the profitablity and compilation time considerations.
1756    NINSNS: estimated number of instructions in the loop,
1757    MEM_REF_COUNT: total number of memory references in the loop.  */
1758
1759 static bool
1760 mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1761 {
1762   int insn_to_mem_ratio;
1763
1764   if (mem_ref_count == 0)
1765     return false;
1766
1767   /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1768      (compute_all_dependences) have high costs based on quadratic complexity.
1769      To avoid huge compilation time, we give up prefetching if mem_ref_count
1770      is too large.  */
1771   if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1772     return false;
1773
1774   /* Prefetching improves performance by overlapping cache missing
1775      memory accesses with CPU operations.  If the loop does not have
1776      enough CPU operations to overlap with memory operations, prefetching
1777      won't give a significant benefit.  One approximate way of checking
1778      this is to require the ratio of instructions to memory references to
1779      be above a certain limit.  This approximation works well in practice.
1780      TODO: Implement a more precise computation by estimating the time
1781      for each CPU or memory op in the loop. Time estimates for memory ops
1782      should account for cache misses.  */
1783   insn_to_mem_ratio = ninsns / mem_ref_count;
1784
1785   if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1786     {
1787       if (dump_file && (dump_flags & TDF_DETAILS))
1788         fprintf (dump_file,
1789                  "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1790                  insn_to_mem_ratio);
1791       return false;
1792     }
1793
1794   return true;
1795 }
1796
1797 /* Determine whether or not the instruction to prefetch ratio in the loop is
1798    too small based on the profitablity consideration.
1799    NINSNS: estimated number of instructions in the loop,
1800    PREFETCH_COUNT: an estimate of the number of prefetches,
1801    UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
1802
1803 static bool
1804 insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1805                                      unsigned unroll_factor)
1806 {
1807   int insn_to_prefetch_ratio;
1808
1809   /* Prefetching most likely causes performance degradation when the instruction
1810      to prefetch ratio is too small.  Too many prefetch instructions in a loop
1811      may reduce the I-cache performance.
1812      (unroll_factor * ninsns) is used to estimate the number of instructions in
1813      the unrolled loop.  This implementation is a bit simplistic -- the number
1814      of issued prefetch instructions is also affected by unrolling.  So,
1815      prefetch_mod and the unroll factor should be taken into account when
1816      determining prefetch_count.  Also, the number of insns of the unrolled
1817      loop will usually be significantly smaller than the number of insns of the
1818      original loop * unroll_factor (at least the induction variable increases
1819      and the exit branches will get eliminated), so it might be better to use
1820      tree_estimate_loop_size + estimated_unrolled_size.  */
1821   insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1822   if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1823     {
1824       if (dump_file && (dump_flags & TDF_DETAILS))
1825         fprintf (dump_file,
1826                  "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1827                  insn_to_prefetch_ratio);
1828       return true;
1829     }
1830
1831   return false;
1832 }
1833
1834
1835 /* Issue prefetch instructions for array references in LOOP.  Returns
1836    true if the LOOP was unrolled.  */
1837
1838 static bool
1839 loop_prefetch_arrays (struct loop *loop)
1840 {
1841   struct mem_ref_group *refs;
1842   unsigned ahead, ninsns, time, unroll_factor;
1843   HOST_WIDE_INT est_niter;
1844   struct tree_niter_desc desc;
1845   bool unrolled = false, no_other_refs;
1846   unsigned prefetch_count;
1847   unsigned mem_ref_count;
1848
1849   if (optimize_loop_nest_for_size_p (loop))
1850     {
1851       if (dump_file && (dump_flags & TDF_DETAILS))
1852         fprintf (dump_file, "  ignored (cold area)\n");
1853       return false;
1854     }
1855
1856   /* FIXME: the time should be weighted by the probabilities of the blocks in
1857      the loop body.  */
1858   time = tree_num_loop_insns (loop, &eni_time_weights);
1859   if (time == 0)
1860     return false;
1861
1862   ahead = (PREFETCH_LATENCY + time - 1) / time;
1863   est_niter = estimated_stmt_executions_int (loop);
1864   if (est_niter == -1)
1865     est_niter = max_stmt_executions_int (loop);
1866
1867   /* Prefetching is not likely to be profitable if the trip count to ahead
1868      ratio is too small.  */
1869   if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1870     return false;
1871
1872   ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1873
1874   /* Step 1: gather the memory references.  */
1875   refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1876
1877   /* Give up prefetching if the number of memory references in the
1878      loop is not reasonable based on profitablity and compilation time
1879      considerations.  */
1880   if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1881     goto fail;
1882
1883   /* Step 2: estimate the reuse effects.  */
1884   prune_by_reuse (refs);
1885
1886   if (nothing_to_prefetch_p (refs))
1887     goto fail;
1888
1889   if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1890     goto fail;
1891
1892   /* Step 3: determine unroll factor.  */
1893   unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1894                                            est_niter);
1895
1896   /* Estimate prefetch count for the unrolled loop.  */
1897   prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1898   if (prefetch_count == 0)
1899     goto fail;
1900
1901   if (dump_file && (dump_flags & TDF_DETAILS))
1902     fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1903              HOST_WIDE_INT_PRINT_DEC "\n"
1904              "insn count %d, mem ref count %d, prefetch count %d\n",
1905              ahead, unroll_factor, est_niter,
1906              ninsns, mem_ref_count, prefetch_count);
1907
1908   /* Prefetching is not likely to be profitable if the instruction to prefetch
1909      ratio is too small.  */
1910   if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1911                                           unroll_factor))
1912     goto fail;
1913
1914   mark_nontemporal_stores (loop, refs);
1915
1916   /* Step 4: what to prefetch?  */
1917   if (!schedule_prefetches (refs, unroll_factor, ahead))
1918     goto fail;
1919
1920   /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1921      iterations so that we do not issue superfluous prefetches.  */
1922   if (unroll_factor != 1)
1923     {
1924       tree_unroll_loop (loop, unroll_factor,
1925                         single_dom_exit (loop), &desc);
1926       unrolled = true;
1927     }
1928
1929   /* Step 6: issue the prefetches.  */
1930   issue_prefetches (refs, unroll_factor, ahead);
1931
1932 fail:
1933   release_mem_refs (refs);
1934   return unrolled;
1935 }
1936
1937 /* Issue prefetch instructions for array references in loops.  */
1938
1939 unsigned int
1940 tree_ssa_prefetch_arrays (void)
1941 {
1942   struct loop *loop;
1943   bool unrolled = false;
1944   int todo_flags = 0;
1945
1946   if (!targetm.have_prefetch ()
1947       /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1948          -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1949          of processor costs and i486 does not have prefetch, but
1950          -march=pentium4 causes targetm.have_prefetch to be true.  Ugh.  */
1951       || PREFETCH_BLOCK == 0)
1952     return 0;
1953
1954   if (dump_file && (dump_flags & TDF_DETAILS))
1955     {
1956       fprintf (dump_file, "Prefetching parameters:\n");
1957       fprintf (dump_file, "    simultaneous prefetches: %d\n",
1958                SIMULTANEOUS_PREFETCHES);
1959       fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1960       fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1961       fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1962                L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1963       fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1964       fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1965       fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1966                MIN_INSN_TO_PREFETCH_RATIO);
1967       fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1968                PREFETCH_MIN_INSN_TO_MEM_RATIO);
1969       fprintf (dump_file, "\n");
1970     }
1971
1972   initialize_original_copy_tables ();
1973
1974   if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1975     {
1976       tree type = build_function_type_list (void_type_node,
1977                                             const_ptr_type_node, NULL_TREE);
1978       tree decl = add_builtin_function ("__builtin_prefetch", type,
1979                                         BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1980                                         NULL, NULL_TREE);
1981       DECL_IS_NOVOPS (decl) = true;
1982       set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1983     }
1984
1985   /* We assume that size of cache line is a power of two, so verify this
1986      here.  */
1987   gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1988
1989   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1990     {
1991       if (dump_file && (dump_flags & TDF_DETAILS))
1992         fprintf (dump_file, "Processing loop %d:\n", loop->num);
1993
1994       unrolled |= loop_prefetch_arrays (loop);
1995
1996       if (dump_file && (dump_flags & TDF_DETAILS))
1997         fprintf (dump_file, "\n\n");
1998     }
1999
2000   if (unrolled)
2001     {
2002       scev_reset ();
2003       todo_flags |= TODO_cleanup_cfg;
2004     }
2005
2006   free_original_copy_tables ();
2007   return todo_flags;
2008 }
2009
2010 /* Prefetching.  */
2011
2012 namespace {
2013
2014 const pass_data pass_data_loop_prefetch =
2015 {
2016   GIMPLE_PASS, /* type */
2017   "aprefetch", /* name */
2018   OPTGROUP_LOOP, /* optinfo_flags */
2019   TV_TREE_PREFETCH, /* tv_id */
2020   ( PROP_cfg | PROP_ssa ), /* properties_required */
2021   0, /* properties_provided */
2022   0, /* properties_destroyed */
2023   0, /* todo_flags_start */
2024   0, /* todo_flags_finish */
2025 };
2026
2027 class pass_loop_prefetch : public gimple_opt_pass
2028 {
2029 public:
2030   pass_loop_prefetch (gcc::context *ctxt)
2031     : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2032   {}
2033
2034   /* opt_pass methods: */
2035   virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2036   virtual unsigned int execute (function *);
2037
2038 }; // class pass_loop_prefetch
2039
2040 unsigned int
2041 pass_loop_prefetch::execute (function *fun)
2042 {
2043   if (number_of_loops (fun) <= 1)
2044     return 0;
2045
2046   return tree_ssa_prefetch_arrays ();
2047 }
2048
2049 } // anon namespace
2050
2051 gimple_opt_pass *
2052 make_pass_loop_prefetch (gcc::context *ctxt)
2053 {
2054   return new pass_loop_prefetch (ctxt);
2055 }
2056
2057