PR libstdc++/67078
[official-gcc.git] / gcc / tree-ssa-loop-prefetch.c
blob1adaed598c934e846b68ac85df1c9c5c747f5ec6
1 /* Array prefetching.
2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "predict.h"
25 #include "tree.h"
26 #include "gimple.h"
27 #include "rtl.h"
28 #include "alias.h"
29 #include "fold-const.h"
30 #include "stor-layout.h"
31 #include "tm_p.h"
32 #include "tree-pretty-print.h"
33 #include "internal-fn.h"
34 #include "gimplify.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-ssa.h"
38 #include "tree-ssa-loop-ivopts.h"
39 #include "tree-ssa-loop-manip.h"
40 #include "tree-ssa-loop-niter.h"
41 #include "tree-ssa-loop.h"
42 #include "tree-into-ssa.h"
43 #include "cfgloop.h"
44 #include "tree-pass.h"
45 #include "insn-config.h"
46 #include "tree-chrec.h"
47 #include "tree-scalar-evolution.h"
48 #include "diagnostic-core.h"
49 #include "params.h"
50 #include "langhooks.h"
51 #include "tree-inline.h"
52 #include "tree-data-ref.h"
53 #include "target.h"
56 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
57 between the GIMPLE and RTL worlds. */
58 #include "flags.h"
59 #include "expmed.h"
60 #include "dojump.h"
61 #include "explow.h"
62 #include "calls.h"
63 #include "emit-rtl.h"
64 #include "varasm.h"
65 #include "stmt.h"
66 #include "expr.h"
67 #include "insn-codes.h"
68 #include "optabs.h"
69 #include "recog.h"
71 /* This pass inserts prefetch instructions to optimize cache usage during
72 accesses to arrays in loops. It processes loops sequentially and:
74 1) Gathers all memory references in the single loop.
75 2) For each of the references it decides when it is profitable to prefetch
76 it. To do it, we evaluate the reuse among the accesses, and determines
77 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
78 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
79 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
80 iterations of the loop that are zero modulo PREFETCH_MOD). For example
81 (assuming cache line size is 64 bytes, char has size 1 byte and there
82 is no hardware sequential prefetch):
84 char *a;
85 for (i = 0; i < max; i++)
87 a[255] = ...; (0)
88 a[i] = ...; (1)
89 a[i + 64] = ...; (2)
90 a[16*i] = ...; (3)
91 a[187*i] = ...; (4)
92 a[187*i + 50] = ...; (5)
95 (0) obviously has PREFETCH_BEFORE 1
96 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
97 location 64 iterations before it, and PREFETCH_MOD 64 (since
98 it hits the same cache line otherwise).
99 (2) has PREFETCH_MOD 64
100 (3) has PREFETCH_MOD 4
101 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
102 the cache line accessed by (5) is the same with probability only
103 7/32.
104 (5) has PREFETCH_MOD 1 as well.
106 Additionally, we use data dependence analysis to determine for each
107 reference the distance till the first reuse; this information is used
108 to determine the temporality of the issued prefetch instruction.
110 3) We determine how much ahead we need to prefetch. The number of
111 iterations needed is time to fetch / time spent in one iteration of
112 the loop. The problem is that we do not know either of these values,
113 so we just make a heuristic guess based on a magic (possibly)
114 target-specific constant and size of the loop.
116 4) Determine which of the references we prefetch. We take into account
117 that there is a maximum number of simultaneous prefetches (provided
118 by machine description). We prefetch as many prefetches as possible
119 while still within this bound (starting with those with lowest
120 prefetch_mod, since they are responsible for most of the cache
121 misses).
123 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
124 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
125 prefetching nonaccessed memory.
126 TODO -- actually implement peeling.
128 6) We actually emit the prefetch instructions. ??? Perhaps emit the
129 prefetch instructions with guards in cases where 5) was not sufficient
130 to satisfy the constraints?
132 A cost model is implemented to determine whether or not prefetching is
133 profitable for a given loop. The cost model has three heuristics:
135 1. Function trip_count_to_ahead_ratio_too_small_p implements a
136 heuristic that determines whether or not the loop has too few
137 iterations (compared to ahead). Prefetching is not likely to be
138 beneficial if the trip count to ahead ratio is below a certain
139 minimum.
141 2. Function mem_ref_count_reasonable_p implements a heuristic that
142 determines whether the given loop has enough CPU ops that can be
143 overlapped with cache missing memory ops. If not, the loop
144 won't benefit from prefetching. In the implementation,
145 prefetching is not considered beneficial if the ratio between
146 the instruction count and the mem ref count is below a certain
147 minimum.
149 3. Function insn_to_prefetch_ratio_too_small_p implements a
150 heuristic that disables prefetching in a loop if the prefetching
151 cost is above a certain limit. The relative prefetching cost is
152 estimated by taking the ratio between the prefetch count and the
153 total intruction count (this models the I-cache cost).
155 The limits used in these heuristics are defined as parameters with
156 reasonable default values. Machine-specific default values will be
157 added later.
159 Some other TODO:
160 -- write and use more general reuse analysis (that could be also used
161 in other cache aimed loop optimizations)
162 -- make it behave sanely together with the prefetches given by user
163 (now we just ignore them; at the very least we should avoid
164 optimizing loops in that user put his own prefetches)
165 -- we assume cache line size alignment of arrays; this could be
166 improved. */
168 /* Magic constants follow. These should be replaced by machine specific
169 numbers. */
171 /* True if write can be prefetched by a read prefetch. */
173 #ifndef WRITE_CAN_USE_READ_PREFETCH
174 #define WRITE_CAN_USE_READ_PREFETCH 1
175 #endif
177 /* True if read can be prefetched by a write prefetch. */
179 #ifndef READ_CAN_USE_WRITE_PREFETCH
180 #define READ_CAN_USE_WRITE_PREFETCH 0
181 #endif
183 /* The size of the block loaded by a single prefetch. Usually, this is
184 the same as cache line size (at the moment, we only consider one level
185 of cache hierarchy). */
187 #ifndef PREFETCH_BLOCK
188 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
189 #endif
191 /* Do we have a forward hardware sequential prefetching? */
193 #ifndef HAVE_FORWARD_PREFETCH
194 #define HAVE_FORWARD_PREFETCH 0
195 #endif
197 /* Do we have a backward hardware sequential prefetching? */
199 #ifndef HAVE_BACKWARD_PREFETCH
200 #define HAVE_BACKWARD_PREFETCH 0
201 #endif
203 /* In some cases we are only able to determine that there is a certain
204 probability that the two accesses hit the same cache line. In this
205 case, we issue the prefetches for both of them if this probability
206 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
208 #ifndef ACCEPTABLE_MISS_RATE
209 #define ACCEPTABLE_MISS_RATE 50
210 #endif
212 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
213 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
215 /* We consider a memory access nontemporal if it is not reused sooner than
216 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
217 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
218 so that we use nontemporal prefetches e.g. if single memory location
219 is accessed several times in a single iteration of the loop. */
220 #define NONTEMPORAL_FRACTION 16
222 /* In case we have to emit a memory fence instruction after the loop that
223 uses nontemporal stores, this defines the builtin to use. */
225 #ifndef FENCE_FOLLOWING_MOVNT
226 #define FENCE_FOLLOWING_MOVNT NULL_TREE
227 #endif
229 /* It is not profitable to prefetch when the trip count is not at
230 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
231 For example, in a loop with a prefetch ahead distance of 10,
232 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
233 profitable to prefetch when the trip count is greater or equal to
234 40. In that case, 30 out of the 40 iterations will benefit from
235 prefetching. */
237 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
238 #define TRIP_COUNT_TO_AHEAD_RATIO 4
239 #endif
241 /* The group of references between that reuse may occur. */
243 struct mem_ref_group
245 tree base; /* Base of the reference. */
246 tree step; /* Step of the reference. */
247 struct mem_ref *refs; /* References in the group. */
248 struct mem_ref_group *next; /* Next group of references. */
251 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
253 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
255 /* Do not generate a prefetch if the unroll factor is significantly less
256 than what is required by the prefetch. This is to avoid redundant
257 prefetches. For example, when prefetch_mod is 16 and unroll_factor is
258 2, prefetching requires unrolling the loop 16 times, but
259 the loop is actually unrolled twice. In this case (ratio = 8),
260 prefetching is not likely to be beneficial. */
262 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
263 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
264 #endif
266 /* Some of the prefetch computations have quadratic complexity. We want to
267 avoid huge compile times and, therefore, want to limit the amount of
268 memory references per loop where we consider prefetching. */
270 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
271 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
272 #endif
274 /* The memory reference. */
276 struct mem_ref
278 gimple stmt; /* Statement in that the reference appears. */
279 tree mem; /* The reference. */
280 HOST_WIDE_INT delta; /* Constant offset of the reference. */
281 struct mem_ref_group *group; /* The group of references it belongs to. */
282 unsigned HOST_WIDE_INT prefetch_mod;
283 /* Prefetch only each PREFETCH_MOD-th
284 iteration. */
285 unsigned HOST_WIDE_INT prefetch_before;
286 /* Prefetch only first PREFETCH_BEFORE
287 iterations. */
288 unsigned reuse_distance; /* The amount of data accessed before the first
289 reuse of this value. */
290 struct mem_ref *next; /* The next reference in the group. */
291 unsigned write_p : 1; /* Is it a write? */
292 unsigned independent_p : 1; /* True if the reference is independent on
293 all other references inside the loop. */
294 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
295 unsigned storent_p : 1; /* True if we changed the store to a
296 nontemporal one. */
299 /* Dumps information about memory reference */
300 static void
301 dump_mem_details (FILE *file, tree base, tree step,
302 HOST_WIDE_INT delta, bool write_p)
304 fprintf (file, "(base ");
305 print_generic_expr (file, base, TDF_SLIM);
306 fprintf (file, ", step ");
307 if (cst_and_fits_in_hwi (step))
308 fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
309 else
310 print_generic_expr (file, step, TDF_TREE);
311 fprintf (file, ")\n");
312 fprintf (file, " delta ");
313 fprintf (file, HOST_WIDE_INT_PRINT_DEC, delta);
314 fprintf (file, "\n");
315 fprintf (file, " %s\n", write_p ? "write" : "read");
316 fprintf (file, "\n");
319 /* Dumps information about reference REF to FILE. */
321 static void
322 dump_mem_ref (FILE *file, struct mem_ref *ref)
324 fprintf (file, "Reference %p:\n", (void *) ref);
326 fprintf (file, " group %p ", (void *) ref->group);
328 dump_mem_details (file, ref->group->base, ref->group->step, ref->delta,
329 ref->write_p);
332 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
333 exist. */
335 static struct mem_ref_group *
336 find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
338 struct mem_ref_group *group;
340 for (; *groups; groups = &(*groups)->next)
342 if (operand_equal_p ((*groups)->step, step, 0)
343 && operand_equal_p ((*groups)->base, base, 0))
344 return *groups;
346 /* If step is an integer constant, keep the list of groups sorted
347 by decreasing step. */
348 if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
349 && int_cst_value ((*groups)->step) < int_cst_value (step))
350 break;
353 group = XNEW (struct mem_ref_group);
354 group->base = base;
355 group->step = step;
356 group->refs = NULL;
357 group->next = *groups;
358 *groups = group;
360 return group;
363 /* Records a memory reference MEM in GROUP with offset DELTA and write status
364 WRITE_P. The reference occurs in statement STMT. */
366 static void
367 record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
368 HOST_WIDE_INT delta, bool write_p)
370 struct mem_ref **aref;
372 /* Do not record the same address twice. */
373 for (aref = &group->refs; *aref; aref = &(*aref)->next)
375 /* It does not have to be possible for write reference to reuse the read
376 prefetch, or vice versa. */
377 if (!WRITE_CAN_USE_READ_PREFETCH
378 && write_p
379 && !(*aref)->write_p)
380 continue;
381 if (!READ_CAN_USE_WRITE_PREFETCH
382 && !write_p
383 && (*aref)->write_p)
384 continue;
386 if ((*aref)->delta == delta)
387 return;
390 (*aref) = XNEW (struct mem_ref);
391 (*aref)->stmt = stmt;
392 (*aref)->mem = mem;
393 (*aref)->delta = delta;
394 (*aref)->write_p = write_p;
395 (*aref)->prefetch_before = PREFETCH_ALL;
396 (*aref)->prefetch_mod = 1;
397 (*aref)->reuse_distance = 0;
398 (*aref)->issue_prefetch_p = false;
399 (*aref)->group = group;
400 (*aref)->next = NULL;
401 (*aref)->independent_p = false;
402 (*aref)->storent_p = false;
404 if (dump_file && (dump_flags & TDF_DETAILS))
405 dump_mem_ref (dump_file, *aref);
408 /* Release memory references in GROUPS. */
410 static void
411 release_mem_refs (struct mem_ref_group *groups)
413 struct mem_ref_group *next_g;
414 struct mem_ref *ref, *next_r;
416 for (; groups; groups = next_g)
418 next_g = groups->next;
419 for (ref = groups->refs; ref; ref = next_r)
421 next_r = ref->next;
422 free (ref);
424 free (groups);
428 /* A structure used to pass arguments to idx_analyze_ref. */
430 struct ar_data
432 struct loop *loop; /* Loop of the reference. */
433 gimple stmt; /* Statement of the reference. */
434 tree *step; /* Step of the memory reference. */
435 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
438 /* Analyzes a single INDEX of a memory reference to obtain information
439 described at analyze_ref. Callback for for_each_index. */
441 static bool
442 idx_analyze_ref (tree base, tree *index, void *data)
444 struct ar_data *ar_data = (struct ar_data *) data;
445 tree ibase, step, stepsize;
446 HOST_WIDE_INT idelta = 0, imult = 1;
447 affine_iv iv;
449 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
450 *index, &iv, true))
451 return false;
452 ibase = iv.base;
453 step = iv.step;
455 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
456 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
458 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
459 ibase = TREE_OPERAND (ibase, 0);
461 if (cst_and_fits_in_hwi (ibase))
463 idelta += int_cst_value (ibase);
464 ibase = build_int_cst (TREE_TYPE (ibase), 0);
467 if (TREE_CODE (base) == ARRAY_REF)
469 stepsize = array_ref_element_size (base);
470 if (!cst_and_fits_in_hwi (stepsize))
471 return false;
472 imult = int_cst_value (stepsize);
473 step = fold_build2 (MULT_EXPR, sizetype,
474 fold_convert (sizetype, step),
475 fold_convert (sizetype, stepsize));
476 idelta *= imult;
479 if (*ar_data->step == NULL_TREE)
480 *ar_data->step = step;
481 else
482 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
483 fold_convert (sizetype, *ar_data->step),
484 fold_convert (sizetype, step));
485 *ar_data->delta += idelta;
486 *index = ibase;
488 return true;
491 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
492 STEP are integer constants and iter is number of iterations of LOOP. The
493 reference occurs in statement STMT. Strips nonaddressable component
494 references from REF_P. */
496 static bool
497 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
498 tree *step, HOST_WIDE_INT *delta,
499 gimple stmt)
501 struct ar_data ar_data;
502 tree off;
503 HOST_WIDE_INT bit_offset;
504 tree ref = *ref_p;
506 *step = NULL_TREE;
507 *delta = 0;
509 /* First strip off the component references. Ignore bitfields.
510 Also strip off the real and imagine parts of a complex, so that
511 they can have the same base. */
512 if (TREE_CODE (ref) == REALPART_EXPR
513 || TREE_CODE (ref) == IMAGPART_EXPR
514 || (TREE_CODE (ref) == COMPONENT_REF
515 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
517 if (TREE_CODE (ref) == IMAGPART_EXPR)
518 *delta += int_size_in_bytes (TREE_TYPE (ref));
519 ref = TREE_OPERAND (ref, 0);
522 *ref_p = ref;
524 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
526 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
527 bit_offset = TREE_INT_CST_LOW (off);
528 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
530 *delta += bit_offset / BITS_PER_UNIT;
533 *base = unshare_expr (ref);
534 ar_data.loop = loop;
535 ar_data.stmt = stmt;
536 ar_data.step = step;
537 ar_data.delta = delta;
538 return for_each_index (base, idx_analyze_ref, &ar_data);
541 /* Record a memory reference REF to the list REFS. The reference occurs in
542 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
543 reference was recorded, false otherwise. */
545 static bool
546 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
547 tree ref, bool write_p, gimple stmt)
549 tree base, step;
550 HOST_WIDE_INT delta;
551 struct mem_ref_group *agrp;
553 if (get_base_address (ref) == NULL)
554 return false;
556 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
557 return false;
558 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
559 if (step == NULL_TREE)
560 return false;
562 /* Stop if the address of BASE could not be taken. */
563 if (may_be_nonaddressable_p (base))
564 return false;
566 /* Limit non-constant step prefetching only to the innermost loops and
567 only when the step is loop invariant in the entire loop nest. */
568 if (!cst_and_fits_in_hwi (step))
570 if (loop->inner != NULL)
572 if (dump_file && (dump_flags & TDF_DETAILS))
574 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
575 print_generic_expr (dump_file, ref, TDF_TREE);
576 fprintf (dump_file,":");
577 dump_mem_details (dump_file, base, step, delta, write_p);
578 fprintf (dump_file,
579 "Ignoring %p, non-constant step prefetching is "
580 "limited to inner most loops \n",
581 (void *) ref);
583 return false;
585 else
587 if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
589 if (dump_file && (dump_flags & TDF_DETAILS))
591 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
592 print_generic_expr (dump_file, ref, TDF_TREE);
593 fprintf (dump_file,":");
594 dump_mem_details (dump_file, base, step, delta, write_p);
595 fprintf (dump_file,
596 "Not prefetching, ignoring %p due to "
597 "loop variant step\n",
598 (void *) ref);
600 return false;
605 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
606 are integer constants. */
607 agrp = find_or_create_group (refs, base, step);
608 record_ref (agrp, stmt, ref, delta, write_p);
610 return true;
613 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
614 true if there are no other memory references inside the loop. */
616 static struct mem_ref_group *
617 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
619 basic_block *body = get_loop_body_in_dom_order (loop);
620 basic_block bb;
621 unsigned i;
622 gimple_stmt_iterator bsi;
623 gimple stmt;
624 tree lhs, rhs;
625 struct mem_ref_group *refs = NULL;
627 *no_other_refs = true;
628 *ref_count = 0;
630 /* Scan the loop body in order, so that the former references precede the
631 later ones. */
632 for (i = 0; i < loop->num_nodes; i++)
634 bb = body[i];
635 if (bb->loop_father != loop)
636 continue;
638 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
640 stmt = gsi_stmt (bsi);
642 if (gimple_code (stmt) != GIMPLE_ASSIGN)
644 if (gimple_vuse (stmt)
645 || (is_gimple_call (stmt)
646 && !(gimple_call_flags (stmt) & ECF_CONST)))
647 *no_other_refs = false;
648 continue;
651 lhs = gimple_assign_lhs (stmt);
652 rhs = gimple_assign_rhs1 (stmt);
654 if (REFERENCE_CLASS_P (rhs))
656 *no_other_refs &= gather_memory_references_ref (loop, &refs,
657 rhs, false, stmt);
658 *ref_count += 1;
660 if (REFERENCE_CLASS_P (lhs))
662 *no_other_refs &= gather_memory_references_ref (loop, &refs,
663 lhs, true, stmt);
664 *ref_count += 1;
668 free (body);
670 return refs;
673 /* Prune the prefetch candidate REF using the self-reuse. */
675 static void
676 prune_ref_by_self_reuse (struct mem_ref *ref)
678 HOST_WIDE_INT step;
679 bool backward;
681 /* If the step size is non constant, we cannot calculate prefetch_mod. */
682 if (!cst_and_fits_in_hwi (ref->group->step))
683 return;
685 step = int_cst_value (ref->group->step);
687 backward = step < 0;
689 if (step == 0)
691 /* Prefetch references to invariant address just once. */
692 ref->prefetch_before = 1;
693 return;
696 if (backward)
697 step = -step;
699 if (step > PREFETCH_BLOCK)
700 return;
702 if ((backward && HAVE_BACKWARD_PREFETCH)
703 || (!backward && HAVE_FORWARD_PREFETCH))
705 ref->prefetch_before = 1;
706 return;
709 ref->prefetch_mod = PREFETCH_BLOCK / step;
712 /* Divides X by BY, rounding down. */
714 static HOST_WIDE_INT
715 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
717 gcc_assert (by > 0);
719 if (x >= 0)
720 return x / by;
721 else
722 return (x + by - 1) / by;
725 /* Given a CACHE_LINE_SIZE and two inductive memory references
726 with a common STEP greater than CACHE_LINE_SIZE and an address
727 difference DELTA, compute the probability that they will fall
728 in different cache lines. Return true if the computed miss rate
729 is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
730 number of distinct iterations after which the pattern repeats itself.
731 ALIGN_UNIT is the unit of alignment in bytes. */
733 static bool
734 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
735 HOST_WIDE_INT step, HOST_WIDE_INT delta,
736 unsigned HOST_WIDE_INT distinct_iters,
737 int align_unit)
739 unsigned align, iter;
740 int total_positions, miss_positions, max_allowed_miss_positions;
741 int address1, address2, cache_line1, cache_line2;
743 /* It always misses if delta is greater than or equal to the cache
744 line size. */
745 if (delta >= (HOST_WIDE_INT) cache_line_size)
746 return false;
748 miss_positions = 0;
749 total_positions = (cache_line_size / align_unit) * distinct_iters;
750 max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
752 /* Iterate through all possible alignments of the first
753 memory reference within its cache line. */
754 for (align = 0; align < cache_line_size; align += align_unit)
756 /* Iterate through all distinct iterations. */
757 for (iter = 0; iter < distinct_iters; iter++)
759 address1 = align + step * iter;
760 address2 = address1 + delta;
761 cache_line1 = address1 / cache_line_size;
762 cache_line2 = address2 / cache_line_size;
763 if (cache_line1 != cache_line2)
765 miss_positions += 1;
766 if (miss_positions > max_allowed_miss_positions)
767 return false;
770 return true;
773 /* Prune the prefetch candidate REF using the reuse with BY.
774 If BY_IS_BEFORE is true, BY is before REF in the loop. */
776 static void
777 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
778 bool by_is_before)
780 HOST_WIDE_INT step;
781 bool backward;
782 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
783 HOST_WIDE_INT delta = delta_b - delta_r;
784 HOST_WIDE_INT hit_from;
785 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
786 HOST_WIDE_INT reduced_step;
787 unsigned HOST_WIDE_INT reduced_prefetch_block;
788 tree ref_type;
789 int align_unit;
791 /* If the step is non constant we cannot calculate prefetch_before. */
792 if (!cst_and_fits_in_hwi (ref->group->step)) {
793 return;
796 step = int_cst_value (ref->group->step);
798 backward = step < 0;
801 if (delta == 0)
803 /* If the references has the same address, only prefetch the
804 former. */
805 if (by_is_before)
806 ref->prefetch_before = 0;
808 return;
811 if (!step)
813 /* If the reference addresses are invariant and fall into the
814 same cache line, prefetch just the first one. */
815 if (!by_is_before)
816 return;
818 if (ddown (ref->delta, PREFETCH_BLOCK)
819 != ddown (by->delta, PREFETCH_BLOCK))
820 return;
822 ref->prefetch_before = 0;
823 return;
826 /* Only prune the reference that is behind in the array. */
827 if (backward)
829 if (delta > 0)
830 return;
832 /* Transform the data so that we may assume that the accesses
833 are forward. */
834 delta = - delta;
835 step = -step;
836 delta_r = PREFETCH_BLOCK - 1 - delta_r;
837 delta_b = PREFETCH_BLOCK - 1 - delta_b;
839 else
841 if (delta < 0)
842 return;
845 /* Check whether the two references are likely to hit the same cache
846 line, and how distant the iterations in that it occurs are from
847 each other. */
849 if (step <= PREFETCH_BLOCK)
851 /* The accesses are sure to meet. Let us check when. */
852 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
853 prefetch_before = (hit_from - delta_r + step - 1) / step;
855 /* Do not reduce prefetch_before if we meet beyond cache size. */
856 if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
857 prefetch_before = PREFETCH_ALL;
858 if (prefetch_before < ref->prefetch_before)
859 ref->prefetch_before = prefetch_before;
861 return;
864 /* A more complicated case with step > prefetch_block. First reduce
865 the ratio between the step and the cache line size to its simplest
866 terms. The resulting denominator will then represent the number of
867 distinct iterations after which each address will go back to its
868 initial location within the cache line. This computation assumes
869 that PREFETCH_BLOCK is a power of two. */
870 prefetch_block = PREFETCH_BLOCK;
871 reduced_prefetch_block = prefetch_block;
872 reduced_step = step;
873 while ((reduced_step & 1) == 0
874 && reduced_prefetch_block > 1)
876 reduced_step >>= 1;
877 reduced_prefetch_block >>= 1;
880 prefetch_before = delta / step;
881 delta %= step;
882 ref_type = TREE_TYPE (ref->mem);
883 align_unit = TYPE_ALIGN (ref_type) / 8;
884 if (is_miss_rate_acceptable (prefetch_block, step, delta,
885 reduced_prefetch_block, align_unit))
887 /* Do not reduce prefetch_before if we meet beyond cache size. */
888 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
889 prefetch_before = PREFETCH_ALL;
890 if (prefetch_before < ref->prefetch_before)
891 ref->prefetch_before = prefetch_before;
893 return;
896 /* Try also the following iteration. */
897 prefetch_before++;
898 delta = step - delta;
899 if (is_miss_rate_acceptable (prefetch_block, step, delta,
900 reduced_prefetch_block, align_unit))
902 if (prefetch_before < ref->prefetch_before)
903 ref->prefetch_before = prefetch_before;
905 return;
908 /* The ref probably does not reuse by. */
909 return;
912 /* Prune the prefetch candidate REF using the reuses with other references
913 in REFS. */
915 static void
916 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
918 struct mem_ref *prune_by;
919 bool before = true;
921 prune_ref_by_self_reuse (ref);
923 for (prune_by = refs; prune_by; prune_by = prune_by->next)
925 if (prune_by == ref)
927 before = false;
928 continue;
931 if (!WRITE_CAN_USE_READ_PREFETCH
932 && ref->write_p
933 && !prune_by->write_p)
934 continue;
935 if (!READ_CAN_USE_WRITE_PREFETCH
936 && !ref->write_p
937 && prune_by->write_p)
938 continue;
940 prune_ref_by_group_reuse (ref, prune_by, before);
944 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
946 static void
947 prune_group_by_reuse (struct mem_ref_group *group)
949 struct mem_ref *ref_pruned;
951 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
953 prune_ref_by_reuse (ref_pruned, group->refs);
955 if (dump_file && (dump_flags & TDF_DETAILS))
957 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
959 if (ref_pruned->prefetch_before == PREFETCH_ALL
960 && ref_pruned->prefetch_mod == 1)
961 fprintf (dump_file, " no restrictions");
962 else if (ref_pruned->prefetch_before == 0)
963 fprintf (dump_file, " do not prefetch");
964 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
965 fprintf (dump_file, " prefetch once");
966 else
968 if (ref_pruned->prefetch_before != PREFETCH_ALL)
970 fprintf (dump_file, " prefetch before ");
971 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
972 ref_pruned->prefetch_before);
974 if (ref_pruned->prefetch_mod != 1)
976 fprintf (dump_file, " prefetch mod ");
977 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
978 ref_pruned->prefetch_mod);
981 fprintf (dump_file, "\n");
986 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
988 static void
989 prune_by_reuse (struct mem_ref_group *groups)
991 for (; groups; groups = groups->next)
992 prune_group_by_reuse (groups);
995 /* Returns true if we should issue prefetch for REF. */
997 static bool
998 should_issue_prefetch_p (struct mem_ref *ref)
1000 /* For now do not issue prefetches for only first few of the
1001 iterations. */
1002 if (ref->prefetch_before != PREFETCH_ALL)
1004 if (dump_file && (dump_flags & TDF_DETAILS))
1005 fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
1006 (void *) ref);
1007 return false;
1010 /* Do not prefetch nontemporal stores. */
1011 if (ref->storent_p)
1013 if (dump_file && (dump_flags & TDF_DETAILS))
1014 fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
1015 return false;
1018 return true;
1021 /* Decide which of the prefetch candidates in GROUPS to prefetch.
1022 AHEAD is the number of iterations to prefetch ahead (which corresponds
1023 to the number of simultaneous instances of one prefetch running at a
1024 time). UNROLL_FACTOR is the factor by that the loop is going to be
1025 unrolled. Returns true if there is anything to prefetch. */
1027 static bool
1028 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1029 unsigned ahead)
1031 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1032 unsigned slots_per_prefetch;
1033 struct mem_ref *ref;
1034 bool any = false;
1036 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
1037 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1039 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1040 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1041 it will need a prefetch slot. */
1042 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1043 if (dump_file && (dump_flags & TDF_DETAILS))
1044 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1045 slots_per_prefetch);
1047 /* For now we just take memory references one by one and issue
1048 prefetches for as many as possible. The groups are sorted
1049 starting with the largest step, since the references with
1050 large step are more likely to cause many cache misses. */
1052 for (; groups; groups = groups->next)
1053 for (ref = groups->refs; ref; ref = ref->next)
1055 if (!should_issue_prefetch_p (ref))
1056 continue;
1058 /* The loop is far from being sufficiently unrolled for this
1059 prefetch. Do not generate prefetch to avoid many redudant
1060 prefetches. */
1061 if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1062 continue;
1064 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1065 and we unroll the loop UNROLL_FACTOR times, we need to insert
1066 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1067 iteration. */
1068 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1069 / ref->prefetch_mod);
1070 prefetch_slots = n_prefetches * slots_per_prefetch;
1072 /* If more than half of the prefetches would be lost anyway, do not
1073 issue the prefetch. */
1074 if (2 * remaining_prefetch_slots < prefetch_slots)
1075 continue;
1077 ref->issue_prefetch_p = true;
1079 if (remaining_prefetch_slots <= prefetch_slots)
1080 return true;
1081 remaining_prefetch_slots -= prefetch_slots;
1082 any = true;
1085 return any;
1088 /* Return TRUE if no prefetch is going to be generated in the given
1089 GROUPS. */
1091 static bool
1092 nothing_to_prefetch_p (struct mem_ref_group *groups)
1094 struct mem_ref *ref;
1096 for (; groups; groups = groups->next)
1097 for (ref = groups->refs; ref; ref = ref->next)
1098 if (should_issue_prefetch_p (ref))
1099 return false;
1101 return true;
1104 /* Estimate the number of prefetches in the given GROUPS.
1105 UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1107 static int
1108 estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1110 struct mem_ref *ref;
1111 unsigned n_prefetches;
1112 int prefetch_count = 0;
1114 for (; groups; groups = groups->next)
1115 for (ref = groups->refs; ref; ref = ref->next)
1116 if (should_issue_prefetch_p (ref))
1118 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1119 / ref->prefetch_mod);
1120 prefetch_count += n_prefetches;
1123 return prefetch_count;
1126 /* Issue prefetches for the reference REF into loop as decided before.
1127 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1128 is the factor by which LOOP was unrolled. */
1130 static void
1131 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1133 HOST_WIDE_INT delta;
1134 tree addr, addr_base, write_p, local, forward;
1135 gcall *prefetch;
1136 gimple_stmt_iterator bsi;
1137 unsigned n_prefetches, ap;
1138 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1140 if (dump_file && (dump_flags & TDF_DETAILS))
1141 fprintf (dump_file, "Issued%s prefetch for %p.\n",
1142 nontemporal ? " nontemporal" : "",
1143 (void *) ref);
1145 bsi = gsi_for_stmt (ref->stmt);
1147 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1148 / ref->prefetch_mod);
1149 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1150 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1151 true, NULL, true, GSI_SAME_STMT);
1152 write_p = ref->write_p ? integer_one_node : integer_zero_node;
1153 local = nontemporal ? integer_zero_node : integer_three_node;
1155 for (ap = 0; ap < n_prefetches; ap++)
1157 if (cst_and_fits_in_hwi (ref->group->step))
1159 /* Determine the address to prefetch. */
1160 delta = (ahead + ap * ref->prefetch_mod) *
1161 int_cst_value (ref->group->step);
1162 addr = fold_build_pointer_plus_hwi (addr_base, delta);
1163 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1164 true, GSI_SAME_STMT);
1166 else
1168 /* The step size is non-constant but loop-invariant. We use the
1169 heuristic to simply prefetch ahead iterations ahead. */
1170 forward = fold_build2 (MULT_EXPR, sizetype,
1171 fold_convert (sizetype, ref->group->step),
1172 fold_convert (sizetype, size_int (ahead)));
1173 addr = fold_build_pointer_plus (addr_base, forward);
1174 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1175 NULL, true, GSI_SAME_STMT);
1177 /* Create the prefetch instruction. */
1178 prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1179 3, addr, write_p, local);
1180 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1184 /* Issue prefetches for the references in GROUPS into loop as decided before.
1185 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1186 factor by that LOOP was unrolled. */
1188 static void
1189 issue_prefetches (struct mem_ref_group *groups,
1190 unsigned unroll_factor, unsigned ahead)
1192 struct mem_ref *ref;
1194 for (; groups; groups = groups->next)
1195 for (ref = groups->refs; ref; ref = ref->next)
1196 if (ref->issue_prefetch_p)
1197 issue_prefetch_ref (ref, unroll_factor, ahead);
1200 /* Returns true if REF is a memory write for that a nontemporal store insn
1201 can be used. */
1203 static bool
1204 nontemporal_store_p (struct mem_ref *ref)
1206 machine_mode mode;
1207 enum insn_code code;
1209 /* REF must be a write that is not reused. We require it to be independent
1210 on all other memory references in the loop, as the nontemporal stores may
1211 be reordered with respect to other memory references. */
1212 if (!ref->write_p
1213 || !ref->independent_p
1214 || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1215 return false;
1217 /* Check that we have the storent instruction for the mode. */
1218 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1219 if (mode == BLKmode)
1220 return false;
1222 code = optab_handler (storent_optab, mode);
1223 return code != CODE_FOR_nothing;
1226 /* If REF is a nontemporal store, we mark the corresponding modify statement
1227 and return true. Otherwise, we return false. */
1229 static bool
1230 mark_nontemporal_store (struct mem_ref *ref)
1232 if (!nontemporal_store_p (ref))
1233 return false;
1235 if (dump_file && (dump_flags & TDF_DETAILS))
1236 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1237 (void *) ref);
1239 gimple_assign_set_nontemporal_move (ref->stmt, true);
1240 ref->storent_p = true;
1242 return true;
1245 /* Issue a memory fence instruction after LOOP. */
1247 static void
1248 emit_mfence_after_loop (struct loop *loop)
1250 vec<edge> exits = get_loop_exit_edges (loop);
1251 edge exit;
1252 gcall *call;
1253 gimple_stmt_iterator bsi;
1254 unsigned i;
1256 FOR_EACH_VEC_ELT (exits, i, exit)
1258 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1260 if (!single_pred_p (exit->dest)
1261 /* If possible, we prefer not to insert the fence on other paths
1262 in cfg. */
1263 && !(exit->flags & EDGE_ABNORMAL))
1264 split_loop_exit_edge (exit);
1265 bsi = gsi_after_labels (exit->dest);
1267 gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1270 exits.release ();
1271 update_ssa (TODO_update_ssa_only_virtuals);
1274 /* Returns true if we can use storent in loop, false otherwise. */
1276 static bool
1277 may_use_storent_in_loop_p (struct loop *loop)
1279 bool ret = true;
1281 if (loop->inner != NULL)
1282 return false;
1284 /* If we must issue a mfence insn after using storent, check that there
1285 is a suitable place for it at each of the loop exits. */
1286 if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1288 vec<edge> exits = get_loop_exit_edges (loop);
1289 unsigned i;
1290 edge exit;
1292 FOR_EACH_VEC_ELT (exits, i, exit)
1293 if ((exit->flags & EDGE_ABNORMAL)
1294 && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1295 ret = false;
1297 exits.release ();
1300 return ret;
1303 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1304 references in the loop. */
1306 static void
1307 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1309 struct mem_ref *ref;
1310 bool any = false;
1312 if (!may_use_storent_in_loop_p (loop))
1313 return;
1315 for (; groups; groups = groups->next)
1316 for (ref = groups->refs; ref; ref = ref->next)
1317 any |= mark_nontemporal_store (ref);
1319 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1320 emit_mfence_after_loop (loop);
1323 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1324 this is the case, fill in DESC by the description of number of
1325 iterations. */
1327 static bool
1328 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1329 unsigned factor)
1331 if (!can_unroll_loop_p (loop, factor, desc))
1332 return false;
1334 /* We only consider loops without control flow for unrolling. This is not
1335 a hard restriction -- tree_unroll_loop works with arbitrary loops
1336 as well; but the unrolling/prefetching is usually more profitable for
1337 loops consisting of a single basic block, and we want to limit the
1338 code growth. */
1339 if (loop->num_nodes > 2)
1340 return false;
1342 return true;
1345 /* Determine the coefficient by that unroll LOOP, from the information
1346 contained in the list of memory references REFS. Description of
1347 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1348 insns of the LOOP. EST_NITER is the estimated number of iterations of
1349 the loop, or -1 if no estimate is available. */
1351 static unsigned
1352 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1353 unsigned ninsns, struct tree_niter_desc *desc,
1354 HOST_WIDE_INT est_niter)
1356 unsigned upper_bound;
1357 unsigned nfactor, factor, mod_constraint;
1358 struct mem_ref_group *agp;
1359 struct mem_ref *ref;
1361 /* First check whether the loop is not too large to unroll. We ignore
1362 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1363 from unrolling them enough to make exactly one cache line covered by each
1364 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1365 us from unrolling the loops too many times in cases where we only expect
1366 gains from better scheduling and decreasing loop overhead, which is not
1367 the case here. */
1368 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1370 /* If we unrolled the loop more times than it iterates, the unrolled version
1371 of the loop would be never entered. */
1372 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1373 upper_bound = est_niter;
1375 if (upper_bound <= 1)
1376 return 1;
1378 /* Choose the factor so that we may prefetch each cache just once,
1379 but bound the unrolling by UPPER_BOUND. */
1380 factor = 1;
1381 for (agp = refs; agp; agp = agp->next)
1382 for (ref = agp->refs; ref; ref = ref->next)
1383 if (should_issue_prefetch_p (ref))
1385 mod_constraint = ref->prefetch_mod;
1386 nfactor = least_common_multiple (mod_constraint, factor);
1387 if (nfactor <= upper_bound)
1388 factor = nfactor;
1391 if (!should_unroll_loop_p (loop, desc, factor))
1392 return 1;
1394 return factor;
1397 /* Returns the total volume of the memory references REFS, taking into account
1398 reuses in the innermost loop and cache line size. TODO -- we should also
1399 take into account reuses across the iterations of the loops in the loop
1400 nest. */
1402 static unsigned
1403 volume_of_references (struct mem_ref_group *refs)
1405 unsigned volume = 0;
1406 struct mem_ref_group *gr;
1407 struct mem_ref *ref;
1409 for (gr = refs; gr; gr = gr->next)
1410 for (ref = gr->refs; ref; ref = ref->next)
1412 /* Almost always reuses another value? */
1413 if (ref->prefetch_before != PREFETCH_ALL)
1414 continue;
1416 /* If several iterations access the same cache line, use the size of
1417 the line divided by this number. Otherwise, a cache line is
1418 accessed in each iteration. TODO -- in the latter case, we should
1419 take the size of the reference into account, rounding it up on cache
1420 line size multiple. */
1421 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1423 return volume;
1426 /* Returns the volume of memory references accessed across VEC iterations of
1427 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1428 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1430 static unsigned
1431 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1433 unsigned i;
1435 for (i = 0; i < n; i++)
1436 if (vec[i] != 0)
1437 break;
1439 if (i == n)
1440 return 0;
1442 gcc_assert (vec[i] > 0);
1444 /* We ignore the parts of the distance vector in subloops, since usually
1445 the numbers of iterations are much smaller. */
1446 return loop_sizes[i] * vec[i];
1449 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1450 at the position corresponding to the loop of the step. N is the depth
1451 of the considered loop nest, and, LOOP is its innermost loop. */
1453 static void
1454 add_subscript_strides (tree access_fn, unsigned stride,
1455 HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1457 struct loop *aloop;
1458 tree step;
1459 HOST_WIDE_INT astep;
1460 unsigned min_depth = loop_depth (loop) - n;
1462 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1464 aloop = get_chrec_loop (access_fn);
1465 step = CHREC_RIGHT (access_fn);
1466 access_fn = CHREC_LEFT (access_fn);
1468 if ((unsigned) loop_depth (aloop) <= min_depth)
1469 continue;
1471 if (tree_fits_shwi_p (step))
1472 astep = tree_to_shwi (step);
1473 else
1474 astep = L1_CACHE_LINE_SIZE;
1476 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1481 /* Returns the volume of memory references accessed between two consecutive
1482 self-reuses of the reference DR. We consider the subscripts of DR in N
1483 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1484 loops. LOOP is the innermost loop of the current loop nest. */
1486 static unsigned
1487 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1488 struct loop *loop)
1490 tree stride, access_fn;
1491 HOST_WIDE_INT *strides, astride;
1492 vec<tree> access_fns;
1493 tree ref = DR_REF (dr);
1494 unsigned i, ret = ~0u;
1496 /* In the following example:
1498 for (i = 0; i < N; i++)
1499 for (j = 0; j < N; j++)
1500 use (a[j][i]);
1501 the same cache line is accessed each N steps (except if the change from
1502 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1503 we cannot rely purely on the results of the data dependence analysis.
1505 Instead, we compute the stride of the reference in each loop, and consider
1506 the innermost loop in that the stride is less than cache size. */
1508 strides = XCNEWVEC (HOST_WIDE_INT, n);
1509 access_fns = DR_ACCESS_FNS (dr);
1511 FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1513 /* Keep track of the reference corresponding to the subscript, so that we
1514 know its stride. */
1515 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1516 ref = TREE_OPERAND (ref, 0);
1518 if (TREE_CODE (ref) == ARRAY_REF)
1520 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1521 if (tree_fits_uhwi_p (stride))
1522 astride = tree_to_uhwi (stride);
1523 else
1524 astride = L1_CACHE_LINE_SIZE;
1526 ref = TREE_OPERAND (ref, 0);
1528 else
1529 astride = 1;
1531 add_subscript_strides (access_fn, astride, strides, n, loop);
1534 for (i = n; i-- > 0; )
1536 unsigned HOST_WIDE_INT s;
1538 s = strides[i] < 0 ? -strides[i] : strides[i];
1540 if (s < (unsigned) L1_CACHE_LINE_SIZE
1541 && (loop_sizes[i]
1542 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1544 ret = loop_sizes[i];
1545 break;
1549 free (strides);
1550 return ret;
1553 /* Determines the distance till the first reuse of each reference in REFS
1554 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1555 memory references in the loop. Return false if the analysis fails. */
1557 static bool
1558 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1559 bool no_other_refs)
1561 struct loop *nest, *aloop;
1562 vec<data_reference_p> datarefs = vNULL;
1563 vec<ddr_p> dependences = vNULL;
1564 struct mem_ref_group *gr;
1565 struct mem_ref *ref, *refb;
1566 vec<loop_p> vloops = vNULL;
1567 unsigned *loop_data_size;
1568 unsigned i, j, n;
1569 unsigned volume, dist, adist;
1570 HOST_WIDE_INT vol;
1571 data_reference_p dr;
1572 ddr_p dep;
1574 if (loop->inner)
1575 return true;
1577 /* Find the outermost loop of the loop nest of loop (we require that
1578 there are no sibling loops inside the nest). */
1579 nest = loop;
1580 while (1)
1582 aloop = loop_outer (nest);
1584 if (aloop == current_loops->tree_root
1585 || aloop->inner->next)
1586 break;
1588 nest = aloop;
1591 /* For each loop, determine the amount of data accessed in each iteration.
1592 We use this to estimate whether the reference is evicted from the
1593 cache before its reuse. */
1594 find_loop_nest (nest, &vloops);
1595 n = vloops.length ();
1596 loop_data_size = XNEWVEC (unsigned, n);
1597 volume = volume_of_references (refs);
1598 i = n;
1599 while (i-- != 0)
1601 loop_data_size[i] = volume;
1602 /* Bound the volume by the L2 cache size, since above this bound,
1603 all dependence distances are equivalent. */
1604 if (volume > L2_CACHE_SIZE_BYTES)
1605 continue;
1607 aloop = vloops[i];
1608 vol = estimated_stmt_executions_int (aloop);
1609 if (vol == -1)
1610 vol = expected_loop_iterations (aloop);
1611 volume *= vol;
1614 /* Prepare the references in the form suitable for data dependence
1615 analysis. We ignore unanalyzable data references (the results
1616 are used just as a heuristics to estimate temporality of the
1617 references, hence we do not need to worry about correctness). */
1618 for (gr = refs; gr; gr = gr->next)
1619 for (ref = gr->refs; ref; ref = ref->next)
1621 dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1622 ref->mem, ref->stmt, !ref->write_p);
1624 if (dr)
1626 ref->reuse_distance = volume;
1627 dr->aux = ref;
1628 datarefs.safe_push (dr);
1630 else
1631 no_other_refs = false;
1634 FOR_EACH_VEC_ELT (datarefs, i, dr)
1636 dist = self_reuse_distance (dr, loop_data_size, n, loop);
1637 ref = (struct mem_ref *) dr->aux;
1638 if (ref->reuse_distance > dist)
1639 ref->reuse_distance = dist;
1641 if (no_other_refs)
1642 ref->independent_p = true;
1645 if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1646 return false;
1648 FOR_EACH_VEC_ELT (dependences, i, dep)
1650 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1651 continue;
1653 ref = (struct mem_ref *) DDR_A (dep)->aux;
1654 refb = (struct mem_ref *) DDR_B (dep)->aux;
1656 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1657 || DDR_NUM_DIST_VECTS (dep) == 0)
1659 /* If the dependence cannot be analyzed, assume that there might be
1660 a reuse. */
1661 dist = 0;
1663 ref->independent_p = false;
1664 refb->independent_p = false;
1666 else
1668 /* The distance vectors are normalized to be always lexicographically
1669 positive, hence we cannot tell just from them whether DDR_A comes
1670 before DDR_B or vice versa. However, it is not important,
1671 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1672 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1673 in cache (and marking it as nontemporal would not affect
1674 anything). */
1676 dist = volume;
1677 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1679 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1680 loop_data_size, n);
1682 /* If this is a dependence in the innermost loop (i.e., the
1683 distances in all superloops are zero) and it is not
1684 the trivial self-dependence with distance zero, record that
1685 the references are not completely independent. */
1686 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1687 && (ref != refb
1688 || DDR_DIST_VECT (dep, j)[n-1] != 0))
1690 ref->independent_p = false;
1691 refb->independent_p = false;
1694 /* Ignore accesses closer than
1695 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1696 so that we use nontemporal prefetches e.g. if single memory
1697 location is accessed several times in a single iteration of
1698 the loop. */
1699 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1700 continue;
1702 if (adist < dist)
1703 dist = adist;
1707 if (ref->reuse_distance > dist)
1708 ref->reuse_distance = dist;
1709 if (refb->reuse_distance > dist)
1710 refb->reuse_distance = dist;
1713 free_dependence_relations (dependences);
1714 free_data_refs (datarefs);
1715 free (loop_data_size);
1717 if (dump_file && (dump_flags & TDF_DETAILS))
1719 fprintf (dump_file, "Reuse distances:\n");
1720 for (gr = refs; gr; gr = gr->next)
1721 for (ref = gr->refs; ref; ref = ref->next)
1722 fprintf (dump_file, " ref %p distance %u\n",
1723 (void *) ref, ref->reuse_distance);
1726 return true;
1729 /* Determine whether or not the trip count to ahead ratio is too small based
1730 on prefitablility consideration.
1731 AHEAD: the iteration ahead distance,
1732 EST_NITER: the estimated trip count. */
1734 static bool
1735 trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1737 /* Assume trip count to ahead ratio is big enough if the trip count could not
1738 be estimated at compile time. */
1739 if (est_niter < 0)
1740 return false;
1742 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1744 if (dump_file && (dump_flags & TDF_DETAILS))
1745 fprintf (dump_file,
1746 "Not prefetching -- loop estimated to roll only %d times\n",
1747 (int) est_niter);
1748 return true;
1751 return false;
1754 /* Determine whether or not the number of memory references in the loop is
1755 reasonable based on the profitablity and compilation time considerations.
1756 NINSNS: estimated number of instructions in the loop,
1757 MEM_REF_COUNT: total number of memory references in the loop. */
1759 static bool
1760 mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1762 int insn_to_mem_ratio;
1764 if (mem_ref_count == 0)
1765 return false;
1767 /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1768 (compute_all_dependences) have high costs based on quadratic complexity.
1769 To avoid huge compilation time, we give up prefetching if mem_ref_count
1770 is too large. */
1771 if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1772 return false;
1774 /* Prefetching improves performance by overlapping cache missing
1775 memory accesses with CPU operations. If the loop does not have
1776 enough CPU operations to overlap with memory operations, prefetching
1777 won't give a significant benefit. One approximate way of checking
1778 this is to require the ratio of instructions to memory references to
1779 be above a certain limit. This approximation works well in practice.
1780 TODO: Implement a more precise computation by estimating the time
1781 for each CPU or memory op in the loop. Time estimates for memory ops
1782 should account for cache misses. */
1783 insn_to_mem_ratio = ninsns / mem_ref_count;
1785 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1787 if (dump_file && (dump_flags & TDF_DETAILS))
1788 fprintf (dump_file,
1789 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1790 insn_to_mem_ratio);
1791 return false;
1794 return true;
1797 /* Determine whether or not the instruction to prefetch ratio in the loop is
1798 too small based on the profitablity consideration.
1799 NINSNS: estimated number of instructions in the loop,
1800 PREFETCH_COUNT: an estimate of the number of prefetches,
1801 UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1803 static bool
1804 insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1805 unsigned unroll_factor)
1807 int insn_to_prefetch_ratio;
1809 /* Prefetching most likely causes performance degradation when the instruction
1810 to prefetch ratio is too small. Too many prefetch instructions in a loop
1811 may reduce the I-cache performance.
1812 (unroll_factor * ninsns) is used to estimate the number of instructions in
1813 the unrolled loop. This implementation is a bit simplistic -- the number
1814 of issued prefetch instructions is also affected by unrolling. So,
1815 prefetch_mod and the unroll factor should be taken into account when
1816 determining prefetch_count. Also, the number of insns of the unrolled
1817 loop will usually be significantly smaller than the number of insns of the
1818 original loop * unroll_factor (at least the induction variable increases
1819 and the exit branches will get eliminated), so it might be better to use
1820 tree_estimate_loop_size + estimated_unrolled_size. */
1821 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1822 if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1824 if (dump_file && (dump_flags & TDF_DETAILS))
1825 fprintf (dump_file,
1826 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1827 insn_to_prefetch_ratio);
1828 return true;
1831 return false;
1835 /* Issue prefetch instructions for array references in LOOP. Returns
1836 true if the LOOP was unrolled. */
1838 static bool
1839 loop_prefetch_arrays (struct loop *loop)
1841 struct mem_ref_group *refs;
1842 unsigned ahead, ninsns, time, unroll_factor;
1843 HOST_WIDE_INT est_niter;
1844 struct tree_niter_desc desc;
1845 bool unrolled = false, no_other_refs;
1846 unsigned prefetch_count;
1847 unsigned mem_ref_count;
1849 if (optimize_loop_nest_for_size_p (loop))
1851 if (dump_file && (dump_flags & TDF_DETAILS))
1852 fprintf (dump_file, " ignored (cold area)\n");
1853 return false;
1856 /* FIXME: the time should be weighted by the probabilities of the blocks in
1857 the loop body. */
1858 time = tree_num_loop_insns (loop, &eni_time_weights);
1859 if (time == 0)
1860 return false;
1862 ahead = (PREFETCH_LATENCY + time - 1) / time;
1863 est_niter = estimated_stmt_executions_int (loop);
1864 if (est_niter == -1)
1865 est_niter = max_stmt_executions_int (loop);
1867 /* Prefetching is not likely to be profitable if the trip count to ahead
1868 ratio is too small. */
1869 if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1870 return false;
1872 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1874 /* Step 1: gather the memory references. */
1875 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1877 /* Give up prefetching if the number of memory references in the
1878 loop is not reasonable based on profitablity and compilation time
1879 considerations. */
1880 if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1881 goto fail;
1883 /* Step 2: estimate the reuse effects. */
1884 prune_by_reuse (refs);
1886 if (nothing_to_prefetch_p (refs))
1887 goto fail;
1889 if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1890 goto fail;
1892 /* Step 3: determine unroll factor. */
1893 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1894 est_niter);
1896 /* Estimate prefetch count for the unrolled loop. */
1897 prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1898 if (prefetch_count == 0)
1899 goto fail;
1901 if (dump_file && (dump_flags & TDF_DETAILS))
1902 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1903 HOST_WIDE_INT_PRINT_DEC "\n"
1904 "insn count %d, mem ref count %d, prefetch count %d\n",
1905 ahead, unroll_factor, est_niter,
1906 ninsns, mem_ref_count, prefetch_count);
1908 /* Prefetching is not likely to be profitable if the instruction to prefetch
1909 ratio is too small. */
1910 if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1911 unroll_factor))
1912 goto fail;
1914 mark_nontemporal_stores (loop, refs);
1916 /* Step 4: what to prefetch? */
1917 if (!schedule_prefetches (refs, unroll_factor, ahead))
1918 goto fail;
1920 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1921 iterations so that we do not issue superfluous prefetches. */
1922 if (unroll_factor != 1)
1924 tree_unroll_loop (loop, unroll_factor,
1925 single_dom_exit (loop), &desc);
1926 unrolled = true;
1929 /* Step 6: issue the prefetches. */
1930 issue_prefetches (refs, unroll_factor, ahead);
1932 fail:
1933 release_mem_refs (refs);
1934 return unrolled;
1937 /* Issue prefetch instructions for array references in loops. */
1939 unsigned int
1940 tree_ssa_prefetch_arrays (void)
1942 struct loop *loop;
1943 bool unrolled = false;
1944 int todo_flags = 0;
1946 if (!targetm.have_prefetch ()
1947 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1948 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1949 of processor costs and i486 does not have prefetch, but
1950 -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */
1951 || PREFETCH_BLOCK == 0)
1952 return 0;
1954 if (dump_file && (dump_flags & TDF_DETAILS))
1956 fprintf (dump_file, "Prefetching parameters:\n");
1957 fprintf (dump_file, " simultaneous prefetches: %d\n",
1958 SIMULTANEOUS_PREFETCHES);
1959 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1960 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1961 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1962 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1963 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1964 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1965 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1966 MIN_INSN_TO_PREFETCH_RATIO);
1967 fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1968 PREFETCH_MIN_INSN_TO_MEM_RATIO);
1969 fprintf (dump_file, "\n");
1972 initialize_original_copy_tables ();
1974 if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1976 tree type = build_function_type_list (void_type_node,
1977 const_ptr_type_node, NULL_TREE);
1978 tree decl = add_builtin_function ("__builtin_prefetch", type,
1979 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1980 NULL, NULL_TREE);
1981 DECL_IS_NOVOPS (decl) = true;
1982 set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1985 /* We assume that size of cache line is a power of two, so verify this
1986 here. */
1987 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1989 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1991 if (dump_file && (dump_flags & TDF_DETAILS))
1992 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1994 unrolled |= loop_prefetch_arrays (loop);
1996 if (dump_file && (dump_flags & TDF_DETAILS))
1997 fprintf (dump_file, "\n\n");
2000 if (unrolled)
2002 scev_reset ();
2003 todo_flags |= TODO_cleanup_cfg;
2006 free_original_copy_tables ();
2007 return todo_flags;
2010 /* Prefetching. */
2012 namespace {
2014 const pass_data pass_data_loop_prefetch =
2016 GIMPLE_PASS, /* type */
2017 "aprefetch", /* name */
2018 OPTGROUP_LOOP, /* optinfo_flags */
2019 TV_TREE_PREFETCH, /* tv_id */
2020 ( PROP_cfg | PROP_ssa ), /* properties_required */
2021 0, /* properties_provided */
2022 0, /* properties_destroyed */
2023 0, /* todo_flags_start */
2024 0, /* todo_flags_finish */
2027 class pass_loop_prefetch : public gimple_opt_pass
2029 public:
2030 pass_loop_prefetch (gcc::context *ctxt)
2031 : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2034 /* opt_pass methods: */
2035 virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2036 virtual unsigned int execute (function *);
2038 }; // class pass_loop_prefetch
2040 unsigned int
2041 pass_loop_prefetch::execute (function *fun)
2043 if (number_of_loops (fun) <= 1)
2044 return 0;
2046 return tree_ssa_prefetch_arrays ();
2049 } // anon namespace
2051 gimple_opt_pass *
2052 make_pass_loop_prefetch (gcc::context *ctxt)
2054 return new pass_loop_prefetch (ctxt);