[ree] PR rtl-optimization/78038: Handle global register dataflow definitions in ree
[official-gcc.git] / gcc / tree-ssa-loop-prefetch.c
blob43ee85aae9229941f20abbe578e55b1241cc000a
1 /* Array prefetching.
2 Copyright (C) 2005-2016 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "target.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "predict.h"
29 #include "tree-pass.h"
30 #include "gimple-ssa.h"
31 #include "optabs-query.h"
32 #include "tree-pretty-print.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "gimplify.h"
36 #include "gimple-iterator.h"
37 #include "gimplify-me.h"
38 #include "tree-ssa-loop-ivopts.h"
39 #include "tree-ssa-loop-manip.h"
40 #include "tree-ssa-loop-niter.h"
41 #include "tree-ssa-loop.h"
42 #include "ssa.h"
43 #include "tree-into-ssa.h"
44 #include "cfgloop.h"
45 #include "tree-scalar-evolution.h"
46 #include "params.h"
47 #include "langhooks.h"
48 #include "tree-inline.h"
49 #include "tree-data-ref.h"
52 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
53 between the GIMPLE and RTL worlds. */
55 /* This pass inserts prefetch instructions to optimize cache usage during
56 accesses to arrays in loops. It processes loops sequentially and:
58 1) Gathers all memory references in the single loop.
59 2) For each of the references it decides when it is profitable to prefetch
60 it. To do it, we evaluate the reuse among the accesses, and determines
61 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
62 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
63 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
64 iterations of the loop that are zero modulo PREFETCH_MOD). For example
65 (assuming cache line size is 64 bytes, char has size 1 byte and there
66 is no hardware sequential prefetch):
68 char *a;
69 for (i = 0; i < max; i++)
71 a[255] = ...; (0)
72 a[i] = ...; (1)
73 a[i + 64] = ...; (2)
74 a[16*i] = ...; (3)
75 a[187*i] = ...; (4)
76 a[187*i + 50] = ...; (5)
79 (0) obviously has PREFETCH_BEFORE 1
80 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
81 location 64 iterations before it, and PREFETCH_MOD 64 (since
82 it hits the same cache line otherwise).
83 (2) has PREFETCH_MOD 64
84 (3) has PREFETCH_MOD 4
85 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
86 the cache line accessed by (5) is the same with probability only
87 7/32.
88 (5) has PREFETCH_MOD 1 as well.
90 Additionally, we use data dependence analysis to determine for each
91 reference the distance till the first reuse; this information is used
92 to determine the temporality of the issued prefetch instruction.
94 3) We determine how much ahead we need to prefetch. The number of
95 iterations needed is time to fetch / time spent in one iteration of
96 the loop. The problem is that we do not know either of these values,
97 so we just make a heuristic guess based on a magic (possibly)
98 target-specific constant and size of the loop.
100 4) Determine which of the references we prefetch. We take into account
101 that there is a maximum number of simultaneous prefetches (provided
102 by machine description). We prefetch as many prefetches as possible
103 while still within this bound (starting with those with lowest
104 prefetch_mod, since they are responsible for most of the cache
105 misses).
107 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
108 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
109 prefetching nonaccessed memory.
110 TODO -- actually implement peeling.
112 6) We actually emit the prefetch instructions. ??? Perhaps emit the
113 prefetch instructions with guards in cases where 5) was not sufficient
114 to satisfy the constraints?
116 A cost model is implemented to determine whether or not prefetching is
117 profitable for a given loop. The cost model has three heuristics:
119 1. Function trip_count_to_ahead_ratio_too_small_p implements a
120 heuristic that determines whether or not the loop has too few
121 iterations (compared to ahead). Prefetching is not likely to be
122 beneficial if the trip count to ahead ratio is below a certain
123 minimum.
125 2. Function mem_ref_count_reasonable_p implements a heuristic that
126 determines whether the given loop has enough CPU ops that can be
127 overlapped with cache missing memory ops. If not, the loop
128 won't benefit from prefetching. In the implementation,
129 prefetching is not considered beneficial if the ratio between
130 the instruction count and the mem ref count is below a certain
131 minimum.
133 3. Function insn_to_prefetch_ratio_too_small_p implements a
134 heuristic that disables prefetching in a loop if the prefetching
135 cost is above a certain limit. The relative prefetching cost is
136 estimated by taking the ratio between the prefetch count and the
137 total intruction count (this models the I-cache cost).
139 The limits used in these heuristics are defined as parameters with
140 reasonable default values. Machine-specific default values will be
141 added later.
143 Some other TODO:
144 -- write and use more general reuse analysis (that could be also used
145 in other cache aimed loop optimizations)
146 -- make it behave sanely together with the prefetches given by user
147 (now we just ignore them; at the very least we should avoid
148 optimizing loops in that user put his own prefetches)
149 -- we assume cache line size alignment of arrays; this could be
150 improved. */
152 /* Magic constants follow. These should be replaced by machine specific
153 numbers. */
155 /* True if write can be prefetched by a read prefetch. */
157 #ifndef WRITE_CAN_USE_READ_PREFETCH
158 #define WRITE_CAN_USE_READ_PREFETCH 1
159 #endif
161 /* True if read can be prefetched by a write prefetch. */
163 #ifndef READ_CAN_USE_WRITE_PREFETCH
164 #define READ_CAN_USE_WRITE_PREFETCH 0
165 #endif
167 /* The size of the block loaded by a single prefetch. Usually, this is
168 the same as cache line size (at the moment, we only consider one level
169 of cache hierarchy). */
171 #ifndef PREFETCH_BLOCK
172 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
173 #endif
175 /* Do we have a forward hardware sequential prefetching? */
177 #ifndef HAVE_FORWARD_PREFETCH
178 #define HAVE_FORWARD_PREFETCH 0
179 #endif
181 /* Do we have a backward hardware sequential prefetching? */
183 #ifndef HAVE_BACKWARD_PREFETCH
184 #define HAVE_BACKWARD_PREFETCH 0
185 #endif
187 /* In some cases we are only able to determine that there is a certain
188 probability that the two accesses hit the same cache line. In this
189 case, we issue the prefetches for both of them if this probability
190 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
192 #ifndef ACCEPTABLE_MISS_RATE
193 #define ACCEPTABLE_MISS_RATE 50
194 #endif
196 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
197 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
199 /* We consider a memory access nontemporal if it is not reused sooner than
200 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
201 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
202 so that we use nontemporal prefetches e.g. if single memory location
203 is accessed several times in a single iteration of the loop. */
204 #define NONTEMPORAL_FRACTION 16
206 /* In case we have to emit a memory fence instruction after the loop that
207 uses nontemporal stores, this defines the builtin to use. */
209 #ifndef FENCE_FOLLOWING_MOVNT
210 #define FENCE_FOLLOWING_MOVNT NULL_TREE
211 #endif
213 /* It is not profitable to prefetch when the trip count is not at
214 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
215 For example, in a loop with a prefetch ahead distance of 10,
216 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
217 profitable to prefetch when the trip count is greater or equal to
218 40. In that case, 30 out of the 40 iterations will benefit from
219 prefetching. */
221 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
222 #define TRIP_COUNT_TO_AHEAD_RATIO 4
223 #endif
225 /* The group of references between that reuse may occur. */
227 struct mem_ref_group
229 tree base; /* Base of the reference. */
230 tree step; /* Step of the reference. */
231 struct mem_ref *refs; /* References in the group. */
232 struct mem_ref_group *next; /* Next group of references. */
235 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
237 #define PREFETCH_ALL HOST_WIDE_INT_M1U
239 /* Do not generate a prefetch if the unroll factor is significantly less
240 than what is required by the prefetch. This is to avoid redundant
241 prefetches. For example, when prefetch_mod is 16 and unroll_factor is
242 2, prefetching requires unrolling the loop 16 times, but
243 the loop is actually unrolled twice. In this case (ratio = 8),
244 prefetching is not likely to be beneficial. */
246 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
247 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
248 #endif
250 /* Some of the prefetch computations have quadratic complexity. We want to
251 avoid huge compile times and, therefore, want to limit the amount of
252 memory references per loop where we consider prefetching. */
254 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
255 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
256 #endif
258 /* The memory reference. */
260 struct mem_ref
262 gimple *stmt; /* Statement in that the reference appears. */
263 tree mem; /* The reference. */
264 HOST_WIDE_INT delta; /* Constant offset of the reference. */
265 struct mem_ref_group *group; /* The group of references it belongs to. */
266 unsigned HOST_WIDE_INT prefetch_mod;
267 /* Prefetch only each PREFETCH_MOD-th
268 iteration. */
269 unsigned HOST_WIDE_INT prefetch_before;
270 /* Prefetch only first PREFETCH_BEFORE
271 iterations. */
272 unsigned reuse_distance; /* The amount of data accessed before the first
273 reuse of this value. */
274 struct mem_ref *next; /* The next reference in the group. */
275 unsigned write_p : 1; /* Is it a write? */
276 unsigned independent_p : 1; /* True if the reference is independent on
277 all other references inside the loop. */
278 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
279 unsigned storent_p : 1; /* True if we changed the store to a
280 nontemporal one. */
283 /* Dumps information about memory reference */
284 static void
285 dump_mem_details (FILE *file, tree base, tree step,
286 HOST_WIDE_INT delta, bool write_p)
288 fprintf (file, "(base ");
289 print_generic_expr (file, base, TDF_SLIM);
290 fprintf (file, ", step ");
291 if (cst_and_fits_in_hwi (step))
292 fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
293 else
294 print_generic_expr (file, step, TDF_TREE);
295 fprintf (file, ")\n");
296 fprintf (file, " delta ");
297 fprintf (file, HOST_WIDE_INT_PRINT_DEC, delta);
298 fprintf (file, "\n");
299 fprintf (file, " %s\n", write_p ? "write" : "read");
300 fprintf (file, "\n");
303 /* Dumps information about reference REF to FILE. */
305 static void
306 dump_mem_ref (FILE *file, struct mem_ref *ref)
308 fprintf (file, "Reference %p:\n", (void *) ref);
310 fprintf (file, " group %p ", (void *) ref->group);
312 dump_mem_details (file, ref->group->base, ref->group->step, ref->delta,
313 ref->write_p);
316 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
317 exist. */
319 static struct mem_ref_group *
320 find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
322 struct mem_ref_group *group;
324 for (; *groups; groups = &(*groups)->next)
326 if (operand_equal_p ((*groups)->step, step, 0)
327 && operand_equal_p ((*groups)->base, base, 0))
328 return *groups;
330 /* If step is an integer constant, keep the list of groups sorted
331 by decreasing step. */
332 if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
333 && int_cst_value ((*groups)->step) < int_cst_value (step))
334 break;
337 group = XNEW (struct mem_ref_group);
338 group->base = base;
339 group->step = step;
340 group->refs = NULL;
341 group->next = *groups;
342 *groups = group;
344 return group;
347 /* Records a memory reference MEM in GROUP with offset DELTA and write status
348 WRITE_P. The reference occurs in statement STMT. */
350 static void
351 record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
352 HOST_WIDE_INT delta, bool write_p)
354 struct mem_ref **aref;
356 /* Do not record the same address twice. */
357 for (aref = &group->refs; *aref; aref = &(*aref)->next)
359 /* It does not have to be possible for write reference to reuse the read
360 prefetch, or vice versa. */
361 if (!WRITE_CAN_USE_READ_PREFETCH
362 && write_p
363 && !(*aref)->write_p)
364 continue;
365 if (!READ_CAN_USE_WRITE_PREFETCH
366 && !write_p
367 && (*aref)->write_p)
368 continue;
370 if ((*aref)->delta == delta)
371 return;
374 (*aref) = XNEW (struct mem_ref);
375 (*aref)->stmt = stmt;
376 (*aref)->mem = mem;
377 (*aref)->delta = delta;
378 (*aref)->write_p = write_p;
379 (*aref)->prefetch_before = PREFETCH_ALL;
380 (*aref)->prefetch_mod = 1;
381 (*aref)->reuse_distance = 0;
382 (*aref)->issue_prefetch_p = false;
383 (*aref)->group = group;
384 (*aref)->next = NULL;
385 (*aref)->independent_p = false;
386 (*aref)->storent_p = false;
388 if (dump_file && (dump_flags & TDF_DETAILS))
389 dump_mem_ref (dump_file, *aref);
392 /* Release memory references in GROUPS. */
394 static void
395 release_mem_refs (struct mem_ref_group *groups)
397 struct mem_ref_group *next_g;
398 struct mem_ref *ref, *next_r;
400 for (; groups; groups = next_g)
402 next_g = groups->next;
403 for (ref = groups->refs; ref; ref = next_r)
405 next_r = ref->next;
406 free (ref);
408 free (groups);
412 /* A structure used to pass arguments to idx_analyze_ref. */
414 struct ar_data
416 struct loop *loop; /* Loop of the reference. */
417 gimple *stmt; /* Statement of the reference. */
418 tree *step; /* Step of the memory reference. */
419 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
422 /* Analyzes a single INDEX of a memory reference to obtain information
423 described at analyze_ref. Callback for for_each_index. */
425 static bool
426 idx_analyze_ref (tree base, tree *index, void *data)
428 struct ar_data *ar_data = (struct ar_data *) data;
429 tree ibase, step, stepsize;
430 HOST_WIDE_INT idelta = 0, imult = 1;
431 affine_iv iv;
433 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
434 *index, &iv, true))
435 return false;
436 ibase = iv.base;
437 step = iv.step;
439 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
440 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
442 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
443 ibase = TREE_OPERAND (ibase, 0);
445 if (cst_and_fits_in_hwi (ibase))
447 idelta += int_cst_value (ibase);
448 ibase = build_int_cst (TREE_TYPE (ibase), 0);
451 if (TREE_CODE (base) == ARRAY_REF)
453 stepsize = array_ref_element_size (base);
454 if (!cst_and_fits_in_hwi (stepsize))
455 return false;
456 imult = int_cst_value (stepsize);
457 step = fold_build2 (MULT_EXPR, sizetype,
458 fold_convert (sizetype, step),
459 fold_convert (sizetype, stepsize));
460 idelta *= imult;
463 if (*ar_data->step == NULL_TREE)
464 *ar_data->step = step;
465 else
466 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
467 fold_convert (sizetype, *ar_data->step),
468 fold_convert (sizetype, step));
469 *ar_data->delta += idelta;
470 *index = ibase;
472 return true;
475 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
476 STEP are integer constants and iter is number of iterations of LOOP. The
477 reference occurs in statement STMT. Strips nonaddressable component
478 references from REF_P. */
480 static bool
481 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
482 tree *step, HOST_WIDE_INT *delta,
483 gimple *stmt)
485 struct ar_data ar_data;
486 tree off;
487 HOST_WIDE_INT bit_offset;
488 tree ref = *ref_p;
490 *step = NULL_TREE;
491 *delta = 0;
493 /* First strip off the component references. Ignore bitfields.
494 Also strip off the real and imagine parts of a complex, so that
495 they can have the same base. */
496 if (TREE_CODE (ref) == REALPART_EXPR
497 || TREE_CODE (ref) == IMAGPART_EXPR
498 || (TREE_CODE (ref) == COMPONENT_REF
499 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
501 if (TREE_CODE (ref) == IMAGPART_EXPR)
502 *delta += int_size_in_bytes (TREE_TYPE (ref));
503 ref = TREE_OPERAND (ref, 0);
506 *ref_p = ref;
508 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
510 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
511 bit_offset = TREE_INT_CST_LOW (off);
512 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
514 *delta += bit_offset / BITS_PER_UNIT;
517 *base = unshare_expr (ref);
518 ar_data.loop = loop;
519 ar_data.stmt = stmt;
520 ar_data.step = step;
521 ar_data.delta = delta;
522 return for_each_index (base, idx_analyze_ref, &ar_data);
525 /* Record a memory reference REF to the list REFS. The reference occurs in
526 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
527 reference was recorded, false otherwise. */
529 static bool
530 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
531 tree ref, bool write_p, gimple *stmt)
533 tree base, step;
534 HOST_WIDE_INT delta;
535 struct mem_ref_group *agrp;
537 if (get_base_address (ref) == NULL)
538 return false;
540 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
541 return false;
542 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
543 if (step == NULL_TREE)
544 return false;
546 /* Stop if the address of BASE could not be taken. */
547 if (may_be_nonaddressable_p (base))
548 return false;
550 /* Limit non-constant step prefetching only to the innermost loops and
551 only when the step is loop invariant in the entire loop nest. */
552 if (!cst_and_fits_in_hwi (step))
554 if (loop->inner != NULL)
556 if (dump_file && (dump_flags & TDF_DETAILS))
558 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
559 print_generic_expr (dump_file, ref, TDF_TREE);
560 fprintf (dump_file,":");
561 dump_mem_details (dump_file, base, step, delta, write_p);
562 fprintf (dump_file,
563 "Ignoring %p, non-constant step prefetching is "
564 "limited to inner most loops \n",
565 (void *) ref);
567 return false;
569 else
571 if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
573 if (dump_file && (dump_flags & TDF_DETAILS))
575 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
576 print_generic_expr (dump_file, ref, TDF_TREE);
577 fprintf (dump_file,":");
578 dump_mem_details (dump_file, base, step, delta, write_p);
579 fprintf (dump_file,
580 "Not prefetching, ignoring %p due to "
581 "loop variant step\n",
582 (void *) ref);
584 return false;
589 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
590 are integer constants. */
591 agrp = find_or_create_group (refs, base, step);
592 record_ref (agrp, stmt, ref, delta, write_p);
594 return true;
597 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
598 true if there are no other memory references inside the loop. */
600 static struct mem_ref_group *
601 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
603 basic_block *body = get_loop_body_in_dom_order (loop);
604 basic_block bb;
605 unsigned i;
606 gimple_stmt_iterator bsi;
607 gimple *stmt;
608 tree lhs, rhs;
609 struct mem_ref_group *refs = NULL;
611 *no_other_refs = true;
612 *ref_count = 0;
614 /* Scan the loop body in order, so that the former references precede the
615 later ones. */
616 for (i = 0; i < loop->num_nodes; i++)
618 bb = body[i];
619 if (bb->loop_father != loop)
620 continue;
622 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
624 stmt = gsi_stmt (bsi);
626 if (gimple_code (stmt) != GIMPLE_ASSIGN)
628 if (gimple_vuse (stmt)
629 || (is_gimple_call (stmt)
630 && !(gimple_call_flags (stmt) & ECF_CONST)))
631 *no_other_refs = false;
632 continue;
635 if (! gimple_vuse (stmt))
636 continue;
638 lhs = gimple_assign_lhs (stmt);
639 rhs = gimple_assign_rhs1 (stmt);
641 if (REFERENCE_CLASS_P (rhs))
643 *no_other_refs &= gather_memory_references_ref (loop, &refs,
644 rhs, false, stmt);
645 *ref_count += 1;
647 if (REFERENCE_CLASS_P (lhs))
649 *no_other_refs &= gather_memory_references_ref (loop, &refs,
650 lhs, true, stmt);
651 *ref_count += 1;
655 free (body);
657 return refs;
660 /* Prune the prefetch candidate REF using the self-reuse. */
662 static void
663 prune_ref_by_self_reuse (struct mem_ref *ref)
665 HOST_WIDE_INT step;
666 bool backward;
668 /* If the step size is non constant, we cannot calculate prefetch_mod. */
669 if (!cst_and_fits_in_hwi (ref->group->step))
670 return;
672 step = int_cst_value (ref->group->step);
674 backward = step < 0;
676 if (step == 0)
678 /* Prefetch references to invariant address just once. */
679 ref->prefetch_before = 1;
680 return;
683 if (backward)
684 step = -step;
686 if (step > PREFETCH_BLOCK)
687 return;
689 if ((backward && HAVE_BACKWARD_PREFETCH)
690 || (!backward && HAVE_FORWARD_PREFETCH))
692 ref->prefetch_before = 1;
693 return;
696 ref->prefetch_mod = PREFETCH_BLOCK / step;
699 /* Divides X by BY, rounding down. */
701 static HOST_WIDE_INT
702 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
704 gcc_assert (by > 0);
706 if (x >= 0)
707 return x / by;
708 else
709 return (x + by - 1) / by;
712 /* Given a CACHE_LINE_SIZE and two inductive memory references
713 with a common STEP greater than CACHE_LINE_SIZE and an address
714 difference DELTA, compute the probability that they will fall
715 in different cache lines. Return true if the computed miss rate
716 is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
717 number of distinct iterations after which the pattern repeats itself.
718 ALIGN_UNIT is the unit of alignment in bytes. */
720 static bool
721 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
722 HOST_WIDE_INT step, HOST_WIDE_INT delta,
723 unsigned HOST_WIDE_INT distinct_iters,
724 int align_unit)
726 unsigned align, iter;
727 int total_positions, miss_positions, max_allowed_miss_positions;
728 int address1, address2, cache_line1, cache_line2;
730 /* It always misses if delta is greater than or equal to the cache
731 line size. */
732 if (delta >= (HOST_WIDE_INT) cache_line_size)
733 return false;
735 miss_positions = 0;
736 total_positions = (cache_line_size / align_unit) * distinct_iters;
737 max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
739 /* Iterate through all possible alignments of the first
740 memory reference within its cache line. */
741 for (align = 0; align < cache_line_size; align += align_unit)
743 /* Iterate through all distinct iterations. */
744 for (iter = 0; iter < distinct_iters; iter++)
746 address1 = align + step * iter;
747 address2 = address1 + delta;
748 cache_line1 = address1 / cache_line_size;
749 cache_line2 = address2 / cache_line_size;
750 if (cache_line1 != cache_line2)
752 miss_positions += 1;
753 if (miss_positions > max_allowed_miss_positions)
754 return false;
757 return true;
760 /* Prune the prefetch candidate REF using the reuse with BY.
761 If BY_IS_BEFORE is true, BY is before REF in the loop. */
763 static void
764 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
765 bool by_is_before)
767 HOST_WIDE_INT step;
768 bool backward;
769 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
770 HOST_WIDE_INT delta = delta_b - delta_r;
771 HOST_WIDE_INT hit_from;
772 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
773 HOST_WIDE_INT reduced_step;
774 unsigned HOST_WIDE_INT reduced_prefetch_block;
775 tree ref_type;
776 int align_unit;
778 /* If the step is non constant we cannot calculate prefetch_before. */
779 if (!cst_and_fits_in_hwi (ref->group->step)) {
780 return;
783 step = int_cst_value (ref->group->step);
785 backward = step < 0;
788 if (delta == 0)
790 /* If the references has the same address, only prefetch the
791 former. */
792 if (by_is_before)
793 ref->prefetch_before = 0;
795 return;
798 if (!step)
800 /* If the reference addresses are invariant and fall into the
801 same cache line, prefetch just the first one. */
802 if (!by_is_before)
803 return;
805 if (ddown (ref->delta, PREFETCH_BLOCK)
806 != ddown (by->delta, PREFETCH_BLOCK))
807 return;
809 ref->prefetch_before = 0;
810 return;
813 /* Only prune the reference that is behind in the array. */
814 if (backward)
816 if (delta > 0)
817 return;
819 /* Transform the data so that we may assume that the accesses
820 are forward. */
821 delta = - delta;
822 step = -step;
823 delta_r = PREFETCH_BLOCK - 1 - delta_r;
824 delta_b = PREFETCH_BLOCK - 1 - delta_b;
826 else
828 if (delta < 0)
829 return;
832 /* Check whether the two references are likely to hit the same cache
833 line, and how distant the iterations in that it occurs are from
834 each other. */
836 if (step <= PREFETCH_BLOCK)
838 /* The accesses are sure to meet. Let us check when. */
839 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
840 prefetch_before = (hit_from - delta_r + step - 1) / step;
842 /* Do not reduce prefetch_before if we meet beyond cache size. */
843 if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
844 prefetch_before = PREFETCH_ALL;
845 if (prefetch_before < ref->prefetch_before)
846 ref->prefetch_before = prefetch_before;
848 return;
851 /* A more complicated case with step > prefetch_block. First reduce
852 the ratio between the step and the cache line size to its simplest
853 terms. The resulting denominator will then represent the number of
854 distinct iterations after which each address will go back to its
855 initial location within the cache line. This computation assumes
856 that PREFETCH_BLOCK is a power of two. */
857 prefetch_block = PREFETCH_BLOCK;
858 reduced_prefetch_block = prefetch_block;
859 reduced_step = step;
860 while ((reduced_step & 1) == 0
861 && reduced_prefetch_block > 1)
863 reduced_step >>= 1;
864 reduced_prefetch_block >>= 1;
867 prefetch_before = delta / step;
868 delta %= step;
869 ref_type = TREE_TYPE (ref->mem);
870 align_unit = TYPE_ALIGN (ref_type) / 8;
871 if (is_miss_rate_acceptable (prefetch_block, step, delta,
872 reduced_prefetch_block, align_unit))
874 /* Do not reduce prefetch_before if we meet beyond cache size. */
875 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
876 prefetch_before = PREFETCH_ALL;
877 if (prefetch_before < ref->prefetch_before)
878 ref->prefetch_before = prefetch_before;
880 return;
883 /* Try also the following iteration. */
884 prefetch_before++;
885 delta = step - delta;
886 if (is_miss_rate_acceptable (prefetch_block, step, delta,
887 reduced_prefetch_block, align_unit))
889 if (prefetch_before < ref->prefetch_before)
890 ref->prefetch_before = prefetch_before;
892 return;
895 /* The ref probably does not reuse by. */
896 return;
899 /* Prune the prefetch candidate REF using the reuses with other references
900 in REFS. */
902 static void
903 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
905 struct mem_ref *prune_by;
906 bool before = true;
908 prune_ref_by_self_reuse (ref);
910 for (prune_by = refs; prune_by; prune_by = prune_by->next)
912 if (prune_by == ref)
914 before = false;
915 continue;
918 if (!WRITE_CAN_USE_READ_PREFETCH
919 && ref->write_p
920 && !prune_by->write_p)
921 continue;
922 if (!READ_CAN_USE_WRITE_PREFETCH
923 && !ref->write_p
924 && prune_by->write_p)
925 continue;
927 prune_ref_by_group_reuse (ref, prune_by, before);
931 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
933 static void
934 prune_group_by_reuse (struct mem_ref_group *group)
936 struct mem_ref *ref_pruned;
938 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
940 prune_ref_by_reuse (ref_pruned, group->refs);
942 if (dump_file && (dump_flags & TDF_DETAILS))
944 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
946 if (ref_pruned->prefetch_before == PREFETCH_ALL
947 && ref_pruned->prefetch_mod == 1)
948 fprintf (dump_file, " no restrictions");
949 else if (ref_pruned->prefetch_before == 0)
950 fprintf (dump_file, " do not prefetch");
951 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
952 fprintf (dump_file, " prefetch once");
953 else
955 if (ref_pruned->prefetch_before != PREFETCH_ALL)
957 fprintf (dump_file, " prefetch before ");
958 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
959 ref_pruned->prefetch_before);
961 if (ref_pruned->prefetch_mod != 1)
963 fprintf (dump_file, " prefetch mod ");
964 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
965 ref_pruned->prefetch_mod);
968 fprintf (dump_file, "\n");
973 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
975 static void
976 prune_by_reuse (struct mem_ref_group *groups)
978 for (; groups; groups = groups->next)
979 prune_group_by_reuse (groups);
982 /* Returns true if we should issue prefetch for REF. */
984 static bool
985 should_issue_prefetch_p (struct mem_ref *ref)
987 /* For now do not issue prefetches for only first few of the
988 iterations. */
989 if (ref->prefetch_before != PREFETCH_ALL)
991 if (dump_file && (dump_flags & TDF_DETAILS))
992 fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
993 (void *) ref);
994 return false;
997 /* Do not prefetch nontemporal stores. */
998 if (ref->storent_p)
1000 if (dump_file && (dump_flags & TDF_DETAILS))
1001 fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
1002 return false;
1005 return true;
1008 /* Decide which of the prefetch candidates in GROUPS to prefetch.
1009 AHEAD is the number of iterations to prefetch ahead (which corresponds
1010 to the number of simultaneous instances of one prefetch running at a
1011 time). UNROLL_FACTOR is the factor by that the loop is going to be
1012 unrolled. Returns true if there is anything to prefetch. */
1014 static bool
1015 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1016 unsigned ahead)
1018 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1019 unsigned slots_per_prefetch;
1020 struct mem_ref *ref;
1021 bool any = false;
1023 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
1024 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1026 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1027 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1028 it will need a prefetch slot. */
1029 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1030 if (dump_file && (dump_flags & TDF_DETAILS))
1031 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1032 slots_per_prefetch);
1034 /* For now we just take memory references one by one and issue
1035 prefetches for as many as possible. The groups are sorted
1036 starting with the largest step, since the references with
1037 large step are more likely to cause many cache misses. */
1039 for (; groups; groups = groups->next)
1040 for (ref = groups->refs; ref; ref = ref->next)
1042 if (!should_issue_prefetch_p (ref))
1043 continue;
1045 /* The loop is far from being sufficiently unrolled for this
1046 prefetch. Do not generate prefetch to avoid many redudant
1047 prefetches. */
1048 if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1049 continue;
1051 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1052 and we unroll the loop UNROLL_FACTOR times, we need to insert
1053 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1054 iteration. */
1055 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1056 / ref->prefetch_mod);
1057 prefetch_slots = n_prefetches * slots_per_prefetch;
1059 /* If more than half of the prefetches would be lost anyway, do not
1060 issue the prefetch. */
1061 if (2 * remaining_prefetch_slots < prefetch_slots)
1062 continue;
1064 ref->issue_prefetch_p = true;
1066 if (remaining_prefetch_slots <= prefetch_slots)
1067 return true;
1068 remaining_prefetch_slots -= prefetch_slots;
1069 any = true;
1072 return any;
1075 /* Return TRUE if no prefetch is going to be generated in the given
1076 GROUPS. */
1078 static bool
1079 nothing_to_prefetch_p (struct mem_ref_group *groups)
1081 struct mem_ref *ref;
1083 for (; groups; groups = groups->next)
1084 for (ref = groups->refs; ref; ref = ref->next)
1085 if (should_issue_prefetch_p (ref))
1086 return false;
1088 return true;
1091 /* Estimate the number of prefetches in the given GROUPS.
1092 UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1094 static int
1095 estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1097 struct mem_ref *ref;
1098 unsigned n_prefetches;
1099 int prefetch_count = 0;
1101 for (; groups; groups = groups->next)
1102 for (ref = groups->refs; ref; ref = ref->next)
1103 if (should_issue_prefetch_p (ref))
1105 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1106 / ref->prefetch_mod);
1107 prefetch_count += n_prefetches;
1110 return prefetch_count;
1113 /* Issue prefetches for the reference REF into loop as decided before.
1114 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1115 is the factor by which LOOP was unrolled. */
1117 static void
1118 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1120 HOST_WIDE_INT delta;
1121 tree addr, addr_base, write_p, local, forward;
1122 gcall *prefetch;
1123 gimple_stmt_iterator bsi;
1124 unsigned n_prefetches, ap;
1125 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1127 if (dump_file && (dump_flags & TDF_DETAILS))
1128 fprintf (dump_file, "Issued%s prefetch for %p.\n",
1129 nontemporal ? " nontemporal" : "",
1130 (void *) ref);
1132 bsi = gsi_for_stmt (ref->stmt);
1134 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1135 / ref->prefetch_mod);
1136 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1137 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1138 true, NULL, true, GSI_SAME_STMT);
1139 write_p = ref->write_p ? integer_one_node : integer_zero_node;
1140 local = nontemporal ? integer_zero_node : integer_three_node;
1142 for (ap = 0; ap < n_prefetches; ap++)
1144 if (cst_and_fits_in_hwi (ref->group->step))
1146 /* Determine the address to prefetch. */
1147 delta = (ahead + ap * ref->prefetch_mod) *
1148 int_cst_value (ref->group->step);
1149 addr = fold_build_pointer_plus_hwi (addr_base, delta);
1150 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1151 true, GSI_SAME_STMT);
1153 else
1155 /* The step size is non-constant but loop-invariant. We use the
1156 heuristic to simply prefetch ahead iterations ahead. */
1157 forward = fold_build2 (MULT_EXPR, sizetype,
1158 fold_convert (sizetype, ref->group->step),
1159 fold_convert (sizetype, size_int (ahead)));
1160 addr = fold_build_pointer_plus (addr_base, forward);
1161 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1162 NULL, true, GSI_SAME_STMT);
1165 if (addr_base != addr
1166 && TREE_CODE (addr_base) == SSA_NAME
1167 && TREE_CODE (addr) == SSA_NAME)
1169 duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
1170 /* As this isn't a plain copy we have to reset alignment
1171 information. */
1172 if (SSA_NAME_PTR_INFO (addr))
1173 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
1176 /* Create the prefetch instruction. */
1177 prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1178 3, addr, write_p, local);
1179 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1183 /* Issue prefetches for the references in GROUPS into loop as decided before.
1184 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1185 factor by that LOOP was unrolled. */
1187 static void
1188 issue_prefetches (struct mem_ref_group *groups,
1189 unsigned unroll_factor, unsigned ahead)
1191 struct mem_ref *ref;
1193 for (; groups; groups = groups->next)
1194 for (ref = groups->refs; ref; ref = ref->next)
1195 if (ref->issue_prefetch_p)
1196 issue_prefetch_ref (ref, unroll_factor, ahead);
1199 /* Returns true if REF is a memory write for that a nontemporal store insn
1200 can be used. */
1202 static bool
1203 nontemporal_store_p (struct mem_ref *ref)
1205 machine_mode mode;
1206 enum insn_code code;
1208 /* REF must be a write that is not reused. We require it to be independent
1209 on all other memory references in the loop, as the nontemporal stores may
1210 be reordered with respect to other memory references. */
1211 if (!ref->write_p
1212 || !ref->independent_p
1213 || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1214 return false;
1216 /* Check that we have the storent instruction for the mode. */
1217 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1218 if (mode == BLKmode)
1219 return false;
1221 code = optab_handler (storent_optab, mode);
1222 return code != CODE_FOR_nothing;
1225 /* If REF is a nontemporal store, we mark the corresponding modify statement
1226 and return true. Otherwise, we return false. */
1228 static bool
1229 mark_nontemporal_store (struct mem_ref *ref)
1231 if (!nontemporal_store_p (ref))
1232 return false;
1234 if (dump_file && (dump_flags & TDF_DETAILS))
1235 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1236 (void *) ref);
1238 gimple_assign_set_nontemporal_move (ref->stmt, true);
1239 ref->storent_p = true;
1241 return true;
1244 /* Issue a memory fence instruction after LOOP. */
1246 static void
1247 emit_mfence_after_loop (struct loop *loop)
1249 vec<edge> exits = get_loop_exit_edges (loop);
1250 edge exit;
1251 gcall *call;
1252 gimple_stmt_iterator bsi;
1253 unsigned i;
1255 FOR_EACH_VEC_ELT (exits, i, exit)
1257 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1259 if (!single_pred_p (exit->dest)
1260 /* If possible, we prefer not to insert the fence on other paths
1261 in cfg. */
1262 && !(exit->flags & EDGE_ABNORMAL))
1263 split_loop_exit_edge (exit);
1264 bsi = gsi_after_labels (exit->dest);
1266 gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1269 exits.release ();
1270 update_ssa (TODO_update_ssa_only_virtuals);
1273 /* Returns true if we can use storent in loop, false otherwise. */
1275 static bool
1276 may_use_storent_in_loop_p (struct loop *loop)
1278 bool ret = true;
1280 if (loop->inner != NULL)
1281 return false;
1283 /* If we must issue a mfence insn after using storent, check that there
1284 is a suitable place for it at each of the loop exits. */
1285 if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1287 vec<edge> exits = get_loop_exit_edges (loop);
1288 unsigned i;
1289 edge exit;
1291 FOR_EACH_VEC_ELT (exits, i, exit)
1292 if ((exit->flags & EDGE_ABNORMAL)
1293 && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1294 ret = false;
1296 exits.release ();
1299 return ret;
1302 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1303 references in the loop. */
1305 static void
1306 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1308 struct mem_ref *ref;
1309 bool any = false;
1311 if (!may_use_storent_in_loop_p (loop))
1312 return;
1314 for (; groups; groups = groups->next)
1315 for (ref = groups->refs; ref; ref = ref->next)
1316 any |= mark_nontemporal_store (ref);
1318 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1319 emit_mfence_after_loop (loop);
1322 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1323 this is the case, fill in DESC by the description of number of
1324 iterations. */
1326 static bool
1327 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1328 unsigned factor)
1330 if (!can_unroll_loop_p (loop, factor, desc))
1331 return false;
1333 /* We only consider loops without control flow for unrolling. This is not
1334 a hard restriction -- tree_unroll_loop works with arbitrary loops
1335 as well; but the unrolling/prefetching is usually more profitable for
1336 loops consisting of a single basic block, and we want to limit the
1337 code growth. */
1338 if (loop->num_nodes > 2)
1339 return false;
1341 return true;
1344 /* Determine the coefficient by that unroll LOOP, from the information
1345 contained in the list of memory references REFS. Description of
1346 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1347 insns of the LOOP. EST_NITER is the estimated number of iterations of
1348 the loop, or -1 if no estimate is available. */
1350 static unsigned
1351 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1352 unsigned ninsns, struct tree_niter_desc *desc,
1353 HOST_WIDE_INT est_niter)
1355 unsigned upper_bound;
1356 unsigned nfactor, factor, mod_constraint;
1357 struct mem_ref_group *agp;
1358 struct mem_ref *ref;
1360 /* First check whether the loop is not too large to unroll. We ignore
1361 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1362 from unrolling them enough to make exactly one cache line covered by each
1363 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1364 us from unrolling the loops too many times in cases where we only expect
1365 gains from better scheduling and decreasing loop overhead, which is not
1366 the case here. */
1367 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1369 /* If we unrolled the loop more times than it iterates, the unrolled version
1370 of the loop would be never entered. */
1371 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1372 upper_bound = est_niter;
1374 if (upper_bound <= 1)
1375 return 1;
1377 /* Choose the factor so that we may prefetch each cache just once,
1378 but bound the unrolling by UPPER_BOUND. */
1379 factor = 1;
1380 for (agp = refs; agp; agp = agp->next)
1381 for (ref = agp->refs; ref; ref = ref->next)
1382 if (should_issue_prefetch_p (ref))
1384 mod_constraint = ref->prefetch_mod;
1385 nfactor = least_common_multiple (mod_constraint, factor);
1386 if (nfactor <= upper_bound)
1387 factor = nfactor;
1390 if (!should_unroll_loop_p (loop, desc, factor))
1391 return 1;
1393 return factor;
1396 /* Returns the total volume of the memory references REFS, taking into account
1397 reuses in the innermost loop and cache line size. TODO -- we should also
1398 take into account reuses across the iterations of the loops in the loop
1399 nest. */
1401 static unsigned
1402 volume_of_references (struct mem_ref_group *refs)
1404 unsigned volume = 0;
1405 struct mem_ref_group *gr;
1406 struct mem_ref *ref;
1408 for (gr = refs; gr; gr = gr->next)
1409 for (ref = gr->refs; ref; ref = ref->next)
1411 /* Almost always reuses another value? */
1412 if (ref->prefetch_before != PREFETCH_ALL)
1413 continue;
1415 /* If several iterations access the same cache line, use the size of
1416 the line divided by this number. Otherwise, a cache line is
1417 accessed in each iteration. TODO -- in the latter case, we should
1418 take the size of the reference into account, rounding it up on cache
1419 line size multiple. */
1420 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1422 return volume;
1425 /* Returns the volume of memory references accessed across VEC iterations of
1426 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1427 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1429 static unsigned
1430 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1432 unsigned i;
1434 for (i = 0; i < n; i++)
1435 if (vec[i] != 0)
1436 break;
1438 if (i == n)
1439 return 0;
1441 gcc_assert (vec[i] > 0);
1443 /* We ignore the parts of the distance vector in subloops, since usually
1444 the numbers of iterations are much smaller. */
1445 return loop_sizes[i] * vec[i];
1448 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1449 at the position corresponding to the loop of the step. N is the depth
1450 of the considered loop nest, and, LOOP is its innermost loop. */
1452 static void
1453 add_subscript_strides (tree access_fn, unsigned stride,
1454 HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1456 struct loop *aloop;
1457 tree step;
1458 HOST_WIDE_INT astep;
1459 unsigned min_depth = loop_depth (loop) - n;
1461 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1463 aloop = get_chrec_loop (access_fn);
1464 step = CHREC_RIGHT (access_fn);
1465 access_fn = CHREC_LEFT (access_fn);
1467 if ((unsigned) loop_depth (aloop) <= min_depth)
1468 continue;
1470 if (tree_fits_shwi_p (step))
1471 astep = tree_to_shwi (step);
1472 else
1473 astep = L1_CACHE_LINE_SIZE;
1475 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1480 /* Returns the volume of memory references accessed between two consecutive
1481 self-reuses of the reference DR. We consider the subscripts of DR in N
1482 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1483 loops. LOOP is the innermost loop of the current loop nest. */
1485 static unsigned
1486 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1487 struct loop *loop)
1489 tree stride, access_fn;
1490 HOST_WIDE_INT *strides, astride;
1491 vec<tree> access_fns;
1492 tree ref = DR_REF (dr);
1493 unsigned i, ret = ~0u;
1495 /* In the following example:
1497 for (i = 0; i < N; i++)
1498 for (j = 0; j < N; j++)
1499 use (a[j][i]);
1500 the same cache line is accessed each N steps (except if the change from
1501 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1502 we cannot rely purely on the results of the data dependence analysis.
1504 Instead, we compute the stride of the reference in each loop, and consider
1505 the innermost loop in that the stride is less than cache size. */
1507 strides = XCNEWVEC (HOST_WIDE_INT, n);
1508 access_fns = DR_ACCESS_FNS (dr);
1510 FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1512 /* Keep track of the reference corresponding to the subscript, so that we
1513 know its stride. */
1514 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1515 ref = TREE_OPERAND (ref, 0);
1517 if (TREE_CODE (ref) == ARRAY_REF)
1519 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1520 if (tree_fits_uhwi_p (stride))
1521 astride = tree_to_uhwi (stride);
1522 else
1523 astride = L1_CACHE_LINE_SIZE;
1525 ref = TREE_OPERAND (ref, 0);
1527 else
1528 astride = 1;
1530 add_subscript_strides (access_fn, astride, strides, n, loop);
1533 for (i = n; i-- > 0; )
1535 unsigned HOST_WIDE_INT s;
1537 s = strides[i] < 0 ? -strides[i] : strides[i];
1539 if (s < (unsigned) L1_CACHE_LINE_SIZE
1540 && (loop_sizes[i]
1541 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1543 ret = loop_sizes[i];
1544 break;
1548 free (strides);
1549 return ret;
1552 /* Determines the distance till the first reuse of each reference in REFS
1553 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1554 memory references in the loop. Return false if the analysis fails. */
1556 static bool
1557 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1558 bool no_other_refs)
1560 struct loop *nest, *aloop;
1561 vec<data_reference_p> datarefs = vNULL;
1562 vec<ddr_p> dependences = vNULL;
1563 struct mem_ref_group *gr;
1564 struct mem_ref *ref, *refb;
1565 auto_vec<loop_p> vloops;
1566 unsigned *loop_data_size;
1567 unsigned i, j, n;
1568 unsigned volume, dist, adist;
1569 HOST_WIDE_INT vol;
1570 data_reference_p dr;
1571 ddr_p dep;
1573 if (loop->inner)
1574 return true;
1576 /* Find the outermost loop of the loop nest of loop (we require that
1577 there are no sibling loops inside the nest). */
1578 nest = loop;
1579 while (1)
1581 aloop = loop_outer (nest);
1583 if (aloop == current_loops->tree_root
1584 || aloop->inner->next)
1585 break;
1587 nest = aloop;
1590 /* For each loop, determine the amount of data accessed in each iteration.
1591 We use this to estimate whether the reference is evicted from the
1592 cache before its reuse. */
1593 find_loop_nest (nest, &vloops);
1594 n = vloops.length ();
1595 loop_data_size = XNEWVEC (unsigned, n);
1596 volume = volume_of_references (refs);
1597 i = n;
1598 while (i-- != 0)
1600 loop_data_size[i] = volume;
1601 /* Bound the volume by the L2 cache size, since above this bound,
1602 all dependence distances are equivalent. */
1603 if (volume > L2_CACHE_SIZE_BYTES)
1604 continue;
1606 aloop = vloops[i];
1607 vol = estimated_stmt_executions_int (aloop);
1608 if (vol == -1)
1609 vol = expected_loop_iterations (aloop);
1610 volume *= vol;
1613 /* Prepare the references in the form suitable for data dependence
1614 analysis. We ignore unanalyzable data references (the results
1615 are used just as a heuristics to estimate temporality of the
1616 references, hence we do not need to worry about correctness). */
1617 for (gr = refs; gr; gr = gr->next)
1618 for (ref = gr->refs; ref; ref = ref->next)
1620 dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1621 ref->mem, ref->stmt, !ref->write_p);
1623 if (dr)
1625 ref->reuse_distance = volume;
1626 dr->aux = ref;
1627 datarefs.safe_push (dr);
1629 else
1630 no_other_refs = false;
1633 FOR_EACH_VEC_ELT (datarefs, i, dr)
1635 dist = self_reuse_distance (dr, loop_data_size, n, loop);
1636 ref = (struct mem_ref *) dr->aux;
1637 if (ref->reuse_distance > dist)
1638 ref->reuse_distance = dist;
1640 if (no_other_refs)
1641 ref->independent_p = true;
1644 if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1645 return false;
1647 FOR_EACH_VEC_ELT (dependences, i, dep)
1649 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1650 continue;
1652 ref = (struct mem_ref *) DDR_A (dep)->aux;
1653 refb = (struct mem_ref *) DDR_B (dep)->aux;
1655 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1656 || DDR_NUM_DIST_VECTS (dep) == 0)
1658 /* If the dependence cannot be analyzed, assume that there might be
1659 a reuse. */
1660 dist = 0;
1662 ref->independent_p = false;
1663 refb->independent_p = false;
1665 else
1667 /* The distance vectors are normalized to be always lexicographically
1668 positive, hence we cannot tell just from them whether DDR_A comes
1669 before DDR_B or vice versa. However, it is not important,
1670 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1671 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1672 in cache (and marking it as nontemporal would not affect
1673 anything). */
1675 dist = volume;
1676 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1678 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1679 loop_data_size, n);
1681 /* If this is a dependence in the innermost loop (i.e., the
1682 distances in all superloops are zero) and it is not
1683 the trivial self-dependence with distance zero, record that
1684 the references are not completely independent. */
1685 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1686 && (ref != refb
1687 || DDR_DIST_VECT (dep, j)[n-1] != 0))
1689 ref->independent_p = false;
1690 refb->independent_p = false;
1693 /* Ignore accesses closer than
1694 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1695 so that we use nontemporal prefetches e.g. if single memory
1696 location is accessed several times in a single iteration of
1697 the loop. */
1698 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1699 continue;
1701 if (adist < dist)
1702 dist = adist;
1706 if (ref->reuse_distance > dist)
1707 ref->reuse_distance = dist;
1708 if (refb->reuse_distance > dist)
1709 refb->reuse_distance = dist;
1712 free_dependence_relations (dependences);
1713 free_data_refs (datarefs);
1714 free (loop_data_size);
1716 if (dump_file && (dump_flags & TDF_DETAILS))
1718 fprintf (dump_file, "Reuse distances:\n");
1719 for (gr = refs; gr; gr = gr->next)
1720 for (ref = gr->refs; ref; ref = ref->next)
1721 fprintf (dump_file, " ref %p distance %u\n",
1722 (void *) ref, ref->reuse_distance);
1725 return true;
1728 /* Determine whether or not the trip count to ahead ratio is too small based
1729 on prefitablility consideration.
1730 AHEAD: the iteration ahead distance,
1731 EST_NITER: the estimated trip count. */
1733 static bool
1734 trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1736 /* Assume trip count to ahead ratio is big enough if the trip count could not
1737 be estimated at compile time. */
1738 if (est_niter < 0)
1739 return false;
1741 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1743 if (dump_file && (dump_flags & TDF_DETAILS))
1744 fprintf (dump_file,
1745 "Not prefetching -- loop estimated to roll only %d times\n",
1746 (int) est_niter);
1747 return true;
1750 return false;
1753 /* Determine whether or not the number of memory references in the loop is
1754 reasonable based on the profitablity and compilation time considerations.
1755 NINSNS: estimated number of instructions in the loop,
1756 MEM_REF_COUNT: total number of memory references in the loop. */
1758 static bool
1759 mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1761 int insn_to_mem_ratio;
1763 if (mem_ref_count == 0)
1764 return false;
1766 /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1767 (compute_all_dependences) have high costs based on quadratic complexity.
1768 To avoid huge compilation time, we give up prefetching if mem_ref_count
1769 is too large. */
1770 if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1771 return false;
1773 /* Prefetching improves performance by overlapping cache missing
1774 memory accesses with CPU operations. If the loop does not have
1775 enough CPU operations to overlap with memory operations, prefetching
1776 won't give a significant benefit. One approximate way of checking
1777 this is to require the ratio of instructions to memory references to
1778 be above a certain limit. This approximation works well in practice.
1779 TODO: Implement a more precise computation by estimating the time
1780 for each CPU or memory op in the loop. Time estimates for memory ops
1781 should account for cache misses. */
1782 insn_to_mem_ratio = ninsns / mem_ref_count;
1784 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1786 if (dump_file && (dump_flags & TDF_DETAILS))
1787 fprintf (dump_file,
1788 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1789 insn_to_mem_ratio);
1790 return false;
1793 return true;
1796 /* Determine whether or not the instruction to prefetch ratio in the loop is
1797 too small based on the profitablity consideration.
1798 NINSNS: estimated number of instructions in the loop,
1799 PREFETCH_COUNT: an estimate of the number of prefetches,
1800 UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1802 static bool
1803 insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1804 unsigned unroll_factor)
1806 int insn_to_prefetch_ratio;
1808 /* Prefetching most likely causes performance degradation when the instruction
1809 to prefetch ratio is too small. Too many prefetch instructions in a loop
1810 may reduce the I-cache performance.
1811 (unroll_factor * ninsns) is used to estimate the number of instructions in
1812 the unrolled loop. This implementation is a bit simplistic -- the number
1813 of issued prefetch instructions is also affected by unrolling. So,
1814 prefetch_mod and the unroll factor should be taken into account when
1815 determining prefetch_count. Also, the number of insns of the unrolled
1816 loop will usually be significantly smaller than the number of insns of the
1817 original loop * unroll_factor (at least the induction variable increases
1818 and the exit branches will get eliminated), so it might be better to use
1819 tree_estimate_loop_size + estimated_unrolled_size. */
1820 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1821 if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1823 if (dump_file && (dump_flags & TDF_DETAILS))
1824 fprintf (dump_file,
1825 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1826 insn_to_prefetch_ratio);
1827 return true;
1830 return false;
1834 /* Issue prefetch instructions for array references in LOOP. Returns
1835 true if the LOOP was unrolled. */
1837 static bool
1838 loop_prefetch_arrays (struct loop *loop)
1840 struct mem_ref_group *refs;
1841 unsigned ahead, ninsns, time, unroll_factor;
1842 HOST_WIDE_INT est_niter;
1843 struct tree_niter_desc desc;
1844 bool unrolled = false, no_other_refs;
1845 unsigned prefetch_count;
1846 unsigned mem_ref_count;
1848 if (optimize_loop_nest_for_size_p (loop))
1850 if (dump_file && (dump_flags & TDF_DETAILS))
1851 fprintf (dump_file, " ignored (cold area)\n");
1852 return false;
1855 /* FIXME: the time should be weighted by the probabilities of the blocks in
1856 the loop body. */
1857 time = tree_num_loop_insns (loop, &eni_time_weights);
1858 if (time == 0)
1859 return false;
1861 ahead = (PREFETCH_LATENCY + time - 1) / time;
1862 est_niter = estimated_stmt_executions_int (loop);
1863 if (est_niter == -1)
1864 est_niter = likely_max_stmt_executions_int (loop);
1866 /* Prefetching is not likely to be profitable if the trip count to ahead
1867 ratio is too small. */
1868 if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1869 return false;
1871 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1873 /* Step 1: gather the memory references. */
1874 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1876 /* Give up prefetching if the number of memory references in the
1877 loop is not reasonable based on profitablity and compilation time
1878 considerations. */
1879 if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1880 goto fail;
1882 /* Step 2: estimate the reuse effects. */
1883 prune_by_reuse (refs);
1885 if (nothing_to_prefetch_p (refs))
1886 goto fail;
1888 if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1889 goto fail;
1891 /* Step 3: determine unroll factor. */
1892 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1893 est_niter);
1895 /* Estimate prefetch count for the unrolled loop. */
1896 prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1897 if (prefetch_count == 0)
1898 goto fail;
1900 if (dump_file && (dump_flags & TDF_DETAILS))
1901 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1902 HOST_WIDE_INT_PRINT_DEC "\n"
1903 "insn count %d, mem ref count %d, prefetch count %d\n",
1904 ahead, unroll_factor, est_niter,
1905 ninsns, mem_ref_count, prefetch_count);
1907 /* Prefetching is not likely to be profitable if the instruction to prefetch
1908 ratio is too small. */
1909 if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1910 unroll_factor))
1911 goto fail;
1913 mark_nontemporal_stores (loop, refs);
1915 /* Step 4: what to prefetch? */
1916 if (!schedule_prefetches (refs, unroll_factor, ahead))
1917 goto fail;
1919 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1920 iterations so that we do not issue superfluous prefetches. */
1921 if (unroll_factor != 1)
1923 tree_unroll_loop (loop, unroll_factor,
1924 single_dom_exit (loop), &desc);
1925 unrolled = true;
1928 /* Step 6: issue the prefetches. */
1929 issue_prefetches (refs, unroll_factor, ahead);
1931 fail:
1932 release_mem_refs (refs);
1933 return unrolled;
1936 /* Issue prefetch instructions for array references in loops. */
1938 unsigned int
1939 tree_ssa_prefetch_arrays (void)
1941 struct loop *loop;
1942 bool unrolled = false;
1943 int todo_flags = 0;
1945 if (!targetm.have_prefetch ()
1946 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1947 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1948 of processor costs and i486 does not have prefetch, but
1949 -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */
1950 || PREFETCH_BLOCK == 0)
1951 return 0;
1953 if (dump_file && (dump_flags & TDF_DETAILS))
1955 fprintf (dump_file, "Prefetching parameters:\n");
1956 fprintf (dump_file, " simultaneous prefetches: %d\n",
1957 SIMULTANEOUS_PREFETCHES);
1958 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1959 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1960 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1961 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1962 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1963 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1964 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1965 MIN_INSN_TO_PREFETCH_RATIO);
1966 fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1967 PREFETCH_MIN_INSN_TO_MEM_RATIO);
1968 fprintf (dump_file, "\n");
1971 initialize_original_copy_tables ();
1973 if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1975 tree type = build_function_type_list (void_type_node,
1976 const_ptr_type_node, NULL_TREE);
1977 tree decl = add_builtin_function ("__builtin_prefetch", type,
1978 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1979 NULL, NULL_TREE);
1980 DECL_IS_NOVOPS (decl) = true;
1981 set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1984 /* We assume that size of cache line is a power of two, so verify this
1985 here. */
1986 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1988 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1990 if (dump_file && (dump_flags & TDF_DETAILS))
1991 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1993 unrolled |= loop_prefetch_arrays (loop);
1995 if (dump_file && (dump_flags & TDF_DETAILS))
1996 fprintf (dump_file, "\n\n");
1999 if (unrolled)
2001 scev_reset ();
2002 todo_flags |= TODO_cleanup_cfg;
2005 free_original_copy_tables ();
2006 return todo_flags;
2009 /* Prefetching. */
2011 namespace {
2013 const pass_data pass_data_loop_prefetch =
2015 GIMPLE_PASS, /* type */
2016 "aprefetch", /* name */
2017 OPTGROUP_LOOP, /* optinfo_flags */
2018 TV_TREE_PREFETCH, /* tv_id */
2019 ( PROP_cfg | PROP_ssa ), /* properties_required */
2020 0, /* properties_provided */
2021 0, /* properties_destroyed */
2022 0, /* todo_flags_start */
2023 0, /* todo_flags_finish */
2026 class pass_loop_prefetch : public gimple_opt_pass
2028 public:
2029 pass_loop_prefetch (gcc::context *ctxt)
2030 : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2033 /* opt_pass methods: */
2034 virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2035 virtual unsigned int execute (function *);
2037 }; // class pass_loop_prefetch
2039 unsigned int
2040 pass_loop_prefetch::execute (function *fun)
2042 if (number_of_loops (fun) <= 1)
2043 return 0;
2045 return tree_ssa_prefetch_arrays ();
2048 } // anon namespace
2050 gimple_opt_pass *
2051 make_pass_loop_prefetch (gcc::context *ctxt)
2053 return new pass_loop_prefetch (ctxt);