2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
29 #include "fold-const.h"
30 #include "stor-layout.h"
32 #include "tree-pretty-print.h"
33 #include "internal-fn.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-ssa.h"
38 #include "tree-ssa-loop-ivopts.h"
39 #include "tree-ssa-loop-manip.h"
40 #include "tree-ssa-loop-niter.h"
41 #include "tree-ssa-loop.h"
42 #include "tree-into-ssa.h"
44 #include "tree-pass.h"
45 #include "insn-config.h"
46 #include "tree-chrec.h"
47 #include "tree-scalar-evolution.h"
48 #include "diagnostic-core.h"
50 #include "langhooks.h"
51 #include "tree-inline.h"
52 #include "tree-data-ref.h"
56 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
57 between the GIMPLE and RTL worlds. */
67 #include "insn-codes.h"
68 #include "optabs-query.h"
71 /* This pass inserts prefetch instructions to optimize cache usage during
72 accesses to arrays in loops. It processes loops sequentially and:
74 1) Gathers all memory references in the single loop.
75 2) For each of the references it decides when it is profitable to prefetch
76 it. To do it, we evaluate the reuse among the accesses, and determines
77 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
78 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
79 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
80 iterations of the loop that are zero modulo PREFETCH_MOD). For example
81 (assuming cache line size is 64 bytes, char has size 1 byte and there
82 is no hardware sequential prefetch):
85 for (i = 0; i < max; i++)
92 a[187*i + 50] = ...; (5)
95 (0) obviously has PREFETCH_BEFORE 1
96 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
97 location 64 iterations before it, and PREFETCH_MOD 64 (since
98 it hits the same cache line otherwise).
99 (2) has PREFETCH_MOD 64
100 (3) has PREFETCH_MOD 4
101 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
102 the cache line accessed by (5) is the same with probability only
104 (5) has PREFETCH_MOD 1 as well.
106 Additionally, we use data dependence analysis to determine for each
107 reference the distance till the first reuse; this information is used
108 to determine the temporality of the issued prefetch instruction.
110 3) We determine how much ahead we need to prefetch. The number of
111 iterations needed is time to fetch / time spent in one iteration of
112 the loop. The problem is that we do not know either of these values,
113 so we just make a heuristic guess based on a magic (possibly)
114 target-specific constant and size of the loop.
116 4) Determine which of the references we prefetch. We take into account
117 that there is a maximum number of simultaneous prefetches (provided
118 by machine description). We prefetch as many prefetches as possible
119 while still within this bound (starting with those with lowest
120 prefetch_mod, since they are responsible for most of the cache
123 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
124 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
125 prefetching nonaccessed memory.
126 TODO -- actually implement peeling.
128 6) We actually emit the prefetch instructions. ??? Perhaps emit the
129 prefetch instructions with guards in cases where 5) was not sufficient
130 to satisfy the constraints?
132 A cost model is implemented to determine whether or not prefetching is
133 profitable for a given loop. The cost model has three heuristics:
135 1. Function trip_count_to_ahead_ratio_too_small_p implements a
136 heuristic that determines whether or not the loop has too few
137 iterations (compared to ahead). Prefetching is not likely to be
138 beneficial if the trip count to ahead ratio is below a certain
141 2. Function mem_ref_count_reasonable_p implements a heuristic that
142 determines whether the given loop has enough CPU ops that can be
143 overlapped with cache missing memory ops. If not, the loop
144 won't benefit from prefetching. In the implementation,
145 prefetching is not considered beneficial if the ratio between
146 the instruction count and the mem ref count is below a certain
149 3. Function insn_to_prefetch_ratio_too_small_p implements a
150 heuristic that disables prefetching in a loop if the prefetching
151 cost is above a certain limit. The relative prefetching cost is
152 estimated by taking the ratio between the prefetch count and the
153 total intruction count (this models the I-cache cost).
155 The limits used in these heuristics are defined as parameters with
156 reasonable default values. Machine-specific default values will be
160 -- write and use more general reuse analysis (that could be also used
161 in other cache aimed loop optimizations)
162 -- make it behave sanely together with the prefetches given by user
163 (now we just ignore them; at the very least we should avoid
164 optimizing loops in that user put his own prefetches)
165 -- we assume cache line size alignment of arrays; this could be
168 /* Magic constants follow. These should be replaced by machine specific
171 /* True if write can be prefetched by a read prefetch. */
173 #ifndef WRITE_CAN_USE_READ_PREFETCH
174 #define WRITE_CAN_USE_READ_PREFETCH 1
177 /* True if read can be prefetched by a write prefetch. */
179 #ifndef READ_CAN_USE_WRITE_PREFETCH
180 #define READ_CAN_USE_WRITE_PREFETCH 0
183 /* The size of the block loaded by a single prefetch. Usually, this is
184 the same as cache line size (at the moment, we only consider one level
185 of cache hierarchy). */
187 #ifndef PREFETCH_BLOCK
188 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
191 /* Do we have a forward hardware sequential prefetching? */
193 #ifndef HAVE_FORWARD_PREFETCH
194 #define HAVE_FORWARD_PREFETCH 0
197 /* Do we have a backward hardware sequential prefetching? */
199 #ifndef HAVE_BACKWARD_PREFETCH
200 #define HAVE_BACKWARD_PREFETCH 0
203 /* In some cases we are only able to determine that there is a certain
204 probability that the two accesses hit the same cache line. In this
205 case, we issue the prefetches for both of them if this probability
206 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
208 #ifndef ACCEPTABLE_MISS_RATE
209 #define ACCEPTABLE_MISS_RATE 50
212 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
213 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
215 /* We consider a memory access nontemporal if it is not reused sooner than
216 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
217 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
218 so that we use nontemporal prefetches e.g. if single memory location
219 is accessed several times in a single iteration of the loop. */
220 #define NONTEMPORAL_FRACTION 16
222 /* In case we have to emit a memory fence instruction after the loop that
223 uses nontemporal stores, this defines the builtin to use. */
225 #ifndef FENCE_FOLLOWING_MOVNT
226 #define FENCE_FOLLOWING_MOVNT NULL_TREE
229 /* It is not profitable to prefetch when the trip count is not at
230 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
231 For example, in a loop with a prefetch ahead distance of 10,
232 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
233 profitable to prefetch when the trip count is greater or equal to
234 40. In that case, 30 out of the 40 iterations will benefit from
237 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
238 #define TRIP_COUNT_TO_AHEAD_RATIO 4
241 /* The group of references between that reuse may occur. */
245 tree base
; /* Base of the reference. */
246 tree step
; /* Step of the reference. */
247 struct mem_ref
*refs
; /* References in the group. */
248 struct mem_ref_group
*next
; /* Next group of references. */
251 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
253 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
255 /* Do not generate a prefetch if the unroll factor is significantly less
256 than what is required by the prefetch. This is to avoid redundant
257 prefetches. For example, when prefetch_mod is 16 and unroll_factor is
258 2, prefetching requires unrolling the loop 16 times, but
259 the loop is actually unrolled twice. In this case (ratio = 8),
260 prefetching is not likely to be beneficial. */
262 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
263 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
266 /* Some of the prefetch computations have quadratic complexity. We want to
267 avoid huge compile times and, therefore, want to limit the amount of
268 memory references per loop where we consider prefetching. */
270 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
271 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
274 /* The memory reference. */
278 gimple
*stmt
; /* Statement in that the reference appears. */
279 tree mem
; /* The reference. */
280 HOST_WIDE_INT delta
; /* Constant offset of the reference. */
281 struct mem_ref_group
*group
; /* The group of references it belongs to. */
282 unsigned HOST_WIDE_INT prefetch_mod
;
283 /* Prefetch only each PREFETCH_MOD-th
285 unsigned HOST_WIDE_INT prefetch_before
;
286 /* Prefetch only first PREFETCH_BEFORE
288 unsigned reuse_distance
; /* The amount of data accessed before the first
289 reuse of this value. */
290 struct mem_ref
*next
; /* The next reference in the group. */
291 unsigned write_p
: 1; /* Is it a write? */
292 unsigned independent_p
: 1; /* True if the reference is independent on
293 all other references inside the loop. */
294 unsigned issue_prefetch_p
: 1; /* Should we really issue the prefetch? */
295 unsigned storent_p
: 1; /* True if we changed the store to a
299 /* Dumps information about memory reference */
301 dump_mem_details (FILE *file
, tree base
, tree step
,
302 HOST_WIDE_INT delta
, bool write_p
)
304 fprintf (file
, "(base ");
305 print_generic_expr (file
, base
, TDF_SLIM
);
306 fprintf (file
, ", step ");
307 if (cst_and_fits_in_hwi (step
))
308 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, int_cst_value (step
));
310 print_generic_expr (file
, step
, TDF_TREE
);
311 fprintf (file
, ")\n");
312 fprintf (file
, " delta ");
313 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, delta
);
314 fprintf (file
, "\n");
315 fprintf (file
, " %s\n", write_p
? "write" : "read");
316 fprintf (file
, "\n");
319 /* Dumps information about reference REF to FILE. */
322 dump_mem_ref (FILE *file
, struct mem_ref
*ref
)
324 fprintf (file
, "Reference %p:\n", (void *) ref
);
326 fprintf (file
, " group %p ", (void *) ref
->group
);
328 dump_mem_details (file
, ref
->group
->base
, ref
->group
->step
, ref
->delta
,
332 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
335 static struct mem_ref_group
*
336 find_or_create_group (struct mem_ref_group
**groups
, tree base
, tree step
)
338 struct mem_ref_group
*group
;
340 for (; *groups
; groups
= &(*groups
)->next
)
342 if (operand_equal_p ((*groups
)->step
, step
, 0)
343 && operand_equal_p ((*groups
)->base
, base
, 0))
346 /* If step is an integer constant, keep the list of groups sorted
347 by decreasing step. */
348 if (cst_and_fits_in_hwi ((*groups
)->step
) && cst_and_fits_in_hwi (step
)
349 && int_cst_value ((*groups
)->step
) < int_cst_value (step
))
353 group
= XNEW (struct mem_ref_group
);
357 group
->next
= *groups
;
363 /* Records a memory reference MEM in GROUP with offset DELTA and write status
364 WRITE_P. The reference occurs in statement STMT. */
367 record_ref (struct mem_ref_group
*group
, gimple
*stmt
, tree mem
,
368 HOST_WIDE_INT delta
, bool write_p
)
370 struct mem_ref
**aref
;
372 /* Do not record the same address twice. */
373 for (aref
= &group
->refs
; *aref
; aref
= &(*aref
)->next
)
375 /* It does not have to be possible for write reference to reuse the read
376 prefetch, or vice versa. */
377 if (!WRITE_CAN_USE_READ_PREFETCH
379 && !(*aref
)->write_p
)
381 if (!READ_CAN_USE_WRITE_PREFETCH
386 if ((*aref
)->delta
== delta
)
390 (*aref
) = XNEW (struct mem_ref
);
391 (*aref
)->stmt
= stmt
;
393 (*aref
)->delta
= delta
;
394 (*aref
)->write_p
= write_p
;
395 (*aref
)->prefetch_before
= PREFETCH_ALL
;
396 (*aref
)->prefetch_mod
= 1;
397 (*aref
)->reuse_distance
= 0;
398 (*aref
)->issue_prefetch_p
= false;
399 (*aref
)->group
= group
;
400 (*aref
)->next
= NULL
;
401 (*aref
)->independent_p
= false;
402 (*aref
)->storent_p
= false;
404 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
405 dump_mem_ref (dump_file
, *aref
);
408 /* Release memory references in GROUPS. */
411 release_mem_refs (struct mem_ref_group
*groups
)
413 struct mem_ref_group
*next_g
;
414 struct mem_ref
*ref
, *next_r
;
416 for (; groups
; groups
= next_g
)
418 next_g
= groups
->next
;
419 for (ref
= groups
->refs
; ref
; ref
= next_r
)
428 /* A structure used to pass arguments to idx_analyze_ref. */
432 struct loop
*loop
; /* Loop of the reference. */
433 gimple
*stmt
; /* Statement of the reference. */
434 tree
*step
; /* Step of the memory reference. */
435 HOST_WIDE_INT
*delta
; /* Offset of the memory reference. */
438 /* Analyzes a single INDEX of a memory reference to obtain information
439 described at analyze_ref. Callback for for_each_index. */
442 idx_analyze_ref (tree base
, tree
*index
, void *data
)
444 struct ar_data
*ar_data
= (struct ar_data
*) data
;
445 tree ibase
, step
, stepsize
;
446 HOST_WIDE_INT idelta
= 0, imult
= 1;
449 if (!simple_iv (ar_data
->loop
, loop_containing_stmt (ar_data
->stmt
),
455 if (TREE_CODE (ibase
) == POINTER_PLUS_EXPR
456 && cst_and_fits_in_hwi (TREE_OPERAND (ibase
, 1)))
458 idelta
= int_cst_value (TREE_OPERAND (ibase
, 1));
459 ibase
= TREE_OPERAND (ibase
, 0);
461 if (cst_and_fits_in_hwi (ibase
))
463 idelta
+= int_cst_value (ibase
);
464 ibase
= build_int_cst (TREE_TYPE (ibase
), 0);
467 if (TREE_CODE (base
) == ARRAY_REF
)
469 stepsize
= array_ref_element_size (base
);
470 if (!cst_and_fits_in_hwi (stepsize
))
472 imult
= int_cst_value (stepsize
);
473 step
= fold_build2 (MULT_EXPR
, sizetype
,
474 fold_convert (sizetype
, step
),
475 fold_convert (sizetype
, stepsize
));
479 if (*ar_data
->step
== NULL_TREE
)
480 *ar_data
->step
= step
;
482 *ar_data
->step
= fold_build2 (PLUS_EXPR
, sizetype
,
483 fold_convert (sizetype
, *ar_data
->step
),
484 fold_convert (sizetype
, step
));
485 *ar_data
->delta
+= idelta
;
491 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
492 STEP are integer constants and iter is number of iterations of LOOP. The
493 reference occurs in statement STMT. Strips nonaddressable component
494 references from REF_P. */
497 analyze_ref (struct loop
*loop
, tree
*ref_p
, tree
*base
,
498 tree
*step
, HOST_WIDE_INT
*delta
,
501 struct ar_data ar_data
;
503 HOST_WIDE_INT bit_offset
;
509 /* First strip off the component references. Ignore bitfields.
510 Also strip off the real and imagine parts of a complex, so that
511 they can have the same base. */
512 if (TREE_CODE (ref
) == REALPART_EXPR
513 || TREE_CODE (ref
) == IMAGPART_EXPR
514 || (TREE_CODE (ref
) == COMPONENT_REF
515 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref
, 1))))
517 if (TREE_CODE (ref
) == IMAGPART_EXPR
)
518 *delta
+= int_size_in_bytes (TREE_TYPE (ref
));
519 ref
= TREE_OPERAND (ref
, 0);
524 for (; TREE_CODE (ref
) == COMPONENT_REF
; ref
= TREE_OPERAND (ref
, 0))
526 off
= DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref
, 1));
527 bit_offset
= TREE_INT_CST_LOW (off
);
528 gcc_assert (bit_offset
% BITS_PER_UNIT
== 0);
530 *delta
+= bit_offset
/ BITS_PER_UNIT
;
533 *base
= unshare_expr (ref
);
537 ar_data
.delta
= delta
;
538 return for_each_index (base
, idx_analyze_ref
, &ar_data
);
541 /* Record a memory reference REF to the list REFS. The reference occurs in
542 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
543 reference was recorded, false otherwise. */
546 gather_memory_references_ref (struct loop
*loop
, struct mem_ref_group
**refs
,
547 tree ref
, bool write_p
, gimple
*stmt
)
551 struct mem_ref_group
*agrp
;
553 if (get_base_address (ref
) == NULL
)
556 if (!analyze_ref (loop
, &ref
, &base
, &step
, &delta
, stmt
))
558 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
559 if (step
== NULL_TREE
)
562 /* Stop if the address of BASE could not be taken. */
563 if (may_be_nonaddressable_p (base
))
566 /* Limit non-constant step prefetching only to the innermost loops and
567 only when the step is loop invariant in the entire loop nest. */
568 if (!cst_and_fits_in_hwi (step
))
570 if (loop
->inner
!= NULL
)
572 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
574 fprintf (dump_file
, "Memory expression %p\n",(void *) ref
);
575 print_generic_expr (dump_file
, ref
, TDF_TREE
);
576 fprintf (dump_file
,":");
577 dump_mem_details (dump_file
, base
, step
, delta
, write_p
);
579 "Ignoring %p, non-constant step prefetching is "
580 "limited to inner most loops \n",
587 if (!expr_invariant_in_loop_p (loop_outermost (loop
), step
))
589 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
591 fprintf (dump_file
, "Memory expression %p\n",(void *) ref
);
592 print_generic_expr (dump_file
, ref
, TDF_TREE
);
593 fprintf (dump_file
,":");
594 dump_mem_details (dump_file
, base
, step
, delta
, write_p
);
596 "Not prefetching, ignoring %p due to "
597 "loop variant step\n",
605 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
606 are integer constants. */
607 agrp
= find_or_create_group (refs
, base
, step
);
608 record_ref (agrp
, stmt
, ref
, delta
, write_p
);
613 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
614 true if there are no other memory references inside the loop. */
616 static struct mem_ref_group
*
617 gather_memory_references (struct loop
*loop
, bool *no_other_refs
, unsigned *ref_count
)
619 basic_block
*body
= get_loop_body_in_dom_order (loop
);
622 gimple_stmt_iterator bsi
;
625 struct mem_ref_group
*refs
= NULL
;
627 *no_other_refs
= true;
630 /* Scan the loop body in order, so that the former references precede the
632 for (i
= 0; i
< loop
->num_nodes
; i
++)
635 if (bb
->loop_father
!= loop
)
638 for (bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
); gsi_next (&bsi
))
640 stmt
= gsi_stmt (bsi
);
642 if (gimple_code (stmt
) != GIMPLE_ASSIGN
)
644 if (gimple_vuse (stmt
)
645 || (is_gimple_call (stmt
)
646 && !(gimple_call_flags (stmt
) & ECF_CONST
)))
647 *no_other_refs
= false;
651 lhs
= gimple_assign_lhs (stmt
);
652 rhs
= gimple_assign_rhs1 (stmt
);
654 if (REFERENCE_CLASS_P (rhs
))
656 *no_other_refs
&= gather_memory_references_ref (loop
, &refs
,
660 if (REFERENCE_CLASS_P (lhs
))
662 *no_other_refs
&= gather_memory_references_ref (loop
, &refs
,
673 /* Prune the prefetch candidate REF using the self-reuse. */
676 prune_ref_by_self_reuse (struct mem_ref
*ref
)
681 /* If the step size is non constant, we cannot calculate prefetch_mod. */
682 if (!cst_and_fits_in_hwi (ref
->group
->step
))
685 step
= int_cst_value (ref
->group
->step
);
691 /* Prefetch references to invariant address just once. */
692 ref
->prefetch_before
= 1;
699 if (step
> PREFETCH_BLOCK
)
702 if ((backward
&& HAVE_BACKWARD_PREFETCH
)
703 || (!backward
&& HAVE_FORWARD_PREFETCH
))
705 ref
->prefetch_before
= 1;
709 ref
->prefetch_mod
= PREFETCH_BLOCK
/ step
;
712 /* Divides X by BY, rounding down. */
715 ddown (HOST_WIDE_INT x
, unsigned HOST_WIDE_INT by
)
722 return (x
+ by
- 1) / by
;
725 /* Given a CACHE_LINE_SIZE and two inductive memory references
726 with a common STEP greater than CACHE_LINE_SIZE and an address
727 difference DELTA, compute the probability that they will fall
728 in different cache lines. Return true if the computed miss rate
729 is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
730 number of distinct iterations after which the pattern repeats itself.
731 ALIGN_UNIT is the unit of alignment in bytes. */
734 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size
,
735 HOST_WIDE_INT step
, HOST_WIDE_INT delta
,
736 unsigned HOST_WIDE_INT distinct_iters
,
739 unsigned align
, iter
;
740 int total_positions
, miss_positions
, max_allowed_miss_positions
;
741 int address1
, address2
, cache_line1
, cache_line2
;
743 /* It always misses if delta is greater than or equal to the cache
745 if (delta
>= (HOST_WIDE_INT
) cache_line_size
)
749 total_positions
= (cache_line_size
/ align_unit
) * distinct_iters
;
750 max_allowed_miss_positions
= (ACCEPTABLE_MISS_RATE
* total_positions
) / 1000;
752 /* Iterate through all possible alignments of the first
753 memory reference within its cache line. */
754 for (align
= 0; align
< cache_line_size
; align
+= align_unit
)
756 /* Iterate through all distinct iterations. */
757 for (iter
= 0; iter
< distinct_iters
; iter
++)
759 address1
= align
+ step
* iter
;
760 address2
= address1
+ delta
;
761 cache_line1
= address1
/ cache_line_size
;
762 cache_line2
= address2
/ cache_line_size
;
763 if (cache_line1
!= cache_line2
)
766 if (miss_positions
> max_allowed_miss_positions
)
773 /* Prune the prefetch candidate REF using the reuse with BY.
774 If BY_IS_BEFORE is true, BY is before REF in the loop. */
777 prune_ref_by_group_reuse (struct mem_ref
*ref
, struct mem_ref
*by
,
782 HOST_WIDE_INT delta_r
= ref
->delta
, delta_b
= by
->delta
;
783 HOST_WIDE_INT delta
= delta_b
- delta_r
;
784 HOST_WIDE_INT hit_from
;
785 unsigned HOST_WIDE_INT prefetch_before
, prefetch_block
;
786 HOST_WIDE_INT reduced_step
;
787 unsigned HOST_WIDE_INT reduced_prefetch_block
;
791 /* If the step is non constant we cannot calculate prefetch_before. */
792 if (!cst_and_fits_in_hwi (ref
->group
->step
)) {
796 step
= int_cst_value (ref
->group
->step
);
803 /* If the references has the same address, only prefetch the
806 ref
->prefetch_before
= 0;
813 /* If the reference addresses are invariant and fall into the
814 same cache line, prefetch just the first one. */
818 if (ddown (ref
->delta
, PREFETCH_BLOCK
)
819 != ddown (by
->delta
, PREFETCH_BLOCK
))
822 ref
->prefetch_before
= 0;
826 /* Only prune the reference that is behind in the array. */
832 /* Transform the data so that we may assume that the accesses
836 delta_r
= PREFETCH_BLOCK
- 1 - delta_r
;
837 delta_b
= PREFETCH_BLOCK
- 1 - delta_b
;
845 /* Check whether the two references are likely to hit the same cache
846 line, and how distant the iterations in that it occurs are from
849 if (step
<= PREFETCH_BLOCK
)
851 /* The accesses are sure to meet. Let us check when. */
852 hit_from
= ddown (delta_b
, PREFETCH_BLOCK
) * PREFETCH_BLOCK
;
853 prefetch_before
= (hit_from
- delta_r
+ step
- 1) / step
;
855 /* Do not reduce prefetch_before if we meet beyond cache size. */
856 if (prefetch_before
> absu_hwi (L2_CACHE_SIZE_BYTES
/ step
))
857 prefetch_before
= PREFETCH_ALL
;
858 if (prefetch_before
< ref
->prefetch_before
)
859 ref
->prefetch_before
= prefetch_before
;
864 /* A more complicated case with step > prefetch_block. First reduce
865 the ratio between the step and the cache line size to its simplest
866 terms. The resulting denominator will then represent the number of
867 distinct iterations after which each address will go back to its
868 initial location within the cache line. This computation assumes
869 that PREFETCH_BLOCK is a power of two. */
870 prefetch_block
= PREFETCH_BLOCK
;
871 reduced_prefetch_block
= prefetch_block
;
873 while ((reduced_step
& 1) == 0
874 && reduced_prefetch_block
> 1)
877 reduced_prefetch_block
>>= 1;
880 prefetch_before
= delta
/ step
;
882 ref_type
= TREE_TYPE (ref
->mem
);
883 align_unit
= TYPE_ALIGN (ref_type
) / 8;
884 if (is_miss_rate_acceptable (prefetch_block
, step
, delta
,
885 reduced_prefetch_block
, align_unit
))
887 /* Do not reduce prefetch_before if we meet beyond cache size. */
888 if (prefetch_before
> L2_CACHE_SIZE_BYTES
/ PREFETCH_BLOCK
)
889 prefetch_before
= PREFETCH_ALL
;
890 if (prefetch_before
< ref
->prefetch_before
)
891 ref
->prefetch_before
= prefetch_before
;
896 /* Try also the following iteration. */
898 delta
= step
- delta
;
899 if (is_miss_rate_acceptable (prefetch_block
, step
, delta
,
900 reduced_prefetch_block
, align_unit
))
902 if (prefetch_before
< ref
->prefetch_before
)
903 ref
->prefetch_before
= prefetch_before
;
908 /* The ref probably does not reuse by. */
912 /* Prune the prefetch candidate REF using the reuses with other references
916 prune_ref_by_reuse (struct mem_ref
*ref
, struct mem_ref
*refs
)
918 struct mem_ref
*prune_by
;
921 prune_ref_by_self_reuse (ref
);
923 for (prune_by
= refs
; prune_by
; prune_by
= prune_by
->next
)
931 if (!WRITE_CAN_USE_READ_PREFETCH
933 && !prune_by
->write_p
)
935 if (!READ_CAN_USE_WRITE_PREFETCH
937 && prune_by
->write_p
)
940 prune_ref_by_group_reuse (ref
, prune_by
, before
);
944 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
947 prune_group_by_reuse (struct mem_ref_group
*group
)
949 struct mem_ref
*ref_pruned
;
951 for (ref_pruned
= group
->refs
; ref_pruned
; ref_pruned
= ref_pruned
->next
)
953 prune_ref_by_reuse (ref_pruned
, group
->refs
);
955 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
957 fprintf (dump_file
, "Reference %p:", (void *) ref_pruned
);
959 if (ref_pruned
->prefetch_before
== PREFETCH_ALL
960 && ref_pruned
->prefetch_mod
== 1)
961 fprintf (dump_file
, " no restrictions");
962 else if (ref_pruned
->prefetch_before
== 0)
963 fprintf (dump_file
, " do not prefetch");
964 else if (ref_pruned
->prefetch_before
<= ref_pruned
->prefetch_mod
)
965 fprintf (dump_file
, " prefetch once");
968 if (ref_pruned
->prefetch_before
!= PREFETCH_ALL
)
970 fprintf (dump_file
, " prefetch before ");
971 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
972 ref_pruned
->prefetch_before
);
974 if (ref_pruned
->prefetch_mod
!= 1)
976 fprintf (dump_file
, " prefetch mod ");
977 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
978 ref_pruned
->prefetch_mod
);
981 fprintf (dump_file
, "\n");
986 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
989 prune_by_reuse (struct mem_ref_group
*groups
)
991 for (; groups
; groups
= groups
->next
)
992 prune_group_by_reuse (groups
);
995 /* Returns true if we should issue prefetch for REF. */
998 should_issue_prefetch_p (struct mem_ref
*ref
)
1000 /* For now do not issue prefetches for only first few of the
1002 if (ref
->prefetch_before
!= PREFETCH_ALL
)
1004 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1005 fprintf (dump_file
, "Ignoring %p due to prefetch_before\n",
1010 /* Do not prefetch nontemporal stores. */
1013 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1014 fprintf (dump_file
, "Ignoring nontemporal store %p\n", (void *) ref
);
1021 /* Decide which of the prefetch candidates in GROUPS to prefetch.
1022 AHEAD is the number of iterations to prefetch ahead (which corresponds
1023 to the number of simultaneous instances of one prefetch running at a
1024 time). UNROLL_FACTOR is the factor by that the loop is going to be
1025 unrolled. Returns true if there is anything to prefetch. */
1028 schedule_prefetches (struct mem_ref_group
*groups
, unsigned unroll_factor
,
1031 unsigned remaining_prefetch_slots
, n_prefetches
, prefetch_slots
;
1032 unsigned slots_per_prefetch
;
1033 struct mem_ref
*ref
;
1036 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
1037 remaining_prefetch_slots
= SIMULTANEOUS_PREFETCHES
;
1039 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1040 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1041 it will need a prefetch slot. */
1042 slots_per_prefetch
= (ahead
+ unroll_factor
/ 2) / unroll_factor
;
1043 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1044 fprintf (dump_file
, "Each prefetch instruction takes %u prefetch slots.\n",
1045 slots_per_prefetch
);
1047 /* For now we just take memory references one by one and issue
1048 prefetches for as many as possible. The groups are sorted
1049 starting with the largest step, since the references with
1050 large step are more likely to cause many cache misses. */
1052 for (; groups
; groups
= groups
->next
)
1053 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
1055 if (!should_issue_prefetch_p (ref
))
1058 /* The loop is far from being sufficiently unrolled for this
1059 prefetch. Do not generate prefetch to avoid many redudant
1061 if (ref
->prefetch_mod
/ unroll_factor
> PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
)
1064 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1065 and we unroll the loop UNROLL_FACTOR times, we need to insert
1066 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1068 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
1069 / ref
->prefetch_mod
);
1070 prefetch_slots
= n_prefetches
* slots_per_prefetch
;
1072 /* If more than half of the prefetches would be lost anyway, do not
1073 issue the prefetch. */
1074 if (2 * remaining_prefetch_slots
< prefetch_slots
)
1077 ref
->issue_prefetch_p
= true;
1079 if (remaining_prefetch_slots
<= prefetch_slots
)
1081 remaining_prefetch_slots
-= prefetch_slots
;
1088 /* Return TRUE if no prefetch is going to be generated in the given
1092 nothing_to_prefetch_p (struct mem_ref_group
*groups
)
1094 struct mem_ref
*ref
;
1096 for (; groups
; groups
= groups
->next
)
1097 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
1098 if (should_issue_prefetch_p (ref
))
1104 /* Estimate the number of prefetches in the given GROUPS.
1105 UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1108 estimate_prefetch_count (struct mem_ref_group
*groups
, unsigned unroll_factor
)
1110 struct mem_ref
*ref
;
1111 unsigned n_prefetches
;
1112 int prefetch_count
= 0;
1114 for (; groups
; groups
= groups
->next
)
1115 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
1116 if (should_issue_prefetch_p (ref
))
1118 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
1119 / ref
->prefetch_mod
);
1120 prefetch_count
+= n_prefetches
;
1123 return prefetch_count
;
1126 /* Issue prefetches for the reference REF into loop as decided before.
1127 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1128 is the factor by which LOOP was unrolled. */
1131 issue_prefetch_ref (struct mem_ref
*ref
, unsigned unroll_factor
, unsigned ahead
)
1133 HOST_WIDE_INT delta
;
1134 tree addr
, addr_base
, write_p
, local
, forward
;
1136 gimple_stmt_iterator bsi
;
1137 unsigned n_prefetches
, ap
;
1138 bool nontemporal
= ref
->reuse_distance
>= L2_CACHE_SIZE_BYTES
;
1140 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1141 fprintf (dump_file
, "Issued%s prefetch for %p.\n",
1142 nontemporal
? " nontemporal" : "",
1145 bsi
= gsi_for_stmt (ref
->stmt
);
1147 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
1148 / ref
->prefetch_mod
);
1149 addr_base
= build_fold_addr_expr_with_type (ref
->mem
, ptr_type_node
);
1150 addr_base
= force_gimple_operand_gsi (&bsi
, unshare_expr (addr_base
),
1151 true, NULL
, true, GSI_SAME_STMT
);
1152 write_p
= ref
->write_p
? integer_one_node
: integer_zero_node
;
1153 local
= nontemporal
? integer_zero_node
: integer_three_node
;
1155 for (ap
= 0; ap
< n_prefetches
; ap
++)
1157 if (cst_and_fits_in_hwi (ref
->group
->step
))
1159 /* Determine the address to prefetch. */
1160 delta
= (ahead
+ ap
* ref
->prefetch_mod
) *
1161 int_cst_value (ref
->group
->step
);
1162 addr
= fold_build_pointer_plus_hwi (addr_base
, delta
);
1163 addr
= force_gimple_operand_gsi (&bsi
, unshare_expr (addr
), true, NULL
,
1164 true, GSI_SAME_STMT
);
1168 /* The step size is non-constant but loop-invariant. We use the
1169 heuristic to simply prefetch ahead iterations ahead. */
1170 forward
= fold_build2 (MULT_EXPR
, sizetype
,
1171 fold_convert (sizetype
, ref
->group
->step
),
1172 fold_convert (sizetype
, size_int (ahead
)));
1173 addr
= fold_build_pointer_plus (addr_base
, forward
);
1174 addr
= force_gimple_operand_gsi (&bsi
, unshare_expr (addr
), true,
1175 NULL
, true, GSI_SAME_STMT
);
1177 /* Create the prefetch instruction. */
1178 prefetch
= gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH
),
1179 3, addr
, write_p
, local
);
1180 gsi_insert_before (&bsi
, prefetch
, GSI_SAME_STMT
);
1184 /* Issue prefetches for the references in GROUPS into loop as decided before.
1185 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1186 factor by that LOOP was unrolled. */
1189 issue_prefetches (struct mem_ref_group
*groups
,
1190 unsigned unroll_factor
, unsigned ahead
)
1192 struct mem_ref
*ref
;
1194 for (; groups
; groups
= groups
->next
)
1195 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
1196 if (ref
->issue_prefetch_p
)
1197 issue_prefetch_ref (ref
, unroll_factor
, ahead
);
1200 /* Returns true if REF is a memory write for that a nontemporal store insn
1204 nontemporal_store_p (struct mem_ref
*ref
)
1207 enum insn_code code
;
1209 /* REF must be a write that is not reused. We require it to be independent
1210 on all other memory references in the loop, as the nontemporal stores may
1211 be reordered with respect to other memory references. */
1213 || !ref
->independent_p
1214 || ref
->reuse_distance
< L2_CACHE_SIZE_BYTES
)
1217 /* Check that we have the storent instruction for the mode. */
1218 mode
= TYPE_MODE (TREE_TYPE (ref
->mem
));
1219 if (mode
== BLKmode
)
1222 code
= optab_handler (storent_optab
, mode
);
1223 return code
!= CODE_FOR_nothing
;
1226 /* If REF is a nontemporal store, we mark the corresponding modify statement
1227 and return true. Otherwise, we return false. */
1230 mark_nontemporal_store (struct mem_ref
*ref
)
1232 if (!nontemporal_store_p (ref
))
1235 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1236 fprintf (dump_file
, "Marked reference %p as a nontemporal store.\n",
1239 gimple_assign_set_nontemporal_move (ref
->stmt
, true);
1240 ref
->storent_p
= true;
1245 /* Issue a memory fence instruction after LOOP. */
1248 emit_mfence_after_loop (struct loop
*loop
)
1250 vec
<edge
> exits
= get_loop_exit_edges (loop
);
1253 gimple_stmt_iterator bsi
;
1256 FOR_EACH_VEC_ELT (exits
, i
, exit
)
1258 call
= gimple_build_call (FENCE_FOLLOWING_MOVNT
, 0);
1260 if (!single_pred_p (exit
->dest
)
1261 /* If possible, we prefer not to insert the fence on other paths
1263 && !(exit
->flags
& EDGE_ABNORMAL
))
1264 split_loop_exit_edge (exit
);
1265 bsi
= gsi_after_labels (exit
->dest
);
1267 gsi_insert_before (&bsi
, call
, GSI_NEW_STMT
);
1271 update_ssa (TODO_update_ssa_only_virtuals
);
1274 /* Returns true if we can use storent in loop, false otherwise. */
1277 may_use_storent_in_loop_p (struct loop
*loop
)
1281 if (loop
->inner
!= NULL
)
1284 /* If we must issue a mfence insn after using storent, check that there
1285 is a suitable place for it at each of the loop exits. */
1286 if (FENCE_FOLLOWING_MOVNT
!= NULL_TREE
)
1288 vec
<edge
> exits
= get_loop_exit_edges (loop
);
1292 FOR_EACH_VEC_ELT (exits
, i
, exit
)
1293 if ((exit
->flags
& EDGE_ABNORMAL
)
1294 && exit
->dest
== EXIT_BLOCK_PTR_FOR_FN (cfun
))
1303 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1304 references in the loop. */
1307 mark_nontemporal_stores (struct loop
*loop
, struct mem_ref_group
*groups
)
1309 struct mem_ref
*ref
;
1312 if (!may_use_storent_in_loop_p (loop
))
1315 for (; groups
; groups
= groups
->next
)
1316 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
1317 any
|= mark_nontemporal_store (ref
);
1319 if (any
&& FENCE_FOLLOWING_MOVNT
!= NULL_TREE
)
1320 emit_mfence_after_loop (loop
);
1323 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1324 this is the case, fill in DESC by the description of number of
1328 should_unroll_loop_p (struct loop
*loop
, struct tree_niter_desc
*desc
,
1331 if (!can_unroll_loop_p (loop
, factor
, desc
))
1334 /* We only consider loops without control flow for unrolling. This is not
1335 a hard restriction -- tree_unroll_loop works with arbitrary loops
1336 as well; but the unrolling/prefetching is usually more profitable for
1337 loops consisting of a single basic block, and we want to limit the
1339 if (loop
->num_nodes
> 2)
1345 /* Determine the coefficient by that unroll LOOP, from the information
1346 contained in the list of memory references REFS. Description of
1347 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1348 insns of the LOOP. EST_NITER is the estimated number of iterations of
1349 the loop, or -1 if no estimate is available. */
1352 determine_unroll_factor (struct loop
*loop
, struct mem_ref_group
*refs
,
1353 unsigned ninsns
, struct tree_niter_desc
*desc
,
1354 HOST_WIDE_INT est_niter
)
1356 unsigned upper_bound
;
1357 unsigned nfactor
, factor
, mod_constraint
;
1358 struct mem_ref_group
*agp
;
1359 struct mem_ref
*ref
;
1361 /* First check whether the loop is not too large to unroll. We ignore
1362 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1363 from unrolling them enough to make exactly one cache line covered by each
1364 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1365 us from unrolling the loops too many times in cases where we only expect
1366 gains from better scheduling and decreasing loop overhead, which is not
1368 upper_bound
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / ninsns
;
1370 /* If we unrolled the loop more times than it iterates, the unrolled version
1371 of the loop would be never entered. */
1372 if (est_niter
>= 0 && est_niter
< (HOST_WIDE_INT
) upper_bound
)
1373 upper_bound
= est_niter
;
1375 if (upper_bound
<= 1)
1378 /* Choose the factor so that we may prefetch each cache just once,
1379 but bound the unrolling by UPPER_BOUND. */
1381 for (agp
= refs
; agp
; agp
= agp
->next
)
1382 for (ref
= agp
->refs
; ref
; ref
= ref
->next
)
1383 if (should_issue_prefetch_p (ref
))
1385 mod_constraint
= ref
->prefetch_mod
;
1386 nfactor
= least_common_multiple (mod_constraint
, factor
);
1387 if (nfactor
<= upper_bound
)
1391 if (!should_unroll_loop_p (loop
, desc
, factor
))
1397 /* Returns the total volume of the memory references REFS, taking into account
1398 reuses in the innermost loop and cache line size. TODO -- we should also
1399 take into account reuses across the iterations of the loops in the loop
1403 volume_of_references (struct mem_ref_group
*refs
)
1405 unsigned volume
= 0;
1406 struct mem_ref_group
*gr
;
1407 struct mem_ref
*ref
;
1409 for (gr
= refs
; gr
; gr
= gr
->next
)
1410 for (ref
= gr
->refs
; ref
; ref
= ref
->next
)
1412 /* Almost always reuses another value? */
1413 if (ref
->prefetch_before
!= PREFETCH_ALL
)
1416 /* If several iterations access the same cache line, use the size of
1417 the line divided by this number. Otherwise, a cache line is
1418 accessed in each iteration. TODO -- in the latter case, we should
1419 take the size of the reference into account, rounding it up on cache
1420 line size multiple. */
1421 volume
+= L1_CACHE_LINE_SIZE
/ ref
->prefetch_mod
;
1426 /* Returns the volume of memory references accessed across VEC iterations of
1427 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1428 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1431 volume_of_dist_vector (lambda_vector vec
, unsigned *loop_sizes
, unsigned n
)
1435 for (i
= 0; i
< n
; i
++)
1442 gcc_assert (vec
[i
] > 0);
1444 /* We ignore the parts of the distance vector in subloops, since usually
1445 the numbers of iterations are much smaller. */
1446 return loop_sizes
[i
] * vec
[i
];
1449 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1450 at the position corresponding to the loop of the step. N is the depth
1451 of the considered loop nest, and, LOOP is its innermost loop. */
1454 add_subscript_strides (tree access_fn
, unsigned stride
,
1455 HOST_WIDE_INT
*strides
, unsigned n
, struct loop
*loop
)
1459 HOST_WIDE_INT astep
;
1460 unsigned min_depth
= loop_depth (loop
) - n
;
1462 while (TREE_CODE (access_fn
) == POLYNOMIAL_CHREC
)
1464 aloop
= get_chrec_loop (access_fn
);
1465 step
= CHREC_RIGHT (access_fn
);
1466 access_fn
= CHREC_LEFT (access_fn
);
1468 if ((unsigned) loop_depth (aloop
) <= min_depth
)
1471 if (tree_fits_shwi_p (step
))
1472 astep
= tree_to_shwi (step
);
1474 astep
= L1_CACHE_LINE_SIZE
;
1476 strides
[n
- 1 - loop_depth (loop
) + loop_depth (aloop
)] += astep
* stride
;
1481 /* Returns the volume of memory references accessed between two consecutive
1482 self-reuses of the reference DR. We consider the subscripts of DR in N
1483 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1484 loops. LOOP is the innermost loop of the current loop nest. */
1487 self_reuse_distance (data_reference_p dr
, unsigned *loop_sizes
, unsigned n
,
1490 tree stride
, access_fn
;
1491 HOST_WIDE_INT
*strides
, astride
;
1492 vec
<tree
> access_fns
;
1493 tree ref
= DR_REF (dr
);
1494 unsigned i
, ret
= ~0u;
1496 /* In the following example:
1498 for (i = 0; i < N; i++)
1499 for (j = 0; j < N; j++)
1501 the same cache line is accessed each N steps (except if the change from
1502 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1503 we cannot rely purely on the results of the data dependence analysis.
1505 Instead, we compute the stride of the reference in each loop, and consider
1506 the innermost loop in that the stride is less than cache size. */
1508 strides
= XCNEWVEC (HOST_WIDE_INT
, n
);
1509 access_fns
= DR_ACCESS_FNS (dr
);
1511 FOR_EACH_VEC_ELT (access_fns
, i
, access_fn
)
1513 /* Keep track of the reference corresponding to the subscript, so that we
1515 while (handled_component_p (ref
) && TREE_CODE (ref
) != ARRAY_REF
)
1516 ref
= TREE_OPERAND (ref
, 0);
1518 if (TREE_CODE (ref
) == ARRAY_REF
)
1520 stride
= TYPE_SIZE_UNIT (TREE_TYPE (ref
));
1521 if (tree_fits_uhwi_p (stride
))
1522 astride
= tree_to_uhwi (stride
);
1524 astride
= L1_CACHE_LINE_SIZE
;
1526 ref
= TREE_OPERAND (ref
, 0);
1531 add_subscript_strides (access_fn
, astride
, strides
, n
, loop
);
1534 for (i
= n
; i
-- > 0; )
1536 unsigned HOST_WIDE_INT s
;
1538 s
= strides
[i
] < 0 ? -strides
[i
] : strides
[i
];
1540 if (s
< (unsigned) L1_CACHE_LINE_SIZE
1542 > (unsigned) (L1_CACHE_SIZE_BYTES
/ NONTEMPORAL_FRACTION
)))
1544 ret
= loop_sizes
[i
];
1553 /* Determines the distance till the first reuse of each reference in REFS
1554 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1555 memory references in the loop. Return false if the analysis fails. */
1558 determine_loop_nest_reuse (struct loop
*loop
, struct mem_ref_group
*refs
,
1561 struct loop
*nest
, *aloop
;
1562 vec
<data_reference_p
> datarefs
= vNULL
;
1563 vec
<ddr_p
> dependences
= vNULL
;
1564 struct mem_ref_group
*gr
;
1565 struct mem_ref
*ref
, *refb
;
1566 vec
<loop_p
> vloops
= vNULL
;
1567 unsigned *loop_data_size
;
1569 unsigned volume
, dist
, adist
;
1571 data_reference_p dr
;
1577 /* Find the outermost loop of the loop nest of loop (we require that
1578 there are no sibling loops inside the nest). */
1582 aloop
= loop_outer (nest
);
1584 if (aloop
== current_loops
->tree_root
1585 || aloop
->inner
->next
)
1591 /* For each loop, determine the amount of data accessed in each iteration.
1592 We use this to estimate whether the reference is evicted from the
1593 cache before its reuse. */
1594 find_loop_nest (nest
, &vloops
);
1595 n
= vloops
.length ();
1596 loop_data_size
= XNEWVEC (unsigned, n
);
1597 volume
= volume_of_references (refs
);
1601 loop_data_size
[i
] = volume
;
1602 /* Bound the volume by the L2 cache size, since above this bound,
1603 all dependence distances are equivalent. */
1604 if (volume
> L2_CACHE_SIZE_BYTES
)
1608 vol
= estimated_stmt_executions_int (aloop
);
1610 vol
= expected_loop_iterations (aloop
);
1614 /* Prepare the references in the form suitable for data dependence
1615 analysis. We ignore unanalyzable data references (the results
1616 are used just as a heuristics to estimate temporality of the
1617 references, hence we do not need to worry about correctness). */
1618 for (gr
= refs
; gr
; gr
= gr
->next
)
1619 for (ref
= gr
->refs
; ref
; ref
= ref
->next
)
1621 dr
= create_data_ref (nest
, loop_containing_stmt (ref
->stmt
),
1622 ref
->mem
, ref
->stmt
, !ref
->write_p
);
1626 ref
->reuse_distance
= volume
;
1628 datarefs
.safe_push (dr
);
1631 no_other_refs
= false;
1634 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1636 dist
= self_reuse_distance (dr
, loop_data_size
, n
, loop
);
1637 ref
= (struct mem_ref
*) dr
->aux
;
1638 if (ref
->reuse_distance
> dist
)
1639 ref
->reuse_distance
= dist
;
1642 ref
->independent_p
= true;
1645 if (!compute_all_dependences (datarefs
, &dependences
, vloops
, true))
1648 FOR_EACH_VEC_ELT (dependences
, i
, dep
)
1650 if (DDR_ARE_DEPENDENT (dep
) == chrec_known
)
1653 ref
= (struct mem_ref
*) DDR_A (dep
)->aux
;
1654 refb
= (struct mem_ref
*) DDR_B (dep
)->aux
;
1656 if (DDR_ARE_DEPENDENT (dep
) == chrec_dont_know
1657 || DDR_NUM_DIST_VECTS (dep
) == 0)
1659 /* If the dependence cannot be analyzed, assume that there might be
1663 ref
->independent_p
= false;
1664 refb
->independent_p
= false;
1668 /* The distance vectors are normalized to be always lexicographically
1669 positive, hence we cannot tell just from them whether DDR_A comes
1670 before DDR_B or vice versa. However, it is not important,
1671 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1672 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1673 in cache (and marking it as nontemporal would not affect
1677 for (j
= 0; j
< DDR_NUM_DIST_VECTS (dep
); j
++)
1679 adist
= volume_of_dist_vector (DDR_DIST_VECT (dep
, j
),
1682 /* If this is a dependence in the innermost loop (i.e., the
1683 distances in all superloops are zero) and it is not
1684 the trivial self-dependence with distance zero, record that
1685 the references are not completely independent. */
1686 if (lambda_vector_zerop (DDR_DIST_VECT (dep
, j
), n
- 1)
1688 || DDR_DIST_VECT (dep
, j
)[n
-1] != 0))
1690 ref
->independent_p
= false;
1691 refb
->independent_p
= false;
1694 /* Ignore accesses closer than
1695 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1696 so that we use nontemporal prefetches e.g. if single memory
1697 location is accessed several times in a single iteration of
1699 if (adist
< L1_CACHE_SIZE_BYTES
/ NONTEMPORAL_FRACTION
)
1707 if (ref
->reuse_distance
> dist
)
1708 ref
->reuse_distance
= dist
;
1709 if (refb
->reuse_distance
> dist
)
1710 refb
->reuse_distance
= dist
;
1713 free_dependence_relations (dependences
);
1714 free_data_refs (datarefs
);
1715 free (loop_data_size
);
1717 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1719 fprintf (dump_file
, "Reuse distances:\n");
1720 for (gr
= refs
; gr
; gr
= gr
->next
)
1721 for (ref
= gr
->refs
; ref
; ref
= ref
->next
)
1722 fprintf (dump_file
, " ref %p distance %u\n",
1723 (void *) ref
, ref
->reuse_distance
);
1729 /* Determine whether or not the trip count to ahead ratio is too small based
1730 on prefitablility consideration.
1731 AHEAD: the iteration ahead distance,
1732 EST_NITER: the estimated trip count. */
1735 trip_count_to_ahead_ratio_too_small_p (unsigned ahead
, HOST_WIDE_INT est_niter
)
1737 /* Assume trip count to ahead ratio is big enough if the trip count could not
1738 be estimated at compile time. */
1742 if (est_niter
< (HOST_WIDE_INT
) (TRIP_COUNT_TO_AHEAD_RATIO
* ahead
))
1744 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1746 "Not prefetching -- loop estimated to roll only %d times\n",
1754 /* Determine whether or not the number of memory references in the loop is
1755 reasonable based on the profitablity and compilation time considerations.
1756 NINSNS: estimated number of instructions in the loop,
1757 MEM_REF_COUNT: total number of memory references in the loop. */
1760 mem_ref_count_reasonable_p (unsigned ninsns
, unsigned mem_ref_count
)
1762 int insn_to_mem_ratio
;
1764 if (mem_ref_count
== 0)
1767 /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1768 (compute_all_dependences) have high costs based on quadratic complexity.
1769 To avoid huge compilation time, we give up prefetching if mem_ref_count
1771 if (mem_ref_count
> PREFETCH_MAX_MEM_REFS_PER_LOOP
)
1774 /* Prefetching improves performance by overlapping cache missing
1775 memory accesses with CPU operations. If the loop does not have
1776 enough CPU operations to overlap with memory operations, prefetching
1777 won't give a significant benefit. One approximate way of checking
1778 this is to require the ratio of instructions to memory references to
1779 be above a certain limit. This approximation works well in practice.
1780 TODO: Implement a more precise computation by estimating the time
1781 for each CPU or memory op in the loop. Time estimates for memory ops
1782 should account for cache misses. */
1783 insn_to_mem_ratio
= ninsns
/ mem_ref_count
;
1785 if (insn_to_mem_ratio
< PREFETCH_MIN_INSN_TO_MEM_RATIO
)
1787 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1789 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1797 /* Determine whether or not the instruction to prefetch ratio in the loop is
1798 too small based on the profitablity consideration.
1799 NINSNS: estimated number of instructions in the loop,
1800 PREFETCH_COUNT: an estimate of the number of prefetches,
1801 UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1804 insn_to_prefetch_ratio_too_small_p (unsigned ninsns
, unsigned prefetch_count
,
1805 unsigned unroll_factor
)
1807 int insn_to_prefetch_ratio
;
1809 /* Prefetching most likely causes performance degradation when the instruction
1810 to prefetch ratio is too small. Too many prefetch instructions in a loop
1811 may reduce the I-cache performance.
1812 (unroll_factor * ninsns) is used to estimate the number of instructions in
1813 the unrolled loop. This implementation is a bit simplistic -- the number
1814 of issued prefetch instructions is also affected by unrolling. So,
1815 prefetch_mod and the unroll factor should be taken into account when
1816 determining prefetch_count. Also, the number of insns of the unrolled
1817 loop will usually be significantly smaller than the number of insns of the
1818 original loop * unroll_factor (at least the induction variable increases
1819 and the exit branches will get eliminated), so it might be better to use
1820 tree_estimate_loop_size + estimated_unrolled_size. */
1821 insn_to_prefetch_ratio
= (unroll_factor
* ninsns
) / prefetch_count
;
1822 if (insn_to_prefetch_ratio
< MIN_INSN_TO_PREFETCH_RATIO
)
1824 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1826 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1827 insn_to_prefetch_ratio
);
1835 /* Issue prefetch instructions for array references in LOOP. Returns
1836 true if the LOOP was unrolled. */
1839 loop_prefetch_arrays (struct loop
*loop
)
1841 struct mem_ref_group
*refs
;
1842 unsigned ahead
, ninsns
, time
, unroll_factor
;
1843 HOST_WIDE_INT est_niter
;
1844 struct tree_niter_desc desc
;
1845 bool unrolled
= false, no_other_refs
;
1846 unsigned prefetch_count
;
1847 unsigned mem_ref_count
;
1849 if (optimize_loop_nest_for_size_p (loop
))
1851 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1852 fprintf (dump_file
, " ignored (cold area)\n");
1856 /* FIXME: the time should be weighted by the probabilities of the blocks in
1858 time
= tree_num_loop_insns (loop
, &eni_time_weights
);
1862 ahead
= (PREFETCH_LATENCY
+ time
- 1) / time
;
1863 est_niter
= estimated_stmt_executions_int (loop
);
1864 if (est_niter
== -1)
1865 est_niter
= max_stmt_executions_int (loop
);
1867 /* Prefetching is not likely to be profitable if the trip count to ahead
1868 ratio is too small. */
1869 if (trip_count_to_ahead_ratio_too_small_p (ahead
, est_niter
))
1872 ninsns
= tree_num_loop_insns (loop
, &eni_size_weights
);
1874 /* Step 1: gather the memory references. */
1875 refs
= gather_memory_references (loop
, &no_other_refs
, &mem_ref_count
);
1877 /* Give up prefetching if the number of memory references in the
1878 loop is not reasonable based on profitablity and compilation time
1880 if (!mem_ref_count_reasonable_p (ninsns
, mem_ref_count
))
1883 /* Step 2: estimate the reuse effects. */
1884 prune_by_reuse (refs
);
1886 if (nothing_to_prefetch_p (refs
))
1889 if (!determine_loop_nest_reuse (loop
, refs
, no_other_refs
))
1892 /* Step 3: determine unroll factor. */
1893 unroll_factor
= determine_unroll_factor (loop
, refs
, ninsns
, &desc
,
1896 /* Estimate prefetch count for the unrolled loop. */
1897 prefetch_count
= estimate_prefetch_count (refs
, unroll_factor
);
1898 if (prefetch_count
== 0)
1901 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1902 fprintf (dump_file
, "Ahead %d, unroll factor %d, trip count "
1903 HOST_WIDE_INT_PRINT_DEC
"\n"
1904 "insn count %d, mem ref count %d, prefetch count %d\n",
1905 ahead
, unroll_factor
, est_niter
,
1906 ninsns
, mem_ref_count
, prefetch_count
);
1908 /* Prefetching is not likely to be profitable if the instruction to prefetch
1909 ratio is too small. */
1910 if (insn_to_prefetch_ratio_too_small_p (ninsns
, prefetch_count
,
1914 mark_nontemporal_stores (loop
, refs
);
1916 /* Step 4: what to prefetch? */
1917 if (!schedule_prefetches (refs
, unroll_factor
, ahead
))
1920 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1921 iterations so that we do not issue superfluous prefetches. */
1922 if (unroll_factor
!= 1)
1924 tree_unroll_loop (loop
, unroll_factor
,
1925 single_dom_exit (loop
), &desc
);
1929 /* Step 6: issue the prefetches. */
1930 issue_prefetches (refs
, unroll_factor
, ahead
);
1933 release_mem_refs (refs
);
1937 /* Issue prefetch instructions for array references in loops. */
1940 tree_ssa_prefetch_arrays (void)
1943 bool unrolled
= false;
1946 if (!targetm
.have_prefetch ()
1947 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1948 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1949 of processor costs and i486 does not have prefetch, but
1950 -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */
1951 || PREFETCH_BLOCK
== 0)
1954 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1956 fprintf (dump_file
, "Prefetching parameters:\n");
1957 fprintf (dump_file
, " simultaneous prefetches: %d\n",
1958 SIMULTANEOUS_PREFETCHES
);
1959 fprintf (dump_file
, " prefetch latency: %d\n", PREFETCH_LATENCY
);
1960 fprintf (dump_file
, " prefetch block size: %d\n", PREFETCH_BLOCK
);
1961 fprintf (dump_file
, " L1 cache size: %d lines, %d kB\n",
1962 L1_CACHE_SIZE_BYTES
/ L1_CACHE_LINE_SIZE
, L1_CACHE_SIZE
);
1963 fprintf (dump_file
, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE
);
1964 fprintf (dump_file
, " L2 cache size: %d kB\n", L2_CACHE_SIZE
);
1965 fprintf (dump_file
, " min insn-to-prefetch ratio: %d \n",
1966 MIN_INSN_TO_PREFETCH_RATIO
);
1967 fprintf (dump_file
, " min insn-to-mem ratio: %d \n",
1968 PREFETCH_MIN_INSN_TO_MEM_RATIO
);
1969 fprintf (dump_file
, "\n");
1972 initialize_original_copy_tables ();
1974 if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH
))
1976 tree type
= build_function_type_list (void_type_node
,
1977 const_ptr_type_node
, NULL_TREE
);
1978 tree decl
= add_builtin_function ("__builtin_prefetch", type
,
1979 BUILT_IN_PREFETCH
, BUILT_IN_NORMAL
,
1981 DECL_IS_NOVOPS (decl
) = true;
1982 set_builtin_decl (BUILT_IN_PREFETCH
, decl
, false);
1985 /* We assume that size of cache line is a power of two, so verify this
1987 gcc_assert ((PREFETCH_BLOCK
& (PREFETCH_BLOCK
- 1)) == 0);
1989 FOR_EACH_LOOP (loop
, LI_FROM_INNERMOST
)
1991 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1992 fprintf (dump_file
, "Processing loop %d:\n", loop
->num
);
1994 unrolled
|= loop_prefetch_arrays (loop
);
1996 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1997 fprintf (dump_file
, "\n\n");
2003 todo_flags
|= TODO_cleanup_cfg
;
2006 free_original_copy_tables ();
2014 const pass_data pass_data_loop_prefetch
=
2016 GIMPLE_PASS
, /* type */
2017 "aprefetch", /* name */
2018 OPTGROUP_LOOP
, /* optinfo_flags */
2019 TV_TREE_PREFETCH
, /* tv_id */
2020 ( PROP_cfg
| PROP_ssa
), /* properties_required */
2021 0, /* properties_provided */
2022 0, /* properties_destroyed */
2023 0, /* todo_flags_start */
2024 0, /* todo_flags_finish */
2027 class pass_loop_prefetch
: public gimple_opt_pass
2030 pass_loop_prefetch (gcc::context
*ctxt
)
2031 : gimple_opt_pass (pass_data_loop_prefetch
, ctxt
)
2034 /* opt_pass methods: */
2035 virtual bool gate (function
*) { return flag_prefetch_loop_arrays
> 0; }
2036 virtual unsigned int execute (function
*);
2038 }; // class pass_loop_prefetch
2041 pass_loop_prefetch::execute (function
*fun
)
2043 if (number_of_loops (fun
) <= 1)
2046 return tree_ssa_prefetch_arrays ();
2052 make_pass_loop_prefetch (gcc::context
*ctxt
)
2054 return new pass_loop_prefetch (ctxt
);