2 Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
27 #include "hard-reg-set.h"
28 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
36 #include "tree-pass.h"
38 #include "insn-config.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
45 #include "langhooks.h"
46 #include "tree-inline.h"
47 #include "tree-data-ref.h"
50 /* This pass inserts prefetch instructions to optimize cache usage during
51 accesses to arrays in loops. It processes loops sequentially and:
53 1) Gathers all memory references in the single loop.
54 2) For each of the references it decides when it is profitable to prefetch
55 it. To do it, we evaluate the reuse among the accesses, and determines
56 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
57 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
58 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
59 iterations of the loop that are zero modulo PREFETCH_MOD). For example
60 (assuming cache line size is 64 bytes, char has size 1 byte and there
61 is no hardware sequential prefetch):
64 for (i = 0; i < max; i++)
71 a[187*i + 50] = ...; (5)
74 (0) obviously has PREFETCH_BEFORE 1
75 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
76 location 64 iterations before it, and PREFETCH_MOD 64 (since
77 it hits the same cache line otherwise).
78 (2) has PREFETCH_MOD 64
79 (3) has PREFETCH_MOD 4
80 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
81 the cache line accessed by (4) is the same with probability only
83 (5) has PREFETCH_MOD 1 as well.
85 Additionally, we use data dependence analysis to determine for each
86 reference the distance till the first reuse; this information is used
87 to determine the temporality of the issued prefetch instruction.
89 3) We determine how much ahead we need to prefetch. The number of
90 iterations needed is time to fetch / time spent in one iteration of
91 the loop. The problem is that we do not know either of these values,
92 so we just make a heuristic guess based on a magic (possibly)
93 target-specific constant and size of the loop.
95 4) Determine which of the references we prefetch. We take into account
96 that there is a maximum number of simultaneous prefetches (provided
97 by machine description). We prefetch as many prefetches as possible
98 while still within this bound (starting with those with lowest
99 prefetch_mod, since they are responsible for most of the cache
102 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
103 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
104 prefetching nonaccessed memory.
105 TODO -- actually implement peeling.
107 6) We actually emit the prefetch instructions. ??? Perhaps emit the
108 prefetch instructions with guards in cases where 5) was not sufficient
109 to satisfy the constraints?
111 The function is_loop_prefetching_profitable() implements a cost model
112 to determine if prefetching is profitable for a given loop. The cost
113 model has two heuristcs:
114 1. A heuristic that determines whether the given loop has enough CPU
115 ops that can be overlapped with cache missing memory ops.
116 If not, the loop won't benefit from prefetching. This is implemented
117 by requirung the ratio between the instruction count and the mem ref
118 count to be above a certain minimum.
119 2. A heuristic that disables prefetching in a loop with an unknown trip
120 count if the prefetching cost is above a certain limit. The relative
121 prefetching cost is estimated by taking the ratio between the
122 prefetch count and the total intruction count (this models the I-cache
124 The limits used in these heuristics are defined as parameters with
125 reasonable default values. Machine-specific default values will be
129 -- write and use more general reuse analysis (that could be also used
130 in other cache aimed loop optimizations)
131 -- make it behave sanely together with the prefetches given by user
132 (now we just ignore them; at the very least we should avoid
133 optimizing loops in that user put his own prefetches)
134 -- we assume cache line size alignment of arrays; this could be
137 /* Magic constants follow. These should be replaced by machine specific
140 /* True if write can be prefetched by a read prefetch. */
142 #ifndef WRITE_CAN_USE_READ_PREFETCH
143 #define WRITE_CAN_USE_READ_PREFETCH 1
146 /* True if read can be prefetched by a write prefetch. */
148 #ifndef READ_CAN_USE_WRITE_PREFETCH
149 #define READ_CAN_USE_WRITE_PREFETCH 0
152 /* The size of the block loaded by a single prefetch. Usually, this is
153 the same as cache line size (at the moment, we only consider one level
154 of cache hierarchy). */
156 #ifndef PREFETCH_BLOCK
157 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
160 /* Do we have a forward hardware sequential prefetching? */
162 #ifndef HAVE_FORWARD_PREFETCH
163 #define HAVE_FORWARD_PREFETCH 0
166 /* Do we have a backward hardware sequential prefetching? */
168 #ifndef HAVE_BACKWARD_PREFETCH
169 #define HAVE_BACKWARD_PREFETCH 0
172 /* In some cases we are only able to determine that there is a certain
173 probability that the two accesses hit the same cache line. In this
174 case, we issue the prefetches for both of them if this probability
175 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
177 #ifndef ACCEPTABLE_MISS_RATE
178 #define ACCEPTABLE_MISS_RATE 50
181 #ifndef HAVE_prefetch
182 #define HAVE_prefetch 0
185 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
186 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
188 /* We consider a memory access nontemporal if it is not reused sooner than
189 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
190 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
191 so that we use nontemporal prefetches e.g. if single memory location
192 is accessed several times in a single iteration of the loop. */
193 #define NONTEMPORAL_FRACTION 16
195 /* In case we have to emit a memory fence instruction after the loop that
196 uses nontemporal stores, this defines the builtin to use. */
198 #ifndef FENCE_FOLLOWING_MOVNT
199 #define FENCE_FOLLOWING_MOVNT NULL_TREE
202 /* The group of references between that reuse may occur. */
206 tree base
; /* Base of the reference. */
207 HOST_WIDE_INT step
; /* Step of the reference. */
208 struct mem_ref
*refs
; /* References in the group. */
209 struct mem_ref_group
*next
; /* Next group of references. */
212 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
214 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
216 /* The memory reference. */
220 gimple stmt
; /* Statement in that the reference appears. */
221 tree mem
; /* The reference. */
222 HOST_WIDE_INT delta
; /* Constant offset of the reference. */
223 struct mem_ref_group
*group
; /* The group of references it belongs to. */
224 unsigned HOST_WIDE_INT prefetch_mod
;
225 /* Prefetch only each PREFETCH_MOD-th
227 unsigned HOST_WIDE_INT prefetch_before
;
228 /* Prefetch only first PREFETCH_BEFORE
230 unsigned reuse_distance
; /* The amount of data accessed before the first
231 reuse of this value. */
232 struct mem_ref
*next
; /* The next reference in the group. */
233 unsigned write_p
: 1; /* Is it a write? */
234 unsigned independent_p
: 1; /* True if the reference is independent on
235 all other references inside the loop. */
236 unsigned issue_prefetch_p
: 1; /* Should we really issue the prefetch? */
237 unsigned storent_p
: 1; /* True if we changed the store to a
241 /* Dumps information about reference REF to FILE. */
244 dump_mem_ref (FILE *file
, struct mem_ref
*ref
)
246 fprintf (file
, "Reference %p:\n", (void *) ref
);
248 fprintf (file
, " group %p (base ", (void *) ref
->group
);
249 print_generic_expr (file
, ref
->group
->base
, TDF_SLIM
);
250 fprintf (file
, ", step ");
251 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, ref
->group
->step
);
252 fprintf (file
, ")\n");
254 fprintf (file
, " delta ");
255 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, ref
->delta
);
256 fprintf (file
, "\n");
258 fprintf (file
, " %s\n", ref
->write_p
? "write" : "read");
260 fprintf (file
, "\n");
263 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
266 static struct mem_ref_group
*
267 find_or_create_group (struct mem_ref_group
**groups
, tree base
,
270 struct mem_ref_group
*group
;
272 for (; *groups
; groups
= &(*groups
)->next
)
274 if ((*groups
)->step
== step
275 && operand_equal_p ((*groups
)->base
, base
, 0))
278 /* Keep the list of groups sorted by decreasing step. */
279 if ((*groups
)->step
< step
)
283 group
= XNEW (struct mem_ref_group
);
287 group
->next
= *groups
;
293 /* Records a memory reference MEM in GROUP with offset DELTA and write status
294 WRITE_P. The reference occurs in statement STMT. */
297 record_ref (struct mem_ref_group
*group
, gimple stmt
, tree mem
,
298 HOST_WIDE_INT delta
, bool write_p
)
300 struct mem_ref
**aref
;
302 /* Do not record the same address twice. */
303 for (aref
= &group
->refs
; *aref
; aref
= &(*aref
)->next
)
305 /* It does not have to be possible for write reference to reuse the read
306 prefetch, or vice versa. */
307 if (!WRITE_CAN_USE_READ_PREFETCH
309 && !(*aref
)->write_p
)
311 if (!READ_CAN_USE_WRITE_PREFETCH
316 if ((*aref
)->delta
== delta
)
320 (*aref
) = XNEW (struct mem_ref
);
321 (*aref
)->stmt
= stmt
;
323 (*aref
)->delta
= delta
;
324 (*aref
)->write_p
= write_p
;
325 (*aref
)->prefetch_before
= PREFETCH_ALL
;
326 (*aref
)->prefetch_mod
= 1;
327 (*aref
)->reuse_distance
= 0;
328 (*aref
)->issue_prefetch_p
= false;
329 (*aref
)->group
= group
;
330 (*aref
)->next
= NULL
;
331 (*aref
)->independent_p
= false;
332 (*aref
)->storent_p
= false;
334 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
335 dump_mem_ref (dump_file
, *aref
);
338 /* Release memory references in GROUPS. */
341 release_mem_refs (struct mem_ref_group
*groups
)
343 struct mem_ref_group
*next_g
;
344 struct mem_ref
*ref
, *next_r
;
346 for (; groups
; groups
= next_g
)
348 next_g
= groups
->next
;
349 for (ref
= groups
->refs
; ref
; ref
= next_r
)
358 /* A structure used to pass arguments to idx_analyze_ref. */
362 struct loop
*loop
; /* Loop of the reference. */
363 gimple stmt
; /* Statement of the reference. */
364 HOST_WIDE_INT
*step
; /* Step of the memory reference. */
365 HOST_WIDE_INT
*delta
; /* Offset of the memory reference. */
368 /* Analyzes a single INDEX of a memory reference to obtain information
369 described at analyze_ref. Callback for for_each_index. */
372 idx_analyze_ref (tree base
, tree
*index
, void *data
)
374 struct ar_data
*ar_data
= (struct ar_data
*) data
;
375 tree ibase
, step
, stepsize
;
376 HOST_WIDE_INT istep
, idelta
= 0, imult
= 1;
379 if (TREE_CODE (base
) == MISALIGNED_INDIRECT_REF
380 || TREE_CODE (base
) == ALIGN_INDIRECT_REF
)
383 if (!simple_iv (ar_data
->loop
, loop_containing_stmt (ar_data
->stmt
),
389 if (!cst_and_fits_in_hwi (step
))
391 istep
= int_cst_value (step
);
393 if (TREE_CODE (ibase
) == POINTER_PLUS_EXPR
394 && cst_and_fits_in_hwi (TREE_OPERAND (ibase
, 1)))
396 idelta
= int_cst_value (TREE_OPERAND (ibase
, 1));
397 ibase
= TREE_OPERAND (ibase
, 0);
399 if (cst_and_fits_in_hwi (ibase
))
401 idelta
+= int_cst_value (ibase
);
402 ibase
= build_int_cst (TREE_TYPE (ibase
), 0);
405 if (TREE_CODE (base
) == ARRAY_REF
)
407 stepsize
= array_ref_element_size (base
);
408 if (!cst_and_fits_in_hwi (stepsize
))
410 imult
= int_cst_value (stepsize
);
416 *ar_data
->step
+= istep
;
417 *ar_data
->delta
+= idelta
;
423 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
424 STEP are integer constants and iter is number of iterations of LOOP. The
425 reference occurs in statement STMT. Strips nonaddressable component
426 references from REF_P. */
429 analyze_ref (struct loop
*loop
, tree
*ref_p
, tree
*base
,
430 HOST_WIDE_INT
*step
, HOST_WIDE_INT
*delta
,
433 struct ar_data ar_data
;
435 HOST_WIDE_INT bit_offset
;
441 /* First strip off the component references. Ignore bitfields. */
442 if (TREE_CODE (ref
) == COMPONENT_REF
443 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref
, 1)))
444 ref
= TREE_OPERAND (ref
, 0);
448 for (; TREE_CODE (ref
) == COMPONENT_REF
; ref
= TREE_OPERAND (ref
, 0))
450 off
= DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref
, 1));
451 bit_offset
= TREE_INT_CST_LOW (off
);
452 gcc_assert (bit_offset
% BITS_PER_UNIT
== 0);
454 *delta
+= bit_offset
/ BITS_PER_UNIT
;
457 *base
= unshare_expr (ref
);
461 ar_data
.delta
= delta
;
462 return for_each_index (base
, idx_analyze_ref
, &ar_data
);
465 /* Record a memory reference REF to the list REFS. The reference occurs in
466 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
467 reference was recorded, false otherwise. */
470 gather_memory_references_ref (struct loop
*loop
, struct mem_ref_group
**refs
,
471 tree ref
, bool write_p
, gimple stmt
)
474 HOST_WIDE_INT step
, delta
;
475 struct mem_ref_group
*agrp
;
477 if (get_base_address (ref
) == NULL
)
480 if (!analyze_ref (loop
, &ref
, &base
, &step
, &delta
, stmt
))
483 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
484 are integer constants. */
485 agrp
= find_or_create_group (refs
, base
, step
);
486 record_ref (agrp
, stmt
, ref
, delta
, write_p
);
491 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
492 true if there are no other memory references inside the loop. */
494 static struct mem_ref_group
*
495 gather_memory_references (struct loop
*loop
, bool *no_other_refs
, unsigned *ref_count
)
497 basic_block
*body
= get_loop_body_in_dom_order (loop
);
500 gimple_stmt_iterator bsi
;
503 struct mem_ref_group
*refs
= NULL
;
505 *no_other_refs
= true;
508 /* Scan the loop body in order, so that the former references precede the
510 for (i
= 0; i
< loop
->num_nodes
; i
++)
513 if (bb
->loop_father
!= loop
)
516 for (bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
); gsi_next (&bsi
))
518 stmt
= gsi_stmt (bsi
);
520 if (gimple_code (stmt
) != GIMPLE_ASSIGN
)
522 if (gimple_vuse (stmt
)
523 || (is_gimple_call (stmt
)
524 && !(gimple_call_flags (stmt
) & ECF_CONST
)))
525 *no_other_refs
= false;
529 lhs
= gimple_assign_lhs (stmt
);
530 rhs
= gimple_assign_rhs1 (stmt
);
532 if (REFERENCE_CLASS_P (rhs
))
534 *no_other_refs
&= gather_memory_references_ref (loop
, &refs
,
538 if (REFERENCE_CLASS_P (lhs
))
540 *no_other_refs
&= gather_memory_references_ref (loop
, &refs
,
551 /* Prune the prefetch candidate REF using the self-reuse. */
554 prune_ref_by_self_reuse (struct mem_ref
*ref
)
556 HOST_WIDE_INT step
= ref
->group
->step
;
557 bool backward
= step
< 0;
561 /* Prefetch references to invariant address just once. */
562 ref
->prefetch_before
= 1;
569 if (step
> PREFETCH_BLOCK
)
572 if ((backward
&& HAVE_BACKWARD_PREFETCH
)
573 || (!backward
&& HAVE_FORWARD_PREFETCH
))
575 ref
->prefetch_before
= 1;
579 ref
->prefetch_mod
= PREFETCH_BLOCK
/ step
;
582 /* Divides X by BY, rounding down. */
585 ddown (HOST_WIDE_INT x
, unsigned HOST_WIDE_INT by
)
592 return (x
+ by
- 1) / by
;
595 /* Given a CACHE_LINE_SIZE and two inductive memory references
596 with a common STEP greater than CACHE_LINE_SIZE and an address
597 difference DELTA, compute the probability that they will fall
598 in different cache lines. DISTINCT_ITERS is the number of
599 distinct iterations after which the pattern repeats itself.
600 ALIGN_UNIT is the unit of alignment in bytes. */
603 compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size
,
604 HOST_WIDE_INT step
, HOST_WIDE_INT delta
,
605 unsigned HOST_WIDE_INT distinct_iters
,
608 unsigned align
, iter
;
609 int total_positions
, miss_positions
, miss_rate
;
610 int address1
, address2
, cache_line1
, cache_line2
;
615 /* Iterate through all possible alignments of the first
616 memory reference within its cache line. */
617 for (align
= 0; align
< cache_line_size
; align
+= align_unit
)
619 /* Iterate through all distinct iterations. */
620 for (iter
= 0; iter
< distinct_iters
; iter
++)
622 address1
= align
+ step
* iter
;
623 address2
= address1
+ delta
;
624 cache_line1
= address1
/ cache_line_size
;
625 cache_line2
= address2
/ cache_line_size
;
626 total_positions
+= 1;
627 if (cache_line1
!= cache_line2
)
630 miss_rate
= 1000 * miss_positions
/ total_positions
;
634 /* Prune the prefetch candidate REF using the reuse with BY.
635 If BY_IS_BEFORE is true, BY is before REF in the loop. */
638 prune_ref_by_group_reuse (struct mem_ref
*ref
, struct mem_ref
*by
,
641 HOST_WIDE_INT step
= ref
->group
->step
;
642 bool backward
= step
< 0;
643 HOST_WIDE_INT delta_r
= ref
->delta
, delta_b
= by
->delta
;
644 HOST_WIDE_INT delta
= delta_b
- delta_r
;
645 HOST_WIDE_INT hit_from
;
646 unsigned HOST_WIDE_INT prefetch_before
, prefetch_block
;
648 HOST_WIDE_INT reduced_step
;
649 unsigned HOST_WIDE_INT reduced_prefetch_block
;
655 /* If the references has the same address, only prefetch the
658 ref
->prefetch_before
= 0;
665 /* If the reference addresses are invariant and fall into the
666 same cache line, prefetch just the first one. */
670 if (ddown (ref
->delta
, PREFETCH_BLOCK
)
671 != ddown (by
->delta
, PREFETCH_BLOCK
))
674 ref
->prefetch_before
= 0;
678 /* Only prune the reference that is behind in the array. */
684 /* Transform the data so that we may assume that the accesses
688 delta_r
= PREFETCH_BLOCK
- 1 - delta_r
;
689 delta_b
= PREFETCH_BLOCK
- 1 - delta_b
;
697 /* Check whether the two references are likely to hit the same cache
698 line, and how distant the iterations in that it occurs are from
701 if (step
<= PREFETCH_BLOCK
)
703 /* The accesses are sure to meet. Let us check when. */
704 hit_from
= ddown (delta_b
, PREFETCH_BLOCK
) * PREFETCH_BLOCK
;
705 prefetch_before
= (hit_from
- delta_r
+ step
- 1) / step
;
707 if (prefetch_before
< ref
->prefetch_before
)
708 ref
->prefetch_before
= prefetch_before
;
713 /* A more complicated case with step > prefetch_block. First reduce
714 the ratio between the step and the cache line size to its simplest
715 terms. The resulting denominator will then represent the number of
716 distinct iterations after which each address will go back to its
717 initial location within the cache line. This computation assumes
718 that PREFETCH_BLOCK is a power of two. */
719 prefetch_block
= PREFETCH_BLOCK
;
720 reduced_prefetch_block
= prefetch_block
;
722 while ((reduced_step
& 1) == 0
723 && reduced_prefetch_block
> 1)
726 reduced_prefetch_block
>>= 1;
729 prefetch_before
= delta
/ step
;
731 ref_type
= TREE_TYPE (ref
->mem
);
732 align_unit
= TYPE_ALIGN (ref_type
) / 8;
733 miss_rate
= compute_miss_rate(prefetch_block
, step
, delta
,
734 reduced_prefetch_block
, align_unit
);
735 if (miss_rate
<= ACCEPTABLE_MISS_RATE
)
737 if (prefetch_before
< ref
->prefetch_before
)
738 ref
->prefetch_before
= prefetch_before
;
743 /* Try also the following iteration. */
745 delta
= step
- delta
;
746 miss_rate
= compute_miss_rate(prefetch_block
, step
, delta
,
747 reduced_prefetch_block
, align_unit
);
748 if (miss_rate
<= ACCEPTABLE_MISS_RATE
)
750 if (prefetch_before
< ref
->prefetch_before
)
751 ref
->prefetch_before
= prefetch_before
;
756 /* The ref probably does not reuse by. */
760 /* Prune the prefetch candidate REF using the reuses with other references
764 prune_ref_by_reuse (struct mem_ref
*ref
, struct mem_ref
*refs
)
766 struct mem_ref
*prune_by
;
769 prune_ref_by_self_reuse (ref
);
771 for (prune_by
= refs
; prune_by
; prune_by
= prune_by
->next
)
779 if (!WRITE_CAN_USE_READ_PREFETCH
781 && !prune_by
->write_p
)
783 if (!READ_CAN_USE_WRITE_PREFETCH
785 && prune_by
->write_p
)
788 prune_ref_by_group_reuse (ref
, prune_by
, before
);
792 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
795 prune_group_by_reuse (struct mem_ref_group
*group
)
797 struct mem_ref
*ref_pruned
;
799 for (ref_pruned
= group
->refs
; ref_pruned
; ref_pruned
= ref_pruned
->next
)
801 prune_ref_by_reuse (ref_pruned
, group
->refs
);
803 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
805 fprintf (dump_file
, "Reference %p:", (void *) ref_pruned
);
807 if (ref_pruned
->prefetch_before
== PREFETCH_ALL
808 && ref_pruned
->prefetch_mod
== 1)
809 fprintf (dump_file
, " no restrictions");
810 else if (ref_pruned
->prefetch_before
== 0)
811 fprintf (dump_file
, " do not prefetch");
812 else if (ref_pruned
->prefetch_before
<= ref_pruned
->prefetch_mod
)
813 fprintf (dump_file
, " prefetch once");
816 if (ref_pruned
->prefetch_before
!= PREFETCH_ALL
)
818 fprintf (dump_file
, " prefetch before ");
819 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
820 ref_pruned
->prefetch_before
);
822 if (ref_pruned
->prefetch_mod
!= 1)
824 fprintf (dump_file
, " prefetch mod ");
825 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
826 ref_pruned
->prefetch_mod
);
829 fprintf (dump_file
, "\n");
834 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
837 prune_by_reuse (struct mem_ref_group
*groups
)
839 for (; groups
; groups
= groups
->next
)
840 prune_group_by_reuse (groups
);
843 /* Returns true if we should issue prefetch for REF. */
846 should_issue_prefetch_p (struct mem_ref
*ref
)
848 /* For now do not issue prefetches for only first few of the
850 if (ref
->prefetch_before
!= PREFETCH_ALL
)
853 /* Do not prefetch nontemporal stores. */
860 /* Decide which of the prefetch candidates in GROUPS to prefetch.
861 AHEAD is the number of iterations to prefetch ahead (which corresponds
862 to the number of simultaneous instances of one prefetch running at a
863 time). UNROLL_FACTOR is the factor by that the loop is going to be
864 unrolled. Returns true if there is anything to prefetch. */
867 schedule_prefetches (struct mem_ref_group
*groups
, unsigned unroll_factor
,
870 unsigned remaining_prefetch_slots
, n_prefetches
, prefetch_slots
;
871 unsigned slots_per_prefetch
;
875 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
876 remaining_prefetch_slots
= SIMULTANEOUS_PREFETCHES
;
878 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
879 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
880 it will need a prefetch slot. */
881 slots_per_prefetch
= (ahead
+ unroll_factor
/ 2) / unroll_factor
;
882 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
883 fprintf (dump_file
, "Each prefetch instruction takes %u prefetch slots.\n",
886 /* For now we just take memory references one by one and issue
887 prefetches for as many as possible. The groups are sorted
888 starting with the largest step, since the references with
889 large step are more likely to cause many cache misses. */
891 for (; groups
; groups
= groups
->next
)
892 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
894 if (!should_issue_prefetch_p (ref
))
897 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
898 and we unroll the loop UNROLL_FACTOR times, we need to insert
899 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
901 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
902 / ref
->prefetch_mod
);
903 prefetch_slots
= n_prefetches
* slots_per_prefetch
;
905 /* If more than half of the prefetches would be lost anyway, do not
906 issue the prefetch. */
907 if (2 * remaining_prefetch_slots
< prefetch_slots
)
910 ref
->issue_prefetch_p
= true;
912 if (remaining_prefetch_slots
<= prefetch_slots
)
914 remaining_prefetch_slots
-= prefetch_slots
;
921 /* Estimate the number of prefetches in the given GROUPS. */
924 estimate_prefetch_count (struct mem_ref_group
*groups
)
927 int prefetch_count
= 0;
929 for (; groups
; groups
= groups
->next
)
930 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
931 if (should_issue_prefetch_p (ref
))
934 return prefetch_count
;
937 /* Issue prefetches for the reference REF into loop as decided before.
938 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
939 is the factor by which LOOP was unrolled. */
942 issue_prefetch_ref (struct mem_ref
*ref
, unsigned unroll_factor
, unsigned ahead
)
945 tree addr
, addr_base
, write_p
, local
;
947 gimple_stmt_iterator bsi
;
948 unsigned n_prefetches
, ap
;
949 bool nontemporal
= ref
->reuse_distance
>= L2_CACHE_SIZE_BYTES
;
951 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
952 fprintf (dump_file
, "Issued%s prefetch for %p.\n",
953 nontemporal
? " nontemporal" : "",
956 bsi
= gsi_for_stmt (ref
->stmt
);
958 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
959 / ref
->prefetch_mod
);
960 addr_base
= build_fold_addr_expr_with_type (ref
->mem
, ptr_type_node
);
961 addr_base
= force_gimple_operand_gsi (&bsi
, unshare_expr (addr_base
),
962 true, NULL
, true, GSI_SAME_STMT
);
963 write_p
= ref
->write_p
? integer_one_node
: integer_zero_node
;
964 local
= build_int_cst (integer_type_node
, nontemporal
? 0 : 3);
966 for (ap
= 0; ap
< n_prefetches
; ap
++)
968 /* Determine the address to prefetch. */
969 delta
= (ahead
+ ap
* ref
->prefetch_mod
) * ref
->group
->step
;
970 addr
= fold_build2 (POINTER_PLUS_EXPR
, ptr_type_node
,
971 addr_base
, size_int (delta
));
972 addr
= force_gimple_operand_gsi (&bsi
, unshare_expr (addr
), true, NULL
,
973 true, GSI_SAME_STMT
);
975 /* Create the prefetch instruction. */
976 prefetch
= gimple_build_call (built_in_decls
[BUILT_IN_PREFETCH
],
977 3, addr
, write_p
, local
);
978 gsi_insert_before (&bsi
, prefetch
, GSI_SAME_STMT
);
982 /* Issue prefetches for the references in GROUPS into loop as decided before.
983 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
984 factor by that LOOP was unrolled. */
987 issue_prefetches (struct mem_ref_group
*groups
,
988 unsigned unroll_factor
, unsigned ahead
)
992 for (; groups
; groups
= groups
->next
)
993 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
994 if (ref
->issue_prefetch_p
)
995 issue_prefetch_ref (ref
, unroll_factor
, ahead
);
998 /* Returns true if REF is a memory write for that a nontemporal store insn
1002 nontemporal_store_p (struct mem_ref
*ref
)
1004 enum machine_mode mode
;
1005 enum insn_code code
;
1007 /* REF must be a write that is not reused. We require it to be independent
1008 on all other memory references in the loop, as the nontemporal stores may
1009 be reordered with respect to other memory references. */
1011 || !ref
->independent_p
1012 || ref
->reuse_distance
< L2_CACHE_SIZE_BYTES
)
1015 /* Check that we have the storent instruction for the mode. */
1016 mode
= TYPE_MODE (TREE_TYPE (ref
->mem
));
1017 if (mode
== BLKmode
)
1020 code
= optab_handler (storent_optab
, mode
)->insn_code
;
1021 return code
!= CODE_FOR_nothing
;
1024 /* If REF is a nontemporal store, we mark the corresponding modify statement
1025 and return true. Otherwise, we return false. */
1028 mark_nontemporal_store (struct mem_ref
*ref
)
1030 if (!nontemporal_store_p (ref
))
1033 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1034 fprintf (dump_file
, "Marked reference %p as a nontemporal store.\n",
1037 gimple_assign_set_nontemporal_move (ref
->stmt
, true);
1038 ref
->storent_p
= true;
1043 /* Issue a memory fence instruction after LOOP. */
1046 emit_mfence_after_loop (struct loop
*loop
)
1048 VEC (edge
, heap
) *exits
= get_loop_exit_edges (loop
);
1051 gimple_stmt_iterator bsi
;
1054 for (i
= 0; VEC_iterate (edge
, exits
, i
, exit
); i
++)
1056 call
= gimple_build_call (FENCE_FOLLOWING_MOVNT
, 0);
1058 if (!single_pred_p (exit
->dest
)
1059 /* If possible, we prefer not to insert the fence on other paths
1061 && !(exit
->flags
& EDGE_ABNORMAL
))
1062 split_loop_exit_edge (exit
);
1063 bsi
= gsi_after_labels (exit
->dest
);
1065 gsi_insert_before (&bsi
, call
, GSI_NEW_STMT
);
1066 mark_virtual_ops_for_renaming (call
);
1069 VEC_free (edge
, heap
, exits
);
1070 update_ssa (TODO_update_ssa_only_virtuals
);
1073 /* Returns true if we can use storent in loop, false otherwise. */
1076 may_use_storent_in_loop_p (struct loop
*loop
)
1080 if (loop
->inner
!= NULL
)
1083 /* If we must issue a mfence insn after using storent, check that there
1084 is a suitable place for it at each of the loop exits. */
1085 if (FENCE_FOLLOWING_MOVNT
!= NULL_TREE
)
1087 VEC (edge
, heap
) *exits
= get_loop_exit_edges (loop
);
1091 for (i
= 0; VEC_iterate (edge
, exits
, i
, exit
); i
++)
1092 if ((exit
->flags
& EDGE_ABNORMAL
)
1093 && exit
->dest
== EXIT_BLOCK_PTR
)
1096 VEC_free (edge
, heap
, exits
);
1102 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1103 references in the loop. */
1106 mark_nontemporal_stores (struct loop
*loop
, struct mem_ref_group
*groups
)
1108 struct mem_ref
*ref
;
1111 if (!may_use_storent_in_loop_p (loop
))
1114 for (; groups
; groups
= groups
->next
)
1115 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
1116 any
|= mark_nontemporal_store (ref
);
1118 if (any
&& FENCE_FOLLOWING_MOVNT
!= NULL_TREE
)
1119 emit_mfence_after_loop (loop
);
1122 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1123 this is the case, fill in DESC by the description of number of
1127 should_unroll_loop_p (struct loop
*loop
, struct tree_niter_desc
*desc
,
1130 if (!can_unroll_loop_p (loop
, factor
, desc
))
1133 /* We only consider loops without control flow for unrolling. This is not
1134 a hard restriction -- tree_unroll_loop works with arbitrary loops
1135 as well; but the unrolling/prefetching is usually more profitable for
1136 loops consisting of a single basic block, and we want to limit the
1138 if (loop
->num_nodes
> 2)
1144 /* Determine the coefficient by that unroll LOOP, from the information
1145 contained in the list of memory references REFS. Description of
1146 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1147 insns of the LOOP. EST_NITER is the estimated number of iterations of
1148 the loop, or -1 if no estimate is available. */
1151 determine_unroll_factor (struct loop
*loop
, struct mem_ref_group
*refs
,
1152 unsigned ninsns
, struct tree_niter_desc
*desc
,
1153 HOST_WIDE_INT est_niter
)
1155 unsigned upper_bound
;
1156 unsigned nfactor
, factor
, mod_constraint
;
1157 struct mem_ref_group
*agp
;
1158 struct mem_ref
*ref
;
1160 /* First check whether the loop is not too large to unroll. We ignore
1161 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1162 from unrolling them enough to make exactly one cache line covered by each
1163 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1164 us from unrolling the loops too many times in cases where we only expect
1165 gains from better scheduling and decreasing loop overhead, which is not
1167 upper_bound
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / ninsns
;
1169 /* If we unrolled the loop more times than it iterates, the unrolled version
1170 of the loop would be never entered. */
1171 if (est_niter
>= 0 && est_niter
< (HOST_WIDE_INT
) upper_bound
)
1172 upper_bound
= est_niter
;
1174 if (upper_bound
<= 1)
1177 /* Choose the factor so that we may prefetch each cache just once,
1178 but bound the unrolling by UPPER_BOUND. */
1180 for (agp
= refs
; agp
; agp
= agp
->next
)
1181 for (ref
= agp
->refs
; ref
; ref
= ref
->next
)
1182 if (should_issue_prefetch_p (ref
))
1184 mod_constraint
= ref
->prefetch_mod
;
1185 nfactor
= least_common_multiple (mod_constraint
, factor
);
1186 if (nfactor
<= upper_bound
)
1190 if (!should_unroll_loop_p (loop
, desc
, factor
))
1196 /* Returns the total volume of the memory references REFS, taking into account
1197 reuses in the innermost loop and cache line size. TODO -- we should also
1198 take into account reuses across the iterations of the loops in the loop
1202 volume_of_references (struct mem_ref_group
*refs
)
1204 unsigned volume
= 0;
1205 struct mem_ref_group
*gr
;
1206 struct mem_ref
*ref
;
1208 for (gr
= refs
; gr
; gr
= gr
->next
)
1209 for (ref
= gr
->refs
; ref
; ref
= ref
->next
)
1211 /* Almost always reuses another value? */
1212 if (ref
->prefetch_before
!= PREFETCH_ALL
)
1215 /* If several iterations access the same cache line, use the size of
1216 the line divided by this number. Otherwise, a cache line is
1217 accessed in each iteration. TODO -- in the latter case, we should
1218 take the size of the reference into account, rounding it up on cache
1219 line size multiple. */
1220 volume
+= L1_CACHE_LINE_SIZE
/ ref
->prefetch_mod
;
1225 /* Returns the volume of memory references accessed across VEC iterations of
1226 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1227 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1230 volume_of_dist_vector (lambda_vector vec
, unsigned *loop_sizes
, unsigned n
)
1234 for (i
= 0; i
< n
; i
++)
1241 gcc_assert (vec
[i
] > 0);
1243 /* We ignore the parts of the distance vector in subloops, since usually
1244 the numbers of iterations are much smaller. */
1245 return loop_sizes
[i
] * vec
[i
];
1248 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1249 at the position corresponding to the loop of the step. N is the depth
1250 of the considered loop nest, and, LOOP is its innermost loop. */
1253 add_subscript_strides (tree access_fn
, unsigned stride
,
1254 HOST_WIDE_INT
*strides
, unsigned n
, struct loop
*loop
)
1258 HOST_WIDE_INT astep
;
1259 unsigned min_depth
= loop_depth (loop
) - n
;
1261 while (TREE_CODE (access_fn
) == POLYNOMIAL_CHREC
)
1263 aloop
= get_chrec_loop (access_fn
);
1264 step
= CHREC_RIGHT (access_fn
);
1265 access_fn
= CHREC_LEFT (access_fn
);
1267 if ((unsigned) loop_depth (aloop
) <= min_depth
)
1270 if (host_integerp (step
, 0))
1271 astep
= tree_low_cst (step
, 0);
1273 astep
= L1_CACHE_LINE_SIZE
;
1275 strides
[n
- 1 - loop_depth (loop
) + loop_depth (aloop
)] += astep
* stride
;
1280 /* Returns the volume of memory references accessed between two consecutive
1281 self-reuses of the reference DR. We consider the subscripts of DR in N
1282 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1283 loops. LOOP is the innermost loop of the current loop nest. */
1286 self_reuse_distance (data_reference_p dr
, unsigned *loop_sizes
, unsigned n
,
1289 tree stride
, access_fn
;
1290 HOST_WIDE_INT
*strides
, astride
;
1291 VEC (tree
, heap
) *access_fns
;
1292 tree ref
= DR_REF (dr
);
1293 unsigned i
, ret
= ~0u;
1295 /* In the following example:
1297 for (i = 0; i < N; i++)
1298 for (j = 0; j < N; j++)
1300 the same cache line is accessed each N steps (except if the change from
1301 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1302 we cannot rely purely on the results of the data dependence analysis.
1304 Instead, we compute the stride of the reference in each loop, and consider
1305 the innermost loop in that the stride is less than cache size. */
1307 strides
= XCNEWVEC (HOST_WIDE_INT
, n
);
1308 access_fns
= DR_ACCESS_FNS (dr
);
1310 for (i
= 0; VEC_iterate (tree
, access_fns
, i
, access_fn
); i
++)
1312 /* Keep track of the reference corresponding to the subscript, so that we
1314 while (handled_component_p (ref
) && TREE_CODE (ref
) != ARRAY_REF
)
1315 ref
= TREE_OPERAND (ref
, 0);
1317 if (TREE_CODE (ref
) == ARRAY_REF
)
1319 stride
= TYPE_SIZE_UNIT (TREE_TYPE (ref
));
1320 if (host_integerp (stride
, 1))
1321 astride
= tree_low_cst (stride
, 1);
1323 astride
= L1_CACHE_LINE_SIZE
;
1325 ref
= TREE_OPERAND (ref
, 0);
1330 add_subscript_strides (access_fn
, astride
, strides
, n
, loop
);
1333 for (i
= n
; i
-- > 0; )
1335 unsigned HOST_WIDE_INT s
;
1337 s
= strides
[i
] < 0 ? -strides
[i
] : strides
[i
];
1339 if (s
< (unsigned) L1_CACHE_LINE_SIZE
1341 > (unsigned) (L1_CACHE_SIZE_BYTES
/ NONTEMPORAL_FRACTION
)))
1343 ret
= loop_sizes
[i
];
1352 /* Determines the distance till the first reuse of each reference in REFS
1353 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1354 memory references in the loop. */
1357 determine_loop_nest_reuse (struct loop
*loop
, struct mem_ref_group
*refs
,
1360 struct loop
*nest
, *aloop
;
1361 VEC (data_reference_p
, heap
) *datarefs
= NULL
;
1362 VEC (ddr_p
, heap
) *dependences
= NULL
;
1363 struct mem_ref_group
*gr
;
1364 struct mem_ref
*ref
, *refb
;
1365 VEC (loop_p
, heap
) *vloops
= NULL
;
1366 unsigned *loop_data_size
;
1368 unsigned volume
, dist
, adist
;
1370 data_reference_p dr
;
1376 /* Find the outermost loop of the loop nest of loop (we require that
1377 there are no sibling loops inside the nest). */
1381 aloop
= loop_outer (nest
);
1383 if (aloop
== current_loops
->tree_root
1384 || aloop
->inner
->next
)
1390 /* For each loop, determine the amount of data accessed in each iteration.
1391 We use this to estimate whether the reference is evicted from the
1392 cache before its reuse. */
1393 find_loop_nest (nest
, &vloops
);
1394 n
= VEC_length (loop_p
, vloops
);
1395 loop_data_size
= XNEWVEC (unsigned, n
);
1396 volume
= volume_of_references (refs
);
1400 loop_data_size
[i
] = volume
;
1401 /* Bound the volume by the L2 cache size, since above this bound,
1402 all dependence distances are equivalent. */
1403 if (volume
> L2_CACHE_SIZE_BYTES
)
1406 aloop
= VEC_index (loop_p
, vloops
, i
);
1407 vol
= estimated_loop_iterations_int (aloop
, false);
1409 vol
= expected_loop_iterations (aloop
);
1413 /* Prepare the references in the form suitable for data dependence
1414 analysis. We ignore unanalyzable data references (the results
1415 are used just as a heuristics to estimate temporality of the
1416 references, hence we do not need to worry about correctness). */
1417 for (gr
= refs
; gr
; gr
= gr
->next
)
1418 for (ref
= gr
->refs
; ref
; ref
= ref
->next
)
1420 dr
= create_data_ref (nest
, ref
->mem
, ref
->stmt
, !ref
->write_p
);
1424 ref
->reuse_distance
= volume
;
1426 VEC_safe_push (data_reference_p
, heap
, datarefs
, dr
);
1429 no_other_refs
= false;
1432 for (i
= 0; VEC_iterate (data_reference_p
, datarefs
, i
, dr
); i
++)
1434 dist
= self_reuse_distance (dr
, loop_data_size
, n
, loop
);
1435 ref
= (struct mem_ref
*) dr
->aux
;
1436 if (ref
->reuse_distance
> dist
)
1437 ref
->reuse_distance
= dist
;
1440 ref
->independent_p
= true;
1443 compute_all_dependences (datarefs
, &dependences
, vloops
, true);
1445 for (i
= 0; VEC_iterate (ddr_p
, dependences
, i
, dep
); i
++)
1447 if (DDR_ARE_DEPENDENT (dep
) == chrec_known
)
1450 ref
= (struct mem_ref
*) DDR_A (dep
)->aux
;
1451 refb
= (struct mem_ref
*) DDR_B (dep
)->aux
;
1453 if (DDR_ARE_DEPENDENT (dep
) == chrec_dont_know
1454 || DDR_NUM_DIST_VECTS (dep
) == 0)
1456 /* If the dependence cannot be analyzed, assume that there might be
1460 ref
->independent_p
= false;
1461 refb
->independent_p
= false;
1465 /* The distance vectors are normalized to be always lexicographically
1466 positive, hence we cannot tell just from them whether DDR_A comes
1467 before DDR_B or vice versa. However, it is not important,
1468 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1469 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1470 in cache (and marking it as nontemporal would not affect
1474 for (j
= 0; j
< DDR_NUM_DIST_VECTS (dep
); j
++)
1476 adist
= volume_of_dist_vector (DDR_DIST_VECT (dep
, j
),
1479 /* If this is a dependence in the innermost loop (i.e., the
1480 distances in all superloops are zero) and it is not
1481 the trivial self-dependence with distance zero, record that
1482 the references are not completely independent. */
1483 if (lambda_vector_zerop (DDR_DIST_VECT (dep
, j
), n
- 1)
1485 || DDR_DIST_VECT (dep
, j
)[n
-1] != 0))
1487 ref
->independent_p
= false;
1488 refb
->independent_p
= false;
1491 /* Ignore accesses closer than
1492 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1493 so that we use nontemporal prefetches e.g. if single memory
1494 location is accessed several times in a single iteration of
1496 if (adist
< L1_CACHE_SIZE_BYTES
/ NONTEMPORAL_FRACTION
)
1504 if (ref
->reuse_distance
> dist
)
1505 ref
->reuse_distance
= dist
;
1506 if (refb
->reuse_distance
> dist
)
1507 refb
->reuse_distance
= dist
;
1510 free_dependence_relations (dependences
);
1511 free_data_refs (datarefs
);
1512 free (loop_data_size
);
1514 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1516 fprintf (dump_file
, "Reuse distances:\n");
1517 for (gr
= refs
; gr
; gr
= gr
->next
)
1518 for (ref
= gr
->refs
; ref
; ref
= ref
->next
)
1519 fprintf (dump_file
, " ref %p distance %u\n",
1520 (void *) ref
, ref
->reuse_distance
);
1524 /* Do a cost-benefit analysis to determine if prefetching is profitable
1525 for the current loop given the following parameters:
1526 AHEAD: the iteration ahead distance,
1527 EST_NITER: the estimated trip count,
1528 NINSNS: estimated number of instructions in the loop,
1529 PREFETCH_COUNT: an estimate of the number of prefetches
1530 MEM_REF_COUNT: total number of memory references in the loop. */
1533 is_loop_prefetching_profitable (unsigned ahead
, HOST_WIDE_INT est_niter
,
1534 unsigned ninsns
, unsigned prefetch_count
,
1535 unsigned mem_ref_count
)
1537 int insn_to_mem_ratio
, insn_to_prefetch_ratio
;
1539 if (mem_ref_count
== 0)
1542 /* Prefetching improves performance by overlapping cache missing
1543 memory accesses with CPU operations. If the loop does not have
1544 enough CPU operations to overlap with memory operations, prefetching
1545 won't give a significant benefit. One approximate way of checking
1546 this is to require the ratio of instructions to memory references to
1547 be above a certain limit. This approximation works well in practice.
1548 TODO: Implement a more precise computation by estimating the time
1549 for each CPU or memory op in the loop. Time estimates for memory ops
1550 should account for cache misses. */
1551 insn_to_mem_ratio
= ninsns
/ mem_ref_count
;
1553 if (insn_to_mem_ratio
< PREFETCH_MIN_INSN_TO_MEM_RATIO
)
1556 /* Profitability of prefetching is highly dependent on the trip count.
1557 For a given AHEAD distance, the first AHEAD iterations do not benefit
1558 from prefetching, and the last AHEAD iterations execute useless
1559 prefetches. So, if the trip count is not large enough relative to AHEAD,
1560 prefetching may cause serious performance degradation. To avoid this
1561 problem when the trip count is not known at compile time, we
1562 conservatively skip loops with high prefetching costs. For now, only
1563 the I-cache cost is considered. The relative I-cache cost is estimated
1564 by taking the ratio between the number of prefetches and the total
1565 number of instructions. Since we are using integer arithmetic, we
1566 compute the reciprocal of this ratio.
1567 TODO: Account for loop unrolling, which may reduce the costs of
1568 shorter stride prefetches. Note that not accounting for loop
1569 unrolling over-estimates the cost and hence gives more conservative
1573 insn_to_prefetch_ratio
= ninsns
/ prefetch_count
;
1574 return insn_to_prefetch_ratio
>= MIN_INSN_TO_PREFETCH_RATIO
;
1577 if (est_niter
<= (HOST_WIDE_INT
) ahead
)
1579 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1581 "Not prefetching -- loop estimated to roll only %d times\n",
1589 /* Issue prefetch instructions for array references in LOOP. Returns
1590 true if the LOOP was unrolled. */
1593 loop_prefetch_arrays (struct loop
*loop
)
1595 struct mem_ref_group
*refs
;
1596 unsigned ahead
, ninsns
, time
, unroll_factor
;
1597 HOST_WIDE_INT est_niter
;
1598 struct tree_niter_desc desc
;
1599 bool unrolled
= false, no_other_refs
;
1600 unsigned prefetch_count
;
1601 unsigned mem_ref_count
;
1603 if (optimize_loop_nest_for_size_p (loop
))
1605 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1606 fprintf (dump_file
, " ignored (cold area)\n");
1610 /* Step 1: gather the memory references. */
1611 refs
= gather_memory_references (loop
, &no_other_refs
, &mem_ref_count
);
1613 /* Step 2: estimate the reuse effects. */
1614 prune_by_reuse (refs
);
1616 prefetch_count
= estimate_prefetch_count (refs
);
1617 if (prefetch_count
== 0)
1620 determine_loop_nest_reuse (loop
, refs
, no_other_refs
);
1622 /* Step 3: determine the ahead and unroll factor. */
1624 /* FIXME: the time should be weighted by the probabilities of the blocks in
1626 time
= tree_num_loop_insns (loop
, &eni_time_weights
);
1627 ahead
= (PREFETCH_LATENCY
+ time
- 1) / time
;
1628 est_niter
= estimated_loop_iterations_int (loop
, false);
1630 ninsns
= tree_num_loop_insns (loop
, &eni_size_weights
);
1631 unroll_factor
= determine_unroll_factor (loop
, refs
, ninsns
, &desc
,
1633 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1634 fprintf (dump_file
, "Ahead %d, unroll factor %d, trip count "
1635 HOST_WIDE_INT_PRINT_DEC
"\n"
1636 "insn count %d, mem ref count %d, prefetch count %d\n",
1637 ahead
, unroll_factor
, est_niter
,
1638 ninsns
, mem_ref_count
, prefetch_count
);
1640 if (!is_loop_prefetching_profitable (ahead
, est_niter
, ninsns
,
1641 prefetch_count
, mem_ref_count
))
1644 mark_nontemporal_stores (loop
, refs
);
1646 /* Step 4: what to prefetch? */
1647 if (!schedule_prefetches (refs
, unroll_factor
, ahead
))
1650 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1651 iterations so that we do not issue superfluous prefetches. */
1652 if (unroll_factor
!= 1)
1654 tree_unroll_loop (loop
, unroll_factor
,
1655 single_dom_exit (loop
), &desc
);
1659 /* Step 6: issue the prefetches. */
1660 issue_prefetches (refs
, unroll_factor
, ahead
);
1663 release_mem_refs (refs
);
1667 /* Issue prefetch instructions for array references in loops. */
1670 tree_ssa_prefetch_arrays (void)
1674 bool unrolled
= false;
1678 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1679 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1680 of processor costs and i486 does not have prefetch, but
1681 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1682 || PREFETCH_BLOCK
== 0)
1685 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1687 fprintf (dump_file
, "Prefetching parameters:\n");
1688 fprintf (dump_file
, " simultaneous prefetches: %d\n",
1689 SIMULTANEOUS_PREFETCHES
);
1690 fprintf (dump_file
, " prefetch latency: %d\n", PREFETCH_LATENCY
);
1691 fprintf (dump_file
, " prefetch block size: %d\n", PREFETCH_BLOCK
);
1692 fprintf (dump_file
, " L1 cache size: %d lines, %d kB\n",
1693 L1_CACHE_SIZE_BYTES
/ L1_CACHE_LINE_SIZE
, L1_CACHE_SIZE
);
1694 fprintf (dump_file
, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE
);
1695 fprintf (dump_file
, " L2 cache size: %d kB\n", L2_CACHE_SIZE
);
1696 fprintf (dump_file
, " min insn-to-prefetch ratio: %d \n",
1697 MIN_INSN_TO_PREFETCH_RATIO
);
1698 fprintf (dump_file
, " min insn-to-mem ratio: %d \n",
1699 PREFETCH_MIN_INSN_TO_MEM_RATIO
);
1700 fprintf (dump_file
, "\n");
1703 initialize_original_copy_tables ();
1705 if (!built_in_decls
[BUILT_IN_PREFETCH
])
1707 tree type
= build_function_type (void_type_node
,
1708 tree_cons (NULL_TREE
,
1709 const_ptr_type_node
,
1711 tree decl
= add_builtin_function ("__builtin_prefetch", type
,
1712 BUILT_IN_PREFETCH
, BUILT_IN_NORMAL
,
1714 DECL_IS_NOVOPS (decl
) = true;
1715 built_in_decls
[BUILT_IN_PREFETCH
] = decl
;
1718 /* We assume that size of cache line is a power of two, so verify this
1720 gcc_assert ((PREFETCH_BLOCK
& (PREFETCH_BLOCK
- 1)) == 0);
1722 FOR_EACH_LOOP (li
, loop
, LI_FROM_INNERMOST
)
1724 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1725 fprintf (dump_file
, "Processing loop %d:\n", loop
->num
);
1727 unrolled
|= loop_prefetch_arrays (loop
);
1729 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1730 fprintf (dump_file
, "\n\n");
1736 todo_flags
|= TODO_cleanup_cfg
;
1739 free_original_copy_tables ();