2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 #include "coretypes.h"
28 #include "hard-reg-set.h"
29 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
38 #include "tree-pass.h"
40 #include "insn-config.h"
43 #include "tree-chrec.h"
44 #include "tree-scalar-evolution.h"
47 #include "langhooks.h"
49 /* This pass inserts prefetch instructions to optimize cache usage during
50 accesses to arrays in loops. It processes loops sequentially and:
52 1) Gathers all memory references in the single loop.
53 2) For each of the references it decides when it is profitable to prefetch
54 it. To do it, we evaluate the reuse among the accesses, and determines
55 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
56 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
57 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
58 iterations of the loop that are zero modulo PREFETCH_MOD). For example
59 (assuming cache line size is 64 bytes, char has size 1 byte and there
60 is no hardware sequential prefetch):
63 for (i = 0; i < max; i++)
70 a[187*i + 50] = ...; (5)
73 (0) obviously has PREFETCH_BEFORE 1
74 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
75 location 64 iterations before it, and PREFETCH_MOD 64 (since
76 it hits the same cache line otherwise).
77 (2) has PREFETCH_MOD 64
78 (3) has PREFETCH_MOD 4
79 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
80 the cache line accessed by (4) is the same with probability only
82 (5) has PREFETCH_MOD 1 as well.
84 3) We determine how much ahead we need to prefetch. The number of
85 iterations needed is time to fetch / time spent in one iteration of
86 the loop. The problem is that we do not know either of these values,
87 so we just make a heuristic guess based on a magic (possibly)
88 target-specific constant and size of the loop.
90 4) Determine which of the references we prefetch. We take into account
91 that there is a maximum number of simultaneous prefetches (provided
92 by machine description). We prefetch as many prefetches as possible
93 while still within this bound (starting with those with lowest
94 prefetch_mod, since they are responsible for most of the cache
97 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
98 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
99 prefetching nonaccessed memory.
100 TODO -- actually implement peeling.
102 6) We actually emit the prefetch instructions. ??? Perhaps emit the
103 prefetch instructions with guards in cases where 5) was not sufficient
104 to satisfy the constraints?
107 -- write and use more general reuse analysis (that could be also used
108 in other cache aimed loop optimizations)
109 -- make it behave sanely together with the prefetches given by user
110 (now we just ignore them; at the very least we should avoid
111 optimizing loops in that user put his own prefetches)
112 -- we assume cache line size alignment of arrays; this could be
115 /* Magic constants follow. These should be replaced by machine specific
118 /* True if write can be prefetched by a read prefetch. */
120 #ifndef WRITE_CAN_USE_READ_PREFETCH
121 #define WRITE_CAN_USE_READ_PREFETCH 1
124 /* True if read can be prefetched by a write prefetch. */
126 #ifndef READ_CAN_USE_WRITE_PREFETCH
127 #define READ_CAN_USE_WRITE_PREFETCH 0
130 /* The size of the block loaded by a single prefetch. Usually, this is
131 the same as cache line size (at the moment, we only consider one level
132 of cache hierarchy). */
134 #ifndef PREFETCH_BLOCK
135 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
138 /* Do we have a forward hardware sequential prefetching? */
140 #ifndef HAVE_FORWARD_PREFETCH
141 #define HAVE_FORWARD_PREFETCH 0
144 /* Do we have a backward hardware sequential prefetching? */
146 #ifndef HAVE_BACKWARD_PREFETCH
147 #define HAVE_BACKWARD_PREFETCH 0
150 /* In some cases we are only able to determine that there is a certain
151 probability that the two accesses hit the same cache line. In this
152 case, we issue the prefetches for both of them if this probability
153 is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */
155 #ifndef ACCEPTABLE_MISS_RATE
156 #define ACCEPTABLE_MISS_RATE 50
159 #ifndef HAVE_prefetch
160 #define HAVE_prefetch 0
163 /* The group of references between that reuse may occur. */
167 tree base
; /* Base of the reference. */
168 HOST_WIDE_INT step
; /* Step of the reference. */
169 struct mem_ref
*refs
; /* References in the group. */
170 struct mem_ref_group
*next
; /* Next group of references. */
173 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
175 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
177 /* The memory reference. */
181 tree stmt
; /* Statement in that the reference appears. */
182 tree mem
; /* The reference. */
183 HOST_WIDE_INT delta
; /* Constant offset of the reference. */
184 bool write_p
; /* Is it a write? */
185 struct mem_ref_group
*group
; /* The group of references it belongs to. */
186 unsigned HOST_WIDE_INT prefetch_mod
;
187 /* Prefetch only each PREFETCH_MOD-th
189 unsigned HOST_WIDE_INT prefetch_before
;
190 /* Prefetch only first PREFETCH_BEFORE
192 bool issue_prefetch_p
; /* Should we really issue the prefetch? */
193 struct mem_ref
*next
; /* The next reference in the group. */
196 /* Dumps information about reference REF to FILE. */
199 dump_mem_ref (FILE *file
, struct mem_ref
*ref
)
201 fprintf (file
, "Reference %p:\n", (void *) ref
);
203 fprintf (file
, " group %p (base ", (void *) ref
->group
);
204 print_generic_expr (file
, ref
->group
->base
, TDF_SLIM
);
205 fprintf (file
, ", step ");
206 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, ref
->group
->step
);
207 fprintf (file
, ")\n");
209 fprintf (file
, " delta ");
210 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, ref
->delta
);
211 fprintf (file
, "\n");
213 fprintf (file
, " %s\n", ref
->write_p
? "write" : "read");
215 fprintf (file
, "\n");
218 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
221 static struct mem_ref_group
*
222 find_or_create_group (struct mem_ref_group
**groups
, tree base
,
225 struct mem_ref_group
*group
;
227 for (; *groups
; groups
= &(*groups
)->next
)
229 if ((*groups
)->step
== step
230 && operand_equal_p ((*groups
)->base
, base
, 0))
233 /* Keep the list of groups sorted by decreasing step. */
234 if ((*groups
)->step
< step
)
238 group
= xcalloc (1, sizeof (struct mem_ref_group
));
242 group
->next
= *groups
;
248 /* Records a memory reference MEM in GROUP with offset DELTA and write status
249 WRITE_P. The reference occurs in statement STMT. */
252 record_ref (struct mem_ref_group
*group
, tree stmt
, tree mem
,
253 HOST_WIDE_INT delta
, bool write_p
)
255 struct mem_ref
**aref
;
257 /* Do not record the same address twice. */
258 for (aref
= &group
->refs
; *aref
; aref
= &(*aref
)->next
)
260 /* It does not have to be possible for write reference to reuse the read
261 prefetch, or vice versa. */
262 if (!WRITE_CAN_USE_READ_PREFETCH
264 && !(*aref
)->write_p
)
266 if (!READ_CAN_USE_WRITE_PREFETCH
271 if ((*aref
)->delta
== delta
)
275 (*aref
) = xcalloc (1, sizeof (struct mem_ref
));
276 (*aref
)->stmt
= stmt
;
278 (*aref
)->delta
= delta
;
279 (*aref
)->write_p
= write_p
;
280 (*aref
)->prefetch_before
= PREFETCH_ALL
;
281 (*aref
)->prefetch_mod
= 1;
282 (*aref
)->issue_prefetch_p
= false;
283 (*aref
)->group
= group
;
284 (*aref
)->next
= NULL
;
286 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
287 dump_mem_ref (dump_file
, *aref
);
290 /* Release memory references in GROUPS. */
293 release_mem_refs (struct mem_ref_group
*groups
)
295 struct mem_ref_group
*next_g
;
296 struct mem_ref
*ref
, *next_r
;
298 for (; groups
; groups
= next_g
)
300 next_g
= groups
->next
;
301 for (ref
= groups
->refs
; ref
; ref
= next_r
)
310 /* A structure used to pass arguments to idx_analyze_ref. */
314 struct loop
*loop
; /* Loop of the reference. */
315 tree stmt
; /* Statement of the reference. */
316 HOST_WIDE_INT
*step
; /* Step of the memory reference. */
317 HOST_WIDE_INT
*delta
; /* Offset of the memory reference. */
320 /* Analyzes a single INDEX of a memory reference to obtain information
321 described at analyze_ref. Callback for for_each_index. */
324 idx_analyze_ref (tree base
, tree
*index
, void *data
)
326 struct ar_data
*ar_data
= data
;
327 tree ibase
, step
, stepsize
;
328 HOST_WIDE_INT istep
, idelta
= 0, imult
= 1;
331 if (TREE_CODE (base
) == MISALIGNED_INDIRECT_REF
332 || TREE_CODE (base
) == ALIGN_INDIRECT_REF
)
335 if (!simple_iv (ar_data
->loop
, ar_data
->stmt
, *index
, &iv
, false))
340 if (!cst_and_fits_in_hwi (step
))
342 istep
= int_cst_value (step
);
344 if (TREE_CODE (ibase
) == PLUS_EXPR
345 && cst_and_fits_in_hwi (TREE_OPERAND (ibase
, 1)))
347 idelta
= int_cst_value (TREE_OPERAND (ibase
, 1));
348 ibase
= TREE_OPERAND (ibase
, 0);
350 if (cst_and_fits_in_hwi (ibase
))
352 idelta
+= int_cst_value (ibase
);
353 ibase
= build_int_cst (TREE_TYPE (ibase
), 0);
356 if (TREE_CODE (base
) == ARRAY_REF
)
358 stepsize
= array_ref_element_size (base
);
359 if (!cst_and_fits_in_hwi (stepsize
))
361 imult
= int_cst_value (stepsize
);
367 *ar_data
->step
+= istep
;
368 *ar_data
->delta
+= idelta
;
374 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
375 STEP are integer constants and iter is number of iterations of LOOP. The
376 reference occurs in statement STMT. Strips nonaddressable component
377 references from REF_P. */
380 analyze_ref (struct loop
*loop
, tree
*ref_p
, tree
*base
,
381 HOST_WIDE_INT
*step
, HOST_WIDE_INT
*delta
,
384 struct ar_data ar_data
;
386 HOST_WIDE_INT bit_offset
;
392 /* First strip off the component references. Ignore bitfields. */
393 if (TREE_CODE (ref
) == COMPONENT_REF
394 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref
, 1)))
395 ref
= TREE_OPERAND (ref
, 0);
399 for (; TREE_CODE (ref
) == COMPONENT_REF
; ref
= TREE_OPERAND (ref
, 0))
401 off
= DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref
, 1));
402 bit_offset
= TREE_INT_CST_LOW (off
);
403 gcc_assert (bit_offset
% BITS_PER_UNIT
== 0);
405 *delta
+= bit_offset
/ BITS_PER_UNIT
;
408 *base
= unshare_expr (ref
);
412 ar_data
.delta
= delta
;
413 return for_each_index (base
, idx_analyze_ref
, &ar_data
);
416 /* Record a memory reference REF to the list REFS. The reference occurs in
417 LOOP in statement STMT and it is write if WRITE_P. */
420 gather_memory_references_ref (struct loop
*loop
, struct mem_ref_group
**refs
,
421 tree ref
, bool write_p
, tree stmt
)
424 HOST_WIDE_INT step
, delta
;
425 struct mem_ref_group
*agrp
;
427 if (!analyze_ref (loop
, &ref
, &base
, &step
, &delta
, stmt
))
430 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
431 are integer constants. */
432 agrp
= find_or_create_group (refs
, base
, step
);
433 record_ref (agrp
, stmt
, ref
, delta
, write_p
);
436 /* Record the suitable memory references in LOOP. */
438 static struct mem_ref_group
*
439 gather_memory_references (struct loop
*loop
)
441 basic_block
*body
= get_loop_body_in_dom_order (loop
);
444 block_stmt_iterator bsi
;
446 struct mem_ref_group
*refs
= NULL
;
448 /* Scan the loop body in order, so that the former references precede the
450 for (i
= 0; i
< loop
->num_nodes
; i
++)
453 if (bb
->loop_father
!= loop
)
456 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
458 stmt
= bsi_stmt (bsi
);
459 if (TREE_CODE (stmt
) != GIMPLE_MODIFY_STMT
)
462 lhs
= GIMPLE_STMT_OPERAND (stmt
, 0);
463 rhs
= GIMPLE_STMT_OPERAND (stmt
, 1);
465 if (REFERENCE_CLASS_P (rhs
))
466 gather_memory_references_ref (loop
, &refs
, rhs
, false, stmt
);
467 if (REFERENCE_CLASS_P (lhs
))
468 gather_memory_references_ref (loop
, &refs
, lhs
, true, stmt
);
476 /* Prune the prefetch candidate REF using the self-reuse. */
479 prune_ref_by_self_reuse (struct mem_ref
*ref
)
481 HOST_WIDE_INT step
= ref
->group
->step
;
482 bool backward
= step
< 0;
486 /* Prefetch references to invariant address just once. */
487 ref
->prefetch_before
= 1;
494 if (step
> PREFETCH_BLOCK
)
497 if ((backward
&& HAVE_BACKWARD_PREFETCH
)
498 || (!backward
&& HAVE_FORWARD_PREFETCH
))
500 ref
->prefetch_before
= 1;
504 ref
->prefetch_mod
= PREFETCH_BLOCK
/ step
;
507 /* Divides X by BY, rounding down. */
510 ddown (HOST_WIDE_INT x
, unsigned HOST_WIDE_INT by
)
517 return (x
+ by
- 1) / by
;
520 /* Prune the prefetch candidate REF using the reuse with BY.
521 If BY_IS_BEFORE is true, BY is before REF in the loop. */
524 prune_ref_by_group_reuse (struct mem_ref
*ref
, struct mem_ref
*by
,
527 HOST_WIDE_INT step
= ref
->group
->step
;
528 bool backward
= step
< 0;
529 HOST_WIDE_INT delta_r
= ref
->delta
, delta_b
= by
->delta
;
530 HOST_WIDE_INT delta
= delta_b
- delta_r
;
531 HOST_WIDE_INT hit_from
;
532 unsigned HOST_WIDE_INT prefetch_before
, prefetch_block
;
536 /* If the references has the same address, only prefetch the
539 ref
->prefetch_before
= 0;
546 /* If the reference addresses are invariant and fall into the
547 same cache line, prefetch just the first one. */
551 if (ddown (ref
->delta
, PREFETCH_BLOCK
)
552 != ddown (by
->delta
, PREFETCH_BLOCK
))
555 ref
->prefetch_before
= 0;
559 /* Only prune the reference that is behind in the array. */
565 /* Transform the data so that we may assume that the accesses
569 delta_r
= PREFETCH_BLOCK
- 1 - delta_r
;
570 delta_b
= PREFETCH_BLOCK
- 1 - delta_b
;
578 /* Check whether the two references are likely to hit the same cache
579 line, and how distant the iterations in that it occurs are from
582 if (step
<= PREFETCH_BLOCK
)
584 /* The accesses are sure to meet. Let us check when. */
585 hit_from
= ddown (delta_b
, PREFETCH_BLOCK
) * PREFETCH_BLOCK
;
586 prefetch_before
= (hit_from
- delta_r
+ step
- 1) / step
;
588 if (prefetch_before
< ref
->prefetch_before
)
589 ref
->prefetch_before
= prefetch_before
;
594 /* A more complicated case. First let us ensure that size of cache line
595 and step are coprime (here we assume that PREFETCH_BLOCK is a power
597 prefetch_block
= PREFETCH_BLOCK
;
598 while ((step
& 1) == 0
599 && prefetch_block
> 1)
602 prefetch_block
>>= 1;
606 /* Now step > prefetch_block, and step and prefetch_block are coprime.
607 Determine the probability that the accesses hit the same cache line. */
609 prefetch_before
= delta
/ step
;
611 if ((unsigned HOST_WIDE_INT
) delta
612 <= (prefetch_block
* ACCEPTABLE_MISS_RATE
/ 1000))
614 if (prefetch_before
< ref
->prefetch_before
)
615 ref
->prefetch_before
= prefetch_before
;
620 /* Try also the following iteration. */
622 delta
= step
- delta
;
623 if ((unsigned HOST_WIDE_INT
) delta
624 <= (prefetch_block
* ACCEPTABLE_MISS_RATE
/ 1000))
626 if (prefetch_before
< ref
->prefetch_before
)
627 ref
->prefetch_before
= prefetch_before
;
632 /* The ref probably does not reuse by. */
636 /* Prune the prefetch candidate REF using the reuses with other references
640 prune_ref_by_reuse (struct mem_ref
*ref
, struct mem_ref
*refs
)
642 struct mem_ref
*prune_by
;
645 prune_ref_by_self_reuse (ref
);
647 for (prune_by
= refs
; prune_by
; prune_by
= prune_by
->next
)
655 if (!WRITE_CAN_USE_READ_PREFETCH
657 && !prune_by
->write_p
)
659 if (!READ_CAN_USE_WRITE_PREFETCH
661 && prune_by
->write_p
)
664 prune_ref_by_group_reuse (ref
, prune_by
, before
);
668 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
671 prune_group_by_reuse (struct mem_ref_group
*group
)
673 struct mem_ref
*ref_pruned
;
675 for (ref_pruned
= group
->refs
; ref_pruned
; ref_pruned
= ref_pruned
->next
)
677 prune_ref_by_reuse (ref_pruned
, group
->refs
);
679 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
681 fprintf (dump_file
, "Reference %p:", (void *) ref_pruned
);
683 if (ref_pruned
->prefetch_before
== PREFETCH_ALL
684 && ref_pruned
->prefetch_mod
== 1)
685 fprintf (dump_file
, " no restrictions");
686 else if (ref_pruned
->prefetch_before
== 0)
687 fprintf (dump_file
, " do not prefetch");
688 else if (ref_pruned
->prefetch_before
<= ref_pruned
->prefetch_mod
)
689 fprintf (dump_file
, " prefetch once");
692 if (ref_pruned
->prefetch_before
!= PREFETCH_ALL
)
694 fprintf (dump_file
, " prefetch before ");
695 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
696 ref_pruned
->prefetch_before
);
698 if (ref_pruned
->prefetch_mod
!= 1)
700 fprintf (dump_file
, " prefetch mod ");
701 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
702 ref_pruned
->prefetch_mod
);
705 fprintf (dump_file
, "\n");
710 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
713 prune_by_reuse (struct mem_ref_group
*groups
)
715 for (; groups
; groups
= groups
->next
)
716 prune_group_by_reuse (groups
);
719 /* Returns true if we should issue prefetch for REF. */
722 should_issue_prefetch_p (struct mem_ref
*ref
)
724 /* For now do not issue prefetches for only first few of the
726 if (ref
->prefetch_before
!= PREFETCH_ALL
)
732 /* Decide which of the prefetch candidates in GROUPS to prefetch.
733 AHEAD is the number of iterations to prefetch ahead (which corresponds
734 to the number of simultaneous instances of one prefetch running at a
735 time). UNROLL_FACTOR is the factor by that the loop is going to be
736 unrolled. Returns true if there is anything to prefetch. */
739 schedule_prefetches (struct mem_ref_group
*groups
, unsigned unroll_factor
,
742 unsigned remaining_prefetch_slots
, n_prefetches
, prefetch_slots
;
743 unsigned slots_per_prefetch
;
747 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
748 remaining_prefetch_slots
= SIMULTANEOUS_PREFETCHES
;
750 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
751 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
752 it will need a prefetch slot. */
753 slots_per_prefetch
= (ahead
+ unroll_factor
/ 2) / unroll_factor
;
754 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
755 fprintf (dump_file
, "Each prefetch instruction takes %u prefetch slots.\n",
758 /* For now we just take memory references one by one and issue
759 prefetches for as many as possible. The groups are sorted
760 starting with the largest step, since the references with
761 large step are more likely to cause many cache misses. */
763 for (; groups
; groups
= groups
->next
)
764 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
766 if (!should_issue_prefetch_p (ref
))
769 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
770 and we unroll the loop UNROLL_FACTOR times, we need to insert
771 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
773 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
774 / ref
->prefetch_mod
);
775 prefetch_slots
= n_prefetches
* slots_per_prefetch
;
777 /* If more than half of the prefetches would be lost anyway, do not
778 issue the prefetch. */
779 if (2 * remaining_prefetch_slots
< prefetch_slots
)
782 ref
->issue_prefetch_p
= true;
784 if (remaining_prefetch_slots
<= prefetch_slots
)
786 remaining_prefetch_slots
-= prefetch_slots
;
793 /* Determine whether there is any reference suitable for prefetching
797 anything_to_prefetch_p (struct mem_ref_group
*groups
)
801 for (; groups
; groups
= groups
->next
)
802 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
803 if (should_issue_prefetch_p (ref
))
809 /* Issue prefetches for the reference REF into loop as decided before.
810 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
811 is the factor by which LOOP was unrolled. */
814 issue_prefetch_ref (struct mem_ref
*ref
, unsigned unroll_factor
, unsigned ahead
)
817 tree addr
, addr_base
, prefetch
, params
, write_p
;
818 block_stmt_iterator bsi
;
819 unsigned n_prefetches
, ap
;
821 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
822 fprintf (dump_file
, "Issued prefetch for %p.\n", (void *) ref
);
824 bsi
= bsi_for_stmt (ref
->stmt
);
826 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
827 / ref
->prefetch_mod
);
828 addr_base
= build_fold_addr_expr_with_type (ref
->mem
, ptr_type_node
);
829 addr_base
= force_gimple_operand_bsi (&bsi
, unshare_expr (addr_base
), true, NULL
);
830 write_p
= ref
->write_p
? integer_one_node
: integer_zero_node
;
832 for (ap
= 0; ap
< n_prefetches
; ap
++)
834 /* Determine the address to prefetch. */
835 delta
= (ahead
+ ap
* ref
->prefetch_mod
) * ref
->group
->step
;
836 addr
= fold_build2 (PLUS_EXPR
, ptr_type_node
,
837 addr_base
, build_int_cst (ptr_type_node
, delta
));
838 addr
= force_gimple_operand_bsi (&bsi
, unshare_expr (addr
), true, NULL
);
840 /* Create the prefetch instruction. */
841 params
= tree_cons (NULL_TREE
, addr
,
842 tree_cons (NULL_TREE
, write_p
, NULL_TREE
));
844 prefetch
= build_function_call_expr (built_in_decls
[BUILT_IN_PREFETCH
],
846 bsi_insert_before (&bsi
, prefetch
, BSI_SAME_STMT
);
850 /* Issue prefetches for the references in GROUPS into loop as decided before.
851 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
852 factor by that LOOP was unrolled. */
855 issue_prefetches (struct mem_ref_group
*groups
,
856 unsigned unroll_factor
, unsigned ahead
)
860 for (; groups
; groups
= groups
->next
)
861 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
862 if (ref
->issue_prefetch_p
)
863 issue_prefetch_ref (ref
, unroll_factor
, ahead
);
866 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
867 this is the case, fill in DESC by the description of number of
871 should_unroll_loop_p (struct loop
*loop
, struct tree_niter_desc
*desc
,
874 if (!can_unroll_loop_p (loop
, factor
, desc
))
877 /* We only consider loops without control flow for unrolling. This is not
878 a hard restriction -- tree_unroll_loop works with arbitrary loops
879 as well; but the unrolling/prefetching is usually more profitable for
880 loops consisting of a single basic block, and we want to limit the
882 if (loop
->num_nodes
> 2)
888 /* Determine the coefficient by that unroll LOOP, from the information
889 contained in the list of memory references REFS. Description of
890 umber of iterations of LOOP is stored to DESC. AHEAD is the number
891 of iterations ahead that we need to prefetch. NINSNS is number of
892 insns of the LOOP. */
895 determine_unroll_factor (struct loop
*loop
, struct mem_ref_group
*refs
,
896 unsigned ninsns
, struct tree_niter_desc
*desc
)
898 unsigned upper_bound
;
899 unsigned nfactor
, factor
, mod_constraint
;
900 struct mem_ref_group
*agp
;
903 /* First check whether the loop is not too large to unroll. We ignore
904 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
905 from unrolling them enough to make exactly one cache line covered by each
906 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
907 us from unrolling the loops too many times in cases where we only expect
908 gains from better scheduling and decreasing loop overhead, which is not
910 upper_bound
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / ninsns
;
911 if (upper_bound
<= 1)
914 /* Choose the factor so that we may prefetch each cache just once,
915 but bound the unrolling by UPPER_BOUND. */
917 for (agp
= refs
; agp
; agp
= agp
->next
)
918 for (ref
= agp
->refs
; ref
; ref
= ref
->next
)
919 if (should_issue_prefetch_p (ref
))
921 mod_constraint
= ref
->prefetch_mod
;
922 nfactor
= least_common_multiple (mod_constraint
, factor
);
923 if (nfactor
<= upper_bound
)
927 if (!should_unroll_loop_p (loop
, desc
, factor
))
933 /* Issue prefetch instructions for array references in LOOP. Returns
934 true if the LOOP was unrolled. */
937 loop_prefetch_arrays (struct loop
*loop
)
939 struct mem_ref_group
*refs
;
940 unsigned ahead
, ninsns
, unroll_factor
;
941 struct tree_niter_desc desc
;
942 bool unrolled
= false;
944 /* Step 1: gather the memory references. */
945 refs
= gather_memory_references (loop
);
947 /* Step 2: estimate the reuse effects. */
948 prune_by_reuse (refs
);
950 if (!anything_to_prefetch_p (refs
))
953 /* Step 3: determine the ahead and unroll factor. */
955 /* FIXME: We should use not size of the loop, but the average number of
956 instructions executed per iteration of the loop. */
957 ninsns
= tree_num_loop_insns (loop
);
958 ahead
= (PREFETCH_LATENCY
+ ninsns
- 1) / ninsns
;
959 unroll_factor
= determine_unroll_factor (loop
, refs
, ninsns
, &desc
);
960 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
961 fprintf (dump_file
, "Ahead %d, unroll factor %d\n", ahead
, unroll_factor
);
963 /* If the loop rolls less than the required unroll factor, prefetching
965 if (unroll_factor
> 1
966 && cst_and_fits_in_hwi (desc
.niter
)
967 && (unsigned HOST_WIDE_INT
) int_cst_value (desc
.niter
) < unroll_factor
)
970 /* Step 4: what to prefetch? */
971 if (!schedule_prefetches (refs
, unroll_factor
, ahead
))
974 /* Step 5: unroll the loop. TODO -- peeling of first and last few
975 iterations so that we do not issue superfluous prefetches. */
976 if (unroll_factor
!= 1)
978 tree_unroll_loop (loop
, unroll_factor
,
979 single_dom_exit (loop
), &desc
);
983 /* Step 6: issue the prefetches. */
984 issue_prefetches (refs
, unroll_factor
, ahead
);
987 release_mem_refs (refs
);
991 /* Issue prefetch instructions for array references in loops. */
994 tree_ssa_prefetch_arrays (void)
998 bool unrolled
= false;
1002 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1003 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1004 of processor costs and i486 does not have prefetch, but
1005 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1006 || PREFETCH_BLOCK
== 0)
1009 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1011 fprintf (dump_file
, "Prefetching parameters:\n");
1012 fprintf (dump_file
, " simultaneous prefetches: %d\n",
1013 SIMULTANEOUS_PREFETCHES
);
1014 fprintf (dump_file
, " prefetch latency: %d\n", PREFETCH_LATENCY
);
1015 fprintf (dump_file
, " L1 cache size: %d (%d bytes)\n",
1016 L1_CACHE_SIZE
, L1_CACHE_SIZE
* L1_CACHE_LINE_SIZE
);
1017 fprintf (dump_file
, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE
);
1018 fprintf (dump_file
, " prefetch block size: %d\n", PREFETCH_BLOCK
);
1019 fprintf (dump_file
, "\n");
1022 initialize_original_copy_tables ();
1024 if (!built_in_decls
[BUILT_IN_PREFETCH
])
1026 tree type
= build_function_type (void_type_node
,
1027 tree_cons (NULL_TREE
,
1028 const_ptr_type_node
,
1030 tree decl
= add_builtin_function ("__builtin_prefetch", type
,
1031 BUILT_IN_PREFETCH
, BUILT_IN_NORMAL
,
1033 DECL_IS_NOVOPS (decl
) = true;
1034 built_in_decls
[BUILT_IN_PREFETCH
] = decl
;
1037 /* We assume that size of cache line is a power of two, so verify this
1039 gcc_assert ((PREFETCH_BLOCK
& (PREFETCH_BLOCK
- 1)) == 0);
1041 FOR_EACH_LOOP (li
, loop
, LI_FROM_INNERMOST
)
1043 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1044 fprintf (dump_file
, "Processing loop %d:\n", loop
->num
);
1046 unrolled
|= loop_prefetch_arrays (loop
);
1048 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1049 fprintf (dump_file
, "\n\n");
1055 todo_flags
|= TODO_cleanup_cfg
;
1058 free_original_copy_tables ();