2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 #include "coretypes.h"
28 #include "hard-reg-set.h"
29 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
38 #include "tree-pass.h"
40 #include "insn-config.h"
43 #include "tree-chrec.h"
44 #include "tree-scalar-evolution.h"
47 #include "langhooks.h"
49 /* This pass inserts prefetch instructions to optimize cache usage during
50 accesses to arrays in loops. It processes loops sequentially and:
52 1) Gathers all memory references in the single loop.
53 2) For each of the references it decides when it is profitable to prefetch
54 it. To do it, we evaluate the reuse among the accesses, and determines
55 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
56 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
57 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
58 iterations of the loop that are zero modulo PREFETCH_MOD). For example
59 (assuming cache line size is 64 bytes, char has size 1 byte and there
60 is no hardware sequential prefetch):
63 for (i = 0; i < max; i++)
70 a[187*i + 50] = ...; (5)
73 (0) obviously has PREFETCH_BEFORE 1
74 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
75 location 64 iterations before it, and PREFETCH_MOD 64 (since
76 it hits the same cache line otherwise).
77 (2) has PREFETCH_MOD 64
78 (3) has PREFETCH_MOD 4
79 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
80 the cache line accessed by (4) is the same with probability only
82 (5) has PREFETCH_MOD 1 as well.
84 3) We determine how much ahead we need to prefetch. The number of
85 iterations needed is time to fetch / time spent in one iteration of
86 the loop. The problem is that we do not know either of these values,
87 so we just make a heuristic guess based on a magic (possibly)
88 target-specific constant and size of the loop.
90 4) Determine which of the references we prefetch. We take into account
91 that there is a maximum number of simultaneous prefetches (provided
92 by machine description). We prefetch as many prefetches as possible
93 while still within this bound (starting with those with lowest
94 prefetch_mod, since they are responsible for most of the cache
97 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
98 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
99 prefetching nonaccessed memory.
100 TODO -- actually implement peeling.
102 6) We actually emit the prefetch instructions. ??? Perhaps emit the
103 prefetch instructions with guards in cases where 5) was not sufficient
104 to satisfy the constraints?
107 -- write and use more general reuse analysis (that could be also used
108 in other cache aimed loop optimizations)
109 -- make it behave sanely together with the prefetches given by user
110 (now we just ignore them; at the very least we should avoid
111 optimizing loops in that user put his own prefetches)
112 -- we assume cache line size alignment of arrays; this could be
115 /* Magic constants follow. These should be replaced by machine specific
118 /* A number that should roughly correspond to the number of instructions
119 executed before the prefetch is completed. */
121 #ifndef PREFETCH_LATENCY
122 #define PREFETCH_LATENCY 200
125 /* Number of prefetches that can run at the same time. */
127 #ifndef SIMULTANEOUS_PREFETCHES
128 #define SIMULTANEOUS_PREFETCHES 3
131 /* True if write can be prefetched by a read prefetch. */
133 #ifndef WRITE_CAN_USE_READ_PREFETCH
134 #define WRITE_CAN_USE_READ_PREFETCH 1
137 /* True if read can be prefetched by a write prefetch. */
139 #ifndef READ_CAN_USE_WRITE_PREFETCH
140 #define READ_CAN_USE_WRITE_PREFETCH 0
143 /* Cache line size. Assumed to be a power of two. */
145 #ifndef PREFETCH_BLOCK
146 #define PREFETCH_BLOCK 32
149 /* Do we have a forward hardware sequential prefetching? */
151 #ifndef HAVE_FORWARD_PREFETCH
152 #define HAVE_FORWARD_PREFETCH 0
155 /* Do we have a backward hardware sequential prefetching? */
157 #ifndef HAVE_BACKWARD_PREFETCH
158 #define HAVE_BACKWARD_PREFETCH 0
161 /* In some cases we are only able to determine that there is a certain
162 probability that the two accesses hit the same cache line. In this
163 case, we issue the prefetches for both of them if this probability
164 is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */
166 #ifndef ACCEPTABLE_MISS_RATE
167 #define ACCEPTABLE_MISS_RATE 50
170 #ifndef HAVE_prefetch
171 #define HAVE_prefetch 0
174 /* The group of references between that reuse may occur. */
178 tree base
; /* Base of the reference. */
179 HOST_WIDE_INT step
; /* Step of the reference. */
180 struct mem_ref
*refs
; /* References in the group. */
181 struct mem_ref_group
*next
; /* Next group of references. */
184 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
186 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
188 /* The memory reference. */
192 tree stmt
; /* Statement in that the reference appears. */
193 tree mem
; /* The reference. */
194 HOST_WIDE_INT delta
; /* Constant offset of the reference. */
195 bool write_p
; /* Is it a write? */
196 struct mem_ref_group
*group
; /* The group of references it belongs to. */
197 unsigned HOST_WIDE_INT prefetch_mod
;
198 /* Prefetch only each PREFETCH_MOD-th
200 unsigned HOST_WIDE_INT prefetch_before
;
201 /* Prefetch only first PREFETCH_BEFORE
203 bool issue_prefetch_p
; /* Should we really issue the prefetch? */
204 struct mem_ref
*next
; /* The next reference in the group. */
207 /* Dumps information about reference REF to FILE. */
210 dump_mem_ref (FILE *file
, struct mem_ref
*ref
)
212 fprintf (file
, "Reference %p:\n", (void *) ref
);
214 fprintf (file
, " group %p (base ", (void *) ref
->group
);
215 print_generic_expr (file
, ref
->group
->base
, TDF_SLIM
);
216 fprintf (file
, ", step ");
217 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, ref
->group
->step
);
218 fprintf (file
, ")\n");
220 fprintf (dump_file
, " delta ");
221 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
, ref
->delta
);
222 fprintf (file
, "\n");
224 fprintf (file
, " %s\n", ref
->write_p
? "write" : "read");
226 fprintf (file
, "\n");
229 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
232 static struct mem_ref_group
*
233 find_or_create_group (struct mem_ref_group
**groups
, tree base
,
236 struct mem_ref_group
*group
;
238 for (; *groups
; groups
= &(*groups
)->next
)
240 if ((*groups
)->step
== step
241 && operand_equal_p ((*groups
)->base
, base
, 0))
244 /* Keep the list of groups sorted by decreasing step. */
245 if ((*groups
)->step
< step
)
249 group
= xcalloc (1, sizeof (struct mem_ref_group
));
253 group
->next
= *groups
;
259 /* Records a memory reference MEM in GROUP with offset DELTA and write status
260 WRITE_P. The reference occurs in statement STMT. */
263 record_ref (struct mem_ref_group
*group
, tree stmt
, tree mem
,
264 HOST_WIDE_INT delta
, bool write_p
)
266 struct mem_ref
**aref
;
268 /* Do not record the same address twice. */
269 for (aref
= &group
->refs
; *aref
; aref
= &(*aref
)->next
)
271 /* It does not have to be possible for write reference to reuse the read
272 prefetch, or vice versa. */
273 if (!WRITE_CAN_USE_READ_PREFETCH
275 && !(*aref
)->write_p
)
277 if (!READ_CAN_USE_WRITE_PREFETCH
282 if ((*aref
)->delta
== delta
)
286 (*aref
) = xcalloc (1, sizeof (struct mem_ref
));
287 (*aref
)->stmt
= stmt
;
289 (*aref
)->delta
= delta
;
290 (*aref
)->write_p
= write_p
;
291 (*aref
)->prefetch_before
= PREFETCH_ALL
;
292 (*aref
)->prefetch_mod
= 1;
293 (*aref
)->issue_prefetch_p
= false;
294 (*aref
)->group
= group
;
295 (*aref
)->next
= NULL
;
297 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
298 dump_mem_ref (dump_file
, *aref
);
301 /* Release memory references in GROUPS. */
304 release_mem_refs (struct mem_ref_group
*groups
)
306 struct mem_ref_group
*next_g
;
307 struct mem_ref
*ref
, *next_r
;
309 for (; groups
; groups
= next_g
)
311 next_g
= groups
->next
;
312 for (ref
= groups
->refs
; ref
; ref
= next_r
)
321 /* A structure used to pass arguments to idx_analyze_ref. */
325 struct loop
*loop
; /* Loop of the reference. */
326 tree stmt
; /* Statement of the reference. */
327 HOST_WIDE_INT
*step
; /* Step of the memory reference. */
328 HOST_WIDE_INT
*delta
; /* Offset of the memory reference. */
331 /* Analyzes a single INDEX of a memory reference to obtain information
332 described at analyze_ref. Callback for for_each_index. */
335 idx_analyze_ref (tree base
, tree
*index
, void *data
)
337 struct ar_data
*ar_data
= data
;
338 tree ibase
, step
, stepsize
;
339 HOST_WIDE_INT istep
, idelta
= 0, imult
= 1;
342 if (TREE_CODE (base
) == MISALIGNED_INDIRECT_REF
343 || TREE_CODE (base
) == ALIGN_INDIRECT_REF
)
346 if (!simple_iv (ar_data
->loop
, ar_data
->stmt
, *index
, &iv
, false))
355 if (!cst_and_fits_in_hwi (step
))
357 istep
= int_cst_value (step
);
360 if (TREE_CODE (ibase
) == PLUS_EXPR
361 && cst_and_fits_in_hwi (TREE_OPERAND (ibase
, 1)))
363 idelta
= int_cst_value (TREE_OPERAND (ibase
, 1));
364 ibase
= TREE_OPERAND (ibase
, 0);
366 if (cst_and_fits_in_hwi (ibase
))
368 idelta
+= int_cst_value (ibase
);
369 ibase
= build_int_cst (TREE_TYPE (ibase
), 0);
372 if (TREE_CODE (base
) == ARRAY_REF
)
374 stepsize
= array_ref_element_size (base
);
375 if (!cst_and_fits_in_hwi (stepsize
))
377 imult
= int_cst_value (stepsize
);
383 *ar_data
->step
+= istep
;
384 *ar_data
->delta
+= idelta
;
390 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
391 STEP are integer constants and iter is number of iterations of LOOP. The
392 reference occurs in statement STMT. Strips nonaddressable component
393 references from REF_P. */
396 analyze_ref (struct loop
*loop
, tree
*ref_p
, tree
*base
,
397 HOST_WIDE_INT
*step
, HOST_WIDE_INT
*delta
,
400 struct ar_data ar_data
;
402 HOST_WIDE_INT bit_offset
;
408 /* First strip off the component references. Ignore bitfields. */
409 if (TREE_CODE (ref
) == COMPONENT_REF
410 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref
, 1)))
411 ref
= TREE_OPERAND (ref
, 0);
415 for (; TREE_CODE (ref
) == COMPONENT_REF
; ref
= TREE_OPERAND (ref
, 0))
417 off
= DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref
, 1));
418 bit_offset
= TREE_INT_CST_LOW (off
);
419 gcc_assert (bit_offset
% BITS_PER_UNIT
== 0);
421 *delta
+= bit_offset
/ BITS_PER_UNIT
;
424 *base
= unshare_expr (ref
);
428 ar_data
.delta
= delta
;
429 return for_each_index (base
, idx_analyze_ref
, &ar_data
);
432 /* Record a memory reference REF to the list REFS. The reference occurs in
433 LOOP in statement STMT and it is write if WRITE_P. */
436 gather_memory_references_ref (struct loop
*loop
, struct mem_ref_group
**refs
,
437 tree ref
, bool write_p
, tree stmt
)
440 HOST_WIDE_INT step
, delta
;
441 struct mem_ref_group
*agrp
;
443 if (!analyze_ref (loop
, &ref
, &base
, &step
, &delta
, stmt
))
446 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
447 are integer constants. */
448 agrp
= find_or_create_group (refs
, base
, step
);
449 record_ref (agrp
, stmt
, ref
, delta
, write_p
);
452 /* Record the suitable memory references in LOOP. */
454 static struct mem_ref_group
*
455 gather_memory_references (struct loop
*loop
)
457 basic_block
*body
= get_loop_body_in_dom_order (loop
);
460 block_stmt_iterator bsi
;
462 struct mem_ref_group
*refs
= NULL
;
464 /* Scan the loop body in order, so that the former references precede the
466 for (i
= 0; i
< loop
->num_nodes
; i
++)
469 if (bb
->loop_father
!= loop
)
472 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
474 stmt
= bsi_stmt (bsi
);
475 if (TREE_CODE (stmt
) != MODIFY_EXPR
)
478 lhs
= TREE_OPERAND (stmt
, 0);
479 rhs
= TREE_OPERAND (stmt
, 1);
481 if (REFERENCE_CLASS_P (rhs
))
482 gather_memory_references_ref (loop
, &refs
, rhs
, false, stmt
);
483 if (REFERENCE_CLASS_P (lhs
))
484 gather_memory_references_ref (loop
, &refs
, lhs
, true, stmt
);
492 /* Prune the prefetch candidate REF using the self-reuse. */
495 prune_ref_by_self_reuse (struct mem_ref
*ref
)
497 HOST_WIDE_INT step
= ref
->group
->step
;
498 bool backward
= step
< 0;
502 /* Prefetch references to invariant address just once. */
503 ref
->prefetch_before
= 1;
510 if (step
> PREFETCH_BLOCK
)
513 if ((backward
&& HAVE_BACKWARD_PREFETCH
)
514 || (!backward
&& HAVE_FORWARD_PREFETCH
))
516 ref
->prefetch_before
= 1;
520 ref
->prefetch_mod
= PREFETCH_BLOCK
/ step
;
523 /* Divides X by BY, rounding down. */
526 ddown (HOST_WIDE_INT x
, unsigned HOST_WIDE_INT by
)
533 return (x
+ by
- 1) / by
;
536 /* Prune the prefetch candidate REF using the reuse with BY.
537 If BY_IS_BEFORE is true, BY is before REF in the loop. */
540 prune_ref_by_group_reuse (struct mem_ref
*ref
, struct mem_ref
*by
,
543 HOST_WIDE_INT step
= ref
->group
->step
;
544 bool backward
= step
< 0;
545 HOST_WIDE_INT delta_r
= ref
->delta
, delta_b
= by
->delta
;
546 HOST_WIDE_INT delta
= delta_b
- delta_r
;
547 HOST_WIDE_INT hit_from
;
548 unsigned HOST_WIDE_INT prefetch_before
, prefetch_block
;
552 /* If the references has the same address, only prefetch the
555 ref
->prefetch_before
= 0;
562 /* If the reference addresses are invariant and fall into the
563 same cache line, prefetch just the first one. */
567 if (ddown (ref
->delta
, PREFETCH_BLOCK
)
568 != ddown (by
->delta
, PREFETCH_BLOCK
))
571 ref
->prefetch_before
= 0;
575 /* Only prune the reference that is behind in the array. */
581 /* Transform the data so that we may assume that the accesses
585 delta_r
= PREFETCH_BLOCK
- 1 - delta_r
;
586 delta_b
= PREFETCH_BLOCK
- 1 - delta_b
;
594 /* Check whether the two references are likely to hit the same cache
595 line, and how distant the iterations in that it occurs are from
598 if (step
<= PREFETCH_BLOCK
)
600 /* The accesses are sure to meet. Let us check when. */
601 hit_from
= ddown (delta_b
, PREFETCH_BLOCK
) * PREFETCH_BLOCK
;
602 prefetch_before
= (hit_from
- delta_r
+ step
- 1) / step
;
604 if (prefetch_before
< ref
->prefetch_before
)
605 ref
->prefetch_before
= prefetch_before
;
610 /* A more complicated case. First let us ensure that size of cache line
611 and step are coprime (here we assume that PREFETCH_BLOCK is a power
613 prefetch_block
= PREFETCH_BLOCK
;
614 while ((step
& 1) == 0
615 && prefetch_block
> 1)
618 prefetch_block
>>= 1;
622 /* Now step > prefetch_block, and step and prefetch_block are coprime.
623 Determine the probability that the accesses hit the same cache line. */
625 prefetch_before
= delta
/ step
;
627 if ((unsigned HOST_WIDE_INT
) delta
628 <= (prefetch_block
* ACCEPTABLE_MISS_RATE
/ 1000))
630 if (prefetch_before
< ref
->prefetch_before
)
631 ref
->prefetch_before
= prefetch_before
;
636 /* Try also the following iteration. */
638 delta
= step
- delta
;
639 if ((unsigned HOST_WIDE_INT
) delta
640 <= (prefetch_block
* ACCEPTABLE_MISS_RATE
/ 1000))
642 if (prefetch_before
< ref
->prefetch_before
)
643 ref
->prefetch_before
= prefetch_before
;
648 /* The ref probably does not reuse by. */
652 /* Prune the prefetch candidate REF using the reuses with other references
656 prune_ref_by_reuse (struct mem_ref
*ref
, struct mem_ref
*refs
)
658 struct mem_ref
*prune_by
;
661 prune_ref_by_self_reuse (ref
);
663 for (prune_by
= refs
; prune_by
; prune_by
= prune_by
->next
)
671 if (!WRITE_CAN_USE_READ_PREFETCH
673 && !prune_by
->write_p
)
675 if (!READ_CAN_USE_WRITE_PREFETCH
677 && prune_by
->write_p
)
680 prune_ref_by_group_reuse (ref
, prune_by
, before
);
684 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
687 prune_group_by_reuse (struct mem_ref_group
*group
)
689 struct mem_ref
*ref_pruned
;
691 for (ref_pruned
= group
->refs
; ref_pruned
; ref_pruned
= ref_pruned
->next
)
693 prune_ref_by_reuse (ref_pruned
, group
->refs
);
695 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
697 fprintf (dump_file
, "Reference %p:", (void *) ref_pruned
);
699 if (ref_pruned
->prefetch_before
== PREFETCH_ALL
700 && ref_pruned
->prefetch_mod
== 1)
701 fprintf (dump_file
, " no restrictions");
702 else if (ref_pruned
->prefetch_before
== 0)
703 fprintf (dump_file
, " do not prefetch");
704 else if (ref_pruned
->prefetch_before
<= ref_pruned
->prefetch_mod
)
705 fprintf (dump_file
, " prefetch once");
708 if (ref_pruned
->prefetch_before
!= PREFETCH_ALL
)
710 fprintf (dump_file
, " prefetch before ");
711 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
712 ref_pruned
->prefetch_before
);
714 if (ref_pruned
->prefetch_mod
!= 1)
716 fprintf (dump_file
, " prefetch mod ");
717 fprintf (dump_file
, HOST_WIDE_INT_PRINT_DEC
,
718 ref_pruned
->prefetch_mod
);
721 fprintf (dump_file
, "\n");
726 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
729 prune_by_reuse (struct mem_ref_group
*groups
)
731 for (; groups
; groups
= groups
->next
)
732 prune_group_by_reuse (groups
);
735 /* Returns true if we should issue prefetch for REF. */
738 should_issue_prefetch_p (struct mem_ref
*ref
)
740 /* For now do not issue prefetches for only first few of the
742 if (ref
->prefetch_before
!= PREFETCH_ALL
)
748 /* Decide which of the prefetch candidates in GROUPS to prefetch.
749 AHEAD is the number of iterations to prefetch ahead (which corresponds
750 to the number of simultaneous instances of one prefetch running at a
751 time). UNROLL_FACTOR is the factor by that the loop is going to be
752 unrolled. Returns true if there is anything to prefetch. */
755 schedule_prefetches (struct mem_ref_group
*groups
, unsigned unroll_factor
,
758 unsigned max_prefetches
, n_prefetches
;
762 max_prefetches
= (SIMULTANEOUS_PREFETCHES
* unroll_factor
) / ahead
;
763 if (max_prefetches
> (unsigned) SIMULTANEOUS_PREFETCHES
)
764 max_prefetches
= SIMULTANEOUS_PREFETCHES
;
766 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
767 fprintf (dump_file
, "Max prefetches to issue: %d.\n", max_prefetches
);
772 /* For now we just take memory references one by one and issue
773 prefetches for as many as possible. The groups are sorted
774 starting with the largest step, since the references with
775 large step are more likely to cause many cache misses. */
777 for (; groups
; groups
= groups
->next
)
778 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
780 if (!should_issue_prefetch_p (ref
))
783 ref
->issue_prefetch_p
= true;
785 /* If prefetch_mod is less then unroll_factor, we need to insert
786 several prefetches for the reference. */
787 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
788 / ref
->prefetch_mod
);
789 if (max_prefetches
<= n_prefetches
)
792 max_prefetches
-= n_prefetches
;
799 /* Determine whether there is any reference suitable for prefetching
803 anything_to_prefetch_p (struct mem_ref_group
*groups
)
807 for (; groups
; groups
= groups
->next
)
808 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
809 if (should_issue_prefetch_p (ref
))
815 /* Issue prefetches for the reference REF into loop as decided before.
816 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
817 is the factor by which LOOP was unrolled. */
820 issue_prefetch_ref (struct mem_ref
*ref
, unsigned unroll_factor
, unsigned ahead
)
823 tree addr
, addr_base
, prefetch
, params
, write_p
;
824 block_stmt_iterator bsi
;
825 unsigned n_prefetches
, ap
;
827 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
828 fprintf (dump_file
, "Issued prefetch for %p.\n", (void *) ref
);
830 bsi
= bsi_for_stmt (ref
->stmt
);
832 n_prefetches
= ((unroll_factor
+ ref
->prefetch_mod
- 1)
833 / ref
->prefetch_mod
);
834 addr_base
= build_fold_addr_expr_with_type (ref
->mem
, ptr_type_node
);
835 addr_base
= force_gimple_operand_bsi (&bsi
, unshare_expr (addr_base
), true, NULL
);
837 for (ap
= 0; ap
< n_prefetches
; ap
++)
839 /* Determine the address to prefetch. */
840 delta
= (ahead
+ ap
* ref
->prefetch_mod
) * ref
->group
->step
;
841 addr
= fold_build2 (PLUS_EXPR
, ptr_type_node
,
842 addr_base
, build_int_cst (ptr_type_node
, delta
));
843 addr
= force_gimple_operand_bsi (&bsi
, unshare_expr (addr
), true, NULL
);
845 /* Create the prefetch instruction. */
846 write_p
= ref
->write_p
? integer_one_node
: integer_zero_node
;
847 params
= tree_cons (NULL_TREE
, addr
,
848 tree_cons (NULL_TREE
, write_p
, NULL_TREE
));
850 prefetch
= build_function_call_expr (built_in_decls
[BUILT_IN_PREFETCH
],
852 bsi_insert_before (&bsi
, prefetch
, BSI_SAME_STMT
);
856 /* Issue prefetches for the references in GROUPS into loop as decided before.
857 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
858 factor by that LOOP was unrolled. */
861 issue_prefetches (struct mem_ref_group
*groups
,
862 unsigned unroll_factor
, unsigned ahead
)
866 for (; groups
; groups
= groups
->next
)
867 for (ref
= groups
->refs
; ref
; ref
= ref
->next
)
868 if (ref
->issue_prefetch_p
)
869 issue_prefetch_ref (ref
, unroll_factor
, ahead
);
872 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
873 this is the case, fill in DESC by the description of number of
877 should_unroll_loop_p (struct loop
*loop
, struct tree_niter_desc
*desc
,
880 if (!can_unroll_loop_p (loop
, factor
, desc
))
883 /* We only consider loops without control flow for unrolling. This is not
884 a hard restriction -- tree_unroll_loop works with arbitrary loops
885 as well; but the unrolling/prefetching is usually more profitable for
886 loops consisting of a single basic block, and we want to limit the
888 if (loop
->num_nodes
> 2)
894 /* Determine the coefficient by that unroll LOOP, from the information
895 contained in the list of memory references REFS. Description of
896 umber of iterations of LOOP is stored to DESC. AHEAD is the number
897 of iterations ahead that we need to prefetch. NINSNS is number of
898 insns of the LOOP. */
901 determine_unroll_factor (struct loop
*loop
, struct mem_ref_group
*refs
,
902 unsigned ahead
, unsigned ninsns
,
903 struct tree_niter_desc
*desc
)
905 unsigned upper_bound
, size_factor
, constraint_factor
;
906 unsigned factor
, max_mod_constraint
, ahead_factor
;
907 struct mem_ref_group
*agp
;
910 upper_bound
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
912 /* First check whether the loop is not too large to unroll. */
913 size_factor
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / ninsns
;
914 if (size_factor
<= 1)
917 if (size_factor
< upper_bound
)
918 upper_bound
= size_factor
;
920 max_mod_constraint
= 1;
921 for (agp
= refs
; agp
; agp
= agp
->next
)
922 for (ref
= agp
->refs
; ref
; ref
= ref
->next
)
923 if (should_issue_prefetch_p (ref
)
924 && ref
->prefetch_mod
> max_mod_constraint
)
925 max_mod_constraint
= ref
->prefetch_mod
;
927 /* Set constraint_factor as large as needed to be able to satisfy the
928 largest modulo constraint. */
929 constraint_factor
= max_mod_constraint
;
931 /* If ahead is too large in comparison with the number of available
932 prefetches, unroll the loop as much as needed to be able to prefetch
933 at least partially some of the references in the loop. */
934 ahead_factor
= ((ahead
+ SIMULTANEOUS_PREFETCHES
- 1)
935 / SIMULTANEOUS_PREFETCHES
);
937 /* Unroll as much as useful, but bound the code size growth. */
938 if (constraint_factor
< ahead_factor
)
939 factor
= ahead_factor
;
941 factor
= constraint_factor
;
942 if (factor
> upper_bound
)
943 factor
= upper_bound
;
945 if (!should_unroll_loop_p (loop
, desc
, factor
))
951 /* Issue prefetch instructions for array references in LOOP. Returns
952 true if the LOOP was unrolled. LOOPS is the array containing all
956 loop_prefetch_arrays (struct loops
*loops
, struct loop
*loop
)
958 struct mem_ref_group
*refs
;
959 unsigned ahead
, ninsns
, unroll_factor
;
960 struct tree_niter_desc desc
;
961 bool unrolled
= false;
963 /* Step 1: gather the memory references. */
964 refs
= gather_memory_references (loop
);
966 /* Step 2: estimate the reuse effects. */
967 prune_by_reuse (refs
);
969 if (!anything_to_prefetch_p (refs
))
972 /* Step 3: determine the ahead and unroll factor. */
974 /* FIXME: We should use not size of the loop, but the average number of
975 instructions executed per iteration of the loop. */
976 ninsns
= tree_num_loop_insns (loop
);
977 ahead
= (PREFETCH_LATENCY
+ ninsns
- 1) / ninsns
;
978 unroll_factor
= determine_unroll_factor (loop
, refs
, ahead
, ninsns
,
980 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
981 fprintf (dump_file
, "Ahead %d, unroll factor %d\n", ahead
, unroll_factor
);
983 /* If the loop rolls less than the required unroll factor, prefetching
985 if (unroll_factor
> 1
986 && cst_and_fits_in_hwi (desc
.niter
)
987 && (unsigned HOST_WIDE_INT
) int_cst_value (desc
.niter
) < unroll_factor
)
990 /* Step 4: what to prefetch? */
991 if (!schedule_prefetches (refs
, unroll_factor
, ahead
))
994 /* Step 5: unroll the loop. TODO -- peeling of first and last few
995 iterations so that we do not issue superfluous prefetches. */
996 if (unroll_factor
!= 1)
998 tree_unroll_loop (loops
, loop
, unroll_factor
,
999 single_dom_exit (loop
), &desc
);
1003 /* Step 6: issue the prefetches. */
1004 issue_prefetches (refs
, unroll_factor
, ahead
);
1007 release_mem_refs (refs
);
1011 /* Issue prefetch instructions for array references in LOOPS. */
1014 tree_ssa_prefetch_arrays (struct loops
*loops
)
1018 bool unrolled
= false;
1022 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1023 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1024 of processor costs and i486 does not have prefetch, but
1025 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1026 || PREFETCH_BLOCK
== 0)
1029 initialize_original_copy_tables ();
1031 if (!built_in_decls
[BUILT_IN_PREFETCH
])
1033 tree type
= build_function_type (void_type_node
,
1034 tree_cons (NULL_TREE
,
1035 const_ptr_type_node
,
1037 tree decl
= lang_hooks
.builtin_function ("__builtin_prefetch", type
,
1038 BUILT_IN_PREFETCH
, BUILT_IN_NORMAL
,
1040 DECL_IS_NOVOPS (decl
) = true;
1041 built_in_decls
[BUILT_IN_PREFETCH
] = decl
;
1044 /* We assume that size of cache line is a power of two, so verify this
1046 gcc_assert ((PREFETCH_BLOCK
& (PREFETCH_BLOCK
- 1)) == 0);
1048 for (i
= loops
->num
- 1; i
> 0; i
--)
1050 loop
= loops
->parray
[i
];
1052 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1053 fprintf (dump_file
, "Processing loop %d:\n", loop
->num
);
1056 unrolled
|= loop_prefetch_arrays (loops
, loop
);
1058 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1059 fprintf (dump_file
, "\n\n");
1065 todo_flags
|= TODO_cleanup_cfg
;
1068 free_original_copy_tables ();