Daily bump.
[official-gcc.git] / gcc / tree-ssa-loop-prefetch.c
blob07f35cf5d0ce92a0e368edf09bf3293e14ef7c31
1 /* Array prefetching.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "tree.h"
26 #include "rtl.h"
27 #include "tm_p.h"
28 #include "hard-reg-set.h"
29 #include "basic-block.h"
30 #include "output.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "varray.h"
37 #include "expr.h"
38 #include "tree-pass.h"
39 #include "ggc.h"
40 #include "insn-config.h"
41 #include "recog.h"
42 #include "hashtab.h"
43 #include "tree-chrec.h"
44 #include "tree-scalar-evolution.h"
45 #include "toplev.h"
46 #include "params.h"
47 #include "langhooks.h"
49 /* This pass inserts prefetch instructions to optimize cache usage during
50 accesses to arrays in loops. It processes loops sequentially and:
52 1) Gathers all memory references in the single loop.
53 2) For each of the references it decides when it is profitable to prefetch
54 it. To do it, we evaluate the reuse among the accesses, and determines
55 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
56 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
57 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
58 iterations of the loop that are zero modulo PREFETCH_MOD). For example
59 (assuming cache line size is 64 bytes, char has size 1 byte and there
60 is no hardware sequential prefetch):
62 char *a;
63 for (i = 0; i < max; i++)
65 a[255] = ...; (0)
66 a[i] = ...; (1)
67 a[i + 64] = ...; (2)
68 a[16*i] = ...; (3)
69 a[187*i] = ...; (4)
70 a[187*i + 50] = ...; (5)
73 (0) obviously has PREFETCH_BEFORE 1
74 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
75 location 64 iterations before it, and PREFETCH_MOD 64 (since
76 it hits the same cache line otherwise).
77 (2) has PREFETCH_MOD 64
78 (3) has PREFETCH_MOD 4
79 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
80 the cache line accessed by (4) is the same with probability only
81 7/32.
82 (5) has PREFETCH_MOD 1 as well.
84 3) We determine how much ahead we need to prefetch. The number of
85 iterations needed is time to fetch / time spent in one iteration of
86 the loop. The problem is that we do not know either of these values,
87 so we just make a heuristic guess based on a magic (possibly)
88 target-specific constant and size of the loop.
90 4) Determine which of the references we prefetch. We take into account
91 that there is a maximum number of simultaneous prefetches (provided
92 by machine description). We prefetch as many prefetches as possible
93 while still within this bound (starting with those with lowest
94 prefetch_mod, since they are responsible for most of the cache
95 misses).
97 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
98 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
99 prefetching nonaccessed memory.
100 TODO -- actually implement peeling.
102 6) We actually emit the prefetch instructions. ??? Perhaps emit the
103 prefetch instructions with guards in cases where 5) was not sufficient
104 to satisfy the constraints?
106 Some other TODO:
107 -- write and use more general reuse analysis (that could be also used
108 in other cache aimed loop optimizations)
109 -- make it behave sanely together with the prefetches given by user
110 (now we just ignore them; at the very least we should avoid
111 optimizing loops in that user put his own prefetches)
112 -- we assume cache line size alignment of arrays; this could be
113 improved. */
115 /* Magic constants follow. These should be replaced by machine specific
116 numbers. */
118 /* True if write can be prefetched by a read prefetch. */
120 #ifndef WRITE_CAN_USE_READ_PREFETCH
121 #define WRITE_CAN_USE_READ_PREFETCH 1
122 #endif
124 /* True if read can be prefetched by a write prefetch. */
126 #ifndef READ_CAN_USE_WRITE_PREFETCH
127 #define READ_CAN_USE_WRITE_PREFETCH 0
128 #endif
130 /* The size of the block loaded by a single prefetch. Usually, this is
131 the same as cache line size (at the moment, we only consider one level
132 of cache hierarchy). */
134 #ifndef PREFETCH_BLOCK
135 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
136 #endif
138 /* Do we have a forward hardware sequential prefetching? */
140 #ifndef HAVE_FORWARD_PREFETCH
141 #define HAVE_FORWARD_PREFETCH 0
142 #endif
144 /* Do we have a backward hardware sequential prefetching? */
146 #ifndef HAVE_BACKWARD_PREFETCH
147 #define HAVE_BACKWARD_PREFETCH 0
148 #endif
150 /* In some cases we are only able to determine that there is a certain
151 probability that the two accesses hit the same cache line. In this
152 case, we issue the prefetches for both of them if this probability
153 is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */
155 #ifndef ACCEPTABLE_MISS_RATE
156 #define ACCEPTABLE_MISS_RATE 50
157 #endif
159 #ifndef HAVE_prefetch
160 #define HAVE_prefetch 0
161 #endif
163 /* The group of references between that reuse may occur. */
165 struct mem_ref_group
167 tree base; /* Base of the reference. */
168 HOST_WIDE_INT step; /* Step of the reference. */
169 struct mem_ref *refs; /* References in the group. */
170 struct mem_ref_group *next; /* Next group of references. */
173 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
175 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
177 /* The memory reference. */
179 struct mem_ref
181 tree stmt; /* Statement in that the reference appears. */
182 tree mem; /* The reference. */
183 HOST_WIDE_INT delta; /* Constant offset of the reference. */
184 bool write_p; /* Is it a write? */
185 struct mem_ref_group *group; /* The group of references it belongs to. */
186 unsigned HOST_WIDE_INT prefetch_mod;
187 /* Prefetch only each PREFETCH_MOD-th
188 iteration. */
189 unsigned HOST_WIDE_INT prefetch_before;
190 /* Prefetch only first PREFETCH_BEFORE
191 iterations. */
192 bool issue_prefetch_p; /* Should we really issue the prefetch? */
193 struct mem_ref *next; /* The next reference in the group. */
196 /* Dumps information about reference REF to FILE. */
198 static void
199 dump_mem_ref (FILE *file, struct mem_ref *ref)
201 fprintf (file, "Reference %p:\n", (void *) ref);
203 fprintf (file, " group %p (base ", (void *) ref->group);
204 print_generic_expr (file, ref->group->base, TDF_SLIM);
205 fprintf (file, ", step ");
206 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
207 fprintf (file, ")\n");
209 fprintf (file, " delta ");
210 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
211 fprintf (file, "\n");
213 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
215 fprintf (file, "\n");
218 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
219 exist. */
221 static struct mem_ref_group *
222 find_or_create_group (struct mem_ref_group **groups, tree base,
223 HOST_WIDE_INT step)
225 struct mem_ref_group *group;
227 for (; *groups; groups = &(*groups)->next)
229 if ((*groups)->step == step
230 && operand_equal_p ((*groups)->base, base, 0))
231 return *groups;
233 /* Keep the list of groups sorted by decreasing step. */
234 if ((*groups)->step < step)
235 break;
238 group = xcalloc (1, sizeof (struct mem_ref_group));
239 group->base = base;
240 group->step = step;
241 group->refs = NULL;
242 group->next = *groups;
243 *groups = group;
245 return group;
248 /* Records a memory reference MEM in GROUP with offset DELTA and write status
249 WRITE_P. The reference occurs in statement STMT. */
251 static void
252 record_ref (struct mem_ref_group *group, tree stmt, tree mem,
253 HOST_WIDE_INT delta, bool write_p)
255 struct mem_ref **aref;
257 /* Do not record the same address twice. */
258 for (aref = &group->refs; *aref; aref = &(*aref)->next)
260 /* It does not have to be possible for write reference to reuse the read
261 prefetch, or vice versa. */
262 if (!WRITE_CAN_USE_READ_PREFETCH
263 && write_p
264 && !(*aref)->write_p)
265 continue;
266 if (!READ_CAN_USE_WRITE_PREFETCH
267 && !write_p
268 && (*aref)->write_p)
269 continue;
271 if ((*aref)->delta == delta)
272 return;
275 (*aref) = xcalloc (1, sizeof (struct mem_ref));
276 (*aref)->stmt = stmt;
277 (*aref)->mem = mem;
278 (*aref)->delta = delta;
279 (*aref)->write_p = write_p;
280 (*aref)->prefetch_before = PREFETCH_ALL;
281 (*aref)->prefetch_mod = 1;
282 (*aref)->issue_prefetch_p = false;
283 (*aref)->group = group;
284 (*aref)->next = NULL;
286 if (dump_file && (dump_flags & TDF_DETAILS))
287 dump_mem_ref (dump_file, *aref);
290 /* Release memory references in GROUPS. */
292 static void
293 release_mem_refs (struct mem_ref_group *groups)
295 struct mem_ref_group *next_g;
296 struct mem_ref *ref, *next_r;
298 for (; groups; groups = next_g)
300 next_g = groups->next;
301 for (ref = groups->refs; ref; ref = next_r)
303 next_r = ref->next;
304 free (ref);
306 free (groups);
310 /* A structure used to pass arguments to idx_analyze_ref. */
312 struct ar_data
314 struct loop *loop; /* Loop of the reference. */
315 tree stmt; /* Statement of the reference. */
316 HOST_WIDE_INT *step; /* Step of the memory reference. */
317 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
320 /* Analyzes a single INDEX of a memory reference to obtain information
321 described at analyze_ref. Callback for for_each_index. */
323 static bool
324 idx_analyze_ref (tree base, tree *index, void *data)
326 struct ar_data *ar_data = data;
327 tree ibase, step, stepsize;
328 HOST_WIDE_INT istep, idelta = 0, imult = 1;
329 affine_iv iv;
331 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
332 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
333 return false;
335 if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false))
336 return false;
337 ibase = iv.base;
338 step = iv.step;
340 if (zero_p (step))
341 istep = 0;
342 else
344 if (!cst_and_fits_in_hwi (step))
345 return false;
346 istep = int_cst_value (step);
349 if (TREE_CODE (ibase) == PLUS_EXPR
350 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
352 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
353 ibase = TREE_OPERAND (ibase, 0);
355 if (cst_and_fits_in_hwi (ibase))
357 idelta += int_cst_value (ibase);
358 ibase = build_int_cst (TREE_TYPE (ibase), 0);
361 if (TREE_CODE (base) == ARRAY_REF)
363 stepsize = array_ref_element_size (base);
364 if (!cst_and_fits_in_hwi (stepsize))
365 return false;
366 imult = int_cst_value (stepsize);
368 istep *= imult;
369 idelta *= imult;
372 *ar_data->step += istep;
373 *ar_data->delta += idelta;
374 *index = ibase;
376 return true;
379 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
380 STEP are integer constants and iter is number of iterations of LOOP. The
381 reference occurs in statement STMT. Strips nonaddressable component
382 references from REF_P. */
384 static bool
385 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
386 HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
387 tree stmt)
389 struct ar_data ar_data;
390 tree off;
391 HOST_WIDE_INT bit_offset;
392 tree ref = *ref_p;
394 *step = 0;
395 *delta = 0;
397 /* First strip off the component references. Ignore bitfields. */
398 if (TREE_CODE (ref) == COMPONENT_REF
399 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
400 ref = TREE_OPERAND (ref, 0);
402 *ref_p = ref;
404 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
406 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
407 bit_offset = TREE_INT_CST_LOW (off);
408 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
410 *delta += bit_offset / BITS_PER_UNIT;
413 *base = unshare_expr (ref);
414 ar_data.loop = loop;
415 ar_data.stmt = stmt;
416 ar_data.step = step;
417 ar_data.delta = delta;
418 return for_each_index (base, idx_analyze_ref, &ar_data);
421 /* Record a memory reference REF to the list REFS. The reference occurs in
422 LOOP in statement STMT and it is write if WRITE_P. */
424 static void
425 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
426 tree ref, bool write_p, tree stmt)
428 tree base;
429 HOST_WIDE_INT step, delta;
430 struct mem_ref_group *agrp;
432 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
433 return;
435 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
436 are integer constants. */
437 agrp = find_or_create_group (refs, base, step);
438 record_ref (agrp, stmt, ref, delta, write_p);
441 /* Record the suitable memory references in LOOP. */
443 static struct mem_ref_group *
444 gather_memory_references (struct loop *loop)
446 basic_block *body = get_loop_body_in_dom_order (loop);
447 basic_block bb;
448 unsigned i;
449 block_stmt_iterator bsi;
450 tree stmt, lhs, rhs;
451 struct mem_ref_group *refs = NULL;
453 /* Scan the loop body in order, so that the former references precede the
454 later ones. */
455 for (i = 0; i < loop->num_nodes; i++)
457 bb = body[i];
458 if (bb->loop_father != loop)
459 continue;
461 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
463 stmt = bsi_stmt (bsi);
464 if (TREE_CODE (stmt) != MODIFY_EXPR)
465 continue;
467 lhs = TREE_OPERAND (stmt, 0);
468 rhs = TREE_OPERAND (stmt, 1);
470 if (REFERENCE_CLASS_P (rhs))
471 gather_memory_references_ref (loop, &refs, rhs, false, stmt);
472 if (REFERENCE_CLASS_P (lhs))
473 gather_memory_references_ref (loop, &refs, lhs, true, stmt);
476 free (body);
478 return refs;
481 /* Prune the prefetch candidate REF using the self-reuse. */
483 static void
484 prune_ref_by_self_reuse (struct mem_ref *ref)
486 HOST_WIDE_INT step = ref->group->step;
487 bool backward = step < 0;
489 if (step == 0)
491 /* Prefetch references to invariant address just once. */
492 ref->prefetch_before = 1;
493 return;
496 if (backward)
497 step = -step;
499 if (step > PREFETCH_BLOCK)
500 return;
502 if ((backward && HAVE_BACKWARD_PREFETCH)
503 || (!backward && HAVE_FORWARD_PREFETCH))
505 ref->prefetch_before = 1;
506 return;
509 ref->prefetch_mod = PREFETCH_BLOCK / step;
512 /* Divides X by BY, rounding down. */
514 static HOST_WIDE_INT
515 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
517 gcc_assert (by > 0);
519 if (x >= 0)
520 return x / by;
521 else
522 return (x + by - 1) / by;
525 /* Prune the prefetch candidate REF using the reuse with BY.
526 If BY_IS_BEFORE is true, BY is before REF in the loop. */
528 static void
529 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
530 bool by_is_before)
532 HOST_WIDE_INT step = ref->group->step;
533 bool backward = step < 0;
534 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
535 HOST_WIDE_INT delta = delta_b - delta_r;
536 HOST_WIDE_INT hit_from;
537 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
539 if (delta == 0)
541 /* If the references has the same address, only prefetch the
542 former. */
543 if (by_is_before)
544 ref->prefetch_before = 0;
546 return;
549 if (!step)
551 /* If the reference addresses are invariant and fall into the
552 same cache line, prefetch just the first one. */
553 if (!by_is_before)
554 return;
556 if (ddown (ref->delta, PREFETCH_BLOCK)
557 != ddown (by->delta, PREFETCH_BLOCK))
558 return;
560 ref->prefetch_before = 0;
561 return;
564 /* Only prune the reference that is behind in the array. */
565 if (backward)
567 if (delta > 0)
568 return;
570 /* Transform the data so that we may assume that the accesses
571 are forward. */
572 delta = - delta;
573 step = -step;
574 delta_r = PREFETCH_BLOCK - 1 - delta_r;
575 delta_b = PREFETCH_BLOCK - 1 - delta_b;
577 else
579 if (delta < 0)
580 return;
583 /* Check whether the two references are likely to hit the same cache
584 line, and how distant the iterations in that it occurs are from
585 each other. */
587 if (step <= PREFETCH_BLOCK)
589 /* The accesses are sure to meet. Let us check when. */
590 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
591 prefetch_before = (hit_from - delta_r + step - 1) / step;
593 if (prefetch_before < ref->prefetch_before)
594 ref->prefetch_before = prefetch_before;
596 return;
599 /* A more complicated case. First let us ensure that size of cache line
600 and step are coprime (here we assume that PREFETCH_BLOCK is a power
601 of two. */
602 prefetch_block = PREFETCH_BLOCK;
603 while ((step & 1) == 0
604 && prefetch_block > 1)
606 step >>= 1;
607 prefetch_block >>= 1;
608 delta >>= 1;
611 /* Now step > prefetch_block, and step and prefetch_block are coprime.
612 Determine the probability that the accesses hit the same cache line. */
614 prefetch_before = delta / step;
615 delta %= step;
616 if ((unsigned HOST_WIDE_INT) delta
617 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
619 if (prefetch_before < ref->prefetch_before)
620 ref->prefetch_before = prefetch_before;
622 return;
625 /* Try also the following iteration. */
626 prefetch_before++;
627 delta = step - delta;
628 if ((unsigned HOST_WIDE_INT) delta
629 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
631 if (prefetch_before < ref->prefetch_before)
632 ref->prefetch_before = prefetch_before;
634 return;
637 /* The ref probably does not reuse by. */
638 return;
641 /* Prune the prefetch candidate REF using the reuses with other references
642 in REFS. */
644 static void
645 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
647 struct mem_ref *prune_by;
648 bool before = true;
650 prune_ref_by_self_reuse (ref);
652 for (prune_by = refs; prune_by; prune_by = prune_by->next)
654 if (prune_by == ref)
656 before = false;
657 continue;
660 if (!WRITE_CAN_USE_READ_PREFETCH
661 && ref->write_p
662 && !prune_by->write_p)
663 continue;
664 if (!READ_CAN_USE_WRITE_PREFETCH
665 && !ref->write_p
666 && prune_by->write_p)
667 continue;
669 prune_ref_by_group_reuse (ref, prune_by, before);
673 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
675 static void
676 prune_group_by_reuse (struct mem_ref_group *group)
678 struct mem_ref *ref_pruned;
680 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
682 prune_ref_by_reuse (ref_pruned, group->refs);
684 if (dump_file && (dump_flags & TDF_DETAILS))
686 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
688 if (ref_pruned->prefetch_before == PREFETCH_ALL
689 && ref_pruned->prefetch_mod == 1)
690 fprintf (dump_file, " no restrictions");
691 else if (ref_pruned->prefetch_before == 0)
692 fprintf (dump_file, " do not prefetch");
693 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
694 fprintf (dump_file, " prefetch once");
695 else
697 if (ref_pruned->prefetch_before != PREFETCH_ALL)
699 fprintf (dump_file, " prefetch before ");
700 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
701 ref_pruned->prefetch_before);
703 if (ref_pruned->prefetch_mod != 1)
705 fprintf (dump_file, " prefetch mod ");
706 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
707 ref_pruned->prefetch_mod);
710 fprintf (dump_file, "\n");
715 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
717 static void
718 prune_by_reuse (struct mem_ref_group *groups)
720 for (; groups; groups = groups->next)
721 prune_group_by_reuse (groups);
724 /* Returns true if we should issue prefetch for REF. */
726 static bool
727 should_issue_prefetch_p (struct mem_ref *ref)
729 /* For now do not issue prefetches for only first few of the
730 iterations. */
731 if (ref->prefetch_before != PREFETCH_ALL)
732 return false;
734 return true;
737 /* Decide which of the prefetch candidates in GROUPS to prefetch.
738 AHEAD is the number of iterations to prefetch ahead (which corresponds
739 to the number of simultaneous instances of one prefetch running at a
740 time). UNROLL_FACTOR is the factor by that the loop is going to be
741 unrolled. Returns true if there is anything to prefetch. */
743 static bool
744 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
745 unsigned ahead)
747 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
748 unsigned slots_per_prefetch;
749 struct mem_ref *ref;
750 bool any = false;
752 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
753 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
755 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
756 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
757 it will need a prefetch slot. */
758 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
759 if (dump_file && (dump_flags & TDF_DETAILS))
760 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
761 slots_per_prefetch);
763 /* For now we just take memory references one by one and issue
764 prefetches for as many as possible. The groups are sorted
765 starting with the largest step, since the references with
766 large step are more likely to cause many cache misses. */
768 for (; groups; groups = groups->next)
769 for (ref = groups->refs; ref; ref = ref->next)
771 if (!should_issue_prefetch_p (ref))
772 continue;
774 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
775 and we unroll the loop UNROLL_FACTOR times, we need to insert
776 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
777 iteration. */
778 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
779 / ref->prefetch_mod);
780 prefetch_slots = n_prefetches * slots_per_prefetch;
782 /* If more than half of the prefetches would be lost anyway, do not
783 issue the prefetch. */
784 if (2 * remaining_prefetch_slots < prefetch_slots)
785 continue;
787 ref->issue_prefetch_p = true;
789 if (remaining_prefetch_slots <= prefetch_slots)
790 return true;
791 remaining_prefetch_slots -= prefetch_slots;
792 any = true;
795 return any;
798 /* Determine whether there is any reference suitable for prefetching
799 in GROUPS. */
801 static bool
802 anything_to_prefetch_p (struct mem_ref_group *groups)
804 struct mem_ref *ref;
806 for (; groups; groups = groups->next)
807 for (ref = groups->refs; ref; ref = ref->next)
808 if (should_issue_prefetch_p (ref))
809 return true;
811 return false;
814 /* Issue prefetches for the reference REF into loop as decided before.
815 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
816 is the factor by which LOOP was unrolled. */
818 static void
819 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
821 HOST_WIDE_INT delta;
822 tree addr, addr_base, prefetch, params, write_p;
823 block_stmt_iterator bsi;
824 unsigned n_prefetches, ap;
826 if (dump_file && (dump_flags & TDF_DETAILS))
827 fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
829 bsi = bsi_for_stmt (ref->stmt);
831 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
832 / ref->prefetch_mod);
833 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
834 addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
835 write_p = ref->write_p ? integer_one_node : integer_zero_node;
837 for (ap = 0; ap < n_prefetches; ap++)
839 /* Determine the address to prefetch. */
840 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
841 addr = fold_build2 (PLUS_EXPR, ptr_type_node,
842 addr_base, build_int_cst (ptr_type_node, delta));
843 addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
845 /* Create the prefetch instruction. */
846 params = tree_cons (NULL_TREE, addr,
847 tree_cons (NULL_TREE, write_p, NULL_TREE));
849 prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
850 params);
851 bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
855 /* Issue prefetches for the references in GROUPS into loop as decided before.
856 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
857 factor by that LOOP was unrolled. */
859 static void
860 issue_prefetches (struct mem_ref_group *groups,
861 unsigned unroll_factor, unsigned ahead)
863 struct mem_ref *ref;
865 for (; groups; groups = groups->next)
866 for (ref = groups->refs; ref; ref = ref->next)
867 if (ref->issue_prefetch_p)
868 issue_prefetch_ref (ref, unroll_factor, ahead);
871 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
872 this is the case, fill in DESC by the description of number of
873 iterations. */
875 static bool
876 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
877 unsigned factor)
879 if (!can_unroll_loop_p (loop, factor, desc))
880 return false;
882 /* We only consider loops without control flow for unrolling. This is not
883 a hard restriction -- tree_unroll_loop works with arbitrary loops
884 as well; but the unrolling/prefetching is usually more profitable for
885 loops consisting of a single basic block, and we want to limit the
886 code growth. */
887 if (loop->num_nodes > 2)
888 return false;
890 return true;
893 /* Determine the coefficient by that unroll LOOP, from the information
894 contained in the list of memory references REFS. Description of
895 umber of iterations of LOOP is stored to DESC. AHEAD is the number
896 of iterations ahead that we need to prefetch. NINSNS is number of
897 insns of the LOOP. */
899 static unsigned
900 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
901 unsigned ninsns, struct tree_niter_desc *desc)
903 unsigned upper_bound;
904 unsigned nfactor, factor, mod_constraint;
905 struct mem_ref_group *agp;
906 struct mem_ref *ref;
908 /* First check whether the loop is not too large to unroll. We ignore
909 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
910 from unrolling them enough to make exactly one cache line covered by each
911 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
912 us from unrolling the loops too many times in cases where we only expect
913 gains from better scheduling and decreasing loop overhead, which is not
914 the case here. */
915 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
916 if (upper_bound <= 1)
917 return 1;
919 /* Choose the factor so that we may prefetch each cache just once,
920 but bound the unrolling by UPPER_BOUND. */
921 factor = 1;
922 for (agp = refs; agp; agp = agp->next)
923 for (ref = agp->refs; ref; ref = ref->next)
924 if (should_issue_prefetch_p (ref))
926 mod_constraint = ref->prefetch_mod;
927 nfactor = least_common_multiple (mod_constraint, factor);
928 if (nfactor <= upper_bound)
929 factor = nfactor;
932 if (!should_unroll_loop_p (loop, desc, factor))
933 return 1;
935 return factor;
938 /* Issue prefetch instructions for array references in LOOP. Returns
939 true if the LOOP was unrolled. */
941 static bool
942 loop_prefetch_arrays (struct loop *loop)
944 struct mem_ref_group *refs;
945 unsigned ahead, ninsns, unroll_factor;
946 struct tree_niter_desc desc;
947 bool unrolled = false;
949 /* Step 1: gather the memory references. */
950 refs = gather_memory_references (loop);
952 /* Step 2: estimate the reuse effects. */
953 prune_by_reuse (refs);
955 if (!anything_to_prefetch_p (refs))
956 goto fail;
958 /* Step 3: determine the ahead and unroll factor. */
960 /* FIXME: We should use not size of the loop, but the average number of
961 instructions executed per iteration of the loop. */
962 ninsns = tree_num_loop_insns (loop);
963 ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
964 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
965 if (dump_file && (dump_flags & TDF_DETAILS))
966 fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
968 /* If the loop rolls less than the required unroll factor, prefetching
969 is useless. */
970 if (unroll_factor > 1
971 && cst_and_fits_in_hwi (desc.niter)
972 && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
973 goto fail;
975 /* Step 4: what to prefetch? */
976 if (!schedule_prefetches (refs, unroll_factor, ahead))
977 goto fail;
979 /* Step 5: unroll the loop. TODO -- peeling of first and last few
980 iterations so that we do not issue superfluous prefetches. */
981 if (unroll_factor != 1)
983 tree_unroll_loop (loop, unroll_factor,
984 single_dom_exit (loop), &desc);
985 unrolled = true;
988 /* Step 6: issue the prefetches. */
989 issue_prefetches (refs, unroll_factor, ahead);
991 fail:
992 release_mem_refs (refs);
993 return unrolled;
996 /* Issue prefetch instructions for array references in loops. */
998 unsigned int
999 tree_ssa_prefetch_arrays (void)
1001 unsigned i;
1002 struct loop *loop;
1003 bool unrolled = false;
1004 int todo_flags = 0;
1006 if (!HAVE_prefetch
1007 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1008 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1009 of processor costs and i486 does not have prefetch, but
1010 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1011 || PREFETCH_BLOCK == 0)
1012 return 0;
1014 if (dump_file && (dump_flags & TDF_DETAILS))
1016 fprintf (dump_file, "Prefetching parameters:\n");
1017 fprintf (dump_file, " simultaneous prefetches: %d\n",
1018 SIMULTANEOUS_PREFETCHES);
1019 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1020 fprintf (dump_file, " L1 cache size: %d (%d bytes)\n",
1021 L1_CACHE_SIZE, L1_CACHE_SIZE * L1_CACHE_LINE_SIZE);
1022 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1023 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1024 fprintf (dump_file, "\n");
1027 initialize_original_copy_tables ();
1029 if (!built_in_decls[BUILT_IN_PREFETCH])
1031 tree type = build_function_type (void_type_node,
1032 tree_cons (NULL_TREE,
1033 const_ptr_type_node,
1034 NULL_TREE));
1035 tree decl = add_builtin_function ("__builtin_prefetch", type,
1036 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1037 NULL, NULL_TREE);
1038 DECL_IS_NOVOPS (decl) = true;
1039 built_in_decls[BUILT_IN_PREFETCH] = decl;
1042 /* We assume that size of cache line is a power of two, so verify this
1043 here. */
1044 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1046 for (i = current_loops->num - 1; i > 0; i--)
1048 loop = current_loops->parray[i];
1049 if (!loop)
1050 continue;
1052 if (dump_file && (dump_flags & TDF_DETAILS))
1053 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1055 unrolled |= loop_prefetch_arrays (loop);
1057 if (dump_file && (dump_flags & TDF_DETAILS))
1058 fprintf (dump_file, "\n\n");
1061 if (unrolled)
1063 scev_reset ();
1064 todo_flags |= TODO_cleanup_cfg;
1067 free_original_copy_tables ();
1068 return todo_flags;