Add files that I missed when importing NaCl changes earlier
[gcc/nacl-gcc.git] / gcc / tree-ssa-loop-prefetch.c
bloba93008a70748bb381ea94e0429c47b0d585f8bb3
1 /* Array prefetching.
2 Copyright (C) 2005, 2007 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "rtl.h"
26 #include "tm_p.h"
27 #include "hard-reg-set.h"
28 #include "basic-block.h"
29 #include "output.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "varray.h"
36 #include "expr.h"
37 #include "tree-pass.h"
38 #include "ggc.h"
39 #include "insn-config.h"
40 #include "recog.h"
41 #include "hashtab.h"
42 #include "tree-chrec.h"
43 #include "tree-scalar-evolution.h"
44 #include "toplev.h"
45 #include "params.h"
46 #include "langhooks.h"
48 /* This pass inserts prefetch instructions to optimize cache usage during
49 accesses to arrays in loops. It processes loops sequentially and:
51 1) Gathers all memory references in the single loop.
52 2) For each of the references it decides when it is profitable to prefetch
53 it. To do it, we evaluate the reuse among the accesses, and determines
54 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
55 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
56 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
57 iterations of the loop that are zero modulo PREFETCH_MOD). For example
58 (assuming cache line size is 64 bytes, char has size 1 byte and there
59 is no hardware sequential prefetch):
61 char *a;
62 for (i = 0; i < max; i++)
64 a[255] = ...; (0)
65 a[i] = ...; (1)
66 a[i + 64] = ...; (2)
67 a[16*i] = ...; (3)
68 a[187*i] = ...; (4)
69 a[187*i + 50] = ...; (5)
72 (0) obviously has PREFETCH_BEFORE 1
73 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
74 location 64 iterations before it, and PREFETCH_MOD 64 (since
75 it hits the same cache line otherwise).
76 (2) has PREFETCH_MOD 64
77 (3) has PREFETCH_MOD 4
78 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
79 the cache line accessed by (4) is the same with probability only
80 7/32.
81 (5) has PREFETCH_MOD 1 as well.
83 3) We determine how much ahead we need to prefetch. The number of
84 iterations needed is time to fetch / time spent in one iteration of
85 the loop. The problem is that we do not know either of these values,
86 so we just make a heuristic guess based on a magic (possibly)
87 target-specific constant and size of the loop.
89 4) Determine which of the references we prefetch. We take into account
90 that there is a maximum number of simultaneous prefetches (provided
91 by machine description). We prefetch as many prefetches as possible
92 while still within this bound (starting with those with lowest
93 prefetch_mod, since they are responsible for most of the cache
94 misses).
96 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
97 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
98 prefetching nonaccessed memory.
99 TODO -- actually implement peeling.
101 6) We actually emit the prefetch instructions. ??? Perhaps emit the
102 prefetch instructions with guards in cases where 5) was not sufficient
103 to satisfy the constraints?
105 Some other TODO:
106 -- write and use more general reuse analysis (that could be also used
107 in other cache aimed loop optimizations)
108 -- make it behave sanely together with the prefetches given by user
109 (now we just ignore them; at the very least we should avoid
110 optimizing loops in that user put his own prefetches)
111 -- we assume cache line size alignment of arrays; this could be
112 improved. */
114 /* Magic constants follow. These should be replaced by machine specific
115 numbers. */
117 /* A number that should roughly correspond to the number of instructions
118 executed before the prefetch is completed. */
120 #ifndef PREFETCH_LATENCY
121 #define PREFETCH_LATENCY 200
122 #endif
124 /* Number of prefetches that can run at the same time. */
126 #ifndef SIMULTANEOUS_PREFETCHES
127 #define SIMULTANEOUS_PREFETCHES 3
128 #endif
130 /* True if write can be prefetched by a read prefetch. */
132 #ifndef WRITE_CAN_USE_READ_PREFETCH
133 #define WRITE_CAN_USE_READ_PREFETCH 1
134 #endif
136 /* True if read can be prefetched by a write prefetch. */
138 #ifndef READ_CAN_USE_WRITE_PREFETCH
139 #define READ_CAN_USE_WRITE_PREFETCH 0
140 #endif
142 /* Cache line size. Assumed to be a power of two. */
144 #ifndef PREFETCH_BLOCK
145 #define PREFETCH_BLOCK 32
146 #endif
148 /* Do we have a forward hardware sequential prefetching? */
150 #ifndef HAVE_FORWARD_PREFETCH
151 #define HAVE_FORWARD_PREFETCH 0
152 #endif
154 /* Do we have a backward hardware sequential prefetching? */
156 #ifndef HAVE_BACKWARD_PREFETCH
157 #define HAVE_BACKWARD_PREFETCH 0
158 #endif
160 /* In some cases we are only able to determine that there is a certain
161 probability that the two accesses hit the same cache line. In this
162 case, we issue the prefetches for both of them if this probability
163 is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */
165 #ifndef ACCEPTABLE_MISS_RATE
166 #define ACCEPTABLE_MISS_RATE 50
167 #endif
169 #ifndef HAVE_prefetch
170 #define HAVE_prefetch 0
171 #endif
173 /* The group of references between that reuse may occur. */
175 struct mem_ref_group
177 tree base; /* Base of the reference. */
178 HOST_WIDE_INT step; /* Step of the reference. */
179 struct mem_ref *refs; /* References in the group. */
180 struct mem_ref_group *next; /* Next group of references. */
183 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
185 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
187 /* The memory reference. */
189 struct mem_ref
191 tree stmt; /* Statement in that the reference appears. */
192 tree mem; /* The reference. */
193 HOST_WIDE_INT delta; /* Constant offset of the reference. */
194 bool write_p; /* Is it a write? */
195 struct mem_ref_group *group; /* The group of references it belongs to. */
196 unsigned HOST_WIDE_INT prefetch_mod;
197 /* Prefetch only each PREFETCH_MOD-th
198 iteration. */
199 unsigned HOST_WIDE_INT prefetch_before;
200 /* Prefetch only first PREFETCH_BEFORE
201 iterations. */
202 bool issue_prefetch_p; /* Should we really issue the prefetch? */
203 struct mem_ref *next; /* The next reference in the group. */
206 /* Dumps information about reference REF to FILE. */
208 static void
209 dump_mem_ref (FILE *file, struct mem_ref *ref)
211 fprintf (file, "Reference %p:\n", (void *) ref);
213 fprintf (file, " group %p (base ", (void *) ref->group);
214 print_generic_expr (file, ref->group->base, TDF_SLIM);
215 fprintf (file, ", step ");
216 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
217 fprintf (file, ")\n");
219 fprintf (dump_file, " delta ");
220 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
221 fprintf (file, "\n");
223 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
225 fprintf (file, "\n");
228 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
229 exist. */
231 static struct mem_ref_group *
232 find_or_create_group (struct mem_ref_group **groups, tree base,
233 HOST_WIDE_INT step)
235 struct mem_ref_group *group;
237 for (; *groups; groups = &(*groups)->next)
239 if ((*groups)->step == step
240 && operand_equal_p ((*groups)->base, base, 0))
241 return *groups;
243 /* Keep the list of groups sorted by decreasing step. */
244 if ((*groups)->step < step)
245 break;
248 group = xcalloc (1, sizeof (struct mem_ref_group));
249 group->base = base;
250 group->step = step;
251 group->refs = NULL;
252 group->next = *groups;
253 *groups = group;
255 return group;
258 /* Records a memory reference MEM in GROUP with offset DELTA and write status
259 WRITE_P. The reference occurs in statement STMT. */
261 static void
262 record_ref (struct mem_ref_group *group, tree stmt, tree mem,
263 HOST_WIDE_INT delta, bool write_p)
265 struct mem_ref **aref;
267 /* Do not record the same address twice. */
268 for (aref = &group->refs; *aref; aref = &(*aref)->next)
270 /* It does not have to be possible for write reference to reuse the read
271 prefetch, or vice versa. */
272 if (!WRITE_CAN_USE_READ_PREFETCH
273 && write_p
274 && !(*aref)->write_p)
275 continue;
276 if (!READ_CAN_USE_WRITE_PREFETCH
277 && !write_p
278 && (*aref)->write_p)
279 continue;
281 if ((*aref)->delta == delta)
282 return;
285 (*aref) = xcalloc (1, sizeof (struct mem_ref));
286 (*aref)->stmt = stmt;
287 (*aref)->mem = mem;
288 (*aref)->delta = delta;
289 (*aref)->write_p = write_p;
290 (*aref)->prefetch_before = PREFETCH_ALL;
291 (*aref)->prefetch_mod = 1;
292 (*aref)->issue_prefetch_p = false;
293 (*aref)->group = group;
294 (*aref)->next = NULL;
296 if (dump_file && (dump_flags & TDF_DETAILS))
297 dump_mem_ref (dump_file, *aref);
300 /* Release memory references in GROUPS. */
302 static void
303 release_mem_refs (struct mem_ref_group *groups)
305 struct mem_ref_group *next_g;
306 struct mem_ref *ref, *next_r;
308 for (; groups; groups = next_g)
310 next_g = groups->next;
311 for (ref = groups->refs; ref; ref = next_r)
313 next_r = ref->next;
314 free (ref);
316 free (groups);
320 /* A structure used to pass arguments to idx_analyze_ref. */
322 struct ar_data
324 struct loop *loop; /* Loop of the reference. */
325 tree stmt; /* Statement of the reference. */
326 HOST_WIDE_INT *step; /* Step of the memory reference. */
327 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
330 /* Analyzes a single INDEX of a memory reference to obtain information
331 described at analyze_ref. Callback for for_each_index. */
333 static bool
334 idx_analyze_ref (tree base, tree *index, void *data)
336 struct ar_data *ar_data = data;
337 tree ibase, step, stepsize;
338 HOST_WIDE_INT istep, idelta = 0, imult = 1;
339 affine_iv iv;
341 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
342 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
343 return false;
345 if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false))
346 return false;
347 ibase = iv.base;
348 step = iv.step;
350 if (zero_p (step))
351 istep = 0;
352 else
354 if (!cst_and_fits_in_hwi (step))
355 return false;
356 istep = int_cst_value (step);
359 if (TREE_CODE (ibase) == PLUS_EXPR
360 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
362 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
363 ibase = TREE_OPERAND (ibase, 0);
365 if (cst_and_fits_in_hwi (ibase))
367 idelta += int_cst_value (ibase);
368 ibase = build_int_cst (TREE_TYPE (ibase), 0);
371 if (TREE_CODE (base) == ARRAY_REF)
373 stepsize = array_ref_element_size (base);
374 if (!cst_and_fits_in_hwi (stepsize))
375 return false;
376 imult = int_cst_value (stepsize);
378 istep *= imult;
379 idelta *= imult;
382 *ar_data->step += istep;
383 *ar_data->delta += idelta;
384 *index = ibase;
386 return true;
389 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
390 STEP are integer constants and iter is number of iterations of LOOP. The
391 reference occurs in statement STMT. Strips nonaddressable component
392 references from REF_P. */
394 static bool
395 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
396 HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
397 tree stmt)
399 struct ar_data ar_data;
400 tree off;
401 HOST_WIDE_INT bit_offset;
402 tree ref = *ref_p;
404 *step = 0;
405 *delta = 0;
407 /* First strip off the component references. Ignore bitfields. */
408 if (TREE_CODE (ref) == COMPONENT_REF
409 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
410 ref = TREE_OPERAND (ref, 0);
412 *ref_p = ref;
414 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
416 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
417 bit_offset = TREE_INT_CST_LOW (off);
418 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
420 *delta += bit_offset / BITS_PER_UNIT;
423 *base = unshare_expr (ref);
424 ar_data.loop = loop;
425 ar_data.stmt = stmt;
426 ar_data.step = step;
427 ar_data.delta = delta;
428 return for_each_index (base, idx_analyze_ref, &ar_data);
431 /* Record a memory reference REF to the list REFS. The reference occurs in
432 LOOP in statement STMT and it is write if WRITE_P. */
434 static void
435 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
436 tree ref, bool write_p, tree stmt)
438 tree base;
439 HOST_WIDE_INT step, delta;
440 struct mem_ref_group *agrp;
442 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
443 return;
445 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
446 are integer constants. */
447 agrp = find_or_create_group (refs, base, step);
448 record_ref (agrp, stmt, ref, delta, write_p);
451 /* Record the suitable memory references in LOOP. */
453 static struct mem_ref_group *
454 gather_memory_references (struct loop *loop)
456 basic_block *body = get_loop_body_in_dom_order (loop);
457 basic_block bb;
458 unsigned i;
459 block_stmt_iterator bsi;
460 tree stmt, lhs, rhs;
461 struct mem_ref_group *refs = NULL;
463 /* Scan the loop body in order, so that the former references precede the
464 later ones. */
465 for (i = 0; i < loop->num_nodes; i++)
467 bb = body[i];
468 if (bb->loop_father != loop)
469 continue;
471 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
473 stmt = bsi_stmt (bsi);
474 if (TREE_CODE (stmt) != MODIFY_EXPR)
475 continue;
477 lhs = TREE_OPERAND (stmt, 0);
478 rhs = TREE_OPERAND (stmt, 1);
480 if (REFERENCE_CLASS_P (rhs))
481 gather_memory_references_ref (loop, &refs, rhs, false, stmt);
482 if (REFERENCE_CLASS_P (lhs))
483 gather_memory_references_ref (loop, &refs, lhs, true, stmt);
486 free (body);
488 return refs;
491 /* Prune the prefetch candidate REF using the self-reuse. */
493 static void
494 prune_ref_by_self_reuse (struct mem_ref *ref)
496 HOST_WIDE_INT step = ref->group->step;
497 bool backward = step < 0;
499 if (step == 0)
501 /* Prefetch references to invariant address just once. */
502 ref->prefetch_before = 1;
503 return;
506 if (backward)
507 step = -step;
509 if (step > PREFETCH_BLOCK)
510 return;
512 if ((backward && HAVE_BACKWARD_PREFETCH)
513 || (!backward && HAVE_FORWARD_PREFETCH))
515 ref->prefetch_before = 1;
516 return;
519 ref->prefetch_mod = PREFETCH_BLOCK / step;
522 /* Divides X by BY, rounding down. */
524 static HOST_WIDE_INT
525 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
527 gcc_assert (by > 0);
529 if (x >= 0)
530 return x / by;
531 else
532 return (x + by - 1) / by;
535 /* Prune the prefetch candidate REF using the reuse with BY.
536 If BY_IS_BEFORE is true, BY is before REF in the loop. */
538 static void
539 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
540 bool by_is_before)
542 HOST_WIDE_INT step = ref->group->step;
543 bool backward = step < 0;
544 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
545 HOST_WIDE_INT delta = delta_b - delta_r;
546 HOST_WIDE_INT hit_from;
547 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
549 if (delta == 0)
551 /* If the references has the same address, only prefetch the
552 former. */
553 if (by_is_before)
554 ref->prefetch_before = 0;
556 return;
559 if (!step)
561 /* If the reference addresses are invariant and fall into the
562 same cache line, prefetch just the first one. */
563 if (!by_is_before)
564 return;
566 if (ddown (ref->delta, PREFETCH_BLOCK)
567 != ddown (by->delta, PREFETCH_BLOCK))
568 return;
570 ref->prefetch_before = 0;
571 return;
574 /* Only prune the reference that is behind in the array. */
575 if (backward)
577 if (delta > 0)
578 return;
580 /* Transform the data so that we may assume that the accesses
581 are forward. */
582 delta = - delta;
583 step = -step;
584 delta_r = PREFETCH_BLOCK - 1 - delta_r;
585 delta_b = PREFETCH_BLOCK - 1 - delta_b;
587 else
589 if (delta < 0)
590 return;
593 /* Check whether the two references are likely to hit the same cache
594 line, and how distant the iterations in that it occurs are from
595 each other. */
597 if (step <= PREFETCH_BLOCK)
599 /* The accesses are sure to meet. Let us check when. */
600 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
601 prefetch_before = (hit_from - delta_r + step - 1) / step;
603 if (prefetch_before < ref->prefetch_before)
604 ref->prefetch_before = prefetch_before;
606 return;
609 /* A more complicated case. First let us ensure that size of cache line
610 and step are coprime (here we assume that PREFETCH_BLOCK is a power
611 of two. */
612 prefetch_block = PREFETCH_BLOCK;
613 while ((step & 1) == 0
614 && prefetch_block > 1)
616 step >>= 1;
617 prefetch_block >>= 1;
618 delta >>= 1;
621 /* Now step > prefetch_block, and step and prefetch_block are coprime.
622 Determine the probability that the accesses hit the same cache line. */
624 prefetch_before = delta / step;
625 delta %= step;
626 if ((unsigned HOST_WIDE_INT) delta
627 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
629 if (prefetch_before < ref->prefetch_before)
630 ref->prefetch_before = prefetch_before;
632 return;
635 /* Try also the following iteration. */
636 prefetch_before++;
637 delta = step - delta;
638 if ((unsigned HOST_WIDE_INT) delta
639 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
641 if (prefetch_before < ref->prefetch_before)
642 ref->prefetch_before = prefetch_before;
644 return;
647 /* The ref probably does not reuse by. */
648 return;
651 /* Prune the prefetch candidate REF using the reuses with other references
652 in REFS. */
654 static void
655 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
657 struct mem_ref *prune_by;
658 bool before = true;
660 prune_ref_by_self_reuse (ref);
662 for (prune_by = refs; prune_by; prune_by = prune_by->next)
664 if (prune_by == ref)
666 before = false;
667 continue;
670 if (!WRITE_CAN_USE_READ_PREFETCH
671 && ref->write_p
672 && !prune_by->write_p)
673 continue;
674 if (!READ_CAN_USE_WRITE_PREFETCH
675 && !ref->write_p
676 && prune_by->write_p)
677 continue;
679 prune_ref_by_group_reuse (ref, prune_by, before);
683 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
685 static void
686 prune_group_by_reuse (struct mem_ref_group *group)
688 struct mem_ref *ref_pruned;
690 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
692 prune_ref_by_reuse (ref_pruned, group->refs);
694 if (dump_file && (dump_flags & TDF_DETAILS))
696 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
698 if (ref_pruned->prefetch_before == PREFETCH_ALL
699 && ref_pruned->prefetch_mod == 1)
700 fprintf (dump_file, " no restrictions");
701 else if (ref_pruned->prefetch_before == 0)
702 fprintf (dump_file, " do not prefetch");
703 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
704 fprintf (dump_file, " prefetch once");
705 else
707 if (ref_pruned->prefetch_before != PREFETCH_ALL)
709 fprintf (dump_file, " prefetch before ");
710 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
711 ref_pruned->prefetch_before);
713 if (ref_pruned->prefetch_mod != 1)
715 fprintf (dump_file, " prefetch mod ");
716 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
717 ref_pruned->prefetch_mod);
720 fprintf (dump_file, "\n");
725 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
727 static void
728 prune_by_reuse (struct mem_ref_group *groups)
730 for (; groups; groups = groups->next)
731 prune_group_by_reuse (groups);
734 /* Returns true if we should issue prefetch for REF. */
736 static bool
737 should_issue_prefetch_p (struct mem_ref *ref)
739 /* For now do not issue prefetches for only first few of the
740 iterations. */
741 if (ref->prefetch_before != PREFETCH_ALL)
742 return false;
744 return true;
747 /* Decide which of the prefetch candidates in GROUPS to prefetch.
748 AHEAD is the number of iterations to prefetch ahead (which corresponds
749 to the number of simultaneous instances of one prefetch running at a
750 time). UNROLL_FACTOR is the factor by that the loop is going to be
751 unrolled. Returns true if there is anything to prefetch. */
753 static bool
754 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
755 unsigned ahead)
757 unsigned max_prefetches, n_prefetches;
758 struct mem_ref *ref;
759 bool any = false;
761 max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
762 if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
763 max_prefetches = SIMULTANEOUS_PREFETCHES;
765 if (dump_file && (dump_flags & TDF_DETAILS))
766 fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
768 if (!max_prefetches)
769 return false;
771 /* For now we just take memory references one by one and issue
772 prefetches for as many as possible. The groups are sorted
773 starting with the largest step, since the references with
774 large step are more likely to cause many cache misses. */
776 for (; groups; groups = groups->next)
777 for (ref = groups->refs; ref; ref = ref->next)
779 if (!should_issue_prefetch_p (ref))
780 continue;
782 ref->issue_prefetch_p = true;
784 /* If prefetch_mod is less then unroll_factor, we need to insert
785 several prefetches for the reference. */
786 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
787 / ref->prefetch_mod);
788 if (max_prefetches <= n_prefetches)
789 return true;
791 max_prefetches -= n_prefetches;
792 any = true;
795 return any;
798 /* Determine whether there is any reference suitable for prefetching
799 in GROUPS. */
801 static bool
802 anything_to_prefetch_p (struct mem_ref_group *groups)
804 struct mem_ref *ref;
806 for (; groups; groups = groups->next)
807 for (ref = groups->refs; ref; ref = ref->next)
808 if (should_issue_prefetch_p (ref))
809 return true;
811 return false;
814 /* Issue prefetches for the reference REF into loop as decided before.
815 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
816 is the factor by which LOOP was unrolled. */
818 static void
819 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
821 HOST_WIDE_INT delta;
822 tree addr, addr_base, prefetch, params, write_p;
823 block_stmt_iterator bsi;
824 unsigned n_prefetches, ap;
826 if (dump_file && (dump_flags & TDF_DETAILS))
827 fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
829 bsi = bsi_for_stmt (ref->stmt);
831 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
832 / ref->prefetch_mod);
833 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
834 addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
836 for (ap = 0; ap < n_prefetches; ap++)
838 /* Determine the address to prefetch. */
839 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
840 addr = fold_build2 (PLUS_EXPR, ptr_type_node,
841 addr_base, build_int_cst (ptr_type_node, delta));
842 addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
844 /* Create the prefetch instruction. */
845 write_p = ref->write_p ? integer_one_node : integer_zero_node;
846 params = tree_cons (NULL_TREE, addr,
847 tree_cons (NULL_TREE, write_p, NULL_TREE));
849 prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
850 params);
851 bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
855 /* Issue prefetches for the references in GROUPS into loop as decided before.
856 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
857 factor by that LOOP was unrolled. */
859 static void
860 issue_prefetches (struct mem_ref_group *groups,
861 unsigned unroll_factor, unsigned ahead)
863 struct mem_ref *ref;
865 for (; groups; groups = groups->next)
866 for (ref = groups->refs; ref; ref = ref->next)
867 if (ref->issue_prefetch_p)
868 issue_prefetch_ref (ref, unroll_factor, ahead);
871 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
872 this is the case, fill in DESC by the description of number of
873 iterations. */
875 static bool
876 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
877 unsigned factor)
879 if (!can_unroll_loop_p (loop, factor, desc))
880 return false;
882 /* We only consider loops without control flow for unrolling. This is not
883 a hard restriction -- tree_unroll_loop works with arbitrary loops
884 as well; but the unrolling/prefetching is usually more profitable for
885 loops consisting of a single basic block, and we want to limit the
886 code growth. */
887 if (loop->num_nodes > 2)
888 return false;
890 return true;
893 /* Determine the coefficient by that unroll LOOP, from the information
894 contained in the list of memory references REFS. Description of
895 umber of iterations of LOOP is stored to DESC. AHEAD is the number
896 of iterations ahead that we need to prefetch. NINSNS is number of
897 insns of the LOOP. */
899 static unsigned
900 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
901 unsigned ahead, unsigned ninsns,
902 struct tree_niter_desc *desc)
904 unsigned upper_bound, size_factor, constraint_factor;
905 unsigned factor, max_mod_constraint, ahead_factor;
906 struct mem_ref_group *agp;
907 struct mem_ref *ref;
909 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
911 /* First check whether the loop is not too large to unroll. */
912 size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
913 if (size_factor <= 1)
914 return 1;
916 if (size_factor < upper_bound)
917 upper_bound = size_factor;
919 max_mod_constraint = 1;
920 for (agp = refs; agp; agp = agp->next)
921 for (ref = agp->refs; ref; ref = ref->next)
922 if (should_issue_prefetch_p (ref)
923 && ref->prefetch_mod > max_mod_constraint)
924 max_mod_constraint = ref->prefetch_mod;
926 /* Set constraint_factor as large as needed to be able to satisfy the
927 largest modulo constraint. */
928 constraint_factor = max_mod_constraint;
930 /* If ahead is too large in comparison with the number of available
931 prefetches, unroll the loop as much as needed to be able to prefetch
932 at least partially some of the references in the loop. */
933 ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
934 / SIMULTANEOUS_PREFETCHES);
936 /* Unroll as much as useful, but bound the code size growth. */
937 if (constraint_factor < ahead_factor)
938 factor = ahead_factor;
939 else
940 factor = constraint_factor;
941 if (factor > upper_bound)
942 factor = upper_bound;
944 if (!should_unroll_loop_p (loop, desc, factor))
945 return 1;
947 return factor;
950 /* Issue prefetch instructions for array references in LOOP. Returns
951 true if the LOOP was unrolled. LOOPS is the array containing all
952 loops. */
954 static bool
955 loop_prefetch_arrays (struct loops *loops, struct loop *loop)
957 struct mem_ref_group *refs;
958 unsigned ahead, ninsns, unroll_factor;
959 struct tree_niter_desc desc;
960 bool unrolled = false;
962 /* Step 1: gather the memory references. */
963 refs = gather_memory_references (loop);
965 /* Step 2: estimate the reuse effects. */
966 prune_by_reuse (refs);
968 if (!anything_to_prefetch_p (refs))
969 goto fail;
971 /* Step 3: determine the ahead and unroll factor. */
973 /* FIXME: We should use not size of the loop, but the average number of
974 instructions executed per iteration of the loop. */
975 ninsns = tree_num_loop_insns (loop);
976 ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
977 unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
978 &desc);
979 if (dump_file && (dump_flags & TDF_DETAILS))
980 fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
982 /* If the loop rolls less than the required unroll factor, prefetching
983 is useless. */
984 if (unroll_factor > 1
985 && cst_and_fits_in_hwi (desc.niter)
986 && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
987 goto fail;
989 /* Step 4: what to prefetch? */
990 if (!schedule_prefetches (refs, unroll_factor, ahead))
991 goto fail;
993 /* Step 5: unroll the loop. TODO -- peeling of first and last few
994 iterations so that we do not issue superfluous prefetches. */
995 if (unroll_factor != 1)
997 tree_unroll_loop (loops, loop, unroll_factor,
998 single_dom_exit (loop), &desc);
999 unrolled = true;
1002 /* Step 6: issue the prefetches. */
1003 issue_prefetches (refs, unroll_factor, ahead);
1005 fail:
1006 release_mem_refs (refs);
1007 return unrolled;
1010 /* Issue prefetch instructions for array references in LOOPS. */
1012 unsigned int
1013 tree_ssa_prefetch_arrays (struct loops *loops)
1015 unsigned i;
1016 struct loop *loop;
1017 bool unrolled = false;
1018 int todo_flags = 0;
1020 if (!HAVE_prefetch
1021 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1022 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1023 of processor costs and i486 does not have prefetch, but
1024 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1025 || PREFETCH_BLOCK == 0)
1026 return 0;
1028 initialize_original_copy_tables ();
1030 if (!built_in_decls[BUILT_IN_PREFETCH])
1032 tree type = build_function_type (void_type_node,
1033 tree_cons (NULL_TREE,
1034 const_ptr_type_node,
1035 NULL_TREE));
1036 tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type,
1037 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1038 NULL, NULL_TREE);
1039 DECL_IS_NOVOPS (decl) = true;
1040 built_in_decls[BUILT_IN_PREFETCH] = decl;
1043 /* We assume that size of cache line is a power of two, so verify this
1044 here. */
1045 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1047 for (i = loops->num - 1; i > 0; i--)
1049 loop = loops->parray[i];
1051 if (dump_file && (dump_flags & TDF_DETAILS))
1052 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1054 if (loop)
1055 unrolled |= loop_prefetch_arrays (loops, loop);
1057 if (dump_file && (dump_flags & TDF_DETAILS))
1058 fprintf (dump_file, "\n\n");
1061 if (unrolled)
1063 scev_reset ();
1064 todo_flags |= TODO_cleanup_cfg;
1067 free_original_copy_tables ();
1068 return todo_flags;