Add a testcase for PR target/43668.
[official-gcc.git] / gcc / tree-ssa-loop-prefetch.c
blob3377eeb1dbb1ced88a7ee9636707f37a63d0269c
1 /* Array prefetching.
2 Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "rtl.h"
26 #include "tm_p.h"
27 #include "hard-reg-set.h"
28 #include "basic-block.h"
29 #include "output.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "tree-pass.h"
37 #include "ggc.h"
38 #include "insn-config.h"
39 #include "recog.h"
40 #include "hashtab.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "toplev.h"
44 #include "params.h"
45 #include "langhooks.h"
46 #include "tree-inline.h"
47 #include "tree-data-ref.h"
48 #include "optabs.h"
50 /* This pass inserts prefetch instructions to optimize cache usage during
51 accesses to arrays in loops. It processes loops sequentially and:
53 1) Gathers all memory references in the single loop.
54 2) For each of the references it decides when it is profitable to prefetch
55 it. To do it, we evaluate the reuse among the accesses, and determines
56 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
57 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
58 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
59 iterations of the loop that are zero modulo PREFETCH_MOD). For example
60 (assuming cache line size is 64 bytes, char has size 1 byte and there
61 is no hardware sequential prefetch):
63 char *a;
64 for (i = 0; i < max; i++)
66 a[255] = ...; (0)
67 a[i] = ...; (1)
68 a[i + 64] = ...; (2)
69 a[16*i] = ...; (3)
70 a[187*i] = ...; (4)
71 a[187*i + 50] = ...; (5)
74 (0) obviously has PREFETCH_BEFORE 1
75 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
76 location 64 iterations before it, and PREFETCH_MOD 64 (since
77 it hits the same cache line otherwise).
78 (2) has PREFETCH_MOD 64
79 (3) has PREFETCH_MOD 4
80 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
81 the cache line accessed by (4) is the same with probability only
82 7/32.
83 (5) has PREFETCH_MOD 1 as well.
85 Additionally, we use data dependence analysis to determine for each
86 reference the distance till the first reuse; this information is used
87 to determine the temporality of the issued prefetch instruction.
89 3) We determine how much ahead we need to prefetch. The number of
90 iterations needed is time to fetch / time spent in one iteration of
91 the loop. The problem is that we do not know either of these values,
92 so we just make a heuristic guess based on a magic (possibly)
93 target-specific constant and size of the loop.
95 4) Determine which of the references we prefetch. We take into account
96 that there is a maximum number of simultaneous prefetches (provided
97 by machine description). We prefetch as many prefetches as possible
98 while still within this bound (starting with those with lowest
99 prefetch_mod, since they are responsible for most of the cache
100 misses).
102 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
103 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
104 prefetching nonaccessed memory.
105 TODO -- actually implement peeling.
107 6) We actually emit the prefetch instructions. ??? Perhaps emit the
108 prefetch instructions with guards in cases where 5) was not sufficient
109 to satisfy the constraints?
111 The function is_loop_prefetching_profitable() implements a cost model
112 to determine if prefetching is profitable for a given loop. The cost
113 model has two heuristcs:
114 1. A heuristic that determines whether the given loop has enough CPU
115 ops that can be overlapped with cache missing memory ops.
116 If not, the loop won't benefit from prefetching. This is implemented
117 by requirung the ratio between the instruction count and the mem ref
118 count to be above a certain minimum.
119 2. A heuristic that disables prefetching in a loop with an unknown trip
120 count if the prefetching cost is above a certain limit. The relative
121 prefetching cost is estimated by taking the ratio between the
122 prefetch count and the total intruction count (this models the I-cache
123 cost).
124 The limits used in these heuristics are defined as parameters with
125 reasonable default values. Machine-specific default values will be
126 added later.
128 Some other TODO:
129 -- write and use more general reuse analysis (that could be also used
130 in other cache aimed loop optimizations)
131 -- make it behave sanely together with the prefetches given by user
132 (now we just ignore them; at the very least we should avoid
133 optimizing loops in that user put his own prefetches)
134 -- we assume cache line size alignment of arrays; this could be
135 improved. */
137 /* Magic constants follow. These should be replaced by machine specific
138 numbers. */
140 /* True if write can be prefetched by a read prefetch. */
142 #ifndef WRITE_CAN_USE_READ_PREFETCH
143 #define WRITE_CAN_USE_READ_PREFETCH 1
144 #endif
146 /* True if read can be prefetched by a write prefetch. */
148 #ifndef READ_CAN_USE_WRITE_PREFETCH
149 #define READ_CAN_USE_WRITE_PREFETCH 0
150 #endif
152 /* The size of the block loaded by a single prefetch. Usually, this is
153 the same as cache line size (at the moment, we only consider one level
154 of cache hierarchy). */
156 #ifndef PREFETCH_BLOCK
157 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
158 #endif
160 /* Do we have a forward hardware sequential prefetching? */
162 #ifndef HAVE_FORWARD_PREFETCH
163 #define HAVE_FORWARD_PREFETCH 0
164 #endif
166 /* Do we have a backward hardware sequential prefetching? */
168 #ifndef HAVE_BACKWARD_PREFETCH
169 #define HAVE_BACKWARD_PREFETCH 0
170 #endif
172 /* In some cases we are only able to determine that there is a certain
173 probability that the two accesses hit the same cache line. In this
174 case, we issue the prefetches for both of them if this probability
175 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
177 #ifndef ACCEPTABLE_MISS_RATE
178 #define ACCEPTABLE_MISS_RATE 50
179 #endif
181 #ifndef HAVE_prefetch
182 #define HAVE_prefetch 0
183 #endif
185 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
186 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
188 /* We consider a memory access nontemporal if it is not reused sooner than
189 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
190 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
191 so that we use nontemporal prefetches e.g. if single memory location
192 is accessed several times in a single iteration of the loop. */
193 #define NONTEMPORAL_FRACTION 16
195 /* In case we have to emit a memory fence instruction after the loop that
196 uses nontemporal stores, this defines the builtin to use. */
198 #ifndef FENCE_FOLLOWING_MOVNT
199 #define FENCE_FOLLOWING_MOVNT NULL_TREE
200 #endif
202 /* The group of references between that reuse may occur. */
204 struct mem_ref_group
206 tree base; /* Base of the reference. */
207 HOST_WIDE_INT step; /* Step of the reference. */
208 struct mem_ref *refs; /* References in the group. */
209 struct mem_ref_group *next; /* Next group of references. */
212 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
214 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
216 /* The memory reference. */
218 struct mem_ref
220 gimple stmt; /* Statement in that the reference appears. */
221 tree mem; /* The reference. */
222 HOST_WIDE_INT delta; /* Constant offset of the reference. */
223 struct mem_ref_group *group; /* The group of references it belongs to. */
224 unsigned HOST_WIDE_INT prefetch_mod;
225 /* Prefetch only each PREFETCH_MOD-th
226 iteration. */
227 unsigned HOST_WIDE_INT prefetch_before;
228 /* Prefetch only first PREFETCH_BEFORE
229 iterations. */
230 unsigned reuse_distance; /* The amount of data accessed before the first
231 reuse of this value. */
232 struct mem_ref *next; /* The next reference in the group. */
233 unsigned write_p : 1; /* Is it a write? */
234 unsigned independent_p : 1; /* True if the reference is independent on
235 all other references inside the loop. */
236 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
237 unsigned storent_p : 1; /* True if we changed the store to a
238 nontemporal one. */
241 /* Dumps information about reference REF to FILE. */
243 static void
244 dump_mem_ref (FILE *file, struct mem_ref *ref)
246 fprintf (file, "Reference %p:\n", (void *) ref);
248 fprintf (file, " group %p (base ", (void *) ref->group);
249 print_generic_expr (file, ref->group->base, TDF_SLIM);
250 fprintf (file, ", step ");
251 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
252 fprintf (file, ")\n");
254 fprintf (file, " delta ");
255 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
256 fprintf (file, "\n");
258 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
260 fprintf (file, "\n");
263 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
264 exist. */
266 static struct mem_ref_group *
267 find_or_create_group (struct mem_ref_group **groups, tree base,
268 HOST_WIDE_INT step)
270 struct mem_ref_group *group;
272 for (; *groups; groups = &(*groups)->next)
274 if ((*groups)->step == step
275 && operand_equal_p ((*groups)->base, base, 0))
276 return *groups;
278 /* Keep the list of groups sorted by decreasing step. */
279 if ((*groups)->step < step)
280 break;
283 group = XNEW (struct mem_ref_group);
284 group->base = base;
285 group->step = step;
286 group->refs = NULL;
287 group->next = *groups;
288 *groups = group;
290 return group;
293 /* Records a memory reference MEM in GROUP with offset DELTA and write status
294 WRITE_P. The reference occurs in statement STMT. */
296 static void
297 record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
298 HOST_WIDE_INT delta, bool write_p)
300 struct mem_ref **aref;
302 /* Do not record the same address twice. */
303 for (aref = &group->refs; *aref; aref = &(*aref)->next)
305 /* It does not have to be possible for write reference to reuse the read
306 prefetch, or vice versa. */
307 if (!WRITE_CAN_USE_READ_PREFETCH
308 && write_p
309 && !(*aref)->write_p)
310 continue;
311 if (!READ_CAN_USE_WRITE_PREFETCH
312 && !write_p
313 && (*aref)->write_p)
314 continue;
316 if ((*aref)->delta == delta)
317 return;
320 (*aref) = XNEW (struct mem_ref);
321 (*aref)->stmt = stmt;
322 (*aref)->mem = mem;
323 (*aref)->delta = delta;
324 (*aref)->write_p = write_p;
325 (*aref)->prefetch_before = PREFETCH_ALL;
326 (*aref)->prefetch_mod = 1;
327 (*aref)->reuse_distance = 0;
328 (*aref)->issue_prefetch_p = false;
329 (*aref)->group = group;
330 (*aref)->next = NULL;
331 (*aref)->independent_p = false;
332 (*aref)->storent_p = false;
334 if (dump_file && (dump_flags & TDF_DETAILS))
335 dump_mem_ref (dump_file, *aref);
338 /* Release memory references in GROUPS. */
340 static void
341 release_mem_refs (struct mem_ref_group *groups)
343 struct mem_ref_group *next_g;
344 struct mem_ref *ref, *next_r;
346 for (; groups; groups = next_g)
348 next_g = groups->next;
349 for (ref = groups->refs; ref; ref = next_r)
351 next_r = ref->next;
352 free (ref);
354 free (groups);
358 /* A structure used to pass arguments to idx_analyze_ref. */
360 struct ar_data
362 struct loop *loop; /* Loop of the reference. */
363 gimple stmt; /* Statement of the reference. */
364 HOST_WIDE_INT *step; /* Step of the memory reference. */
365 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
368 /* Analyzes a single INDEX of a memory reference to obtain information
369 described at analyze_ref. Callback for for_each_index. */
371 static bool
372 idx_analyze_ref (tree base, tree *index, void *data)
374 struct ar_data *ar_data = (struct ar_data *) data;
375 tree ibase, step, stepsize;
376 HOST_WIDE_INT istep, idelta = 0, imult = 1;
377 affine_iv iv;
379 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
380 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
381 return false;
383 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
384 *index, &iv, false))
385 return false;
386 ibase = iv.base;
387 step = iv.step;
389 if (!cst_and_fits_in_hwi (step))
390 return false;
391 istep = int_cst_value (step);
393 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
394 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
396 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
397 ibase = TREE_OPERAND (ibase, 0);
399 if (cst_and_fits_in_hwi (ibase))
401 idelta += int_cst_value (ibase);
402 ibase = build_int_cst (TREE_TYPE (ibase), 0);
405 if (TREE_CODE (base) == ARRAY_REF)
407 stepsize = array_ref_element_size (base);
408 if (!cst_and_fits_in_hwi (stepsize))
409 return false;
410 imult = int_cst_value (stepsize);
412 istep *= imult;
413 idelta *= imult;
416 *ar_data->step += istep;
417 *ar_data->delta += idelta;
418 *index = ibase;
420 return true;
423 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
424 STEP are integer constants and iter is number of iterations of LOOP. The
425 reference occurs in statement STMT. Strips nonaddressable component
426 references from REF_P. */
428 static bool
429 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
430 HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
431 gimple stmt)
433 struct ar_data ar_data;
434 tree off;
435 HOST_WIDE_INT bit_offset;
436 tree ref = *ref_p;
438 *step = 0;
439 *delta = 0;
441 /* First strip off the component references. Ignore bitfields. */
442 if (TREE_CODE (ref) == COMPONENT_REF
443 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
444 ref = TREE_OPERAND (ref, 0);
446 *ref_p = ref;
448 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
450 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
451 bit_offset = TREE_INT_CST_LOW (off);
452 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
454 *delta += bit_offset / BITS_PER_UNIT;
457 *base = unshare_expr (ref);
458 ar_data.loop = loop;
459 ar_data.stmt = stmt;
460 ar_data.step = step;
461 ar_data.delta = delta;
462 return for_each_index (base, idx_analyze_ref, &ar_data);
465 /* Record a memory reference REF to the list REFS. The reference occurs in
466 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
467 reference was recorded, false otherwise. */
469 static bool
470 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
471 tree ref, bool write_p, gimple stmt)
473 tree base;
474 HOST_WIDE_INT step, delta;
475 struct mem_ref_group *agrp;
477 if (get_base_address (ref) == NULL)
478 return false;
480 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
481 return false;
483 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
484 are integer constants. */
485 agrp = find_or_create_group (refs, base, step);
486 record_ref (agrp, stmt, ref, delta, write_p);
488 return true;
491 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
492 true if there are no other memory references inside the loop. */
494 static struct mem_ref_group *
495 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
497 basic_block *body = get_loop_body_in_dom_order (loop);
498 basic_block bb;
499 unsigned i;
500 gimple_stmt_iterator bsi;
501 gimple stmt;
502 tree lhs, rhs;
503 struct mem_ref_group *refs = NULL;
505 *no_other_refs = true;
506 *ref_count = 0;
508 /* Scan the loop body in order, so that the former references precede the
509 later ones. */
510 for (i = 0; i < loop->num_nodes; i++)
512 bb = body[i];
513 if (bb->loop_father != loop)
514 continue;
516 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
518 stmt = gsi_stmt (bsi);
520 if (gimple_code (stmt) != GIMPLE_ASSIGN)
522 if (gimple_vuse (stmt)
523 || (is_gimple_call (stmt)
524 && !(gimple_call_flags (stmt) & ECF_CONST)))
525 *no_other_refs = false;
526 continue;
529 lhs = gimple_assign_lhs (stmt);
530 rhs = gimple_assign_rhs1 (stmt);
532 if (REFERENCE_CLASS_P (rhs))
534 *no_other_refs &= gather_memory_references_ref (loop, &refs,
535 rhs, false, stmt);
536 *ref_count += 1;
538 if (REFERENCE_CLASS_P (lhs))
540 *no_other_refs &= gather_memory_references_ref (loop, &refs,
541 lhs, true, stmt);
542 *ref_count += 1;
546 free (body);
548 return refs;
551 /* Prune the prefetch candidate REF using the self-reuse. */
553 static void
554 prune_ref_by_self_reuse (struct mem_ref *ref)
556 HOST_WIDE_INT step = ref->group->step;
557 bool backward = step < 0;
559 if (step == 0)
561 /* Prefetch references to invariant address just once. */
562 ref->prefetch_before = 1;
563 return;
566 if (backward)
567 step = -step;
569 if (step > PREFETCH_BLOCK)
570 return;
572 if ((backward && HAVE_BACKWARD_PREFETCH)
573 || (!backward && HAVE_FORWARD_PREFETCH))
575 ref->prefetch_before = 1;
576 return;
579 ref->prefetch_mod = PREFETCH_BLOCK / step;
582 /* Divides X by BY, rounding down. */
584 static HOST_WIDE_INT
585 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
587 gcc_assert (by > 0);
589 if (x >= 0)
590 return x / by;
591 else
592 return (x + by - 1) / by;
595 /* Given a CACHE_LINE_SIZE and two inductive memory references
596 with a common STEP greater than CACHE_LINE_SIZE and an address
597 difference DELTA, compute the probability that they will fall
598 in different cache lines. DISTINCT_ITERS is the number of
599 distinct iterations after which the pattern repeats itself.
600 ALIGN_UNIT is the unit of alignment in bytes. */
602 static int
603 compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size,
604 HOST_WIDE_INT step, HOST_WIDE_INT delta,
605 unsigned HOST_WIDE_INT distinct_iters,
606 int align_unit)
608 unsigned align, iter;
609 int total_positions, miss_positions, miss_rate;
610 int address1, address2, cache_line1, cache_line2;
612 total_positions = 0;
613 miss_positions = 0;
615 /* Iterate through all possible alignments of the first
616 memory reference within its cache line. */
617 for (align = 0; align < cache_line_size; align += align_unit)
619 /* Iterate through all distinct iterations. */
620 for (iter = 0; iter < distinct_iters; iter++)
622 address1 = align + step * iter;
623 address2 = address1 + delta;
624 cache_line1 = address1 / cache_line_size;
625 cache_line2 = address2 / cache_line_size;
626 total_positions += 1;
627 if (cache_line1 != cache_line2)
628 miss_positions += 1;
630 miss_rate = 1000 * miss_positions / total_positions;
631 return miss_rate;
634 /* Prune the prefetch candidate REF using the reuse with BY.
635 If BY_IS_BEFORE is true, BY is before REF in the loop. */
637 static void
638 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
639 bool by_is_before)
641 HOST_WIDE_INT step = ref->group->step;
642 bool backward = step < 0;
643 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
644 HOST_WIDE_INT delta = delta_b - delta_r;
645 HOST_WIDE_INT hit_from;
646 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
647 int miss_rate;
648 HOST_WIDE_INT reduced_step;
649 unsigned HOST_WIDE_INT reduced_prefetch_block;
650 tree ref_type;
651 int align_unit;
653 if (delta == 0)
655 /* If the references has the same address, only prefetch the
656 former. */
657 if (by_is_before)
658 ref->prefetch_before = 0;
660 return;
663 if (!step)
665 /* If the reference addresses are invariant and fall into the
666 same cache line, prefetch just the first one. */
667 if (!by_is_before)
668 return;
670 if (ddown (ref->delta, PREFETCH_BLOCK)
671 != ddown (by->delta, PREFETCH_BLOCK))
672 return;
674 ref->prefetch_before = 0;
675 return;
678 /* Only prune the reference that is behind in the array. */
679 if (backward)
681 if (delta > 0)
682 return;
684 /* Transform the data so that we may assume that the accesses
685 are forward. */
686 delta = - delta;
687 step = -step;
688 delta_r = PREFETCH_BLOCK - 1 - delta_r;
689 delta_b = PREFETCH_BLOCK - 1 - delta_b;
691 else
693 if (delta < 0)
694 return;
697 /* Check whether the two references are likely to hit the same cache
698 line, and how distant the iterations in that it occurs are from
699 each other. */
701 if (step <= PREFETCH_BLOCK)
703 /* The accesses are sure to meet. Let us check when. */
704 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
705 prefetch_before = (hit_from - delta_r + step - 1) / step;
707 if (prefetch_before < ref->prefetch_before)
708 ref->prefetch_before = prefetch_before;
710 return;
713 /* A more complicated case with step > prefetch_block. First reduce
714 the ratio between the step and the cache line size to its simplest
715 terms. The resulting denominator will then represent the number of
716 distinct iterations after which each address will go back to its
717 initial location within the cache line. This computation assumes
718 that PREFETCH_BLOCK is a power of two. */
719 prefetch_block = PREFETCH_BLOCK;
720 reduced_prefetch_block = prefetch_block;
721 reduced_step = step;
722 while ((reduced_step & 1) == 0
723 && reduced_prefetch_block > 1)
725 reduced_step >>= 1;
726 reduced_prefetch_block >>= 1;
729 prefetch_before = delta / step;
730 delta %= step;
731 ref_type = TREE_TYPE (ref->mem);
732 align_unit = TYPE_ALIGN (ref_type) / 8;
733 miss_rate = compute_miss_rate(prefetch_block, step, delta,
734 reduced_prefetch_block, align_unit);
735 if (miss_rate <= ACCEPTABLE_MISS_RATE)
737 if (prefetch_before < ref->prefetch_before)
738 ref->prefetch_before = prefetch_before;
740 return;
743 /* Try also the following iteration. */
744 prefetch_before++;
745 delta = step - delta;
746 miss_rate = compute_miss_rate(prefetch_block, step, delta,
747 reduced_prefetch_block, align_unit);
748 if (miss_rate <= ACCEPTABLE_MISS_RATE)
750 if (prefetch_before < ref->prefetch_before)
751 ref->prefetch_before = prefetch_before;
753 return;
756 /* The ref probably does not reuse by. */
757 return;
760 /* Prune the prefetch candidate REF using the reuses with other references
761 in REFS. */
763 static void
764 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
766 struct mem_ref *prune_by;
767 bool before = true;
769 prune_ref_by_self_reuse (ref);
771 for (prune_by = refs; prune_by; prune_by = prune_by->next)
773 if (prune_by == ref)
775 before = false;
776 continue;
779 if (!WRITE_CAN_USE_READ_PREFETCH
780 && ref->write_p
781 && !prune_by->write_p)
782 continue;
783 if (!READ_CAN_USE_WRITE_PREFETCH
784 && !ref->write_p
785 && prune_by->write_p)
786 continue;
788 prune_ref_by_group_reuse (ref, prune_by, before);
792 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
794 static void
795 prune_group_by_reuse (struct mem_ref_group *group)
797 struct mem_ref *ref_pruned;
799 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
801 prune_ref_by_reuse (ref_pruned, group->refs);
803 if (dump_file && (dump_flags & TDF_DETAILS))
805 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
807 if (ref_pruned->prefetch_before == PREFETCH_ALL
808 && ref_pruned->prefetch_mod == 1)
809 fprintf (dump_file, " no restrictions");
810 else if (ref_pruned->prefetch_before == 0)
811 fprintf (dump_file, " do not prefetch");
812 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
813 fprintf (dump_file, " prefetch once");
814 else
816 if (ref_pruned->prefetch_before != PREFETCH_ALL)
818 fprintf (dump_file, " prefetch before ");
819 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
820 ref_pruned->prefetch_before);
822 if (ref_pruned->prefetch_mod != 1)
824 fprintf (dump_file, " prefetch mod ");
825 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
826 ref_pruned->prefetch_mod);
829 fprintf (dump_file, "\n");
834 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
836 static void
837 prune_by_reuse (struct mem_ref_group *groups)
839 for (; groups; groups = groups->next)
840 prune_group_by_reuse (groups);
843 /* Returns true if we should issue prefetch for REF. */
845 static bool
846 should_issue_prefetch_p (struct mem_ref *ref)
848 /* For now do not issue prefetches for only first few of the
849 iterations. */
850 if (ref->prefetch_before != PREFETCH_ALL)
851 return false;
853 /* Do not prefetch nontemporal stores. */
854 if (ref->storent_p)
855 return false;
857 return true;
860 /* Decide which of the prefetch candidates in GROUPS to prefetch.
861 AHEAD is the number of iterations to prefetch ahead (which corresponds
862 to the number of simultaneous instances of one prefetch running at a
863 time). UNROLL_FACTOR is the factor by that the loop is going to be
864 unrolled. Returns true if there is anything to prefetch. */
866 static bool
867 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
868 unsigned ahead)
870 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
871 unsigned slots_per_prefetch;
872 struct mem_ref *ref;
873 bool any = false;
875 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
876 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
878 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
879 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
880 it will need a prefetch slot. */
881 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
882 if (dump_file && (dump_flags & TDF_DETAILS))
883 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
884 slots_per_prefetch);
886 /* For now we just take memory references one by one and issue
887 prefetches for as many as possible. The groups are sorted
888 starting with the largest step, since the references with
889 large step are more likely to cause many cache misses. */
891 for (; groups; groups = groups->next)
892 for (ref = groups->refs; ref; ref = ref->next)
894 if (!should_issue_prefetch_p (ref))
895 continue;
897 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
898 and we unroll the loop UNROLL_FACTOR times, we need to insert
899 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
900 iteration. */
901 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
902 / ref->prefetch_mod);
903 prefetch_slots = n_prefetches * slots_per_prefetch;
905 /* If more than half of the prefetches would be lost anyway, do not
906 issue the prefetch. */
907 if (2 * remaining_prefetch_slots < prefetch_slots)
908 continue;
910 ref->issue_prefetch_p = true;
912 if (remaining_prefetch_slots <= prefetch_slots)
913 return true;
914 remaining_prefetch_slots -= prefetch_slots;
915 any = true;
918 return any;
921 /* Estimate the number of prefetches in the given GROUPS. */
923 static int
924 estimate_prefetch_count (struct mem_ref_group *groups)
926 struct mem_ref *ref;
927 int prefetch_count = 0;
929 for (; groups; groups = groups->next)
930 for (ref = groups->refs; ref; ref = ref->next)
931 if (should_issue_prefetch_p (ref))
932 prefetch_count++;
934 return prefetch_count;
937 /* Issue prefetches for the reference REF into loop as decided before.
938 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
939 is the factor by which LOOP was unrolled. */
941 static void
942 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
944 HOST_WIDE_INT delta;
945 tree addr, addr_base, write_p, local;
946 gimple prefetch;
947 gimple_stmt_iterator bsi;
948 unsigned n_prefetches, ap;
949 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
951 if (dump_file && (dump_flags & TDF_DETAILS))
952 fprintf (dump_file, "Issued%s prefetch for %p.\n",
953 nontemporal ? " nontemporal" : "",
954 (void *) ref);
956 bsi = gsi_for_stmt (ref->stmt);
958 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
959 / ref->prefetch_mod);
960 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
961 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
962 true, NULL, true, GSI_SAME_STMT);
963 write_p = ref->write_p ? integer_one_node : integer_zero_node;
964 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
966 for (ap = 0; ap < n_prefetches; ap++)
968 /* Determine the address to prefetch. */
969 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
970 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node,
971 addr_base, size_int (delta));
972 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
973 true, GSI_SAME_STMT);
975 /* Create the prefetch instruction. */
976 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH],
977 3, addr, write_p, local);
978 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
982 /* Issue prefetches for the references in GROUPS into loop as decided before.
983 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
984 factor by that LOOP was unrolled. */
986 static void
987 issue_prefetches (struct mem_ref_group *groups,
988 unsigned unroll_factor, unsigned ahead)
990 struct mem_ref *ref;
992 for (; groups; groups = groups->next)
993 for (ref = groups->refs; ref; ref = ref->next)
994 if (ref->issue_prefetch_p)
995 issue_prefetch_ref (ref, unroll_factor, ahead);
998 /* Returns true if REF is a memory write for that a nontemporal store insn
999 can be used. */
1001 static bool
1002 nontemporal_store_p (struct mem_ref *ref)
1004 enum machine_mode mode;
1005 enum insn_code code;
1007 /* REF must be a write that is not reused. We require it to be independent
1008 on all other memory references in the loop, as the nontemporal stores may
1009 be reordered with respect to other memory references. */
1010 if (!ref->write_p
1011 || !ref->independent_p
1012 || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1013 return false;
1015 /* Check that we have the storent instruction for the mode. */
1016 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1017 if (mode == BLKmode)
1018 return false;
1020 code = optab_handler (storent_optab, mode)->insn_code;
1021 return code != CODE_FOR_nothing;
1024 /* If REF is a nontemporal store, we mark the corresponding modify statement
1025 and return true. Otherwise, we return false. */
1027 static bool
1028 mark_nontemporal_store (struct mem_ref *ref)
1030 if (!nontemporal_store_p (ref))
1031 return false;
1033 if (dump_file && (dump_flags & TDF_DETAILS))
1034 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1035 (void *) ref);
1037 gimple_assign_set_nontemporal_move (ref->stmt, true);
1038 ref->storent_p = true;
1040 return true;
1043 /* Issue a memory fence instruction after LOOP. */
1045 static void
1046 emit_mfence_after_loop (struct loop *loop)
1048 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1049 edge exit;
1050 gimple call;
1051 gimple_stmt_iterator bsi;
1052 unsigned i;
1054 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1056 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1058 if (!single_pred_p (exit->dest)
1059 /* If possible, we prefer not to insert the fence on other paths
1060 in cfg. */
1061 && !(exit->flags & EDGE_ABNORMAL))
1062 split_loop_exit_edge (exit);
1063 bsi = gsi_after_labels (exit->dest);
1065 gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1066 mark_virtual_ops_for_renaming (call);
1069 VEC_free (edge, heap, exits);
1070 update_ssa (TODO_update_ssa_only_virtuals);
1073 /* Returns true if we can use storent in loop, false otherwise. */
1075 static bool
1076 may_use_storent_in_loop_p (struct loop *loop)
1078 bool ret = true;
1080 if (loop->inner != NULL)
1081 return false;
1083 /* If we must issue a mfence insn after using storent, check that there
1084 is a suitable place for it at each of the loop exits. */
1085 if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1087 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1088 unsigned i;
1089 edge exit;
1091 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1092 if ((exit->flags & EDGE_ABNORMAL)
1093 && exit->dest == EXIT_BLOCK_PTR)
1094 ret = false;
1096 VEC_free (edge, heap, exits);
1099 return ret;
1102 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1103 references in the loop. */
1105 static void
1106 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1108 struct mem_ref *ref;
1109 bool any = false;
1111 if (!may_use_storent_in_loop_p (loop))
1112 return;
1114 for (; groups; groups = groups->next)
1115 for (ref = groups->refs; ref; ref = ref->next)
1116 any |= mark_nontemporal_store (ref);
1118 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1119 emit_mfence_after_loop (loop);
1122 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1123 this is the case, fill in DESC by the description of number of
1124 iterations. */
1126 static bool
1127 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1128 unsigned factor)
1130 if (!can_unroll_loop_p (loop, factor, desc))
1131 return false;
1133 /* We only consider loops without control flow for unrolling. This is not
1134 a hard restriction -- tree_unroll_loop works with arbitrary loops
1135 as well; but the unrolling/prefetching is usually more profitable for
1136 loops consisting of a single basic block, and we want to limit the
1137 code growth. */
1138 if (loop->num_nodes > 2)
1139 return false;
1141 return true;
1144 /* Determine the coefficient by that unroll LOOP, from the information
1145 contained in the list of memory references REFS. Description of
1146 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1147 insns of the LOOP. EST_NITER is the estimated number of iterations of
1148 the loop, or -1 if no estimate is available. */
1150 static unsigned
1151 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1152 unsigned ninsns, struct tree_niter_desc *desc,
1153 HOST_WIDE_INT est_niter)
1155 unsigned upper_bound;
1156 unsigned nfactor, factor, mod_constraint;
1157 struct mem_ref_group *agp;
1158 struct mem_ref *ref;
1160 /* First check whether the loop is not too large to unroll. We ignore
1161 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1162 from unrolling them enough to make exactly one cache line covered by each
1163 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1164 us from unrolling the loops too many times in cases where we only expect
1165 gains from better scheduling and decreasing loop overhead, which is not
1166 the case here. */
1167 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1169 /* If we unrolled the loop more times than it iterates, the unrolled version
1170 of the loop would be never entered. */
1171 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1172 upper_bound = est_niter;
1174 if (upper_bound <= 1)
1175 return 1;
1177 /* Choose the factor so that we may prefetch each cache just once,
1178 but bound the unrolling by UPPER_BOUND. */
1179 factor = 1;
1180 for (agp = refs; agp; agp = agp->next)
1181 for (ref = agp->refs; ref; ref = ref->next)
1182 if (should_issue_prefetch_p (ref))
1184 mod_constraint = ref->prefetch_mod;
1185 nfactor = least_common_multiple (mod_constraint, factor);
1186 if (nfactor <= upper_bound)
1187 factor = nfactor;
1190 if (!should_unroll_loop_p (loop, desc, factor))
1191 return 1;
1193 return factor;
1196 /* Returns the total volume of the memory references REFS, taking into account
1197 reuses in the innermost loop and cache line size. TODO -- we should also
1198 take into account reuses across the iterations of the loops in the loop
1199 nest. */
1201 static unsigned
1202 volume_of_references (struct mem_ref_group *refs)
1204 unsigned volume = 0;
1205 struct mem_ref_group *gr;
1206 struct mem_ref *ref;
1208 for (gr = refs; gr; gr = gr->next)
1209 for (ref = gr->refs; ref; ref = ref->next)
1211 /* Almost always reuses another value? */
1212 if (ref->prefetch_before != PREFETCH_ALL)
1213 continue;
1215 /* If several iterations access the same cache line, use the size of
1216 the line divided by this number. Otherwise, a cache line is
1217 accessed in each iteration. TODO -- in the latter case, we should
1218 take the size of the reference into account, rounding it up on cache
1219 line size multiple. */
1220 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1222 return volume;
1225 /* Returns the volume of memory references accessed across VEC iterations of
1226 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1227 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1229 static unsigned
1230 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1232 unsigned i;
1234 for (i = 0; i < n; i++)
1235 if (vec[i] != 0)
1236 break;
1238 if (i == n)
1239 return 0;
1241 gcc_assert (vec[i] > 0);
1243 /* We ignore the parts of the distance vector in subloops, since usually
1244 the numbers of iterations are much smaller. */
1245 return loop_sizes[i] * vec[i];
1248 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1249 at the position corresponding to the loop of the step. N is the depth
1250 of the considered loop nest, and, LOOP is its innermost loop. */
1252 static void
1253 add_subscript_strides (tree access_fn, unsigned stride,
1254 HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1256 struct loop *aloop;
1257 tree step;
1258 HOST_WIDE_INT astep;
1259 unsigned min_depth = loop_depth (loop) - n;
1261 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1263 aloop = get_chrec_loop (access_fn);
1264 step = CHREC_RIGHT (access_fn);
1265 access_fn = CHREC_LEFT (access_fn);
1267 if ((unsigned) loop_depth (aloop) <= min_depth)
1268 continue;
1270 if (host_integerp (step, 0))
1271 astep = tree_low_cst (step, 0);
1272 else
1273 astep = L1_CACHE_LINE_SIZE;
1275 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1280 /* Returns the volume of memory references accessed between two consecutive
1281 self-reuses of the reference DR. We consider the subscripts of DR in N
1282 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1283 loops. LOOP is the innermost loop of the current loop nest. */
1285 static unsigned
1286 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1287 struct loop *loop)
1289 tree stride, access_fn;
1290 HOST_WIDE_INT *strides, astride;
1291 VEC (tree, heap) *access_fns;
1292 tree ref = DR_REF (dr);
1293 unsigned i, ret = ~0u;
1295 /* In the following example:
1297 for (i = 0; i < N; i++)
1298 for (j = 0; j < N; j++)
1299 use (a[j][i]);
1300 the same cache line is accessed each N steps (except if the change from
1301 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1302 we cannot rely purely on the results of the data dependence analysis.
1304 Instead, we compute the stride of the reference in each loop, and consider
1305 the innermost loop in that the stride is less than cache size. */
1307 strides = XCNEWVEC (HOST_WIDE_INT, n);
1308 access_fns = DR_ACCESS_FNS (dr);
1310 for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
1312 /* Keep track of the reference corresponding to the subscript, so that we
1313 know its stride. */
1314 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1315 ref = TREE_OPERAND (ref, 0);
1317 if (TREE_CODE (ref) == ARRAY_REF)
1319 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1320 if (host_integerp (stride, 1))
1321 astride = tree_low_cst (stride, 1);
1322 else
1323 astride = L1_CACHE_LINE_SIZE;
1325 ref = TREE_OPERAND (ref, 0);
1327 else
1328 astride = 1;
1330 add_subscript_strides (access_fn, astride, strides, n, loop);
1333 for (i = n; i-- > 0; )
1335 unsigned HOST_WIDE_INT s;
1337 s = strides[i] < 0 ? -strides[i] : strides[i];
1339 if (s < (unsigned) L1_CACHE_LINE_SIZE
1340 && (loop_sizes[i]
1341 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1343 ret = loop_sizes[i];
1344 break;
1348 free (strides);
1349 return ret;
1352 /* Determines the distance till the first reuse of each reference in REFS
1353 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1354 memory references in the loop. */
1356 static void
1357 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1358 bool no_other_refs)
1360 struct loop *nest, *aloop;
1361 VEC (data_reference_p, heap) *datarefs = NULL;
1362 VEC (ddr_p, heap) *dependences = NULL;
1363 struct mem_ref_group *gr;
1364 struct mem_ref *ref, *refb;
1365 VEC (loop_p, heap) *vloops = NULL;
1366 unsigned *loop_data_size;
1367 unsigned i, j, n;
1368 unsigned volume, dist, adist;
1369 HOST_WIDE_INT vol;
1370 data_reference_p dr;
1371 ddr_p dep;
1373 if (loop->inner)
1374 return;
1376 /* Find the outermost loop of the loop nest of loop (we require that
1377 there are no sibling loops inside the nest). */
1378 nest = loop;
1379 while (1)
1381 aloop = loop_outer (nest);
1383 if (aloop == current_loops->tree_root
1384 || aloop->inner->next)
1385 break;
1387 nest = aloop;
1390 /* For each loop, determine the amount of data accessed in each iteration.
1391 We use this to estimate whether the reference is evicted from the
1392 cache before its reuse. */
1393 find_loop_nest (nest, &vloops);
1394 n = VEC_length (loop_p, vloops);
1395 loop_data_size = XNEWVEC (unsigned, n);
1396 volume = volume_of_references (refs);
1397 i = n;
1398 while (i-- != 0)
1400 loop_data_size[i] = volume;
1401 /* Bound the volume by the L2 cache size, since above this bound,
1402 all dependence distances are equivalent. */
1403 if (volume > L2_CACHE_SIZE_BYTES)
1404 continue;
1406 aloop = VEC_index (loop_p, vloops, i);
1407 vol = estimated_loop_iterations_int (aloop, false);
1408 if (vol < 0)
1409 vol = expected_loop_iterations (aloop);
1410 volume *= vol;
1413 /* Prepare the references in the form suitable for data dependence
1414 analysis. We ignore unanalyzable data references (the results
1415 are used just as a heuristics to estimate temporality of the
1416 references, hence we do not need to worry about correctness). */
1417 for (gr = refs; gr; gr = gr->next)
1418 for (ref = gr->refs; ref; ref = ref->next)
1420 dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
1422 if (dr)
1424 ref->reuse_distance = volume;
1425 dr->aux = ref;
1426 VEC_safe_push (data_reference_p, heap, datarefs, dr);
1428 else
1429 no_other_refs = false;
1432 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
1434 dist = self_reuse_distance (dr, loop_data_size, n, loop);
1435 ref = (struct mem_ref *) dr->aux;
1436 if (ref->reuse_distance > dist)
1437 ref->reuse_distance = dist;
1439 if (no_other_refs)
1440 ref->independent_p = true;
1443 compute_all_dependences (datarefs, &dependences, vloops, true);
1445 for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
1447 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1448 continue;
1450 ref = (struct mem_ref *) DDR_A (dep)->aux;
1451 refb = (struct mem_ref *) DDR_B (dep)->aux;
1453 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1454 || DDR_NUM_DIST_VECTS (dep) == 0)
1456 /* If the dependence cannot be analyzed, assume that there might be
1457 a reuse. */
1458 dist = 0;
1460 ref->independent_p = false;
1461 refb->independent_p = false;
1463 else
1465 /* The distance vectors are normalized to be always lexicographically
1466 positive, hence we cannot tell just from them whether DDR_A comes
1467 before DDR_B or vice versa. However, it is not important,
1468 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1469 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1470 in cache (and marking it as nontemporal would not affect
1471 anything). */
1473 dist = volume;
1474 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1476 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1477 loop_data_size, n);
1479 /* If this is a dependence in the innermost loop (i.e., the
1480 distances in all superloops are zero) and it is not
1481 the trivial self-dependence with distance zero, record that
1482 the references are not completely independent. */
1483 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1484 && (ref != refb
1485 || DDR_DIST_VECT (dep, j)[n-1] != 0))
1487 ref->independent_p = false;
1488 refb->independent_p = false;
1491 /* Ignore accesses closer than
1492 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1493 so that we use nontemporal prefetches e.g. if single memory
1494 location is accessed several times in a single iteration of
1495 the loop. */
1496 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1497 continue;
1499 if (adist < dist)
1500 dist = adist;
1504 if (ref->reuse_distance > dist)
1505 ref->reuse_distance = dist;
1506 if (refb->reuse_distance > dist)
1507 refb->reuse_distance = dist;
1510 free_dependence_relations (dependences);
1511 free_data_refs (datarefs);
1512 free (loop_data_size);
1514 if (dump_file && (dump_flags & TDF_DETAILS))
1516 fprintf (dump_file, "Reuse distances:\n");
1517 for (gr = refs; gr; gr = gr->next)
1518 for (ref = gr->refs; ref; ref = ref->next)
1519 fprintf (dump_file, " ref %p distance %u\n",
1520 (void *) ref, ref->reuse_distance);
1524 /* Do a cost-benefit analysis to determine if prefetching is profitable
1525 for the current loop given the following parameters:
1526 AHEAD: the iteration ahead distance,
1527 EST_NITER: the estimated trip count,
1528 NINSNS: estimated number of instructions in the loop,
1529 PREFETCH_COUNT: an estimate of the number of prefetches
1530 MEM_REF_COUNT: total number of memory references in the loop. */
1532 static bool
1533 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
1534 unsigned ninsns, unsigned prefetch_count,
1535 unsigned mem_ref_count)
1537 int insn_to_mem_ratio, insn_to_prefetch_ratio;
1539 if (mem_ref_count == 0)
1540 return false;
1542 /* Prefetching improves performance by overlapping cache missing
1543 memory accesses with CPU operations. If the loop does not have
1544 enough CPU operations to overlap with memory operations, prefetching
1545 won't give a significant benefit. One approximate way of checking
1546 this is to require the ratio of instructions to memory references to
1547 be above a certain limit. This approximation works well in practice.
1548 TODO: Implement a more precise computation by estimating the time
1549 for each CPU or memory op in the loop. Time estimates for memory ops
1550 should account for cache misses. */
1551 insn_to_mem_ratio = ninsns / mem_ref_count;
1553 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1554 return false;
1556 /* Profitability of prefetching is highly dependent on the trip count.
1557 For a given AHEAD distance, the first AHEAD iterations do not benefit
1558 from prefetching, and the last AHEAD iterations execute useless
1559 prefetches. So, if the trip count is not large enough relative to AHEAD,
1560 prefetching may cause serious performance degradation. To avoid this
1561 problem when the trip count is not known at compile time, we
1562 conservatively skip loops with high prefetching costs. For now, only
1563 the I-cache cost is considered. The relative I-cache cost is estimated
1564 by taking the ratio between the number of prefetches and the total
1565 number of instructions. Since we are using integer arithmetic, we
1566 compute the reciprocal of this ratio.
1567 TODO: Account for loop unrolling, which may reduce the costs of
1568 shorter stride prefetches. Note that not accounting for loop
1569 unrolling over-estimates the cost and hence gives more conservative
1570 results. */
1571 if (est_niter < 0)
1573 insn_to_prefetch_ratio = ninsns / prefetch_count;
1574 return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO;
1577 if (est_niter <= (HOST_WIDE_INT) ahead)
1579 if (dump_file && (dump_flags & TDF_DETAILS))
1580 fprintf (dump_file,
1581 "Not prefetching -- loop estimated to roll only %d times\n",
1582 (int) est_niter);
1583 return false;
1585 return true;
1589 /* Issue prefetch instructions for array references in LOOP. Returns
1590 true if the LOOP was unrolled. */
1592 static bool
1593 loop_prefetch_arrays (struct loop *loop)
1595 struct mem_ref_group *refs;
1596 unsigned ahead, ninsns, time, unroll_factor;
1597 HOST_WIDE_INT est_niter;
1598 struct tree_niter_desc desc;
1599 bool unrolled = false, no_other_refs;
1600 unsigned prefetch_count;
1601 unsigned mem_ref_count;
1603 if (optimize_loop_nest_for_size_p (loop))
1605 if (dump_file && (dump_flags & TDF_DETAILS))
1606 fprintf (dump_file, " ignored (cold area)\n");
1607 return false;
1610 /* Step 1: gather the memory references. */
1611 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1613 /* Step 2: estimate the reuse effects. */
1614 prune_by_reuse (refs);
1616 prefetch_count = estimate_prefetch_count (refs);
1617 if (prefetch_count == 0)
1618 goto fail;
1620 determine_loop_nest_reuse (loop, refs, no_other_refs);
1622 /* Step 3: determine the ahead and unroll factor. */
1624 /* FIXME: the time should be weighted by the probabilities of the blocks in
1625 the loop body. */
1626 time = tree_num_loop_insns (loop, &eni_time_weights);
1627 ahead = (PREFETCH_LATENCY + time - 1) / time;
1628 est_niter = estimated_loop_iterations_int (loop, false);
1630 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1631 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1632 est_niter);
1633 if (dump_file && (dump_flags & TDF_DETAILS))
1634 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1635 HOST_WIDE_INT_PRINT_DEC "\n"
1636 "insn count %d, mem ref count %d, prefetch count %d\n",
1637 ahead, unroll_factor, est_niter,
1638 ninsns, mem_ref_count, prefetch_count);
1640 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns,
1641 prefetch_count, mem_ref_count))
1642 goto fail;
1644 mark_nontemporal_stores (loop, refs);
1646 /* Step 4: what to prefetch? */
1647 if (!schedule_prefetches (refs, unroll_factor, ahead))
1648 goto fail;
1650 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1651 iterations so that we do not issue superfluous prefetches. */
1652 if (unroll_factor != 1)
1654 tree_unroll_loop (loop, unroll_factor,
1655 single_dom_exit (loop), &desc);
1656 unrolled = true;
1659 /* Step 6: issue the prefetches. */
1660 issue_prefetches (refs, unroll_factor, ahead);
1662 fail:
1663 release_mem_refs (refs);
1664 return unrolled;
1667 /* Issue prefetch instructions for array references in loops. */
1669 unsigned int
1670 tree_ssa_prefetch_arrays (void)
1672 loop_iterator li;
1673 struct loop *loop;
1674 bool unrolled = false;
1675 int todo_flags = 0;
1677 if (!HAVE_prefetch
1678 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1679 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1680 of processor costs and i486 does not have prefetch, but
1681 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1682 || PREFETCH_BLOCK == 0)
1683 return 0;
1685 if (dump_file && (dump_flags & TDF_DETAILS))
1687 fprintf (dump_file, "Prefetching parameters:\n");
1688 fprintf (dump_file, " simultaneous prefetches: %d\n",
1689 SIMULTANEOUS_PREFETCHES);
1690 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1691 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1692 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1693 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1694 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1695 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1696 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1697 MIN_INSN_TO_PREFETCH_RATIO);
1698 fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1699 PREFETCH_MIN_INSN_TO_MEM_RATIO);
1700 fprintf (dump_file, "\n");
1703 initialize_original_copy_tables ();
1705 if (!built_in_decls[BUILT_IN_PREFETCH])
1707 tree type = build_function_type (void_type_node,
1708 tree_cons (NULL_TREE,
1709 const_ptr_type_node,
1710 NULL_TREE));
1711 tree decl = add_builtin_function ("__builtin_prefetch", type,
1712 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1713 NULL, NULL_TREE);
1714 DECL_IS_NOVOPS (decl) = true;
1715 built_in_decls[BUILT_IN_PREFETCH] = decl;
1718 /* We assume that size of cache line is a power of two, so verify this
1719 here. */
1720 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1722 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1724 if (dump_file && (dump_flags & TDF_DETAILS))
1725 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1727 unrolled |= loop_prefetch_arrays (loop);
1729 if (dump_file && (dump_flags & TDF_DETAILS))
1730 fprintf (dump_file, "\n\n");
1733 if (unrolled)
1735 scev_reset ();
1736 todo_flags |= TODO_cleanup_cfg;
1739 free_original_copy_tables ();
1740 return todo_flags;