* ipa-prop.c (ipa_modify_call_arguments): Initialize deref_align.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob230e302acf6db0aad00da0f48c22502c653df397
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
40 /* Need to include rtl.h, expr.h, etc. for optabs. */
41 #include "expr.h"
42 #include "optabs.h"
44 /* Return true if load- or store-lanes optab OPTAB is implemented for
45 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
47 static bool
48 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
49 tree vectype, unsigned HOST_WIDE_INT count)
51 enum machine_mode mode, array_mode;
52 bool limit_p;
54 mode = TYPE_MODE (vectype);
55 limit_p = !targetm.array_mode_supported_p (mode, count);
56 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
57 MODE_INT, limit_p);
59 if (array_mode == BLKmode)
61 if (dump_enabled_p ())
62 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
63 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
64 GET_MODE_NAME (mode), count);
65 return false;
68 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "cannot use %s<%s><%s>", name,
73 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
74 return false;
77 if (dump_enabled_p ())
78 dump_printf_loc (MSG_NOTE, vect_location,
79 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
80 GET_MODE_NAME (mode));
82 return true;
86 /* Return the smallest scalar part of STMT.
87 This is used to determine the vectype of the stmt. We generally set the
88 vectype according to the type of the result (lhs). For stmts whose
89 result-type is different than the type of the arguments (e.g., demotion,
90 promotion), vectype will be reset appropriately (later). Note that we have
91 to visit the smallest datatype in this function, because that determines the
92 VF. If the smallest datatype in the loop is present only as the rhs of a
93 promotion operation - we'd miss it.
94 Such a case, where a variable of this datatype does not appear in the lhs
95 anywhere in the loop, can only occur if it's an invariant: e.g.:
96 'int_x = (int) short_inv', which we'd expect to have been optimized away by
97 invariant motion. However, we cannot rely on invariant motion to always
98 take invariants out of the loop, and so in the case of promotion we also
99 have to check the rhs.
100 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
101 types. */
103 tree
104 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
105 HOST_WIDE_INT *rhs_size_unit)
107 tree scalar_type = gimple_expr_type (stmt);
108 HOST_WIDE_INT lhs, rhs;
110 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
112 if (is_gimple_assign (stmt)
113 && (gimple_assign_cast_p (stmt)
114 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
115 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
116 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
118 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
120 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
121 if (rhs < lhs)
122 scalar_type = rhs_type;
125 *lhs_size_unit = lhs;
126 *rhs_size_unit = rhs;
127 return scalar_type;
131 /* Check if data references pointed by DR_I and DR_J are same or
132 belong to same interleaving group. Return FALSE if drs are
133 different, otherwise return TRUE. */
135 static bool
136 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
138 gimple stmt_i = DR_STMT (dr_i);
139 gimple stmt_j = DR_STMT (dr_j);
141 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
142 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
143 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
144 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
145 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
146 return true;
147 else
148 return false;
151 /* If address ranges represented by DDR_I and DDR_J are equal,
152 return TRUE, otherwise return FALSE. */
154 static bool
155 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
157 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
158 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
159 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
160 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
161 return true;
162 else
163 return false;
166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
167 tested at run-time. Return TRUE if DDR was successfully inserted.
168 Return false if versioning is not supported. */
170 static bool
171 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
175 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
176 return false;
178 if (dump_enabled_p ())
180 dump_printf_loc (MSG_NOTE, vect_location,
181 "mark for run-time aliasing test between ");
182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
183 dump_printf (MSG_NOTE, " and ");
184 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
187 if (optimize_loop_nest_for_size_p (loop))
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
191 "versioning not supported when optimizing for size.");
192 return false;
195 /* FORNOW: We don't support versioning with outer-loop vectorization. */
196 if (loop->inner)
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
200 "versioning not yet supported for outer-loops.");
201 return false;
204 /* FORNOW: We don't support creating runtime alias tests for non-constant
205 step. */
206 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
207 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
209 if (dump_enabled_p ())
210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
211 "versioning not yet supported for non-constant "
212 "step");
213 return false;
216 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
217 return true;
221 /* Function vect_analyze_data_ref_dependence.
223 Return TRUE if there (might) exist a dependence between a memory-reference
224 DRA and a memory-reference DRB. When versioning for alias may check a
225 dependence at run-time, return FALSE. Adjust *MAX_VF according to
226 the data dependence. */
228 static bool
229 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
230 loop_vec_info loop_vinfo, int *max_vf)
232 unsigned int i;
233 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
234 struct data_reference *dra = DDR_A (ddr);
235 struct data_reference *drb = DDR_B (ddr);
236 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
237 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
238 lambda_vector dist_v;
239 unsigned int loop_depth;
241 /* In loop analysis all data references should be vectorizable. */
242 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
243 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
244 gcc_unreachable ();
246 /* Independent data accesses. */
247 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
248 return false;
250 if (dra == drb
251 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
252 return false;
254 /* Unknown data dependence. */
255 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
257 /* If user asserted safelen consecutive iterations can be
258 executed concurrently, assume independence. */
259 if (loop->safelen >= 2)
261 if (loop->safelen < *max_vf)
262 *max_vf = loop->safelen;
263 return false;
266 if (STMT_VINFO_GATHER_P (stmtinfo_a)
267 || STMT_VINFO_GATHER_P (stmtinfo_b))
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
272 "versioning for alias not supported for: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 DR_REF (dra));
276 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 DR_REF (drb));
280 return true;
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
286 "versioning for alias required: "
287 "can't determine dependence between ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (dra));
290 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
292 DR_REF (drb));
295 /* Add to list of ddrs that need to be tested at run-time. */
296 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
299 /* Known data dependence. */
300 if (DDR_NUM_DIST_VECTS (ddr) == 0)
302 /* If user asserted safelen consecutive iterations can be
303 executed concurrently, assume independence. */
304 if (loop->safelen >= 2)
306 if (loop->safelen < *max_vf)
307 *max_vf = loop->safelen;
308 return false;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a)
312 || STMT_VINFO_GATHER_P (stmtinfo_b))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 DR_REF (drb));
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
341 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
344 int dist = dist_v[loop_depth];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE, vect_location,
348 "dependence distance = %d.", dist);
350 if (dist == 0)
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_NOTE, " and ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
361 /* When we perform grouped accesses and perform implicit CSE
362 by detecting equal accesses and doing disambiguation with
363 runtime alias tests like for
364 .. = a[i];
365 .. = a[i+1];
366 a[i] = ..;
367 a[i+1] = ..;
368 *p = ..;
369 .. = a[i];
370 .. = a[i+1];
371 where we will end up loading { a[i], a[i+1] } once, make
372 sure that inserting group loads before the first load and
373 stores after the last store will do the right thing. */
374 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 && GROUP_SAME_DR_STMT (stmtinfo_a))
376 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
377 && GROUP_SAME_DR_STMT (stmtinfo_b)))
379 gimple earlier_stmt;
380 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
381 if (DR_IS_WRITE
382 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
384 if (dump_enabled_p ())
385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
386 "READ_WRITE dependence in interleaving.");
387 return true;
391 continue;
394 if (dist > 0 && DDR_REVERSED_P (ddr))
396 /* If DDR_REVERSED_P the order of the data-refs in DDR was
397 reversed (to make distance vector positive), and the actual
398 distance is negative. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
401 "dependence distance negative.");
402 continue;
405 if (abs (dist) >= 2
406 && abs (dist) < *max_vf)
408 /* The dependence distance requires reduction of the maximal
409 vectorization factor. */
410 *max_vf = abs (dist);
411 if (dump_enabled_p ())
412 dump_printf_loc (MSG_NOTE, vect_location,
413 "adjusting maximal vectorization factor to %i",
414 *max_vf);
417 if (abs (dist) >= *max_vf)
419 /* Dependence distance does not create dependence, as far as
420 vectorization is concerned, in this case. */
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_NOTE, vect_location,
423 "dependence distance >= VF.");
424 continue;
427 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
430 "not vectorized, possible dependence "
431 "between data-refs ");
432 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
433 dump_printf (MSG_NOTE, " and ");
434 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
437 return true;
440 return false;
443 /* Function vect_analyze_data_ref_dependences.
445 Examine all the data references in the loop, and make sure there do not
446 exist any data dependences between them. Set *MAX_VF according to
447 the maximum vectorization factor the data dependences allow. */
449 bool
450 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
452 unsigned int i;
453 struct data_dependence_relation *ddr;
455 if (dump_enabled_p ())
456 dump_printf_loc (MSG_NOTE, vect_location,
457 "=== vect_analyze_data_ref_dependences ===");
459 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
460 &LOOP_VINFO_DDRS (loop_vinfo),
461 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
462 return false;
464 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
465 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
466 return false;
468 return true;
472 /* Function vect_slp_analyze_data_ref_dependence.
474 Return TRUE if there (might) exist a dependence between a memory-reference
475 DRA and a memory-reference DRB. When versioning for alias may check a
476 dependence at run-time, return FALSE. Adjust *MAX_VF according to
477 the data dependence. */
479 static bool
480 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
482 struct data_reference *dra = DDR_A (ddr);
483 struct data_reference *drb = DDR_B (ddr);
485 /* We need to check dependences of statements marked as unvectorizable
486 as well, they still can prohibit vectorization. */
488 /* Independent data accesses. */
489 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
490 return false;
492 if (dra == drb)
493 return false;
495 /* Read-read is OK. */
496 if (DR_IS_READ (dra) && DR_IS_READ (drb))
497 return false;
499 /* If dra and drb are part of the same interleaving chain consider
500 them independent. */
501 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
502 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
503 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
504 return false;
506 /* Unknown data dependence. */
507 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
509 gimple earlier_stmt;
511 if (dump_enabled_p ())
513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
514 "can't determine dependence between ");
515 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
516 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
517 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
520 /* We do not vectorize basic blocks with write-write dependencies. */
521 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
522 return true;
524 /* Check that it's not a load-after-store dependence. */
525 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
526 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
527 return true;
529 return false;
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "determined dependence between ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
537 dump_printf (MSG_NOTE, " and ");
538 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
541 /* Do not vectorize basic blocks with write-write dependences. */
542 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
543 return true;
545 /* Check dependence between DRA and DRB for basic block vectorization.
546 If the accesses share same bases and offsets, we can compare their initial
547 constant offsets to decide whether they differ or not. In case of a read-
548 write dependence we check that the load is before the store to ensure that
549 vectorization will not change the order of the accesses. */
551 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
552 gimple earlier_stmt;
554 /* Check that the data-refs have same bases and offsets. If not, we can't
555 determine if they are dependent. */
556 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
557 || !dr_equal_offsets_p (dra, drb))
558 return true;
560 /* Check the types. */
561 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
562 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
564 if (type_size_a != type_size_b
565 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
566 TREE_TYPE (DR_REF (drb))))
567 return true;
569 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
570 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
572 /* Two different locations - no dependence. */
573 if (init_a != init_b)
574 return false;
576 /* We have a read-write dependence. Check that the load is before the store.
577 When we vectorize basic blocks, vector load can be only before
578 corresponding scalar load, and vector store can be only after its
579 corresponding scalar store. So the order of the acceses is preserved in
580 case the load is before the store. */
581 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
582 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
583 return false;
585 return true;
589 /* Function vect_analyze_data_ref_dependences.
591 Examine all the data references in the basic-block, and make sure there
592 do not exist any data dependences between them. Set *MAX_VF according to
593 the maximum vectorization factor the data dependences allow. */
595 bool
596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
598 struct data_dependence_relation *ddr;
599 unsigned int i;
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "=== vect_slp_analyze_data_ref_dependences ===");
605 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
606 &BB_VINFO_DDRS (bb_vinfo),
607 vNULL, true))
608 return false;
610 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
611 if (vect_slp_analyze_data_ref_dependence (ddr))
612 return false;
614 return true;
618 /* Function vect_compute_data_ref_alignment
620 Compute the misalignment of the data reference DR.
622 Output:
623 1. If during the misalignment computation it is found that the data reference
624 cannot be vectorized then false is returned.
625 2. DR_MISALIGNMENT (DR) is defined.
627 FOR NOW: No analysis is actually performed. Misalignment is calculated
628 only for trivial cases. TODO. */
630 static bool
631 vect_compute_data_ref_alignment (struct data_reference *dr)
633 gimple stmt = DR_STMT (dr);
634 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
636 struct loop *loop = NULL;
637 tree ref = DR_REF (dr);
638 tree vectype;
639 tree base, base_addr;
640 bool base_aligned;
641 tree misalign;
642 tree aligned_to, alignment;
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE, vect_location,
646 "vect_compute_data_ref_alignment:");
648 if (loop_vinfo)
649 loop = LOOP_VINFO_LOOP (loop_vinfo);
651 /* Initialize misalignment to unknown. */
652 SET_DR_MISALIGNMENT (dr, -1);
654 /* Strided loads perform only component accesses, misalignment information
655 is irrelevant for them. */
656 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
657 return true;
659 misalign = DR_INIT (dr);
660 aligned_to = DR_ALIGNED_TO (dr);
661 base_addr = DR_BASE_ADDRESS (dr);
662 vectype = STMT_VINFO_VECTYPE (stmt_info);
664 /* In case the dataref is in an inner-loop of the loop that is being
665 vectorized (LOOP), we use the base and misalignment information
666 relative to the outer-loop (LOOP). This is ok only if the misalignment
667 stays the same throughout the execution of the inner-loop, which is why
668 we have to check that the stride of the dataref in the inner-loop evenly
669 divides by the vector size. */
670 if (loop && nested_in_vect_loop_p (loop, stmt))
672 tree step = DR_STEP (dr);
673 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
675 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_NOTE, vect_location,
679 "inner step divides the vector-size.");
680 misalign = STMT_VINFO_DR_INIT (stmt_info);
681 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
682 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
684 else
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
688 "inner step doesn't divide the vector-size.");
689 misalign = NULL_TREE;
693 /* Similarly, if we're doing basic-block vectorization, we can only use
694 base and misalignment information relative to an innermost loop if the
695 misalignment stays the same throughout the execution of the loop.
696 As above, this is the case if the stride of the dataref evenly divides
697 by the vector size. */
698 if (!loop)
700 tree step = DR_STEP (dr);
701 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
703 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "SLP: step doesn't divide the vector-size.");
708 misalign = NULL_TREE;
712 base = build_fold_indirect_ref (base_addr);
713 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
715 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
716 || !misalign)
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
724 return true;
727 if ((DECL_P (base)
728 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
729 alignment) >= 0)
730 || (TREE_CODE (base_addr) == SSA_NAME
731 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
732 TREE_TYPE (base_addr)))),
733 alignment) >= 0)
734 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
735 base_aligned = true;
736 else
737 base_aligned = false;
739 if (!base_aligned)
741 /* Do not change the alignment of global variables here if
742 flag_section_anchors is enabled as we already generated
743 RTL for other functions. Most global variables should
744 have been aligned during the IPA increase_alignment pass. */
745 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
746 || (TREE_STATIC (base) && flag_section_anchors))
748 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE, vect_location,
751 "can't force alignment of ref: ");
752 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
754 return true;
757 /* Force the alignment of the decl.
758 NOTE: This is the only change to the code we make during
759 the analysis phase, before deciding to vectorize the loop. */
760 if (dump_enabled_p ())
762 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
763 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
766 ((dataref_aux *)dr->aux)->base_decl = base;
767 ((dataref_aux *)dr->aux)->base_misaligned = true;
770 /* If this is a backward running DR then first access in the larger
771 vectype actually is N-1 elements before the address in the DR.
772 Adjust misalign accordingly. */
773 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
775 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
776 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
777 otherwise we wouldn't be here. */
778 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
779 /* PLUS because DR_STEP was negative. */
780 misalign = size_binop (PLUS_EXPR, misalign, offset);
783 /* Modulo alignment. */
784 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
786 if (!host_integerp (misalign, 1))
788 /* Negative or overflowed misalignment value. */
789 if (dump_enabled_p ())
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "unexpected misalign value");
792 return false;
795 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
797 if (dump_enabled_p ())
799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
800 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
801 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
804 return true;
808 /* Function vect_compute_data_refs_alignment
810 Compute the misalignment of data references in the loop.
811 Return FALSE if a data reference is found that cannot be vectorized. */
813 static bool
814 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
815 bb_vec_info bb_vinfo)
817 vec<data_reference_p> datarefs;
818 struct data_reference *dr;
819 unsigned int i;
821 if (loop_vinfo)
822 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
823 else
824 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
826 FOR_EACH_VEC_ELT (datarefs, i, dr)
827 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
828 && !vect_compute_data_ref_alignment (dr))
830 if (bb_vinfo)
832 /* Mark unsupported statement as unvectorizable. */
833 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
834 continue;
836 else
837 return false;
840 return true;
844 /* Function vect_update_misalignment_for_peel
846 DR - the data reference whose misalignment is to be adjusted.
847 DR_PEEL - the data reference whose misalignment is being made
848 zero in the vector loop by the peel.
849 NPEEL - the number of iterations in the peel loop if the misalignment
850 of DR_PEEL is known at compile time. */
852 static void
853 vect_update_misalignment_for_peel (struct data_reference *dr,
854 struct data_reference *dr_peel, int npeel)
856 unsigned int i;
857 vec<dr_p> same_align_drs;
858 struct data_reference *current_dr;
859 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
860 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
861 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
862 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
864 /* For interleaved data accesses the step in the loop must be multiplied by
865 the size of the interleaving group. */
866 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
867 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
868 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
869 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
871 /* It can be assumed that the data refs with the same alignment as dr_peel
872 are aligned in the vector loop. */
873 same_align_drs
874 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
875 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
877 if (current_dr != dr)
878 continue;
879 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
880 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
881 SET_DR_MISALIGNMENT (dr, 0);
882 return;
885 if (known_alignment_for_access_p (dr)
886 && known_alignment_for_access_p (dr_peel))
888 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
889 int misal = DR_MISALIGNMENT (dr);
890 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
891 misal += negative ? -npeel * dr_size : npeel * dr_size;
892 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
893 SET_DR_MISALIGNMENT (dr, misal);
894 return;
897 if (dump_enabled_p ())
898 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
899 SET_DR_MISALIGNMENT (dr, -1);
903 /* Function vect_verify_datarefs_alignment
905 Return TRUE if all data references in the loop can be
906 handled with respect to alignment. */
908 bool
909 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
911 vec<data_reference_p> datarefs;
912 struct data_reference *dr;
913 enum dr_alignment_support supportable_dr_alignment;
914 unsigned int i;
916 if (loop_vinfo)
917 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
918 else
919 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
921 FOR_EACH_VEC_ELT (datarefs, i, dr)
923 gimple stmt = DR_STMT (dr);
924 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
926 if (!STMT_VINFO_RELEVANT_P (stmt_info))
927 continue;
929 /* For interleaving, only the alignment of the first access matters.
930 Skip statements marked as not vectorizable. */
931 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
932 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
933 || !STMT_VINFO_VECTORIZABLE (stmt_info))
934 continue;
936 /* Strided loads perform only component accesses, alignment is
937 irrelevant for them. */
938 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
939 continue;
941 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
942 if (!supportable_dr_alignment)
944 if (dump_enabled_p ())
946 if (DR_IS_READ (dr))
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
948 "not vectorized: unsupported unaligned load.");
949 else
950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
951 "not vectorized: unsupported unaligned "
952 "store.");
954 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
955 DR_REF (dr));
957 return false;
959 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE, vect_location,
961 "Vectorizing an unaligned access.");
963 return true;
966 /* Given an memory reference EXP return whether its alignment is less
967 than its size. */
969 static bool
970 not_size_aligned (tree exp)
972 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
973 return true;
975 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
976 > get_object_alignment (exp));
979 /* Function vector_alignment_reachable_p
981 Return true if vector alignment for DR is reachable by peeling
982 a few loop iterations. Return false otherwise. */
984 static bool
985 vector_alignment_reachable_p (struct data_reference *dr)
987 gimple stmt = DR_STMT (dr);
988 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
989 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
991 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
993 /* For interleaved access we peel only if number of iterations in
994 the prolog loop ({VF - misalignment}), is a multiple of the
995 number of the interleaved accesses. */
996 int elem_size, mis_in_elements;
997 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
999 /* FORNOW: handle only known alignment. */
1000 if (!known_alignment_for_access_p (dr))
1001 return false;
1003 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1004 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1006 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1007 return false;
1010 /* If misalignment is known at the compile time then allow peeling
1011 only if natural alignment is reachable through peeling. */
1012 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1014 HOST_WIDE_INT elmsize =
1015 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1016 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_NOTE, vect_location,
1019 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1020 dump_printf (MSG_NOTE,
1021 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1023 if (DR_MISALIGNMENT (dr) % elmsize)
1025 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1027 "data size does not divide the misalignment.\n");
1028 return false;
1032 if (!known_alignment_for_access_p (dr))
1034 tree type = TREE_TYPE (DR_REF (dr));
1035 bool is_packed = not_size_aligned (DR_REF (dr));
1036 if (dump_enabled_p ())
1037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038 "Unknown misalignment, is_packed = %d",is_packed);
1039 if ((TYPE_USER_ALIGN (type) && !is_packed)
1040 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1041 return true;
1042 else
1043 return false;
1046 return true;
1050 /* Calculate the cost of the memory access represented by DR. */
1052 static void
1053 vect_get_data_access_cost (struct data_reference *dr,
1054 unsigned int *inside_cost,
1055 unsigned int *outside_cost,
1056 stmt_vector_for_cost *body_cost_vec)
1058 gimple stmt = DR_STMT (dr);
1059 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1060 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1061 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1062 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1063 int ncopies = vf / nunits;
1065 if (DR_IS_READ (dr))
1066 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1067 NULL, body_cost_vec, false);
1068 else
1069 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1071 if (dump_enabled_p ())
1072 dump_printf_loc (MSG_NOTE, vect_location,
1073 "vect_get_data_access_cost: inside_cost = %d, "
1074 "outside_cost = %d.", *inside_cost, *outside_cost);
1078 /* Insert DR into peeling hash table with NPEEL as key. */
1080 static void
1081 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1082 int npeel)
1084 struct _vect_peel_info elem, *slot;
1085 _vect_peel_info **new_slot;
1086 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1088 elem.npeel = npeel;
1089 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1090 if (slot)
1091 slot->count++;
1092 else
1094 slot = XNEW (struct _vect_peel_info);
1095 slot->npeel = npeel;
1096 slot->dr = dr;
1097 slot->count = 1;
1098 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1099 *new_slot = slot;
1102 if (!supportable_dr_alignment && !flag_vect_cost_model)
1103 slot->count += VECT_MAX_COST;
1107 /* Traverse peeling hash table to find peeling option that aligns maximum
1108 number of data accesses. */
1111 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1112 _vect_peel_extended_info *max)
1114 vect_peel_info elem = *slot;
1116 if (elem->count > max->peel_info.count
1117 || (elem->count == max->peel_info.count
1118 && max->peel_info.npeel > elem->npeel))
1120 max->peel_info.npeel = elem->npeel;
1121 max->peel_info.count = elem->count;
1122 max->peel_info.dr = elem->dr;
1125 return 1;
1129 /* Traverse peeling hash table and calculate cost for each peeling option.
1130 Find the one with the lowest cost. */
1133 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1134 _vect_peel_extended_info *min)
1136 vect_peel_info elem = *slot;
1137 int save_misalignment, dummy;
1138 unsigned int inside_cost = 0, outside_cost = 0, i;
1139 gimple stmt = DR_STMT (elem->dr);
1140 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1141 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1142 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1143 struct data_reference *dr;
1144 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1145 int single_iter_cost;
1147 prologue_cost_vec.create (2);
1148 body_cost_vec.create (2);
1149 epilogue_cost_vec.create (2);
1151 FOR_EACH_VEC_ELT (datarefs, i, dr)
1153 stmt = DR_STMT (dr);
1154 stmt_info = vinfo_for_stmt (stmt);
1155 /* For interleaving, only the alignment of the first access
1156 matters. */
1157 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1158 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1159 continue;
1161 save_misalignment = DR_MISALIGNMENT (dr);
1162 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1163 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1164 &body_cost_vec);
1165 SET_DR_MISALIGNMENT (dr, save_misalignment);
1168 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1169 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1170 &dummy, single_iter_cost,
1171 &prologue_cost_vec,
1172 &epilogue_cost_vec);
1174 /* Prologue and epilogue costs are added to the target model later.
1175 These costs depend only on the scalar iteration cost, the
1176 number of peeling iterations finally chosen, and the number of
1177 misaligned statements. So discard the information found here. */
1178 prologue_cost_vec.release ();
1179 epilogue_cost_vec.release ();
1181 if (inside_cost < min->inside_cost
1182 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1184 min->inside_cost = inside_cost;
1185 min->outside_cost = outside_cost;
1186 min->body_cost_vec.release ();
1187 min->body_cost_vec = body_cost_vec;
1188 min->peel_info.dr = elem->dr;
1189 min->peel_info.npeel = elem->npeel;
1191 else
1192 body_cost_vec.release ();
1194 return 1;
1198 /* Choose best peeling option by traversing peeling hash table and either
1199 choosing an option with the lowest cost (if cost model is enabled) or the
1200 option that aligns as many accesses as possible. */
1202 static struct data_reference *
1203 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1204 unsigned int *npeel,
1205 stmt_vector_for_cost *body_cost_vec)
1207 struct _vect_peel_extended_info res;
1209 res.peel_info.dr = NULL;
1210 res.body_cost_vec = stmt_vector_for_cost();
1212 if (flag_vect_cost_model)
1214 res.inside_cost = INT_MAX;
1215 res.outside_cost = INT_MAX;
1216 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1217 .traverse <_vect_peel_extended_info *,
1218 vect_peeling_hash_get_lowest_cost> (&res);
1220 else
1222 res.peel_info.count = 0;
1223 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1224 .traverse <_vect_peel_extended_info *,
1225 vect_peeling_hash_get_most_frequent> (&res);
1228 *npeel = res.peel_info.npeel;
1229 *body_cost_vec = res.body_cost_vec;
1230 return res.peel_info.dr;
1234 /* Function vect_enhance_data_refs_alignment
1236 This pass will use loop versioning and loop peeling in order to enhance
1237 the alignment of data references in the loop.
1239 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1240 original loop is to be vectorized. Any other loops that are created by
1241 the transformations performed in this pass - are not supposed to be
1242 vectorized. This restriction will be relaxed.
1244 This pass will require a cost model to guide it whether to apply peeling
1245 or versioning or a combination of the two. For example, the scheme that
1246 intel uses when given a loop with several memory accesses, is as follows:
1247 choose one memory access ('p') which alignment you want to force by doing
1248 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1249 other accesses are not necessarily aligned, or (2) use loop versioning to
1250 generate one loop in which all accesses are aligned, and another loop in
1251 which only 'p' is necessarily aligned.
1253 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1254 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1255 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1257 Devising a cost model is the most critical aspect of this work. It will
1258 guide us on which access to peel for, whether to use loop versioning, how
1259 many versions to create, etc. The cost model will probably consist of
1260 generic considerations as well as target specific considerations (on
1261 powerpc for example, misaligned stores are more painful than misaligned
1262 loads).
1264 Here are the general steps involved in alignment enhancements:
1266 -- original loop, before alignment analysis:
1267 for (i=0; i<N; i++){
1268 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1269 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1272 -- After vect_compute_data_refs_alignment:
1273 for (i=0; i<N; i++){
1274 x = q[i]; # DR_MISALIGNMENT(q) = 3
1275 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1278 -- Possibility 1: we do loop versioning:
1279 if (p is aligned) {
1280 for (i=0; i<N; i++){ # loop 1A
1281 x = q[i]; # DR_MISALIGNMENT(q) = 3
1282 p[i] = y; # DR_MISALIGNMENT(p) = 0
1285 else {
1286 for (i=0; i<N; i++){ # loop 1B
1287 x = q[i]; # DR_MISALIGNMENT(q) = 3
1288 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1292 -- Possibility 2: we do loop peeling:
1293 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1294 x = q[i];
1295 p[i] = y;
1297 for (i = 3; i < N; i++){ # loop 2A
1298 x = q[i]; # DR_MISALIGNMENT(q) = 0
1299 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1302 -- Possibility 3: combination of loop peeling and versioning:
1303 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1304 x = q[i];
1305 p[i] = y;
1307 if (p is aligned) {
1308 for (i = 3; i<N; i++){ # loop 3A
1309 x = q[i]; # DR_MISALIGNMENT(q) = 0
1310 p[i] = y; # DR_MISALIGNMENT(p) = 0
1313 else {
1314 for (i = 3; i<N; i++){ # loop 3B
1315 x = q[i]; # DR_MISALIGNMENT(q) = 0
1316 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1320 These loops are later passed to loop_transform to be vectorized. The
1321 vectorizer will use the alignment information to guide the transformation
1322 (whether to generate regular loads/stores, or with special handling for
1323 misalignment). */
1325 bool
1326 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1328 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1329 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1330 enum dr_alignment_support supportable_dr_alignment;
1331 struct data_reference *dr0 = NULL, *first_store = NULL;
1332 struct data_reference *dr;
1333 unsigned int i, j;
1334 bool do_peeling = false;
1335 bool do_versioning = false;
1336 bool stat;
1337 gimple stmt;
1338 stmt_vec_info stmt_info;
1339 unsigned int npeel = 0;
1340 bool all_misalignments_unknown = true;
1341 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1342 unsigned possible_npeel_number = 1;
1343 tree vectype;
1344 unsigned int nelements, mis, same_align_drs_max = 0;
1345 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1347 if (dump_enabled_p ())
1348 dump_printf_loc (MSG_NOTE, vect_location,
1349 "=== vect_enhance_data_refs_alignment ===");
1351 /* While cost model enhancements are expected in the future, the high level
1352 view of the code at this time is as follows:
1354 A) If there is a misaligned access then see if peeling to align
1355 this access can make all data references satisfy
1356 vect_supportable_dr_alignment. If so, update data structures
1357 as needed and return true.
1359 B) If peeling wasn't possible and there is a data reference with an
1360 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1361 then see if loop versioning checks can be used to make all data
1362 references satisfy vect_supportable_dr_alignment. If so, update
1363 data structures as needed and return true.
1365 C) If neither peeling nor versioning were successful then return false if
1366 any data reference does not satisfy vect_supportable_dr_alignment.
1368 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1370 Note, Possibility 3 above (which is peeling and versioning together) is not
1371 being done at this time. */
1373 /* (1) Peeling to force alignment. */
1375 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1376 Considerations:
1377 + How many accesses will become aligned due to the peeling
1378 - How many accesses will become unaligned due to the peeling,
1379 and the cost of misaligned accesses.
1380 - The cost of peeling (the extra runtime checks, the increase
1381 in code size). */
1383 FOR_EACH_VEC_ELT (datarefs, i, dr)
1385 stmt = DR_STMT (dr);
1386 stmt_info = vinfo_for_stmt (stmt);
1388 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1389 continue;
1391 /* For interleaving, only the alignment of the first access
1392 matters. */
1393 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1394 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1395 continue;
1397 /* For invariant accesses there is nothing to enhance. */
1398 if (integer_zerop (DR_STEP (dr)))
1399 continue;
1401 /* Strided loads perform only component accesses, alignment is
1402 irrelevant for them. */
1403 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1404 continue;
1406 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1407 do_peeling = vector_alignment_reachable_p (dr);
1408 if (do_peeling)
1410 if (known_alignment_for_access_p (dr))
1412 unsigned int npeel_tmp;
1413 bool negative = tree_int_cst_compare (DR_STEP (dr),
1414 size_zero_node) < 0;
1416 /* Save info about DR in the hash table. */
1417 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1418 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1420 vectype = STMT_VINFO_VECTYPE (stmt_info);
1421 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1422 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1423 TREE_TYPE (DR_REF (dr))));
1424 npeel_tmp = (negative
1425 ? (mis - nelements) : (nelements - mis))
1426 & (nelements - 1);
1428 /* For multiple types, it is possible that the bigger type access
1429 will have more than one peeling option. E.g., a loop with two
1430 types: one of size (vector size / 4), and the other one of
1431 size (vector size / 8). Vectorization factor will 8. If both
1432 access are misaligned by 3, the first one needs one scalar
1433 iteration to be aligned, and the second one needs 5. But the
1434 the first one will be aligned also by peeling 5 scalar
1435 iterations, and in that case both accesses will be aligned.
1436 Hence, except for the immediate peeling amount, we also want
1437 to try to add full vector size, while we don't exceed
1438 vectorization factor.
1439 We do this automtically for cost model, since we calculate cost
1440 for every peeling option. */
1441 if (!flag_vect_cost_model)
1442 possible_npeel_number = vf /nelements;
1444 /* Handle the aligned case. We may decide to align some other
1445 access, making DR unaligned. */
1446 if (DR_MISALIGNMENT (dr) == 0)
1448 npeel_tmp = 0;
1449 if (!flag_vect_cost_model)
1450 possible_npeel_number++;
1453 for (j = 0; j < possible_npeel_number; j++)
1455 gcc_assert (npeel_tmp <= vf);
1456 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1457 npeel_tmp += nelements;
1460 all_misalignments_unknown = false;
1461 /* Data-ref that was chosen for the case that all the
1462 misalignments are unknown is not relevant anymore, since we
1463 have a data-ref with known alignment. */
1464 dr0 = NULL;
1466 else
1468 /* If we don't know any misalignment values, we prefer
1469 peeling for data-ref that has the maximum number of data-refs
1470 with the same alignment, unless the target prefers to align
1471 stores over load. */
1472 if (all_misalignments_unknown)
1474 unsigned same_align_drs
1475 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1476 if (!dr0
1477 || same_align_drs_max < same_align_drs)
1479 same_align_drs_max = same_align_drs;
1480 dr0 = dr;
1482 /* For data-refs with the same number of related
1483 accesses prefer the one where the misalign
1484 computation will be invariant in the outermost loop. */
1485 else if (same_align_drs_max == same_align_drs)
1487 struct loop *ivloop0, *ivloop;
1488 ivloop0 = outermost_invariant_loop_for_expr
1489 (loop, DR_BASE_ADDRESS (dr0));
1490 ivloop = outermost_invariant_loop_for_expr
1491 (loop, DR_BASE_ADDRESS (dr));
1492 if ((ivloop && !ivloop0)
1493 || (ivloop && ivloop0
1494 && flow_loop_nested_p (ivloop, ivloop0)))
1495 dr0 = dr;
1498 if (!first_store && DR_IS_WRITE (dr))
1499 first_store = dr;
1502 /* If there are both known and unknown misaligned accesses in the
1503 loop, we choose peeling amount according to the known
1504 accesses. */
1505 if (!supportable_dr_alignment)
1507 dr0 = dr;
1508 if (!first_store && DR_IS_WRITE (dr))
1509 first_store = dr;
1513 else
1515 if (!aligned_access_p (dr))
1517 if (dump_enabled_p ())
1518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1519 "vector alignment may not be reachable");
1520 break;
1525 /* Check if we can possibly peel the loop. */
1526 if (!vect_can_advance_ivs_p (loop_vinfo)
1527 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1528 do_peeling = false;
1530 if (do_peeling && all_misalignments_unknown
1531 && vect_supportable_dr_alignment (dr0, false))
1534 /* Check if the target requires to prefer stores over loads, i.e., if
1535 misaligned stores are more expensive than misaligned loads (taking
1536 drs with same alignment into account). */
1537 if (first_store && DR_IS_READ (dr0))
1539 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1540 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1541 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1542 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1543 stmt_vector_for_cost dummy;
1544 dummy.create (2);
1546 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1547 &dummy);
1548 vect_get_data_access_cost (first_store, &store_inside_cost,
1549 &store_outside_cost, &dummy);
1551 dummy.release ();
1553 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1554 aligning the load DR0). */
1555 load_inside_penalty = store_inside_cost;
1556 load_outside_penalty = store_outside_cost;
1557 for (i = 0;
1558 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1559 DR_STMT (first_store))).iterate (i, &dr);
1560 i++)
1561 if (DR_IS_READ (dr))
1563 load_inside_penalty += load_inside_cost;
1564 load_outside_penalty += load_outside_cost;
1566 else
1568 load_inside_penalty += store_inside_cost;
1569 load_outside_penalty += store_outside_cost;
1572 /* Calculate the penalty for leaving DR0 unaligned (by
1573 aligning the FIRST_STORE). */
1574 store_inside_penalty = load_inside_cost;
1575 store_outside_penalty = load_outside_cost;
1576 for (i = 0;
1577 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1578 DR_STMT (dr0))).iterate (i, &dr);
1579 i++)
1580 if (DR_IS_READ (dr))
1582 store_inside_penalty += load_inside_cost;
1583 store_outside_penalty += load_outside_cost;
1585 else
1587 store_inside_penalty += store_inside_cost;
1588 store_outside_penalty += store_outside_cost;
1591 if (load_inside_penalty > store_inside_penalty
1592 || (load_inside_penalty == store_inside_penalty
1593 && load_outside_penalty > store_outside_penalty))
1594 dr0 = first_store;
1597 /* In case there are only loads with different unknown misalignments, use
1598 peeling only if it may help to align other accesses in the loop. */
1599 if (!first_store
1600 && !STMT_VINFO_SAME_ALIGN_REFS (
1601 vinfo_for_stmt (DR_STMT (dr0))).length ()
1602 && vect_supportable_dr_alignment (dr0, false)
1603 != dr_unaligned_supported)
1604 do_peeling = false;
1607 if (do_peeling && !dr0)
1609 /* Peeling is possible, but there is no data access that is not supported
1610 unless aligned. So we try to choose the best possible peeling. */
1612 /* We should get here only if there are drs with known misalignment. */
1613 gcc_assert (!all_misalignments_unknown);
1615 /* Choose the best peeling from the hash table. */
1616 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1617 &body_cost_vec);
1618 if (!dr0 || !npeel)
1619 do_peeling = false;
1622 if (do_peeling)
1624 stmt = DR_STMT (dr0);
1625 stmt_info = vinfo_for_stmt (stmt);
1626 vectype = STMT_VINFO_VECTYPE (stmt_info);
1627 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1629 if (known_alignment_for_access_p (dr0))
1631 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1632 size_zero_node) < 0;
1633 if (!npeel)
1635 /* Since it's known at compile time, compute the number of
1636 iterations in the peeled loop (the peeling factor) for use in
1637 updating DR_MISALIGNMENT values. The peeling factor is the
1638 vectorization factor minus the misalignment as an element
1639 count. */
1640 mis = DR_MISALIGNMENT (dr0);
1641 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1642 npeel = ((negative ? mis - nelements : nelements - mis)
1643 & (nelements - 1));
1646 /* For interleaved data access every iteration accesses all the
1647 members of the group, therefore we divide the number of iterations
1648 by the group size. */
1649 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1650 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1651 npeel /= GROUP_SIZE (stmt_info);
1653 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_NOTE, vect_location,
1655 "Try peeling by %d", npeel);
1658 /* Ensure that all data refs can be vectorized after the peel. */
1659 FOR_EACH_VEC_ELT (datarefs, i, dr)
1661 int save_misalignment;
1663 if (dr == dr0)
1664 continue;
1666 stmt = DR_STMT (dr);
1667 stmt_info = vinfo_for_stmt (stmt);
1668 /* For interleaving, only the alignment of the first access
1669 matters. */
1670 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1671 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1672 continue;
1674 /* Strided loads perform only component accesses, alignment is
1675 irrelevant for them. */
1676 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1677 continue;
1679 save_misalignment = DR_MISALIGNMENT (dr);
1680 vect_update_misalignment_for_peel (dr, dr0, npeel);
1681 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1682 SET_DR_MISALIGNMENT (dr, save_misalignment);
1684 if (!supportable_dr_alignment)
1686 do_peeling = false;
1687 break;
1691 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1693 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1694 if (!stat)
1695 do_peeling = false;
1696 else
1698 body_cost_vec.release ();
1699 return stat;
1703 if (do_peeling)
1705 stmt_info_for_cost *si;
1706 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1708 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1709 If the misalignment of DR_i is identical to that of dr0 then set
1710 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1711 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1712 by the peeling factor times the element size of DR_i (MOD the
1713 vectorization factor times the size). Otherwise, the
1714 misalignment of DR_i must be set to unknown. */
1715 FOR_EACH_VEC_ELT (datarefs, i, dr)
1716 if (dr != dr0)
1717 vect_update_misalignment_for_peel (dr, dr0, npeel);
1719 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1720 if (npeel)
1721 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1722 else
1723 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1724 SET_DR_MISALIGNMENT (dr0, 0);
1725 if (dump_enabled_p ())
1727 dump_printf_loc (MSG_NOTE, vect_location,
1728 "Alignment of access forced using peeling.");
1729 dump_printf_loc (MSG_NOTE, vect_location,
1730 "Peeling for alignment will be applied.");
1732 /* We've delayed passing the inside-loop peeling costs to the
1733 target cost model until we were sure peeling would happen.
1734 Do so now. */
1735 if (body_cost_vec.exists ())
1737 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1739 struct _stmt_vec_info *stmt_info
1740 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1741 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1742 si->misalign, vect_body);
1744 body_cost_vec.release ();
1747 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1748 gcc_assert (stat);
1749 return stat;
1753 body_cost_vec.release ();
1755 /* (2) Versioning to force alignment. */
1757 /* Try versioning if:
1758 1) flag_tree_vect_loop_version is TRUE
1759 2) optimize loop for speed
1760 3) there is at least one unsupported misaligned data ref with an unknown
1761 misalignment, and
1762 4) all misaligned data refs with a known misalignment are supported, and
1763 5) the number of runtime alignment checks is within reason. */
1765 do_versioning =
1766 flag_tree_vect_loop_version
1767 && optimize_loop_nest_for_speed_p (loop)
1768 && (!loop->inner); /* FORNOW */
1770 if (do_versioning)
1772 FOR_EACH_VEC_ELT (datarefs, i, dr)
1774 stmt = DR_STMT (dr);
1775 stmt_info = vinfo_for_stmt (stmt);
1777 /* For interleaving, only the alignment of the first access
1778 matters. */
1779 if (aligned_access_p (dr)
1780 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1781 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1782 continue;
1784 /* Strided loads perform only component accesses, alignment is
1785 irrelevant for them. */
1786 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1787 continue;
1789 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1791 if (!supportable_dr_alignment)
1793 gimple stmt;
1794 int mask;
1795 tree vectype;
1797 if (known_alignment_for_access_p (dr)
1798 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1799 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1801 do_versioning = false;
1802 break;
1805 stmt = DR_STMT (dr);
1806 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1807 gcc_assert (vectype);
1809 /* The rightmost bits of an aligned address must be zeros.
1810 Construct the mask needed for this test. For example,
1811 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1812 mask must be 15 = 0xf. */
1813 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1815 /* FORNOW: use the same mask to test all potentially unaligned
1816 references in the loop. The vectorizer currently supports
1817 a single vector size, see the reference to
1818 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1819 vectorization factor is computed. */
1820 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1821 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1822 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1823 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1824 DR_STMT (dr));
1828 /* Versioning requires at least one misaligned data reference. */
1829 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1830 do_versioning = false;
1831 else if (!do_versioning)
1832 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1835 if (do_versioning)
1837 vec<gimple> may_misalign_stmts
1838 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1839 gimple stmt;
1841 /* It can now be assumed that the data references in the statements
1842 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1843 of the loop being vectorized. */
1844 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1846 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1847 dr = STMT_VINFO_DATA_REF (stmt_info);
1848 SET_DR_MISALIGNMENT (dr, 0);
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location,
1851 "Alignment of access forced using versioning.");
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_NOTE, vect_location,
1856 "Versioning for alignment will be applied.");
1858 /* Peeling and versioning can't be done together at this time. */
1859 gcc_assert (! (do_peeling && do_versioning));
1861 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1862 gcc_assert (stat);
1863 return stat;
1866 /* This point is reached if neither peeling nor versioning is being done. */
1867 gcc_assert (! (do_peeling || do_versioning));
1869 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1870 return stat;
1874 /* Function vect_find_same_alignment_drs.
1876 Update group and alignment relations according to the chosen
1877 vectorization factor. */
1879 static void
1880 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1881 loop_vec_info loop_vinfo)
1883 unsigned int i;
1884 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1885 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1886 struct data_reference *dra = DDR_A (ddr);
1887 struct data_reference *drb = DDR_B (ddr);
1888 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1889 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1890 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1891 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1892 lambda_vector dist_v;
1893 unsigned int loop_depth;
1895 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1896 return;
1898 if (dra == drb)
1899 return;
1901 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1902 return;
1904 /* Loop-based vectorization and known data dependence. */
1905 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1906 return;
1908 /* Data-dependence analysis reports a distance vector of zero
1909 for data-references that overlap only in the first iteration
1910 but have different sign step (see PR45764).
1911 So as a sanity check require equal DR_STEP. */
1912 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1913 return;
1915 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1916 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1918 int dist = dist_v[loop_depth];
1920 if (dump_enabled_p ())
1921 dump_printf_loc (MSG_NOTE, vect_location,
1922 "dependence distance = %d.", dist);
1924 /* Same loop iteration. */
1925 if (dist == 0
1926 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1928 /* Two references with distance zero have the same alignment. */
1929 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1930 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1931 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_NOTE, vect_location,
1934 "accesses have the same alignment.");
1935 dump_printf (MSG_NOTE,
1936 "dependence distance modulo vf == 0 between ");
1937 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1938 dump_printf (MSG_NOTE, " and ");
1939 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1946 /* Function vect_analyze_data_refs_alignment
1948 Analyze the alignment of the data-references in the loop.
1949 Return FALSE if a data reference is found that cannot be vectorized. */
1951 bool
1952 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1953 bb_vec_info bb_vinfo)
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_NOTE, vect_location,
1957 "=== vect_analyze_data_refs_alignment ===");
1959 /* Mark groups of data references with same alignment using
1960 data dependence information. */
1961 if (loop_vinfo)
1963 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1964 struct data_dependence_relation *ddr;
1965 unsigned int i;
1967 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1968 vect_find_same_alignment_drs (ddr, loop_vinfo);
1971 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1973 if (dump_enabled_p ())
1974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1975 "not vectorized: can't calculate alignment "
1976 "for data ref.");
1977 return false;
1980 return true;
1984 /* Analyze groups of accesses: check that DR belongs to a group of
1985 accesses of legal size, step, etc. Detect gaps, single element
1986 interleaving, and other special cases. Set grouped access info.
1987 Collect groups of strided stores for further use in SLP analysis. */
1989 static bool
1990 vect_analyze_group_access (struct data_reference *dr)
1992 tree step = DR_STEP (dr);
1993 tree scalar_type = TREE_TYPE (DR_REF (dr));
1994 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1995 gimple stmt = DR_STMT (dr);
1996 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1997 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1998 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1999 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2000 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2001 bool slp_impossible = false;
2002 struct loop *loop = NULL;
2004 if (loop_vinfo)
2005 loop = LOOP_VINFO_LOOP (loop_vinfo);
2007 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2008 size of the interleaving group (including gaps). */
2009 groupsize = absu_hwi (dr_step) / type_size;
2011 /* Not consecutive access is possible only if it is a part of interleaving. */
2012 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2014 /* Check if it this DR is a part of interleaving, and is a single
2015 element of the group that is accessed in the loop. */
2017 /* Gaps are supported only for loads. STEP must be a multiple of the type
2018 size. The size of the group must be a power of 2. */
2019 if (DR_IS_READ (dr)
2020 && (dr_step % type_size) == 0
2021 && groupsize > 0
2022 && exact_log2 (groupsize) != -1)
2024 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2025 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2026 if (dump_enabled_p ())
2028 dump_printf_loc (MSG_NOTE, vect_location,
2029 "Detected single element interleaving ");
2030 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2031 dump_printf (MSG_NOTE, " step ");
2032 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2035 if (loop_vinfo)
2037 if (dump_enabled_p ())
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "Data access with gaps requires scalar "
2040 "epilogue loop");
2041 if (loop->inner)
2043 if (dump_enabled_p ())
2044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2045 "Peeling for outer loop is not"
2046 " supported");
2047 return false;
2050 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2053 return true;
2056 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059 "not consecutive access ");
2060 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2063 if (bb_vinfo)
2065 /* Mark the statement as unvectorizable. */
2066 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2067 return true;
2070 return false;
2073 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2075 /* First stmt in the interleaving chain. Check the chain. */
2076 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2077 struct data_reference *data_ref = dr;
2078 unsigned int count = 1;
2079 tree prev_init = DR_INIT (data_ref);
2080 gimple prev = stmt;
2081 HOST_WIDE_INT diff, gaps = 0;
2082 unsigned HOST_WIDE_INT count_in_bytes;
2084 while (next)
2086 /* Skip same data-refs. In case that two or more stmts share
2087 data-ref (supported only for loads), we vectorize only the first
2088 stmt, and the rest get their vectorized loads from the first
2089 one. */
2090 if (!tree_int_cst_compare (DR_INIT (data_ref),
2091 DR_INIT (STMT_VINFO_DATA_REF (
2092 vinfo_for_stmt (next)))))
2094 if (DR_IS_WRITE (data_ref))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "Two store stmts share the same dr.");
2099 return false;
2102 /* For load use the same data-ref load. */
2103 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2105 prev = next;
2106 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2107 continue;
2110 prev = next;
2111 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2113 /* All group members have the same STEP by construction. */
2114 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2116 /* Check that the distance between two accesses is equal to the type
2117 size. Otherwise, we have gaps. */
2118 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2119 - TREE_INT_CST_LOW (prev_init)) / type_size;
2120 if (diff != 1)
2122 /* FORNOW: SLP of accesses with gaps is not supported. */
2123 slp_impossible = true;
2124 if (DR_IS_WRITE (data_ref))
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 "interleaved store with gaps");
2129 return false;
2132 gaps += diff - 1;
2135 last_accessed_element += diff;
2137 /* Store the gap from the previous member of the group. If there is no
2138 gap in the access, GROUP_GAP is always 1. */
2139 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2141 prev_init = DR_INIT (data_ref);
2142 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2143 /* Count the number of data-refs in the chain. */
2144 count++;
2147 /* COUNT is the number of accesses found, we multiply it by the size of
2148 the type to get COUNT_IN_BYTES. */
2149 count_in_bytes = type_size * count;
2151 /* Check that the size of the interleaving (including gaps) is not
2152 greater than STEP. */
2153 if (dr_step != 0
2154 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2156 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2159 "interleaving size is greater than step for ");
2160 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2162 return false;
2165 /* Check that the size of the interleaving is equal to STEP for stores,
2166 i.e., that there are no gaps. */
2167 if (dr_step != 0
2168 && absu_hwi (dr_step) != count_in_bytes)
2170 if (DR_IS_READ (dr))
2172 slp_impossible = true;
2173 /* There is a gap after the last load in the group. This gap is a
2174 difference between the groupsize and the number of elements.
2175 When there is no gap, this difference should be 0. */
2176 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2178 else
2180 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "interleaved store with gaps");
2183 return false;
2187 /* Check that STEP is a multiple of type size. */
2188 if (dr_step != 0
2189 && (dr_step % type_size) != 0)
2191 if (dump_enabled_p ())
2193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2194 "step is not a multiple of type size: step ");
2195 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2196 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2197 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2198 TYPE_SIZE_UNIT (scalar_type));
2200 return false;
2203 if (groupsize == 0)
2204 groupsize = count;
2206 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2207 if (dump_enabled_p ())
2208 dump_printf_loc (MSG_NOTE, vect_location,
2209 "Detected interleaving of size %d", (int)groupsize);
2211 /* SLP: create an SLP data structure for every interleaving group of
2212 stores for further analysis in vect_analyse_slp. */
2213 if (DR_IS_WRITE (dr) && !slp_impossible)
2215 if (loop_vinfo)
2216 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2217 if (bb_vinfo)
2218 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2221 /* There is a gap in the end of the group. */
2222 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226 "Data access with gaps requires scalar "
2227 "epilogue loop");
2228 if (loop->inner)
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232 "Peeling for outer loop is not supported");
2233 return false;
2236 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2240 return true;
2244 /* Analyze the access pattern of the data-reference DR.
2245 In case of non-consecutive accesses call vect_analyze_group_access() to
2246 analyze groups of accesses. */
2248 static bool
2249 vect_analyze_data_ref_access (struct data_reference *dr)
2251 tree step = DR_STEP (dr);
2252 tree scalar_type = TREE_TYPE (DR_REF (dr));
2253 gimple stmt = DR_STMT (dr);
2254 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2255 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2256 struct loop *loop = NULL;
2258 if (loop_vinfo)
2259 loop = LOOP_VINFO_LOOP (loop_vinfo);
2261 if (loop_vinfo && !step)
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265 "bad data-ref access in loop");
2266 return false;
2269 /* Allow invariant loads in not nested loops. */
2270 if (loop_vinfo && integer_zerop (step))
2272 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2273 if (nested_in_vect_loop_p (loop, stmt))
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_NOTE, vect_location,
2277 "zero step in inner loop of nest");
2278 return false;
2280 return DR_IS_READ (dr);
2283 if (loop && nested_in_vect_loop_p (loop, stmt))
2285 /* Interleaved accesses are not yet supported within outer-loop
2286 vectorization for references in the inner-loop. */
2287 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2289 /* For the rest of the analysis we use the outer-loop step. */
2290 step = STMT_VINFO_DR_STEP (stmt_info);
2291 if (integer_zerop (step))
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "zero step in outer loop.");
2296 if (DR_IS_READ (dr))
2297 return true;
2298 else
2299 return false;
2303 /* Consecutive? */
2304 if (TREE_CODE (step) == INTEGER_CST)
2306 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2307 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2308 || (dr_step < 0
2309 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2311 /* Mark that it is not interleaving. */
2312 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2313 return true;
2317 if (loop && nested_in_vect_loop_p (loop, stmt))
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "grouped access in outer loop.");
2322 return false;
2325 /* Assume this is a DR handled by non-constant strided load case. */
2326 if (TREE_CODE (step) != INTEGER_CST)
2327 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2329 /* Not consecutive access - check if it's a part of interleaving group. */
2330 return vect_analyze_group_access (dr);
2335 /* A helper function used in the comparator function to sort data
2336 references. T1 and T2 are two data references to be compared.
2337 The function returns -1, 0, or 1. */
2339 static int
2340 compare_tree (tree t1, tree t2)
2342 int i, cmp;
2343 enum tree_code code;
2344 char tclass;
2346 if (t1 == t2)
2347 return 0;
2348 if (t1 == NULL)
2349 return -1;
2350 if (t2 == NULL)
2351 return 1;
2354 if (TREE_CODE (t1) != TREE_CODE (t2))
2355 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2357 code = TREE_CODE (t1);
2358 switch (code)
2360 /* For const values, we can just use hash values for comparisons. */
2361 case INTEGER_CST:
2362 case REAL_CST:
2363 case FIXED_CST:
2364 case STRING_CST:
2365 case COMPLEX_CST:
2366 case VECTOR_CST:
2368 hashval_t h1 = iterative_hash_expr (t1, 0);
2369 hashval_t h2 = iterative_hash_expr (t2, 0);
2370 if (h1 != h2)
2371 return h1 < h2 ? -1 : 1;
2372 break;
2375 case SSA_NAME:
2376 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2377 if (cmp != 0)
2378 return cmp;
2380 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2381 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2382 break;
2384 default:
2385 tclass = TREE_CODE_CLASS (code);
2387 /* For var-decl, we could compare their UIDs. */
2388 if (tclass == tcc_declaration)
2390 if (DECL_UID (t1) != DECL_UID (t2))
2391 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2392 break;
2395 /* For expressions with operands, compare their operands recursively. */
2396 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2398 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2399 if (cmp != 0)
2400 return cmp;
2404 return 0;
2408 /* Compare two data-references DRA and DRB to group them into chunks
2409 suitable for grouping. */
2411 static int
2412 dr_group_sort_cmp (const void *dra_, const void *drb_)
2414 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2415 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2416 int cmp;
2418 /* Stabilize sort. */
2419 if (dra == drb)
2420 return 0;
2422 /* Ordering of DRs according to base. */
2423 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2425 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2426 if (cmp != 0)
2427 return cmp;
2430 /* And according to DR_OFFSET. */
2431 if (!dr_equal_offsets_p (dra, drb))
2433 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2434 if (cmp != 0)
2435 return cmp;
2438 /* Put reads before writes. */
2439 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2440 return DR_IS_READ (dra) ? -1 : 1;
2442 /* Then sort after access size. */
2443 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2444 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2446 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2447 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2448 if (cmp != 0)
2449 return cmp;
2452 /* And after step. */
2453 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2455 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2456 if (cmp != 0)
2457 return cmp;
2460 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2461 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2462 if (cmp == 0)
2463 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2464 return cmp;
2467 /* Function vect_analyze_data_ref_accesses.
2469 Analyze the access pattern of all the data references in the loop.
2471 FORNOW: the only access pattern that is considered vectorizable is a
2472 simple step 1 (consecutive) access.
2474 FORNOW: handle only arrays and pointer accesses. */
2476 bool
2477 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2479 unsigned int i;
2480 vec<data_reference_p> datarefs;
2481 struct data_reference *dr;
2483 if (dump_enabled_p ())
2484 dump_printf_loc (MSG_NOTE, vect_location,
2485 "=== vect_analyze_data_ref_accesses ===");
2487 if (loop_vinfo)
2488 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2489 else
2490 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2492 if (datarefs.is_empty ())
2493 return true;
2495 /* Sort the array of datarefs to make building the interleaving chains
2496 linear. */
2497 qsort (datarefs.address(), datarefs.length (),
2498 sizeof (data_reference_p), dr_group_sort_cmp);
2500 /* Build the interleaving chains. */
2501 for (i = 0; i < datarefs.length () - 1;)
2503 data_reference_p dra = datarefs[i];
2504 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2505 stmt_vec_info lastinfo = NULL;
2506 for (i = i + 1; i < datarefs.length (); ++i)
2508 data_reference_p drb = datarefs[i];
2509 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2511 /* ??? Imperfect sorting (non-compatible types, non-modulo
2512 accesses, same accesses) can lead to a group to be artificially
2513 split here as we don't just skip over those. If it really
2514 matters we can push those to a worklist and re-iterate
2515 over them. The we can just skip ahead to the next DR here. */
2517 /* Check that the data-refs have same first location (except init)
2518 and they are both either store or load (not load and store). */
2519 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2520 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2521 DR_BASE_ADDRESS (drb), 0)
2522 || !dr_equal_offsets_p (dra, drb))
2523 break;
2525 /* Check that the data-refs have the same constant size and step. */
2526 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2527 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2528 if (!host_integerp (sza, 1)
2529 || !host_integerp (szb, 1)
2530 || !tree_int_cst_equal (sza, szb)
2531 || !host_integerp (DR_STEP (dra), 0)
2532 || !host_integerp (DR_STEP (drb), 0)
2533 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2534 break;
2536 /* Do not place the same access in the interleaving chain twice. */
2537 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2538 break;
2540 /* Check the types are compatible.
2541 ??? We don't distinguish this during sorting. */
2542 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2543 TREE_TYPE (DR_REF (drb))))
2544 break;
2546 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2547 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2548 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2549 gcc_assert (init_a < init_b);
2551 /* If init_b == init_a + the size of the type * k, we have an
2552 interleaving, and DRA is accessed before DRB. */
2553 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2554 if ((init_b - init_a) % type_size_a != 0)
2555 break;
2557 /* The step (if not zero) is greater than the difference between
2558 data-refs' inits. This splits groups into suitable sizes. */
2559 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2560 if (step != 0 && step <= (init_b - init_a))
2561 break;
2563 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_NOTE, vect_location,
2566 "Detected interleaving ");
2567 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2568 dump_printf (MSG_NOTE, " and ");
2569 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2572 /* Link the found element into the group list. */
2573 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2575 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2576 lastinfo = stmtinfo_a;
2578 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2579 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2580 lastinfo = stmtinfo_b;
2584 FOR_EACH_VEC_ELT (datarefs, i, dr)
2585 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2586 && !vect_analyze_data_ref_access (dr))
2588 if (dump_enabled_p ())
2589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2590 "not vectorized: complicated access pattern.");
2592 if (bb_vinfo)
2594 /* Mark the statement as not vectorizable. */
2595 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2596 continue;
2598 else
2599 return false;
2602 return true;
2605 /* Function vect_prune_runtime_alias_test_list.
2607 Prune a list of ddrs to be tested at run-time by versioning for alias.
2608 Return FALSE if resulting list of ddrs is longer then allowed by
2609 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2611 bool
2612 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2614 vec<ddr_p> ddrs =
2615 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2616 unsigned i, j;
2618 if (dump_enabled_p ())
2619 dump_printf_loc (MSG_NOTE, vect_location,
2620 "=== vect_prune_runtime_alias_test_list ===");
2622 for (i = 0; i < ddrs.length (); )
2624 bool found;
2625 ddr_p ddr_i;
2627 ddr_i = ddrs[i];
2628 found = false;
2630 for (j = 0; j < i; j++)
2632 ddr_p ddr_j = ddrs[j];
2634 if (vect_vfa_range_equal (ddr_i, ddr_j))
2636 if (dump_enabled_p ())
2638 dump_printf_loc (MSG_NOTE, vect_location,
2639 "found equal ranges ");
2640 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2641 dump_printf (MSG_NOTE, ", ");
2642 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2643 dump_printf (MSG_NOTE, " and ");
2644 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2645 dump_printf (MSG_NOTE, ", ");
2646 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2648 found = true;
2649 break;
2653 if (found)
2655 ddrs.ordered_remove (i);
2656 continue;
2658 i++;
2661 if (ddrs.length () >
2662 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2664 if (dump_enabled_p ())
2666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2667 "disable versioning for alias - max number of "
2668 "generated checks exceeded.");
2671 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2673 return false;
2676 return true;
2679 /* Check whether a non-affine read in stmt is suitable for gather load
2680 and if so, return a builtin decl for that operation. */
2682 tree
2683 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2684 tree *offp, int *scalep)
2686 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2687 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2688 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2689 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2690 tree offtype = NULL_TREE;
2691 tree decl, base, off;
2692 enum machine_mode pmode;
2693 int punsignedp, pvolatilep;
2695 /* The gather builtins need address of the form
2696 loop_invariant + vector * {1, 2, 4, 8}
2698 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2699 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2700 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2701 multiplications and additions in it. To get a vector, we need
2702 a single SSA_NAME that will be defined in the loop and will
2703 contain everything that is not loop invariant and that can be
2704 vectorized. The following code attempts to find such a preexistng
2705 SSA_NAME OFF and put the loop invariants into a tree BASE
2706 that can be gimplified before the loop. */
2707 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2708 &pmode, &punsignedp, &pvolatilep, false);
2709 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2711 if (TREE_CODE (base) == MEM_REF)
2713 if (!integer_zerop (TREE_OPERAND (base, 1)))
2715 if (off == NULL_TREE)
2717 double_int moff = mem_ref_offset (base);
2718 off = double_int_to_tree (sizetype, moff);
2720 else
2721 off = size_binop (PLUS_EXPR, off,
2722 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2724 base = TREE_OPERAND (base, 0);
2726 else
2727 base = build_fold_addr_expr (base);
2729 if (off == NULL_TREE)
2730 off = size_zero_node;
2732 /* If base is not loop invariant, either off is 0, then we start with just
2733 the constant offset in the loop invariant BASE and continue with base
2734 as OFF, otherwise give up.
2735 We could handle that case by gimplifying the addition of base + off
2736 into some SSA_NAME and use that as off, but for now punt. */
2737 if (!expr_invariant_in_loop_p (loop, base))
2739 if (!integer_zerop (off))
2740 return NULL_TREE;
2741 off = base;
2742 base = size_int (pbitpos / BITS_PER_UNIT);
2744 /* Otherwise put base + constant offset into the loop invariant BASE
2745 and continue with OFF. */
2746 else
2748 base = fold_convert (sizetype, base);
2749 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2752 /* OFF at this point may be either a SSA_NAME or some tree expression
2753 from get_inner_reference. Try to peel off loop invariants from it
2754 into BASE as long as possible. */
2755 STRIP_NOPS (off);
2756 while (offtype == NULL_TREE)
2758 enum tree_code code;
2759 tree op0, op1, add = NULL_TREE;
2761 if (TREE_CODE (off) == SSA_NAME)
2763 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2765 if (expr_invariant_in_loop_p (loop, off))
2766 return NULL_TREE;
2768 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2769 break;
2771 op0 = gimple_assign_rhs1 (def_stmt);
2772 code = gimple_assign_rhs_code (def_stmt);
2773 op1 = gimple_assign_rhs2 (def_stmt);
2775 else
2777 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2778 return NULL_TREE;
2779 code = TREE_CODE (off);
2780 extract_ops_from_tree (off, &code, &op0, &op1);
2782 switch (code)
2784 case POINTER_PLUS_EXPR:
2785 case PLUS_EXPR:
2786 if (expr_invariant_in_loop_p (loop, op0))
2788 add = op0;
2789 off = op1;
2790 do_add:
2791 add = fold_convert (sizetype, add);
2792 if (scale != 1)
2793 add = size_binop (MULT_EXPR, add, size_int (scale));
2794 base = size_binop (PLUS_EXPR, base, add);
2795 continue;
2797 if (expr_invariant_in_loop_p (loop, op1))
2799 add = op1;
2800 off = op0;
2801 goto do_add;
2803 break;
2804 case MINUS_EXPR:
2805 if (expr_invariant_in_loop_p (loop, op1))
2807 add = fold_convert (sizetype, op1);
2808 add = size_binop (MINUS_EXPR, size_zero_node, add);
2809 off = op0;
2810 goto do_add;
2812 break;
2813 case MULT_EXPR:
2814 if (scale == 1 && host_integerp (op1, 0))
2816 scale = tree_low_cst (op1, 0);
2817 off = op0;
2818 continue;
2820 break;
2821 case SSA_NAME:
2822 off = op0;
2823 continue;
2824 CASE_CONVERT:
2825 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2826 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2827 break;
2828 if (TYPE_PRECISION (TREE_TYPE (op0))
2829 == TYPE_PRECISION (TREE_TYPE (off)))
2831 off = op0;
2832 continue;
2834 if (TYPE_PRECISION (TREE_TYPE (op0))
2835 < TYPE_PRECISION (TREE_TYPE (off)))
2837 off = op0;
2838 offtype = TREE_TYPE (off);
2839 STRIP_NOPS (off);
2840 continue;
2842 break;
2843 default:
2844 break;
2846 break;
2849 /* If at the end OFF still isn't a SSA_NAME or isn't
2850 defined in the loop, punt. */
2851 if (TREE_CODE (off) != SSA_NAME
2852 || expr_invariant_in_loop_p (loop, off))
2853 return NULL_TREE;
2855 if (offtype == NULL_TREE)
2856 offtype = TREE_TYPE (off);
2858 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2859 offtype, scale);
2860 if (decl == NULL_TREE)
2861 return NULL_TREE;
2863 if (basep)
2864 *basep = base;
2865 if (offp)
2866 *offp = off;
2867 if (scalep)
2868 *scalep = scale;
2869 return decl;
2872 /* Function vect_analyze_data_refs.
2874 Find all the data references in the loop or basic block.
2876 The general structure of the analysis of data refs in the vectorizer is as
2877 follows:
2878 1- vect_analyze_data_refs(loop/bb): call
2879 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2880 in the loop/bb and their dependences.
2881 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2882 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2883 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2887 bool
2888 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2889 bb_vec_info bb_vinfo,
2890 int *min_vf)
2892 struct loop *loop = NULL;
2893 basic_block bb = NULL;
2894 unsigned int i;
2895 vec<data_reference_p> datarefs;
2896 struct data_reference *dr;
2897 tree scalar_type;
2899 if (dump_enabled_p ())
2900 dump_printf_loc (MSG_NOTE, vect_location,
2901 "=== vect_analyze_data_refs ===\n");
2903 if (loop_vinfo)
2905 loop = LOOP_VINFO_LOOP (loop_vinfo);
2906 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2907 || find_data_references_in_loop
2908 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "not vectorized: loop contains function calls"
2913 " or data references that cannot be analyzed");
2914 return false;
2917 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2919 else
2921 gimple_stmt_iterator gsi;
2923 bb = BB_VINFO_BB (bb_vinfo);
2924 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2926 gimple stmt = gsi_stmt (gsi);
2927 if (!find_data_references_in_stmt (NULL, stmt,
2928 &BB_VINFO_DATAREFS (bb_vinfo)))
2930 /* Mark the rest of the basic-block as unvectorizable. */
2931 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2933 stmt = gsi_stmt (gsi);
2934 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2936 break;
2940 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2943 /* Go through the data-refs, check that the analysis succeeded. Update
2944 pointer from stmt_vec_info struct to DR and vectype. */
2946 FOR_EACH_VEC_ELT (datarefs, i, dr)
2948 gimple stmt;
2949 stmt_vec_info stmt_info;
2950 tree base, offset, init;
2951 bool gather = false;
2952 bool simd_lane_access = false;
2953 int vf;
2955 again:
2956 if (!dr || !DR_REF (dr))
2958 if (dump_enabled_p ())
2959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2960 "not vectorized: unhandled data-ref ");
2961 return false;
2964 stmt = DR_STMT (dr);
2965 stmt_info = vinfo_for_stmt (stmt);
2967 /* Discard clobbers from the dataref vector. We will remove
2968 clobber stmts during vectorization. */
2969 if (gimple_clobber_p (stmt))
2971 if (i == datarefs.length () - 1)
2973 datarefs.pop ();
2974 break;
2976 datarefs[i] = datarefs.pop ();
2977 goto again;
2980 /* Check that analysis of the data-ref succeeded. */
2981 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2982 || !DR_STEP (dr))
2984 bool maybe_gather
2985 = DR_IS_READ (dr)
2986 && !TREE_THIS_VOLATILE (DR_REF (dr))
2987 && targetm.vectorize.builtin_gather != NULL;
2988 bool maybe_simd_lane_access
2989 = loop_vinfo && loop->simduid;
2991 /* If target supports vector gather loads, or if this might be
2992 a SIMD lane access, see if they can't be used. */
2993 if (loop_vinfo
2994 && (maybe_gather || maybe_simd_lane_access)
2995 && !nested_in_vect_loop_p (loop, stmt))
2997 struct data_reference *newdr
2998 = create_data_ref (NULL, loop_containing_stmt (stmt),
2999 DR_REF (dr), stmt, true);
3000 gcc_assert (newdr != NULL && DR_REF (newdr));
3001 if (DR_BASE_ADDRESS (newdr)
3002 && DR_OFFSET (newdr)
3003 && DR_INIT (newdr)
3004 && DR_STEP (newdr)
3005 && integer_zerop (DR_STEP (newdr)))
3007 if (maybe_simd_lane_access)
3009 tree off = DR_OFFSET (newdr);
3010 STRIP_NOPS (off);
3011 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3012 && TREE_CODE (off) == MULT_EXPR
3013 && host_integerp (TREE_OPERAND (off, 1), 1))
3015 tree step = TREE_OPERAND (off, 1);
3016 off = TREE_OPERAND (off, 0);
3017 STRIP_NOPS (off);
3018 if (CONVERT_EXPR_P (off)
3019 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3020 0)))
3021 < TYPE_PRECISION (TREE_TYPE (off)))
3022 off = TREE_OPERAND (off, 0);
3023 if (TREE_CODE (off) == SSA_NAME)
3025 gimple def = SSA_NAME_DEF_STMT (off);
3026 tree reft = TREE_TYPE (DR_REF (newdr));
3027 if (gimple_call_internal_p (def)
3028 && gimple_call_internal_fn (def)
3029 == IFN_GOMP_SIMD_LANE)
3031 tree arg = gimple_call_arg (def, 0);
3032 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3033 arg = SSA_NAME_VAR (arg);
3034 if (arg == loop->simduid
3035 /* For now. */
3036 && tree_int_cst_equal
3037 (TYPE_SIZE_UNIT (reft),
3038 step))
3040 DR_OFFSET (newdr) = ssize_int (0);
3041 DR_STEP (newdr) = step;
3042 dr = newdr;
3043 simd_lane_access = true;
3049 if (!simd_lane_access && maybe_gather)
3051 dr = newdr;
3052 gather = true;
3055 if (!gather && !simd_lane_access)
3056 free_data_ref (newdr);
3059 if (!gather && !simd_lane_access)
3061 if (dump_enabled_p ())
3063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3064 "not vectorized: data ref analysis "
3065 "failed ");
3066 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3069 if (bb_vinfo)
3070 break;
3072 return false;
3076 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3080 "not vectorized: base addr of dr is a "
3081 "constant");
3083 if (bb_vinfo)
3084 break;
3086 if (gather || simd_lane_access)
3087 free_data_ref (dr);
3088 return false;
3091 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3093 if (dump_enabled_p ())
3095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3096 "not vectorized: volatile type ");
3097 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3100 if (bb_vinfo)
3101 break;
3103 return false;
3106 if (stmt_can_throw_internal (stmt))
3108 if (dump_enabled_p ())
3110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3111 "not vectorized: statement can throw an "
3112 "exception ");
3113 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3116 if (bb_vinfo)
3117 break;
3119 if (gather || simd_lane_access)
3120 free_data_ref (dr);
3121 return false;
3124 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3125 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3127 if (dump_enabled_p ())
3129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130 "not vectorized: statement is bitfield "
3131 "access ");
3132 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3135 if (bb_vinfo)
3136 break;
3138 if (gather || simd_lane_access)
3139 free_data_ref (dr);
3140 return false;
3143 base = unshare_expr (DR_BASE_ADDRESS (dr));
3144 offset = unshare_expr (DR_OFFSET (dr));
3145 init = unshare_expr (DR_INIT (dr));
3147 if (is_gimple_call (stmt))
3149 if (dump_enabled_p ())
3151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3152 "not vectorized: dr in a call ");
3153 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3156 if (bb_vinfo)
3157 break;
3159 if (gather || simd_lane_access)
3160 free_data_ref (dr);
3161 return false;
3164 /* Update DR field in stmt_vec_info struct. */
3166 /* If the dataref is in an inner-loop of the loop that is considered for
3167 for vectorization, we also want to analyze the access relative to
3168 the outer-loop (DR contains information only relative to the
3169 inner-most enclosing loop). We do that by building a reference to the
3170 first location accessed by the inner-loop, and analyze it relative to
3171 the outer-loop. */
3172 if (loop && nested_in_vect_loop_p (loop, stmt))
3174 tree outer_step, outer_base, outer_init;
3175 HOST_WIDE_INT pbitsize, pbitpos;
3176 tree poffset;
3177 enum machine_mode pmode;
3178 int punsignedp, pvolatilep;
3179 affine_iv base_iv, offset_iv;
3180 tree dinit;
3182 /* Build a reference to the first location accessed by the
3183 inner-loop: *(BASE+INIT). (The first location is actually
3184 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3185 tree inner_base = build_fold_indirect_ref
3186 (fold_build_pointer_plus (base, init));
3188 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE, vect_location,
3191 "analyze in outer-loop: ");
3192 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3195 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3196 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3197 gcc_assert (outer_base != NULL_TREE);
3199 if (pbitpos % BITS_PER_UNIT != 0)
3201 if (dump_enabled_p ())
3202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3203 "failed: bit offset alignment.\n");
3204 return false;
3207 outer_base = build_fold_addr_expr (outer_base);
3208 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3209 &base_iv, false))
3211 if (dump_enabled_p ())
3212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3213 "failed: evolution of base is not affine.\n");
3214 return false;
3217 if (offset)
3219 if (poffset)
3220 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3221 poffset);
3222 else
3223 poffset = offset;
3226 if (!poffset)
3228 offset_iv.base = ssize_int (0);
3229 offset_iv.step = ssize_int (0);
3231 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3232 &offset_iv, false))
3234 if (dump_enabled_p ())
3235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236 "evolution of offset is not affine.\n");
3237 return false;
3240 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3241 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3242 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3243 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3244 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3246 outer_step = size_binop (PLUS_EXPR,
3247 fold_convert (ssizetype, base_iv.step),
3248 fold_convert (ssizetype, offset_iv.step));
3250 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3251 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3252 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3253 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3254 STMT_VINFO_DR_OFFSET (stmt_info) =
3255 fold_convert (ssizetype, offset_iv.base);
3256 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3257 size_int (highest_pow2_factor (offset_iv.base));
3259 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "\touter base_address: ");
3263 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3264 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3265 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3266 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3267 STMT_VINFO_DR_OFFSET (stmt_info));
3268 dump_printf (MSG_NOTE,
3269 "\n\touter constant offset from base address: ");
3270 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3271 STMT_VINFO_DR_INIT (stmt_info));
3272 dump_printf (MSG_NOTE, "\n\touter step: ");
3273 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3274 STMT_VINFO_DR_STEP (stmt_info));
3275 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3276 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3277 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3281 if (STMT_VINFO_DATA_REF (stmt_info))
3283 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3286 "not vectorized: more than one data ref "
3287 "in stmt: ");
3288 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3291 if (bb_vinfo)
3292 break;
3294 if (gather || simd_lane_access)
3295 free_data_ref (dr);
3296 return false;
3299 STMT_VINFO_DATA_REF (stmt_info) = dr;
3300 if (simd_lane_access)
3302 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3303 datarefs[i] = dr;
3306 /* Set vectype for STMT. */
3307 scalar_type = TREE_TYPE (DR_REF (dr));
3308 STMT_VINFO_VECTYPE (stmt_info) =
3309 get_vectype_for_scalar_type (scalar_type);
3310 if (!STMT_VINFO_VECTYPE (stmt_info))
3312 if (dump_enabled_p ())
3314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3315 "not vectorized: no vectype for stmt: ");
3316 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3317 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3319 scalar_type);
3322 if (bb_vinfo)
3323 break;
3325 if (gather || simd_lane_access)
3327 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3328 free_data_ref (dr);
3330 return false;
3332 else
3334 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE, vect_location,
3337 "got vectype for stmt: ");
3338 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3339 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3340 STMT_VINFO_VECTYPE (stmt_info));
3344 /* Adjust the minimal vectorization factor according to the
3345 vector type. */
3346 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3347 if (vf > *min_vf)
3348 *min_vf = vf;
3350 if (gather)
3352 tree off;
3354 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3355 if (gather
3356 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3357 gather = false;
3358 if (!gather)
3360 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3361 free_data_ref (dr);
3362 if (dump_enabled_p ())
3364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3365 "not vectorized: not suitable for gather "
3366 "load ");
3367 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3369 return false;
3372 datarefs[i] = dr;
3373 STMT_VINFO_GATHER_P (stmt_info) = true;
3375 else if (loop_vinfo
3376 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3378 if (nested_in_vect_loop_p (loop, stmt)
3379 || !DR_IS_READ (dr))
3381 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3384 "not vectorized: not suitable for strided "
3385 "load ");
3386 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3388 return false;
3390 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3394 /* If we stopped analysis at the first dataref we could not analyze
3395 when trying to vectorize a basic-block mark the rest of the datarefs
3396 as not vectorizable and truncate the vector of datarefs. That
3397 avoids spending useless time in analyzing their dependence. */
3398 if (i != datarefs.length ())
3400 gcc_assert (bb_vinfo != NULL);
3401 for (unsigned j = i; j < datarefs.length (); ++j)
3403 data_reference_p dr = datarefs[j];
3404 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3405 free_data_ref (dr);
3407 datarefs.truncate (i);
3410 return true;
3414 /* Function vect_get_new_vect_var.
3416 Returns a name for a new variable. The current naming scheme appends the
3417 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3418 the name of vectorizer generated variables, and appends that to NAME if
3419 provided. */
3421 tree
3422 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3424 const char *prefix;
3425 tree new_vect_var;
3427 switch (var_kind)
3429 case vect_simple_var:
3430 prefix = "vect";
3431 break;
3432 case vect_scalar_var:
3433 prefix = "stmp";
3434 break;
3435 case vect_pointer_var:
3436 prefix = "vectp";
3437 break;
3438 default:
3439 gcc_unreachable ();
3442 if (name)
3444 char* tmp = concat (prefix, "_", name, NULL);
3445 new_vect_var = create_tmp_reg (type, tmp);
3446 free (tmp);
3448 else
3449 new_vect_var = create_tmp_reg (type, prefix);
3451 return new_vect_var;
3455 /* Function vect_create_addr_base_for_vector_ref.
3457 Create an expression that computes the address of the first memory location
3458 that will be accessed for a data reference.
3460 Input:
3461 STMT: The statement containing the data reference.
3462 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3463 OFFSET: Optional. If supplied, it is be added to the initial address.
3464 LOOP: Specify relative to which loop-nest should the address be computed.
3465 For example, when the dataref is in an inner-loop nested in an
3466 outer-loop that is now being vectorized, LOOP can be either the
3467 outer-loop, or the inner-loop. The first memory location accessed
3468 by the following dataref ('in' points to short):
3470 for (i=0; i<N; i++)
3471 for (j=0; j<M; j++)
3472 s += in[i+j]
3474 is as follows:
3475 if LOOP=i_loop: &in (relative to i_loop)
3476 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3478 Output:
3479 1. Return an SSA_NAME whose value is the address of the memory location of
3480 the first vector of the data reference.
3481 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3482 these statement(s) which define the returned SSA_NAME.
3484 FORNOW: We are only handling array accesses with step 1. */
3486 tree
3487 vect_create_addr_base_for_vector_ref (gimple stmt,
3488 gimple_seq *new_stmt_list,
3489 tree offset,
3490 struct loop *loop)
3492 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3493 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3494 tree data_ref_base;
3495 const char *base_name;
3496 tree addr_base;
3497 tree dest;
3498 gimple_seq seq = NULL;
3499 tree base_offset;
3500 tree init;
3501 tree vect_ptr_type;
3502 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3503 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3505 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3507 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3509 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3511 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3512 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3513 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3515 else
3517 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3518 base_offset = unshare_expr (DR_OFFSET (dr));
3519 init = unshare_expr (DR_INIT (dr));
3522 if (loop_vinfo)
3523 base_name = get_name (data_ref_base);
3524 else
3526 base_offset = ssize_int (0);
3527 init = ssize_int (0);
3528 base_name = get_name (DR_REF (dr));
3531 /* Create base_offset */
3532 base_offset = size_binop (PLUS_EXPR,
3533 fold_convert (sizetype, base_offset),
3534 fold_convert (sizetype, init));
3536 if (offset)
3538 offset = fold_build2 (MULT_EXPR, sizetype,
3539 fold_convert (sizetype, offset), step);
3540 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3541 base_offset, offset);
3544 /* base + base_offset */
3545 if (loop_vinfo)
3546 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3547 else
3549 addr_base = build1 (ADDR_EXPR,
3550 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3551 unshare_expr (DR_REF (dr)));
3554 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3555 addr_base = fold_convert (vect_ptr_type, addr_base);
3556 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3557 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3558 gimple_seq_add_seq (new_stmt_list, seq);
3560 if (DR_PTR_INFO (dr)
3561 && TREE_CODE (addr_base) == SSA_NAME)
3563 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3564 if (offset)
3565 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3568 if (dump_enabled_p ())
3570 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3571 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3574 return addr_base;
3578 /* Function vect_create_data_ref_ptr.
3580 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3581 location accessed in the loop by STMT, along with the def-use update
3582 chain to appropriately advance the pointer through the loop iterations.
3583 Also set aliasing information for the pointer. This pointer is used by
3584 the callers to this function to create a memory reference expression for
3585 vector load/store access.
3587 Input:
3588 1. STMT: a stmt that references memory. Expected to be of the form
3589 GIMPLE_ASSIGN <name, data-ref> or
3590 GIMPLE_ASSIGN <data-ref, name>.
3591 2. AGGR_TYPE: the type of the reference, which should be either a vector
3592 or an array.
3593 3. AT_LOOP: the loop where the vector memref is to be created.
3594 4. OFFSET (optional): an offset to be added to the initial address accessed
3595 by the data-ref in STMT.
3596 5. BSI: location where the new stmts are to be placed if there is no loop
3597 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3598 pointing to the initial address.
3600 Output:
3601 1. Declare a new ptr to vector_type, and have it point to the base of the
3602 data reference (initial addressed accessed by the data reference).
3603 For example, for vector of type V8HI, the following code is generated:
3605 v8hi *ap;
3606 ap = (v8hi *)initial_address;
3608 if OFFSET is not supplied:
3609 initial_address = &a[init];
3610 if OFFSET is supplied:
3611 initial_address = &a[init + OFFSET];
3613 Return the initial_address in INITIAL_ADDRESS.
3615 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3616 update the pointer in each iteration of the loop.
3618 Return the increment stmt that updates the pointer in PTR_INCR.
3620 3. Set INV_P to true if the access pattern of the data reference in the
3621 vectorized loop is invariant. Set it to false otherwise.
3623 4. Return the pointer. */
3625 tree
3626 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3627 tree offset, tree *initial_address,
3628 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3629 bool only_init, bool *inv_p)
3631 const char *base_name;
3632 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3633 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3634 struct loop *loop = NULL;
3635 bool nested_in_vect_loop = false;
3636 struct loop *containing_loop = NULL;
3637 tree aggr_ptr_type;
3638 tree aggr_ptr;
3639 tree new_temp;
3640 gimple vec_stmt;
3641 gimple_seq new_stmt_list = NULL;
3642 edge pe = NULL;
3643 basic_block new_bb;
3644 tree aggr_ptr_init;
3645 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3646 tree aptr;
3647 gimple_stmt_iterator incr_gsi;
3648 bool insert_after;
3649 tree indx_before_incr, indx_after_incr;
3650 gimple incr;
3651 tree step;
3652 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3654 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3655 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3657 if (loop_vinfo)
3659 loop = LOOP_VINFO_LOOP (loop_vinfo);
3660 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3661 containing_loop = (gimple_bb (stmt))->loop_father;
3662 pe = loop_preheader_edge (loop);
3664 else
3666 gcc_assert (bb_vinfo);
3667 only_init = true;
3668 *ptr_incr = NULL;
3671 /* Check the step (evolution) of the load in LOOP, and record
3672 whether it's invariant. */
3673 if (nested_in_vect_loop)
3674 step = STMT_VINFO_DR_STEP (stmt_info);
3675 else
3676 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3678 if (integer_zerop (step))
3679 *inv_p = true;
3680 else
3681 *inv_p = false;
3683 /* Create an expression for the first address accessed by this load
3684 in LOOP. */
3685 base_name = get_name (DR_BASE_ADDRESS (dr));
3687 if (dump_enabled_p ())
3689 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 "create %s-pointer variable to type: ",
3692 tree_code_name[(int) TREE_CODE (aggr_type)]);
3693 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3694 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3695 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3696 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3697 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3698 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3699 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3700 else
3701 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3702 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3705 /* (1) Create the new aggregate-pointer variable.
3706 Vector and array types inherit the alias set of their component
3707 type by default so we need to use a ref-all pointer if the data
3708 reference does not conflict with the created aggregated data
3709 reference because it is not addressable. */
3710 bool need_ref_all = false;
3711 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3712 get_alias_set (DR_REF (dr))))
3713 need_ref_all = true;
3714 /* Likewise for any of the data references in the stmt group. */
3715 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3717 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3720 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3721 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3722 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3723 get_alias_set (DR_REF (sdr))))
3725 need_ref_all = true;
3726 break;
3728 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3730 while (orig_stmt);
3732 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3733 need_ref_all);
3734 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3737 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3738 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3739 def-use update cycles for the pointer: one relative to the outer-loop
3740 (LOOP), which is what steps (3) and (4) below do. The other is relative
3741 to the inner-loop (which is the inner-most loop containing the dataref),
3742 and this is done be step (5) below.
3744 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3745 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3746 redundant. Steps (3),(4) create the following:
3748 vp0 = &base_addr;
3749 LOOP: vp1 = phi(vp0,vp2)
3752 vp2 = vp1 + step
3753 goto LOOP
3755 If there is an inner-loop nested in loop, then step (5) will also be
3756 applied, and an additional update in the inner-loop will be created:
3758 vp0 = &base_addr;
3759 LOOP: vp1 = phi(vp0,vp2)
3761 inner: vp3 = phi(vp1,vp4)
3762 vp4 = vp3 + inner_step
3763 if () goto inner
3765 vp2 = vp1 + step
3766 if () goto LOOP */
3768 /* (2) Calculate the initial address of the aggregate-pointer, and set
3769 the aggregate-pointer to point to it before the loop. */
3771 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3773 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3774 offset, loop);
3775 if (new_stmt_list)
3777 if (pe)
3779 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3780 gcc_assert (!new_bb);
3782 else
3783 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3786 *initial_address = new_temp;
3788 /* Create: p = (aggr_type *) initial_base */
3789 if (TREE_CODE (new_temp) != SSA_NAME
3790 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3792 vec_stmt = gimple_build_assign (aggr_ptr,
3793 fold_convert (aggr_ptr_type, new_temp));
3794 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3795 /* Copy the points-to information if it exists. */
3796 if (DR_PTR_INFO (dr))
3797 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3798 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3799 if (pe)
3801 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3802 gcc_assert (!new_bb);
3804 else
3805 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3807 else
3808 aggr_ptr_init = new_temp;
3810 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3811 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3812 inner-loop nested in LOOP (during outer-loop vectorization). */
3814 /* No update in loop is required. */
3815 if (only_init && (!loop_vinfo || at_loop == loop))
3816 aptr = aggr_ptr_init;
3817 else
3819 /* The step of the aggregate pointer is the type size. */
3820 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3821 /* One exception to the above is when the scalar step of the load in
3822 LOOP is zero. In this case the step here is also zero. */
3823 if (*inv_p)
3824 iv_step = size_zero_node;
3825 else if (tree_int_cst_sgn (step) == -1)
3826 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3828 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3830 create_iv (aggr_ptr_init,
3831 fold_convert (aggr_ptr_type, iv_step),
3832 aggr_ptr, loop, &incr_gsi, insert_after,
3833 &indx_before_incr, &indx_after_incr);
3834 incr = gsi_stmt (incr_gsi);
3835 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3837 /* Copy the points-to information if it exists. */
3838 if (DR_PTR_INFO (dr))
3840 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3841 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3843 if (ptr_incr)
3844 *ptr_incr = incr;
3846 aptr = indx_before_incr;
3849 if (!nested_in_vect_loop || only_init)
3850 return aptr;
3853 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3854 nested in LOOP, if exists. */
3856 gcc_assert (nested_in_vect_loop);
3857 if (!only_init)
3859 standard_iv_increment_position (containing_loop, &incr_gsi,
3860 &insert_after);
3861 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3862 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3863 &indx_after_incr);
3864 incr = gsi_stmt (incr_gsi);
3865 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3867 /* Copy the points-to information if it exists. */
3868 if (DR_PTR_INFO (dr))
3870 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3871 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3873 if (ptr_incr)
3874 *ptr_incr = incr;
3876 return indx_before_incr;
3878 else
3879 gcc_unreachable ();
3883 /* Function bump_vector_ptr
3885 Increment a pointer (to a vector type) by vector-size. If requested,
3886 i.e. if PTR-INCR is given, then also connect the new increment stmt
3887 to the existing def-use update-chain of the pointer, by modifying
3888 the PTR_INCR as illustrated below:
3890 The pointer def-use update-chain before this function:
3891 DATAREF_PTR = phi (p_0, p_2)
3892 ....
3893 PTR_INCR: p_2 = DATAREF_PTR + step
3895 The pointer def-use update-chain after this function:
3896 DATAREF_PTR = phi (p_0, p_2)
3897 ....
3898 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3899 ....
3900 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3902 Input:
3903 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3904 in the loop.
3905 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3906 the loop. The increment amount across iterations is expected
3907 to be vector_size.
3908 BSI - location where the new update stmt is to be placed.
3909 STMT - the original scalar memory-access stmt that is being vectorized.
3910 BUMP - optional. The offset by which to bump the pointer. If not given,
3911 the offset is assumed to be vector_size.
3913 Output: Return NEW_DATAREF_PTR as illustrated above.
3917 tree
3918 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3919 gimple stmt, tree bump)
3921 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3922 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3923 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3924 tree update = TYPE_SIZE_UNIT (vectype);
3925 gimple incr_stmt;
3926 ssa_op_iter iter;
3927 use_operand_p use_p;
3928 tree new_dataref_ptr;
3930 if (bump)
3931 update = bump;
3933 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3934 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3935 dataref_ptr, update);
3936 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3938 /* Copy the points-to information if it exists. */
3939 if (DR_PTR_INFO (dr))
3941 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3942 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3945 if (!ptr_incr)
3946 return new_dataref_ptr;
3948 /* Update the vector-pointer's cross-iteration increment. */
3949 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3951 tree use = USE_FROM_PTR (use_p);
3953 if (use == dataref_ptr)
3954 SET_USE (use_p, new_dataref_ptr);
3955 else
3956 gcc_assert (tree_int_cst_compare (use, update) == 0);
3959 return new_dataref_ptr;
3963 /* Function vect_create_destination_var.
3965 Create a new temporary of type VECTYPE. */
3967 tree
3968 vect_create_destination_var (tree scalar_dest, tree vectype)
3970 tree vec_dest;
3971 const char *name;
3972 char *new_name;
3973 tree type;
3974 enum vect_var_kind kind;
3976 kind = vectype ? vect_simple_var : vect_scalar_var;
3977 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3979 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3981 name = get_name (scalar_dest);
3982 if (name)
3983 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3984 else
3985 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3986 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3987 free (new_name);
3989 return vec_dest;
3992 /* Function vect_grouped_store_supported.
3994 Returns TRUE if interleave high and interleave low permutations
3995 are supported, and FALSE otherwise. */
3997 bool
3998 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4000 enum machine_mode mode = TYPE_MODE (vectype);
4002 /* vect_permute_store_chain requires the group size to be a power of two. */
4003 if (exact_log2 (count) == -1)
4005 if (dump_enabled_p ())
4006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4007 "the size of the group of accesses"
4008 " is not a power of 2");
4009 return false;
4012 /* Check that the permutation is supported. */
4013 if (VECTOR_MODE_P (mode))
4015 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4016 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4017 for (i = 0; i < nelt / 2; i++)
4019 sel[i * 2] = i;
4020 sel[i * 2 + 1] = i + nelt;
4022 if (can_vec_perm_p (mode, false, sel))
4024 for (i = 0; i < nelt; i++)
4025 sel[i] += nelt / 2;
4026 if (can_vec_perm_p (mode, false, sel))
4027 return true;
4031 if (dump_enabled_p ())
4032 dump_printf (MSG_MISSED_OPTIMIZATION,
4033 "interleave op not supported by target.");
4034 return false;
4038 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4039 type VECTYPE. */
4041 bool
4042 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4044 return vect_lanes_optab_supported_p ("vec_store_lanes",
4045 vec_store_lanes_optab,
4046 vectype, count);
4050 /* Function vect_permute_store_chain.
4052 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4053 a power of 2, generate interleave_high/low stmts to reorder the data
4054 correctly for the stores. Return the final references for stores in
4055 RESULT_CHAIN.
4057 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4058 The input is 4 vectors each containing 8 elements. We assign a number to
4059 each element, the input sequence is:
4061 1st vec: 0 1 2 3 4 5 6 7
4062 2nd vec: 8 9 10 11 12 13 14 15
4063 3rd vec: 16 17 18 19 20 21 22 23
4064 4th vec: 24 25 26 27 28 29 30 31
4066 The output sequence should be:
4068 1st vec: 0 8 16 24 1 9 17 25
4069 2nd vec: 2 10 18 26 3 11 19 27
4070 3rd vec: 4 12 20 28 5 13 21 30
4071 4th vec: 6 14 22 30 7 15 23 31
4073 i.e., we interleave the contents of the four vectors in their order.
4075 We use interleave_high/low instructions to create such output. The input of
4076 each interleave_high/low operation is two vectors:
4077 1st vec 2nd vec
4078 0 1 2 3 4 5 6 7
4079 the even elements of the result vector are obtained left-to-right from the
4080 high/low elements of the first vector. The odd elements of the result are
4081 obtained left-to-right from the high/low elements of the second vector.
4082 The output of interleave_high will be: 0 4 1 5
4083 and of interleave_low: 2 6 3 7
4086 The permutation is done in log LENGTH stages. In each stage interleave_high
4087 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4088 where the first argument is taken from the first half of DR_CHAIN and the
4089 second argument from it's second half.
4090 In our example,
4092 I1: interleave_high (1st vec, 3rd vec)
4093 I2: interleave_low (1st vec, 3rd vec)
4094 I3: interleave_high (2nd vec, 4th vec)
4095 I4: interleave_low (2nd vec, 4th vec)
4097 The output for the first stage is:
4099 I1: 0 16 1 17 2 18 3 19
4100 I2: 4 20 5 21 6 22 7 23
4101 I3: 8 24 9 25 10 26 11 27
4102 I4: 12 28 13 29 14 30 15 31
4104 The output of the second stage, i.e. the final result is:
4106 I1: 0 8 16 24 1 9 17 25
4107 I2: 2 10 18 26 3 11 19 27
4108 I3: 4 12 20 28 5 13 21 30
4109 I4: 6 14 22 30 7 15 23 31. */
4111 void
4112 vect_permute_store_chain (vec<tree> dr_chain,
4113 unsigned int length,
4114 gimple stmt,
4115 gimple_stmt_iterator *gsi,
4116 vec<tree> *result_chain)
4118 tree vect1, vect2, high, low;
4119 gimple perm_stmt;
4120 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4121 tree perm_mask_low, perm_mask_high;
4122 unsigned int i, n;
4123 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4124 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4126 result_chain->quick_grow (length);
4127 memcpy (result_chain->address (), dr_chain.address (),
4128 length * sizeof (tree));
4130 for (i = 0, n = nelt / 2; i < n; i++)
4132 sel[i * 2] = i;
4133 sel[i * 2 + 1] = i + nelt;
4135 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4136 gcc_assert (perm_mask_high != NULL);
4138 for (i = 0; i < nelt; i++)
4139 sel[i] += nelt / 2;
4140 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4141 gcc_assert (perm_mask_low != NULL);
4143 for (i = 0, n = exact_log2 (length); i < n; i++)
4145 for (j = 0; j < length/2; j++)
4147 vect1 = dr_chain[j];
4148 vect2 = dr_chain[j+length/2];
4150 /* Create interleaving stmt:
4151 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4152 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4153 perm_stmt
4154 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4155 vect1, vect2, perm_mask_high);
4156 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4157 (*result_chain)[2*j] = high;
4159 /* Create interleaving stmt:
4160 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4161 nelt*3/2+1, ...}> */
4162 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4163 perm_stmt
4164 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4165 vect1, vect2, perm_mask_low);
4166 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4167 (*result_chain)[2*j+1] = low;
4169 memcpy (dr_chain.address (), result_chain->address (),
4170 length * sizeof (tree));
4174 /* Function vect_setup_realignment
4176 This function is called when vectorizing an unaligned load using
4177 the dr_explicit_realign[_optimized] scheme.
4178 This function generates the following code at the loop prolog:
4180 p = initial_addr;
4181 x msq_init = *(floor(p)); # prolog load
4182 realignment_token = call target_builtin;
4183 loop:
4184 x msq = phi (msq_init, ---)
4186 The stmts marked with x are generated only for the case of
4187 dr_explicit_realign_optimized.
4189 The code above sets up a new (vector) pointer, pointing to the first
4190 location accessed by STMT, and a "floor-aligned" load using that pointer.
4191 It also generates code to compute the "realignment-token" (if the relevant
4192 target hook was defined), and creates a phi-node at the loop-header bb
4193 whose arguments are the result of the prolog-load (created by this
4194 function) and the result of a load that takes place in the loop (to be
4195 created by the caller to this function).
4197 For the case of dr_explicit_realign_optimized:
4198 The caller to this function uses the phi-result (msq) to create the
4199 realignment code inside the loop, and sets up the missing phi argument,
4200 as follows:
4201 loop:
4202 msq = phi (msq_init, lsq)
4203 lsq = *(floor(p')); # load in loop
4204 result = realign_load (msq, lsq, realignment_token);
4206 For the case of dr_explicit_realign:
4207 loop:
4208 msq = *(floor(p)); # load in loop
4209 p' = p + (VS-1);
4210 lsq = *(floor(p')); # load in loop
4211 result = realign_load (msq, lsq, realignment_token);
4213 Input:
4214 STMT - (scalar) load stmt to be vectorized. This load accesses
4215 a memory location that may be unaligned.
4216 BSI - place where new code is to be inserted.
4217 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4218 is used.
4220 Output:
4221 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4222 target hook, if defined.
4223 Return value - the result of the loop-header phi node. */
4225 tree
4226 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4227 tree *realignment_token,
4228 enum dr_alignment_support alignment_support_scheme,
4229 tree init_addr,
4230 struct loop **at_loop)
4232 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4233 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4234 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4235 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4236 struct loop *loop = NULL;
4237 edge pe = NULL;
4238 tree scalar_dest = gimple_assign_lhs (stmt);
4239 tree vec_dest;
4240 gimple inc;
4241 tree ptr;
4242 tree data_ref;
4243 gimple new_stmt;
4244 basic_block new_bb;
4245 tree msq_init = NULL_TREE;
4246 tree new_temp;
4247 gimple phi_stmt;
4248 tree msq = NULL_TREE;
4249 gimple_seq stmts = NULL;
4250 bool inv_p;
4251 bool compute_in_loop = false;
4252 bool nested_in_vect_loop = false;
4253 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4254 struct loop *loop_for_initial_load = NULL;
4256 if (loop_vinfo)
4258 loop = LOOP_VINFO_LOOP (loop_vinfo);
4259 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4262 gcc_assert (alignment_support_scheme == dr_explicit_realign
4263 || alignment_support_scheme == dr_explicit_realign_optimized);
4265 /* We need to generate three things:
4266 1. the misalignment computation
4267 2. the extra vector load (for the optimized realignment scheme).
4268 3. the phi node for the two vectors from which the realignment is
4269 done (for the optimized realignment scheme). */
4271 /* 1. Determine where to generate the misalignment computation.
4273 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4274 calculation will be generated by this function, outside the loop (in the
4275 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4276 caller, inside the loop.
4278 Background: If the misalignment remains fixed throughout the iterations of
4279 the loop, then both realignment schemes are applicable, and also the
4280 misalignment computation can be done outside LOOP. This is because we are
4281 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4282 are a multiple of VS (the Vector Size), and therefore the misalignment in
4283 different vectorized LOOP iterations is always the same.
4284 The problem arises only if the memory access is in an inner-loop nested
4285 inside LOOP, which is now being vectorized using outer-loop vectorization.
4286 This is the only case when the misalignment of the memory access may not
4287 remain fixed throughout the iterations of the inner-loop (as explained in
4288 detail in vect_supportable_dr_alignment). In this case, not only is the
4289 optimized realignment scheme not applicable, but also the misalignment
4290 computation (and generation of the realignment token that is passed to
4291 REALIGN_LOAD) have to be done inside the loop.
4293 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4294 or not, which in turn determines if the misalignment is computed inside
4295 the inner-loop, or outside LOOP. */
4297 if (init_addr != NULL_TREE || !loop_vinfo)
4299 compute_in_loop = true;
4300 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4304 /* 2. Determine where to generate the extra vector load.
4306 For the optimized realignment scheme, instead of generating two vector
4307 loads in each iteration, we generate a single extra vector load in the
4308 preheader of the loop, and in each iteration reuse the result of the
4309 vector load from the previous iteration. In case the memory access is in
4310 an inner-loop nested inside LOOP, which is now being vectorized using
4311 outer-loop vectorization, we need to determine whether this initial vector
4312 load should be generated at the preheader of the inner-loop, or can be
4313 generated at the preheader of LOOP. If the memory access has no evolution
4314 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4315 to be generated inside LOOP (in the preheader of the inner-loop). */
4317 if (nested_in_vect_loop)
4319 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4320 bool invariant_in_outerloop =
4321 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4322 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4324 else
4325 loop_for_initial_load = loop;
4326 if (at_loop)
4327 *at_loop = loop_for_initial_load;
4329 if (loop_for_initial_load)
4330 pe = loop_preheader_edge (loop_for_initial_load);
4332 /* 3. For the case of the optimized realignment, create the first vector
4333 load at the loop preheader. */
4335 if (alignment_support_scheme == dr_explicit_realign_optimized)
4337 /* Create msq_init = *(floor(p1)) in the loop preheader */
4339 gcc_assert (!compute_in_loop);
4340 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4341 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4342 NULL_TREE, &init_addr, NULL, &inc,
4343 true, &inv_p);
4344 new_temp = copy_ssa_name (ptr, NULL);
4345 new_stmt = gimple_build_assign_with_ops
4346 (BIT_AND_EXPR, new_temp, ptr,
4347 build_int_cst (TREE_TYPE (ptr),
4348 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4349 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4350 gcc_assert (!new_bb);
4351 data_ref
4352 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4353 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4354 new_stmt = gimple_build_assign (vec_dest, data_ref);
4355 new_temp = make_ssa_name (vec_dest, new_stmt);
4356 gimple_assign_set_lhs (new_stmt, new_temp);
4357 if (pe)
4359 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4360 gcc_assert (!new_bb);
4362 else
4363 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4365 msq_init = gimple_assign_lhs (new_stmt);
4368 /* 4. Create realignment token using a target builtin, if available.
4369 It is done either inside the containing loop, or before LOOP (as
4370 determined above). */
4372 if (targetm.vectorize.builtin_mask_for_load)
4374 tree builtin_decl;
4376 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4377 if (!init_addr)
4379 /* Generate the INIT_ADDR computation outside LOOP. */
4380 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4381 NULL_TREE, loop);
4382 if (loop)
4384 pe = loop_preheader_edge (loop);
4385 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4386 gcc_assert (!new_bb);
4388 else
4389 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4392 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4393 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4394 vec_dest =
4395 vect_create_destination_var (scalar_dest,
4396 gimple_call_return_type (new_stmt));
4397 new_temp = make_ssa_name (vec_dest, new_stmt);
4398 gimple_call_set_lhs (new_stmt, new_temp);
4400 if (compute_in_loop)
4401 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4402 else
4404 /* Generate the misalignment computation outside LOOP. */
4405 pe = loop_preheader_edge (loop);
4406 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4407 gcc_assert (!new_bb);
4410 *realignment_token = gimple_call_lhs (new_stmt);
4412 /* The result of the CALL_EXPR to this builtin is determined from
4413 the value of the parameter and no global variables are touched
4414 which makes the builtin a "const" function. Requiring the
4415 builtin to have the "const" attribute makes it unnecessary
4416 to call mark_call_clobbered. */
4417 gcc_assert (TREE_READONLY (builtin_decl));
4420 if (alignment_support_scheme == dr_explicit_realign)
4421 return msq;
4423 gcc_assert (!compute_in_loop);
4424 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4427 /* 5. Create msq = phi <msq_init, lsq> in loop */
4429 pe = loop_preheader_edge (containing_loop);
4430 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4431 msq = make_ssa_name (vec_dest, NULL);
4432 phi_stmt = create_phi_node (msq, containing_loop->header);
4433 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4435 return msq;
4439 /* Function vect_grouped_load_supported.
4441 Returns TRUE if even and odd permutations are supported,
4442 and FALSE otherwise. */
4444 bool
4445 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4447 enum machine_mode mode = TYPE_MODE (vectype);
4449 /* vect_permute_load_chain requires the group size to be a power of two. */
4450 if (exact_log2 (count) == -1)
4452 if (dump_enabled_p ())
4453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4454 "the size of the group of accesses"
4455 " is not a power of 2");
4456 return false;
4459 /* Check that the permutation is supported. */
4460 if (VECTOR_MODE_P (mode))
4462 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4463 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4465 for (i = 0; i < nelt; i++)
4466 sel[i] = i * 2;
4467 if (can_vec_perm_p (mode, false, sel))
4469 for (i = 0; i < nelt; i++)
4470 sel[i] = i * 2 + 1;
4471 if (can_vec_perm_p (mode, false, sel))
4472 return true;
4476 if (dump_enabled_p ())
4477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4478 "extract even/odd not supported by target");
4479 return false;
4482 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4483 type VECTYPE. */
4485 bool
4486 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4488 return vect_lanes_optab_supported_p ("vec_load_lanes",
4489 vec_load_lanes_optab,
4490 vectype, count);
4493 /* Function vect_permute_load_chain.
4495 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4496 a power of 2, generate extract_even/odd stmts to reorder the input data
4497 correctly. Return the final references for loads in RESULT_CHAIN.
4499 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4500 The input is 4 vectors each containing 8 elements. We assign a number to each
4501 element, the input sequence is:
4503 1st vec: 0 1 2 3 4 5 6 7
4504 2nd vec: 8 9 10 11 12 13 14 15
4505 3rd vec: 16 17 18 19 20 21 22 23
4506 4th vec: 24 25 26 27 28 29 30 31
4508 The output sequence should be:
4510 1st vec: 0 4 8 12 16 20 24 28
4511 2nd vec: 1 5 9 13 17 21 25 29
4512 3rd vec: 2 6 10 14 18 22 26 30
4513 4th vec: 3 7 11 15 19 23 27 31
4515 i.e., the first output vector should contain the first elements of each
4516 interleaving group, etc.
4518 We use extract_even/odd instructions to create such output. The input of
4519 each extract_even/odd operation is two vectors
4520 1st vec 2nd vec
4521 0 1 2 3 4 5 6 7
4523 and the output is the vector of extracted even/odd elements. The output of
4524 extract_even will be: 0 2 4 6
4525 and of extract_odd: 1 3 5 7
4528 The permutation is done in log LENGTH stages. In each stage extract_even
4529 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4530 their order. In our example,
4532 E1: extract_even (1st vec, 2nd vec)
4533 E2: extract_odd (1st vec, 2nd vec)
4534 E3: extract_even (3rd vec, 4th vec)
4535 E4: extract_odd (3rd vec, 4th vec)
4537 The output for the first stage will be:
4539 E1: 0 2 4 6 8 10 12 14
4540 E2: 1 3 5 7 9 11 13 15
4541 E3: 16 18 20 22 24 26 28 30
4542 E4: 17 19 21 23 25 27 29 31
4544 In order to proceed and create the correct sequence for the next stage (or
4545 for the correct output, if the second stage is the last one, as in our
4546 example), we first put the output of extract_even operation and then the
4547 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4548 The input for the second stage is:
4550 1st vec (E1): 0 2 4 6 8 10 12 14
4551 2nd vec (E3): 16 18 20 22 24 26 28 30
4552 3rd vec (E2): 1 3 5 7 9 11 13 15
4553 4th vec (E4): 17 19 21 23 25 27 29 31
4555 The output of the second stage:
4557 E1: 0 4 8 12 16 20 24 28
4558 E2: 2 6 10 14 18 22 26 30
4559 E3: 1 5 9 13 17 21 25 29
4560 E4: 3 7 11 15 19 23 27 31
4562 And RESULT_CHAIN after reordering:
4564 1st vec (E1): 0 4 8 12 16 20 24 28
4565 2nd vec (E3): 1 5 9 13 17 21 25 29
4566 3rd vec (E2): 2 6 10 14 18 22 26 30
4567 4th vec (E4): 3 7 11 15 19 23 27 31. */
4569 static void
4570 vect_permute_load_chain (vec<tree> dr_chain,
4571 unsigned int length,
4572 gimple stmt,
4573 gimple_stmt_iterator *gsi,
4574 vec<tree> *result_chain)
4576 tree data_ref, first_vect, second_vect;
4577 tree perm_mask_even, perm_mask_odd;
4578 gimple perm_stmt;
4579 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4580 unsigned int i, j, log_length = exact_log2 (length);
4581 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4582 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4584 result_chain->quick_grow (length);
4585 memcpy (result_chain->address (), dr_chain.address (),
4586 length * sizeof (tree));
4588 for (i = 0; i < nelt; ++i)
4589 sel[i] = i * 2;
4590 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4591 gcc_assert (perm_mask_even != NULL);
4593 for (i = 0; i < nelt; ++i)
4594 sel[i] = i * 2 + 1;
4595 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4596 gcc_assert (perm_mask_odd != NULL);
4598 for (i = 0; i < log_length; i++)
4600 for (j = 0; j < length; j += 2)
4602 first_vect = dr_chain[j];
4603 second_vect = dr_chain[j+1];
4605 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4606 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4607 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4608 first_vect, second_vect,
4609 perm_mask_even);
4610 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4611 (*result_chain)[j/2] = data_ref;
4613 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4614 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4615 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4616 first_vect, second_vect,
4617 perm_mask_odd);
4618 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4619 (*result_chain)[j/2+length/2] = data_ref;
4621 memcpy (dr_chain.address (), result_chain->address (),
4622 length * sizeof (tree));
4627 /* Function vect_transform_grouped_load.
4629 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4630 to perform their permutation and ascribe the result vectorized statements to
4631 the scalar statements.
4634 void
4635 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4636 gimple_stmt_iterator *gsi)
4638 vec<tree> result_chain = vNULL;
4640 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4641 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4642 vectors, that are ready for vector computation. */
4643 result_chain.create (size);
4644 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4645 vect_record_grouped_load_vectors (stmt, result_chain);
4646 result_chain.release ();
4649 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4650 generated as part of the vectorization of STMT. Assign the statement
4651 for each vector to the associated scalar statement. */
4653 void
4654 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4656 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4657 gimple next_stmt, new_stmt;
4658 unsigned int i, gap_count;
4659 tree tmp_data_ref;
4661 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4662 Since we scan the chain starting from it's first node, their order
4663 corresponds the order of data-refs in RESULT_CHAIN. */
4664 next_stmt = first_stmt;
4665 gap_count = 1;
4666 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4668 if (!next_stmt)
4669 break;
4671 /* Skip the gaps. Loads created for the gaps will be removed by dead
4672 code elimination pass later. No need to check for the first stmt in
4673 the group, since it always exists.
4674 GROUP_GAP is the number of steps in elements from the previous
4675 access (if there is no gap GROUP_GAP is 1). We skip loads that
4676 correspond to the gaps. */
4677 if (next_stmt != first_stmt
4678 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4680 gap_count++;
4681 continue;
4684 while (next_stmt)
4686 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4687 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4688 copies, and we put the new vector statement in the first available
4689 RELATED_STMT. */
4690 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4691 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4692 else
4694 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4696 gimple prev_stmt =
4697 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4698 gimple rel_stmt =
4699 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4700 while (rel_stmt)
4702 prev_stmt = rel_stmt;
4703 rel_stmt =
4704 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4707 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4708 new_stmt;
4712 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4713 gap_count = 1;
4714 /* If NEXT_STMT accesses the same DR as the previous statement,
4715 put the same TMP_DATA_REF as its vectorized statement; otherwise
4716 get the next data-ref from RESULT_CHAIN. */
4717 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4718 break;
4723 /* Function vect_force_dr_alignment_p.
4725 Returns whether the alignment of a DECL can be forced to be aligned
4726 on ALIGNMENT bit boundary. */
4728 bool
4729 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4731 if (TREE_CODE (decl) != VAR_DECL)
4732 return false;
4734 /* We cannot change alignment of common or external symbols as another
4735 translation unit may contain a definition with lower alignment.
4736 The rules of common symbol linking mean that the definition
4737 will override the common symbol. The same is true for constant
4738 pool entries which may be shared and are not properly merged
4739 by LTO. */
4740 if (DECL_EXTERNAL (decl)
4741 || DECL_COMMON (decl)
4742 || DECL_IN_CONSTANT_POOL (decl))
4743 return false;
4745 if (TREE_ASM_WRITTEN (decl))
4746 return false;
4748 /* Do not override the alignment as specified by the ABI when the used
4749 attribute is set. */
4750 if (DECL_PRESERVE_P (decl))
4751 return false;
4753 /* Do not override explicit alignment set by the user when an explicit
4754 section name is also used. This is a common idiom used by many
4755 software projects. */
4756 if (DECL_SECTION_NAME (decl) != NULL_TREE
4757 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4758 return false;
4760 if (TREE_STATIC (decl))
4761 return (alignment <= MAX_OFILE_ALIGNMENT);
4762 else
4763 return (alignment <= MAX_STACK_ALIGNMENT);
4767 /* Return whether the data reference DR is supported with respect to its
4768 alignment.
4769 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4770 it is aligned, i.e., check if it is possible to vectorize it with different
4771 alignment. */
4773 enum dr_alignment_support
4774 vect_supportable_dr_alignment (struct data_reference *dr,
4775 bool check_aligned_accesses)
4777 gimple stmt = DR_STMT (dr);
4778 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4779 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4780 enum machine_mode mode = TYPE_MODE (vectype);
4781 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4782 struct loop *vect_loop = NULL;
4783 bool nested_in_vect_loop = false;
4785 if (aligned_access_p (dr) && !check_aligned_accesses)
4786 return dr_aligned;
4788 if (loop_vinfo)
4790 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4791 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4794 /* Possibly unaligned access. */
4796 /* We can choose between using the implicit realignment scheme (generating
4797 a misaligned_move stmt) and the explicit realignment scheme (generating
4798 aligned loads with a REALIGN_LOAD). There are two variants to the
4799 explicit realignment scheme: optimized, and unoptimized.
4800 We can optimize the realignment only if the step between consecutive
4801 vector loads is equal to the vector size. Since the vector memory
4802 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4803 is guaranteed that the misalignment amount remains the same throughout the
4804 execution of the vectorized loop. Therefore, we can create the
4805 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4806 at the loop preheader.
4808 However, in the case of outer-loop vectorization, when vectorizing a
4809 memory access in the inner-loop nested within the LOOP that is now being
4810 vectorized, while it is guaranteed that the misalignment of the
4811 vectorized memory access will remain the same in different outer-loop
4812 iterations, it is *not* guaranteed that is will remain the same throughout
4813 the execution of the inner-loop. This is because the inner-loop advances
4814 with the original scalar step (and not in steps of VS). If the inner-loop
4815 step happens to be a multiple of VS, then the misalignment remains fixed
4816 and we can use the optimized realignment scheme. For example:
4818 for (i=0; i<N; i++)
4819 for (j=0; j<M; j++)
4820 s += a[i+j];
4822 When vectorizing the i-loop in the above example, the step between
4823 consecutive vector loads is 1, and so the misalignment does not remain
4824 fixed across the execution of the inner-loop, and the realignment cannot
4825 be optimized (as illustrated in the following pseudo vectorized loop):
4827 for (i=0; i<N; i+=4)
4828 for (j=0; j<M; j++){
4829 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4830 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4831 // (assuming that we start from an aligned address).
4834 We therefore have to use the unoptimized realignment scheme:
4836 for (i=0; i<N; i+=4)
4837 for (j=k; j<M; j+=4)
4838 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4839 // that the misalignment of the initial address is
4840 // 0).
4842 The loop can then be vectorized as follows:
4844 for (k=0; k<4; k++){
4845 rt = get_realignment_token (&vp[k]);
4846 for (i=0; i<N; i+=4){
4847 v1 = vp[i+k];
4848 for (j=k; j<M; j+=4){
4849 v2 = vp[i+j+VS-1];
4850 va = REALIGN_LOAD <v1,v2,rt>;
4851 vs += va;
4852 v1 = v2;
4855 } */
4857 if (DR_IS_READ (dr))
4859 bool is_packed = false;
4860 tree type = (TREE_TYPE (DR_REF (dr)));
4862 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4863 && (!targetm.vectorize.builtin_mask_for_load
4864 || targetm.vectorize.builtin_mask_for_load ()))
4866 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4867 if ((nested_in_vect_loop
4868 && (TREE_INT_CST_LOW (DR_STEP (dr))
4869 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4870 || !loop_vinfo)
4871 return dr_explicit_realign;
4872 else
4873 return dr_explicit_realign_optimized;
4875 if (!known_alignment_for_access_p (dr))
4876 is_packed = not_size_aligned (DR_REF (dr));
4878 if ((TYPE_USER_ALIGN (type) && !is_packed)
4879 || targetm.vectorize.
4880 support_vector_misalignment (mode, type,
4881 DR_MISALIGNMENT (dr), is_packed))
4882 /* Can't software pipeline the loads, but can at least do them. */
4883 return dr_unaligned_supported;
4885 else
4887 bool is_packed = false;
4888 tree type = (TREE_TYPE (DR_REF (dr)));
4890 if (!known_alignment_for_access_p (dr))
4891 is_packed = not_size_aligned (DR_REF (dr));
4893 if ((TYPE_USER_ALIGN (type) && !is_packed)
4894 || targetm.vectorize.
4895 support_vector_misalignment (mode, type,
4896 DR_MISALIGNMENT (dr), is_packed))
4897 return dr_unaligned_supported;
4900 /* Unsupported. */
4901 return dr_unaligned_unsupported;