libgomp: Use pthread mutexes in the nvptx plugin.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob85e7c584c1bab97522b5a83b422db7d3636783e9
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tm_p.h"
40 #include "target.h"
41 #include "predict.h"
42 #include "hard-reg-set.h"
43 #include "input.h"
44 #include "function.h"
45 #include "dominance.h"
46 #include "cfg.h"
47 #include "basic-block.h"
48 #include "gimple-pretty-print.h"
49 #include "tree-ssa-alias.h"
50 #include "internal-fn.h"
51 #include "tree-eh.h"
52 #include "gimple-expr.h"
53 #include "is-a.h"
54 #include "gimple.h"
55 #include "gimplify.h"
56 #include "gimple-iterator.h"
57 #include "gimplify-me.h"
58 #include "gimple-ssa.h"
59 #include "tree-phinodes.h"
60 #include "ssa-iterators.h"
61 #include "stringpool.h"
62 #include "tree-ssanames.h"
63 #include "tree-ssa-loop-ivopts.h"
64 #include "tree-ssa-loop-manip.h"
65 #include "tree-ssa-loop.h"
66 #include "dumpfile.h"
67 #include "cfgloop.h"
68 #include "tree-chrec.h"
69 #include "tree-scalar-evolution.h"
70 #include "tree-vectorizer.h"
71 #include "diagnostic-core.h"
72 #include "hash-map.h"
73 #include "plugin-api.h"
74 #include "ipa-ref.h"
75 #include "cgraph.h"
76 /* Need to include rtl.h, expr.h, etc. for optabs. */
77 #include "expr.h"
78 #include "insn-codes.h"
79 #include "optabs.h"
80 #include "builtins.h"
81 #include "varasm.h"
83 /* Return true if load- or store-lanes optab OPTAB is implemented for
84 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
86 static bool
87 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
88 tree vectype, unsigned HOST_WIDE_INT count)
90 machine_mode mode, array_mode;
91 bool limit_p;
93 mode = TYPE_MODE (vectype);
94 limit_p = !targetm.array_mode_supported_p (mode, count);
95 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
96 MODE_INT, limit_p);
98 if (array_mode == BLKmode)
100 if (dump_enabled_p ())
101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
102 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
103 GET_MODE_NAME (mode), count);
104 return false;
107 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
109 if (dump_enabled_p ())
110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
111 "cannot use %s<%s><%s>\n", name,
112 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
113 return false;
116 if (dump_enabled_p ())
117 dump_printf_loc (MSG_NOTE, vect_location,
118 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
119 GET_MODE_NAME (mode));
121 return true;
125 /* Return the smallest scalar part of STMT.
126 This is used to determine the vectype of the stmt. We generally set the
127 vectype according to the type of the result (lhs). For stmts whose
128 result-type is different than the type of the arguments (e.g., demotion,
129 promotion), vectype will be reset appropriately (later). Note that we have
130 to visit the smallest datatype in this function, because that determines the
131 VF. If the smallest datatype in the loop is present only as the rhs of a
132 promotion operation - we'd miss it.
133 Such a case, where a variable of this datatype does not appear in the lhs
134 anywhere in the loop, can only occur if it's an invariant: e.g.:
135 'int_x = (int) short_inv', which we'd expect to have been optimized away by
136 invariant motion. However, we cannot rely on invariant motion to always
137 take invariants out of the loop, and so in the case of promotion we also
138 have to check the rhs.
139 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
140 types. */
142 tree
143 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
144 HOST_WIDE_INT *rhs_size_unit)
146 tree scalar_type = gimple_expr_type (stmt);
147 HOST_WIDE_INT lhs, rhs;
149 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
151 if (is_gimple_assign (stmt)
152 && (gimple_assign_cast_p (stmt)
153 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
154 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
155 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
157 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
159 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
160 if (rhs < lhs)
161 scalar_type = rhs_type;
164 *lhs_size_unit = lhs;
165 *rhs_size_unit = rhs;
166 return scalar_type;
170 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
171 tested at run-time. Return TRUE if DDR was successfully inserted.
172 Return false if versioning is not supported. */
174 static bool
175 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
177 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
179 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
180 return false;
182 if (dump_enabled_p ())
184 dump_printf_loc (MSG_NOTE, vect_location,
185 "mark for run-time aliasing test between ");
186 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
187 dump_printf (MSG_NOTE, " and ");
188 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
189 dump_printf (MSG_NOTE, "\n");
192 if (optimize_loop_nest_for_size_p (loop))
194 if (dump_enabled_p ())
195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
196 "versioning not supported when optimizing"
197 " for size.\n");
198 return false;
201 /* FORNOW: We don't support versioning with outer-loop vectorization. */
202 if (loop->inner)
204 if (dump_enabled_p ())
205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
206 "versioning not yet supported for outer-loops.\n");
207 return false;
210 /* FORNOW: We don't support creating runtime alias tests for non-constant
211 step. */
212 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
213 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217 "versioning not yet supported for non-constant "
218 "step\n");
219 return false;
222 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
223 return true;
227 /* Function vect_analyze_data_ref_dependence.
229 Return TRUE if there (might) exist a dependence between a memory-reference
230 DRA and a memory-reference DRB. When versioning for alias may check a
231 dependence at run-time, return FALSE. Adjust *MAX_VF according to
232 the data dependence. */
234 static bool
235 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo, int *max_vf)
238 unsigned int i;
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 struct data_reference *dra = DDR_A (ddr);
241 struct data_reference *drb = DDR_B (ddr);
242 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
243 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
244 lambda_vector dist_v;
245 unsigned int loop_depth;
247 /* In loop analysis all data references should be vectorizable. */
248 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
249 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
250 gcc_unreachable ();
252 /* Independent data accesses. */
253 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
254 return false;
256 if (dra == drb
257 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
258 return false;
260 /* Even if we have an anti-dependence then, as the vectorized loop covers at
261 least two scalar iterations, there is always also a true dependence.
262 As the vectorizer does not re-order loads and stores we can ignore
263 the anti-dependence if TBAA can disambiguate both DRs similar to the
264 case with known negative distance anti-dependences (positive
265 distance anti-dependences would violate TBAA constraints). */
266 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
267 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
268 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
269 get_alias_set (DR_REF (drb))))
270 return false;
272 /* Unknown data dependence. */
273 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
275 /* If user asserted safelen consecutive iterations can be
276 executed concurrently, assume independence. */
277 if (loop->safelen >= 2)
279 if (loop->safelen < *max_vf)
280 *max_vf = loop->safelen;
281 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
282 return false;
285 if (STMT_VINFO_GATHER_P (stmtinfo_a)
286 || STMT_VINFO_GATHER_P (stmtinfo_b))
288 if (dump_enabled_p ())
290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
291 "versioning for alias not supported for: "
292 "can't determine dependence between ");
293 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
294 DR_REF (dra));
295 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (drb));
298 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
300 return true;
303 if (dump_enabled_p ())
305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306 "versioning for alias required: "
307 "can't determine dependence between ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
309 DR_REF (dra));
310 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (drb));
313 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
316 /* Add to list of ddrs that need to be tested at run-time. */
317 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
320 /* Known data dependence. */
321 if (DDR_NUM_DIST_VECTS (ddr) == 0)
323 /* If user asserted safelen consecutive iterations can be
324 executed concurrently, assume independence. */
325 if (loop->safelen >= 2)
327 if (loop->safelen < *max_vf)
328 *max_vf = loop->safelen;
329 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
330 return false;
333 if (STMT_VINFO_GATHER_P (stmtinfo_a)
334 || STMT_VINFO_GATHER_P (stmtinfo_b))
336 if (dump_enabled_p ())
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "versioning for alias not supported for: "
340 "bad dist vector for ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 DR_REF (dra));
343 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (drb));
346 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
348 return true;
351 if (dump_enabled_p ())
353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
354 "versioning for alias required: "
355 "bad dist vector for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
358 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
361 /* Add to list of ddrs that need to be tested at run-time. */
362 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
365 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
366 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
368 int dist = dist_v[loop_depth];
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_NOTE, vect_location,
372 "dependence distance = %d.\n", dist);
374 if (dist == 0)
376 if (dump_enabled_p ())
378 dump_printf_loc (MSG_NOTE, vect_location,
379 "dependence distance == 0 between ");
380 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
381 dump_printf (MSG_NOTE, " and ");
382 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
383 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
386 /* When we perform grouped accesses and perform implicit CSE
387 by detecting equal accesses and doing disambiguation with
388 runtime alias tests like for
389 .. = a[i];
390 .. = a[i+1];
391 a[i] = ..;
392 a[i+1] = ..;
393 *p = ..;
394 .. = a[i];
395 .. = a[i+1];
396 where we will end up loading { a[i], a[i+1] } once, make
397 sure that inserting group loads before the first load and
398 stores after the last store will do the right thing.
399 Similar for groups like
400 a[i] = ...;
401 ... = a[i];
402 a[i+1] = ...;
403 where loads from the group interleave with the store. */
404 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
405 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
407 gimple earlier_stmt;
408 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
409 if (DR_IS_WRITE
410 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "READ_WRITE dependence in interleaving."
415 "\n");
416 return true;
420 continue;
423 if (dist > 0 && DDR_REVERSED_P (ddr))
425 /* If DDR_REVERSED_P the order of the data-refs in DDR was
426 reversed (to make distance vector positive), and the actual
427 distance is negative. */
428 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
430 "dependence distance negative.\n");
431 /* Record a negative dependence distance to later limit the
432 amount of stmt copying / unrolling we can perform.
433 Only need to handle read-after-write dependence. */
434 if (DR_IS_READ (drb)
435 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
436 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
437 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
438 continue;
441 if (abs (dist) >= 2
442 && abs (dist) < *max_vf)
444 /* The dependence distance requires reduction of the maximal
445 vectorization factor. */
446 *max_vf = abs (dist);
447 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE, vect_location,
449 "adjusting maximal vectorization factor to %i\n",
450 *max_vf);
453 if (abs (dist) >= *max_vf)
455 /* Dependence distance does not create dependence, as far as
456 vectorization is concerned, in this case. */
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_NOTE, vect_location,
459 "dependence distance >= VF.\n");
460 continue;
463 if (dump_enabled_p ())
465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
466 "not vectorized, possible dependence "
467 "between data-refs ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
469 dump_printf (MSG_NOTE, " and ");
470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
471 dump_printf (MSG_NOTE, "\n");
474 return true;
477 return false;
480 /* Function vect_analyze_data_ref_dependences.
482 Examine all the data references in the loop, and make sure there do not
483 exist any data dependences between them. Set *MAX_VF according to
484 the maximum vectorization factor the data dependences allow. */
486 bool
487 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
489 unsigned int i;
490 struct data_dependence_relation *ddr;
492 if (dump_enabled_p ())
493 dump_printf_loc (MSG_NOTE, vect_location,
494 "=== vect_analyze_data_ref_dependences ===\n");
496 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
497 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
498 &LOOP_VINFO_DDRS (loop_vinfo),
499 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
500 return false;
502 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
503 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
504 return false;
506 return true;
510 /* Function vect_slp_analyze_data_ref_dependence.
512 Return TRUE if there (might) exist a dependence between a memory-reference
513 DRA and a memory-reference DRB. When versioning for alias may check a
514 dependence at run-time, return FALSE. Adjust *MAX_VF according to
515 the data dependence. */
517 static bool
518 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
520 struct data_reference *dra = DDR_A (ddr);
521 struct data_reference *drb = DDR_B (ddr);
523 /* We need to check dependences of statements marked as unvectorizable
524 as well, they still can prohibit vectorization. */
526 /* Independent data accesses. */
527 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
528 return false;
530 if (dra == drb)
531 return false;
533 /* Read-read is OK. */
534 if (DR_IS_READ (dra) && DR_IS_READ (drb))
535 return false;
537 /* If dra and drb are part of the same interleaving chain consider
538 them independent. */
539 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
540 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
541 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
542 return false;
544 /* Unknown data dependence. */
545 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
547 if (dump_enabled_p ())
549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
550 "can't determine dependence between ");
551 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
552 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
553 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
554 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
557 else if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "determined dependence between ");
561 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
562 dump_printf (MSG_NOTE, " and ");
563 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
564 dump_printf (MSG_NOTE, "\n");
567 /* We do not vectorize basic blocks with write-write dependencies. */
568 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
569 return true;
571 /* If we have a read-write dependence check that the load is before the store.
572 When we vectorize basic blocks, vector load can be only before
573 corresponding scalar load, and vector store can be only after its
574 corresponding scalar store. So the order of the acceses is preserved in
575 case the load is before the store. */
576 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
577 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
579 /* That only holds for load-store pairs taking part in vectorization. */
580 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
581 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
582 return false;
585 return true;
589 /* Function vect_analyze_data_ref_dependences.
591 Examine all the data references in the basic-block, and make sure there
592 do not exist any data dependences between them. Set *MAX_VF according to
593 the maximum vectorization factor the data dependences allow. */
595 bool
596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
598 struct data_dependence_relation *ddr;
599 unsigned int i;
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "=== vect_slp_analyze_data_ref_dependences ===\n");
605 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
606 &BB_VINFO_DDRS (bb_vinfo),
607 vNULL, true))
608 return false;
610 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
611 if (vect_slp_analyze_data_ref_dependence (ddr))
612 return false;
614 return true;
618 /* Function vect_compute_data_ref_alignment
620 Compute the misalignment of the data reference DR.
622 Output:
623 1. If during the misalignment computation it is found that the data reference
624 cannot be vectorized then false is returned.
625 2. DR_MISALIGNMENT (DR) is defined.
627 FOR NOW: No analysis is actually performed. Misalignment is calculated
628 only for trivial cases. TODO. */
630 static bool
631 vect_compute_data_ref_alignment (struct data_reference *dr)
633 gimple stmt = DR_STMT (dr);
634 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
636 struct loop *loop = NULL;
637 tree ref = DR_REF (dr);
638 tree vectype;
639 tree base, base_addr;
640 bool base_aligned;
641 tree misalign;
642 tree aligned_to, alignment;
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE, vect_location,
646 "vect_compute_data_ref_alignment:\n");
648 if (loop_vinfo)
649 loop = LOOP_VINFO_LOOP (loop_vinfo);
651 /* Initialize misalignment to unknown. */
652 SET_DR_MISALIGNMENT (dr, -1);
654 /* Strided loads perform only component accesses, misalignment information
655 is irrelevant for them. */
656 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
657 return true;
659 misalign = DR_INIT (dr);
660 aligned_to = DR_ALIGNED_TO (dr);
661 base_addr = DR_BASE_ADDRESS (dr);
662 vectype = STMT_VINFO_VECTYPE (stmt_info);
664 /* In case the dataref is in an inner-loop of the loop that is being
665 vectorized (LOOP), we use the base and misalignment information
666 relative to the outer-loop (LOOP). This is ok only if the misalignment
667 stays the same throughout the execution of the inner-loop, which is why
668 we have to check that the stride of the dataref in the inner-loop evenly
669 divides by the vector size. */
670 if (loop && nested_in_vect_loop_p (loop, stmt))
672 tree step = DR_STEP (dr);
673 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
675 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_NOTE, vect_location,
679 "inner step divides the vector-size.\n");
680 misalign = STMT_VINFO_DR_INIT (stmt_info);
681 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
682 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
684 else
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
688 "inner step doesn't divide the vector-size.\n");
689 misalign = NULL_TREE;
693 /* Similarly, if we're doing basic-block vectorization, we can only use
694 base and misalignment information relative to an innermost loop if the
695 misalignment stays the same throughout the execution of the loop.
696 As above, this is the case if the stride of the dataref evenly divides
697 by the vector size. */
698 if (!loop)
700 tree step = DR_STEP (dr);
701 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
703 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "SLP: step doesn't divide the vector-size.\n");
708 misalign = NULL_TREE;
712 base = build_fold_indirect_ref (base_addr);
713 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
715 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
716 || !misalign)
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
723 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
725 return true;
728 if ((DECL_P (base)
729 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
730 alignment) >= 0)
731 || (TREE_CODE (base_addr) == SSA_NAME
732 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
733 TREE_TYPE (base_addr)))),
734 alignment) >= 0)
735 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
736 base_aligned = true;
737 else
738 base_aligned = false;
740 if (!base_aligned)
742 /* Do not change the alignment of global variables here if
743 flag_section_anchors is enabled as we already generated
744 RTL for other functions. Most global variables should
745 have been aligned during the IPA increase_alignment pass. */
746 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
747 || (TREE_STATIC (base) && flag_section_anchors))
749 if (dump_enabled_p ())
751 dump_printf_loc (MSG_NOTE, vect_location,
752 "can't force alignment of ref: ");
753 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
754 dump_printf (MSG_NOTE, "\n");
756 return true;
759 /* Force the alignment of the decl.
760 NOTE: This is the only change to the code we make during
761 the analysis phase, before deciding to vectorize the loop. */
762 if (dump_enabled_p ())
764 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
765 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
766 dump_printf (MSG_NOTE, "\n");
769 ((dataref_aux *)dr->aux)->base_decl = base;
770 ((dataref_aux *)dr->aux)->base_misaligned = true;
773 /* If this is a backward running DR then first access in the larger
774 vectype actually is N-1 elements before the address in the DR.
775 Adjust misalign accordingly. */
776 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
778 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
779 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
780 otherwise we wouldn't be here. */
781 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
782 /* PLUS because DR_STEP was negative. */
783 misalign = size_binop (PLUS_EXPR, misalign, offset);
786 /* Modulo alignment. */
787 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
789 if (!tree_fits_uhwi_p (misalign))
791 /* Negative or overflowed misalignment value. */
792 if (dump_enabled_p ())
793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 "unexpected misalign value\n");
795 return false;
798 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
803 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
804 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
805 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
808 return true;
812 /* Function vect_compute_data_refs_alignment
814 Compute the misalignment of data references in the loop.
815 Return FALSE if a data reference is found that cannot be vectorized. */
817 static bool
818 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
819 bb_vec_info bb_vinfo)
821 vec<data_reference_p> datarefs;
822 struct data_reference *dr;
823 unsigned int i;
825 if (loop_vinfo)
826 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
827 else
828 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
830 FOR_EACH_VEC_ELT (datarefs, i, dr)
831 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
832 && !vect_compute_data_ref_alignment (dr))
834 if (bb_vinfo)
836 /* Mark unsupported statement as unvectorizable. */
837 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
838 continue;
840 else
841 return false;
844 return true;
848 /* Function vect_update_misalignment_for_peel
850 DR - the data reference whose misalignment is to be adjusted.
851 DR_PEEL - the data reference whose misalignment is being made
852 zero in the vector loop by the peel.
853 NPEEL - the number of iterations in the peel loop if the misalignment
854 of DR_PEEL is known at compile time. */
856 static void
857 vect_update_misalignment_for_peel (struct data_reference *dr,
858 struct data_reference *dr_peel, int npeel)
860 unsigned int i;
861 vec<dr_p> same_align_drs;
862 struct data_reference *current_dr;
863 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
864 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
865 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
866 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
868 /* For interleaved data accesses the step in the loop must be multiplied by
869 the size of the interleaving group. */
870 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
871 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
872 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
873 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
875 /* It can be assumed that the data refs with the same alignment as dr_peel
876 are aligned in the vector loop. */
877 same_align_drs
878 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
879 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
881 if (current_dr != dr)
882 continue;
883 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
884 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
885 SET_DR_MISALIGNMENT (dr, 0);
886 return;
889 if (known_alignment_for_access_p (dr)
890 && known_alignment_for_access_p (dr_peel))
892 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
893 int misal = DR_MISALIGNMENT (dr);
894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
895 misal += negative ? -npeel * dr_size : npeel * dr_size;
896 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
897 SET_DR_MISALIGNMENT (dr, misal);
898 return;
901 if (dump_enabled_p ())
902 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
903 SET_DR_MISALIGNMENT (dr, -1);
907 /* Function vect_verify_datarefs_alignment
909 Return TRUE if all data references in the loop can be
910 handled with respect to alignment. */
912 bool
913 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
915 vec<data_reference_p> datarefs;
916 struct data_reference *dr;
917 enum dr_alignment_support supportable_dr_alignment;
918 unsigned int i;
920 if (loop_vinfo)
921 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
922 else
923 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
925 FOR_EACH_VEC_ELT (datarefs, i, dr)
927 gimple stmt = DR_STMT (dr);
928 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
930 if (!STMT_VINFO_RELEVANT_P (stmt_info))
931 continue;
933 /* For interleaving, only the alignment of the first access matters.
934 Skip statements marked as not vectorizable. */
935 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
936 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
937 || !STMT_VINFO_VECTORIZABLE (stmt_info))
938 continue;
940 /* Strided loads perform only component accesses, alignment is
941 irrelevant for them. */
942 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
943 continue;
945 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
946 if (!supportable_dr_alignment)
948 if (dump_enabled_p ())
950 if (DR_IS_READ (dr))
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
952 "not vectorized: unsupported unaligned load.");
953 else
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "not vectorized: unsupported unaligned "
956 "store.");
958 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 DR_REF (dr));
960 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
962 return false;
964 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
965 dump_printf_loc (MSG_NOTE, vect_location,
966 "Vectorizing an unaligned access.\n");
968 return true;
971 /* Given an memory reference EXP return whether its alignment is less
972 than its size. */
974 static bool
975 not_size_aligned (tree exp)
977 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
978 return true;
980 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
981 > get_object_alignment (exp));
984 /* Function vector_alignment_reachable_p
986 Return true if vector alignment for DR is reachable by peeling
987 a few loop iterations. Return false otherwise. */
989 static bool
990 vector_alignment_reachable_p (struct data_reference *dr)
992 gimple stmt = DR_STMT (dr);
993 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
994 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
996 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
998 /* For interleaved access we peel only if number of iterations in
999 the prolog loop ({VF - misalignment}), is a multiple of the
1000 number of the interleaved accesses. */
1001 int elem_size, mis_in_elements;
1002 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1004 /* FORNOW: handle only known alignment. */
1005 if (!known_alignment_for_access_p (dr))
1006 return false;
1008 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1009 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1011 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1012 return false;
1015 /* If misalignment is known at the compile time then allow peeling
1016 only if natural alignment is reachable through peeling. */
1017 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1019 HOST_WIDE_INT elmsize =
1020 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1021 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1025 dump_printf (MSG_NOTE,
1026 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1028 if (DR_MISALIGNMENT (dr) % elmsize)
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "data size does not divide the misalignment.\n");
1033 return false;
1037 if (!known_alignment_for_access_p (dr))
1039 tree type = TREE_TYPE (DR_REF (dr));
1040 bool is_packed = not_size_aligned (DR_REF (dr));
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "Unknown misalignment, is_packed = %d\n",is_packed);
1044 if ((TYPE_USER_ALIGN (type) && !is_packed)
1045 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1046 return true;
1047 else
1048 return false;
1051 return true;
1055 /* Calculate the cost of the memory access represented by DR. */
1057 static void
1058 vect_get_data_access_cost (struct data_reference *dr,
1059 unsigned int *inside_cost,
1060 unsigned int *outside_cost,
1061 stmt_vector_for_cost *body_cost_vec)
1063 gimple stmt = DR_STMT (dr);
1064 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1065 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1066 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1067 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1068 int ncopies = vf / nunits;
1070 if (DR_IS_READ (dr))
1071 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1072 NULL, body_cost_vec, false);
1073 else
1074 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1076 if (dump_enabled_p ())
1077 dump_printf_loc (MSG_NOTE, vect_location,
1078 "vect_get_data_access_cost: inside_cost = %d, "
1079 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1083 /* Insert DR into peeling hash table with NPEEL as key. */
1085 static void
1086 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1087 int npeel)
1089 struct _vect_peel_info elem, *slot;
1090 _vect_peel_info **new_slot;
1091 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1093 elem.npeel = npeel;
1094 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1095 if (slot)
1096 slot->count++;
1097 else
1099 slot = XNEW (struct _vect_peel_info);
1100 slot->npeel = npeel;
1101 slot->dr = dr;
1102 slot->count = 1;
1103 new_slot
1104 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1105 *new_slot = slot;
1108 if (!supportable_dr_alignment
1109 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1110 slot->count += VECT_MAX_COST;
1114 /* Traverse peeling hash table to find peeling option that aligns maximum
1115 number of data accesses. */
1118 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1119 _vect_peel_extended_info *max)
1121 vect_peel_info elem = *slot;
1123 if (elem->count > max->peel_info.count
1124 || (elem->count == max->peel_info.count
1125 && max->peel_info.npeel > elem->npeel))
1127 max->peel_info.npeel = elem->npeel;
1128 max->peel_info.count = elem->count;
1129 max->peel_info.dr = elem->dr;
1132 return 1;
1136 /* Traverse peeling hash table and calculate cost for each peeling option.
1137 Find the one with the lowest cost. */
1140 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1141 _vect_peel_extended_info *min)
1143 vect_peel_info elem = *slot;
1144 int save_misalignment, dummy;
1145 unsigned int inside_cost = 0, outside_cost = 0, i;
1146 gimple stmt = DR_STMT (elem->dr);
1147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1148 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1149 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1150 struct data_reference *dr;
1151 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1152 int single_iter_cost;
1154 prologue_cost_vec.create (2);
1155 body_cost_vec.create (2);
1156 epilogue_cost_vec.create (2);
1158 FOR_EACH_VEC_ELT (datarefs, i, dr)
1160 stmt = DR_STMT (dr);
1161 stmt_info = vinfo_for_stmt (stmt);
1162 /* For interleaving, only the alignment of the first access
1163 matters. */
1164 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1165 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1166 continue;
1168 save_misalignment = DR_MISALIGNMENT (dr);
1169 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1170 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1171 &body_cost_vec);
1172 SET_DR_MISALIGNMENT (dr, save_misalignment);
1175 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1176 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1177 &dummy, single_iter_cost,
1178 &prologue_cost_vec,
1179 &epilogue_cost_vec);
1181 /* Prologue and epilogue costs are added to the target model later.
1182 These costs depend only on the scalar iteration cost, the
1183 number of peeling iterations finally chosen, and the number of
1184 misaligned statements. So discard the information found here. */
1185 prologue_cost_vec.release ();
1186 epilogue_cost_vec.release ();
1188 if (inside_cost < min->inside_cost
1189 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1191 min->inside_cost = inside_cost;
1192 min->outside_cost = outside_cost;
1193 min->body_cost_vec.release ();
1194 min->body_cost_vec = body_cost_vec;
1195 min->peel_info.dr = elem->dr;
1196 min->peel_info.npeel = elem->npeel;
1198 else
1199 body_cost_vec.release ();
1201 return 1;
1205 /* Choose best peeling option by traversing peeling hash table and either
1206 choosing an option with the lowest cost (if cost model is enabled) or the
1207 option that aligns as many accesses as possible. */
1209 static struct data_reference *
1210 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1211 unsigned int *npeel,
1212 stmt_vector_for_cost *body_cost_vec)
1214 struct _vect_peel_extended_info res;
1216 res.peel_info.dr = NULL;
1217 res.body_cost_vec = stmt_vector_for_cost ();
1219 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1221 res.inside_cost = INT_MAX;
1222 res.outside_cost = INT_MAX;
1223 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1224 ->traverse <_vect_peel_extended_info *,
1225 vect_peeling_hash_get_lowest_cost> (&res);
1227 else
1229 res.peel_info.count = 0;
1230 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1231 ->traverse <_vect_peel_extended_info *,
1232 vect_peeling_hash_get_most_frequent> (&res);
1235 *npeel = res.peel_info.npeel;
1236 *body_cost_vec = res.body_cost_vec;
1237 return res.peel_info.dr;
1241 /* Function vect_enhance_data_refs_alignment
1243 This pass will use loop versioning and loop peeling in order to enhance
1244 the alignment of data references in the loop.
1246 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1247 original loop is to be vectorized. Any other loops that are created by
1248 the transformations performed in this pass - are not supposed to be
1249 vectorized. This restriction will be relaxed.
1251 This pass will require a cost model to guide it whether to apply peeling
1252 or versioning or a combination of the two. For example, the scheme that
1253 intel uses when given a loop with several memory accesses, is as follows:
1254 choose one memory access ('p') which alignment you want to force by doing
1255 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1256 other accesses are not necessarily aligned, or (2) use loop versioning to
1257 generate one loop in which all accesses are aligned, and another loop in
1258 which only 'p' is necessarily aligned.
1260 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1261 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1262 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1264 Devising a cost model is the most critical aspect of this work. It will
1265 guide us on which access to peel for, whether to use loop versioning, how
1266 many versions to create, etc. The cost model will probably consist of
1267 generic considerations as well as target specific considerations (on
1268 powerpc for example, misaligned stores are more painful than misaligned
1269 loads).
1271 Here are the general steps involved in alignment enhancements:
1273 -- original loop, before alignment analysis:
1274 for (i=0; i<N; i++){
1275 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1276 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1279 -- After vect_compute_data_refs_alignment:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = 3
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1285 -- Possibility 1: we do loop versioning:
1286 if (p is aligned) {
1287 for (i=0; i<N; i++){ # loop 1A
1288 x = q[i]; # DR_MISALIGNMENT(q) = 3
1289 p[i] = y; # DR_MISALIGNMENT(p) = 0
1292 else {
1293 for (i=0; i<N; i++){ # loop 1B
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1299 -- Possibility 2: we do loop peeling:
1300 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1301 x = q[i];
1302 p[i] = y;
1304 for (i = 3; i < N; i++){ # loop 2A
1305 x = q[i]; # DR_MISALIGNMENT(q) = 0
1306 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1309 -- Possibility 3: combination of loop peeling and versioning:
1310 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1311 x = q[i];
1312 p[i] = y;
1314 if (p is aligned) {
1315 for (i = 3; i<N; i++){ # loop 3A
1316 x = q[i]; # DR_MISALIGNMENT(q) = 0
1317 p[i] = y; # DR_MISALIGNMENT(p) = 0
1320 else {
1321 for (i = 3; i<N; i++){ # loop 3B
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1327 These loops are later passed to loop_transform to be vectorized. The
1328 vectorizer will use the alignment information to guide the transformation
1329 (whether to generate regular loads/stores, or with special handling for
1330 misalignment). */
1332 bool
1333 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1335 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1336 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1337 enum dr_alignment_support supportable_dr_alignment;
1338 struct data_reference *dr0 = NULL, *first_store = NULL;
1339 struct data_reference *dr;
1340 unsigned int i, j;
1341 bool do_peeling = false;
1342 bool do_versioning = false;
1343 bool stat;
1344 gimple stmt;
1345 stmt_vec_info stmt_info;
1346 unsigned int npeel = 0;
1347 bool all_misalignments_unknown = true;
1348 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1349 unsigned possible_npeel_number = 1;
1350 tree vectype;
1351 unsigned int nelements, mis, same_align_drs_max = 0;
1352 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1354 if (dump_enabled_p ())
1355 dump_printf_loc (MSG_NOTE, vect_location,
1356 "=== vect_enhance_data_refs_alignment ===\n");
1358 /* While cost model enhancements are expected in the future, the high level
1359 view of the code at this time is as follows:
1361 A) If there is a misaligned access then see if peeling to align
1362 this access can make all data references satisfy
1363 vect_supportable_dr_alignment. If so, update data structures
1364 as needed and return true.
1366 B) If peeling wasn't possible and there is a data reference with an
1367 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1368 then see if loop versioning checks can be used to make all data
1369 references satisfy vect_supportable_dr_alignment. If so, update
1370 data structures as needed and return true.
1372 C) If neither peeling nor versioning were successful then return false if
1373 any data reference does not satisfy vect_supportable_dr_alignment.
1375 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1377 Note, Possibility 3 above (which is peeling and versioning together) is not
1378 being done at this time. */
1380 /* (1) Peeling to force alignment. */
1382 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1383 Considerations:
1384 + How many accesses will become aligned due to the peeling
1385 - How many accesses will become unaligned due to the peeling,
1386 and the cost of misaligned accesses.
1387 - The cost of peeling (the extra runtime checks, the increase
1388 in code size). */
1390 FOR_EACH_VEC_ELT (datarefs, i, dr)
1392 stmt = DR_STMT (dr);
1393 stmt_info = vinfo_for_stmt (stmt);
1395 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1396 continue;
1398 /* For interleaving, only the alignment of the first access
1399 matters. */
1400 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1401 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1402 continue;
1404 /* For invariant accesses there is nothing to enhance. */
1405 if (integer_zerop (DR_STEP (dr)))
1406 continue;
1408 /* Strided loads perform only component accesses, alignment is
1409 irrelevant for them. */
1410 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1411 continue;
1413 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1414 do_peeling = vector_alignment_reachable_p (dr);
1415 if (do_peeling)
1417 if (known_alignment_for_access_p (dr))
1419 unsigned int npeel_tmp;
1420 bool negative = tree_int_cst_compare (DR_STEP (dr),
1421 size_zero_node) < 0;
1423 /* Save info about DR in the hash table. */
1424 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1425 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1426 = new hash_table<peel_info_hasher> (1);
1428 vectype = STMT_VINFO_VECTYPE (stmt_info);
1429 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1430 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1431 TREE_TYPE (DR_REF (dr))));
1432 npeel_tmp = (negative
1433 ? (mis - nelements) : (nelements - mis))
1434 & (nelements - 1);
1436 /* For multiple types, it is possible that the bigger type access
1437 will have more than one peeling option. E.g., a loop with two
1438 types: one of size (vector size / 4), and the other one of
1439 size (vector size / 8). Vectorization factor will 8. If both
1440 access are misaligned by 3, the first one needs one scalar
1441 iteration to be aligned, and the second one needs 5. But the
1442 the first one will be aligned also by peeling 5 scalar
1443 iterations, and in that case both accesses will be aligned.
1444 Hence, except for the immediate peeling amount, we also want
1445 to try to add full vector size, while we don't exceed
1446 vectorization factor.
1447 We do this automtically for cost model, since we calculate cost
1448 for every peeling option. */
1449 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1450 possible_npeel_number = vf /nelements;
1452 /* Handle the aligned case. We may decide to align some other
1453 access, making DR unaligned. */
1454 if (DR_MISALIGNMENT (dr) == 0)
1456 npeel_tmp = 0;
1457 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1458 possible_npeel_number++;
1461 for (j = 0; j < possible_npeel_number; j++)
1463 gcc_assert (npeel_tmp <= vf);
1464 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1465 npeel_tmp += nelements;
1468 all_misalignments_unknown = false;
1469 /* Data-ref that was chosen for the case that all the
1470 misalignments are unknown is not relevant anymore, since we
1471 have a data-ref with known alignment. */
1472 dr0 = NULL;
1474 else
1476 /* If we don't know any misalignment values, we prefer
1477 peeling for data-ref that has the maximum number of data-refs
1478 with the same alignment, unless the target prefers to align
1479 stores over load. */
1480 if (all_misalignments_unknown)
1482 unsigned same_align_drs
1483 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1484 if (!dr0
1485 || same_align_drs_max < same_align_drs)
1487 same_align_drs_max = same_align_drs;
1488 dr0 = dr;
1490 /* For data-refs with the same number of related
1491 accesses prefer the one where the misalign
1492 computation will be invariant in the outermost loop. */
1493 else if (same_align_drs_max == same_align_drs)
1495 struct loop *ivloop0, *ivloop;
1496 ivloop0 = outermost_invariant_loop_for_expr
1497 (loop, DR_BASE_ADDRESS (dr0));
1498 ivloop = outermost_invariant_loop_for_expr
1499 (loop, DR_BASE_ADDRESS (dr));
1500 if ((ivloop && !ivloop0)
1501 || (ivloop && ivloop0
1502 && flow_loop_nested_p (ivloop, ivloop0)))
1503 dr0 = dr;
1506 if (!first_store && DR_IS_WRITE (dr))
1507 first_store = dr;
1510 /* If there are both known and unknown misaligned accesses in the
1511 loop, we choose peeling amount according to the known
1512 accesses. */
1513 if (!supportable_dr_alignment)
1515 dr0 = dr;
1516 if (!first_store && DR_IS_WRITE (dr))
1517 first_store = dr;
1521 else
1523 if (!aligned_access_p (dr))
1525 if (dump_enabled_p ())
1526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1527 "vector alignment may not be reachable\n");
1528 break;
1533 /* Check if we can possibly peel the loop. */
1534 if (!vect_can_advance_ivs_p (loop_vinfo)
1535 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1536 do_peeling = false;
1538 /* If we don't know how many times the peeling loop will run
1539 assume it will run VF-1 times and disable peeling if the remaining
1540 iters are less than the vectorization factor. */
1541 if (do_peeling
1542 && all_misalignments_unknown
1543 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1544 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1545 < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1546 do_peeling = false;
1548 if (do_peeling
1549 && all_misalignments_unknown
1550 && vect_supportable_dr_alignment (dr0, false))
1552 /* Check if the target requires to prefer stores over loads, i.e., if
1553 misaligned stores are more expensive than misaligned loads (taking
1554 drs with same alignment into account). */
1555 if (first_store && DR_IS_READ (dr0))
1557 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1558 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1559 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1560 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1561 stmt_vector_for_cost dummy;
1562 dummy.create (2);
1564 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1565 &dummy);
1566 vect_get_data_access_cost (first_store, &store_inside_cost,
1567 &store_outside_cost, &dummy);
1569 dummy.release ();
1571 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1572 aligning the load DR0). */
1573 load_inside_penalty = store_inside_cost;
1574 load_outside_penalty = store_outside_cost;
1575 for (i = 0;
1576 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1577 DR_STMT (first_store))).iterate (i, &dr);
1578 i++)
1579 if (DR_IS_READ (dr))
1581 load_inside_penalty += load_inside_cost;
1582 load_outside_penalty += load_outside_cost;
1584 else
1586 load_inside_penalty += store_inside_cost;
1587 load_outside_penalty += store_outside_cost;
1590 /* Calculate the penalty for leaving DR0 unaligned (by
1591 aligning the FIRST_STORE). */
1592 store_inside_penalty = load_inside_cost;
1593 store_outside_penalty = load_outside_cost;
1594 for (i = 0;
1595 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1596 DR_STMT (dr0))).iterate (i, &dr);
1597 i++)
1598 if (DR_IS_READ (dr))
1600 store_inside_penalty += load_inside_cost;
1601 store_outside_penalty += load_outside_cost;
1603 else
1605 store_inside_penalty += store_inside_cost;
1606 store_outside_penalty += store_outside_cost;
1609 if (load_inside_penalty > store_inside_penalty
1610 || (load_inside_penalty == store_inside_penalty
1611 && load_outside_penalty > store_outside_penalty))
1612 dr0 = first_store;
1615 /* In case there are only loads with different unknown misalignments, use
1616 peeling only if it may help to align other accesses in the loop. */
1617 if (!first_store
1618 && !STMT_VINFO_SAME_ALIGN_REFS (
1619 vinfo_for_stmt (DR_STMT (dr0))).length ()
1620 && vect_supportable_dr_alignment (dr0, false)
1621 != dr_unaligned_supported)
1622 do_peeling = false;
1625 if (do_peeling && !dr0)
1627 /* Peeling is possible, but there is no data access that is not supported
1628 unless aligned. So we try to choose the best possible peeling. */
1630 /* We should get here only if there are drs with known misalignment. */
1631 gcc_assert (!all_misalignments_unknown);
1633 /* Choose the best peeling from the hash table. */
1634 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1635 &body_cost_vec);
1636 if (!dr0 || !npeel)
1637 do_peeling = false;
1639 /* If peeling by npeel will result in a remaining loop not iterating
1640 enough to be vectorized then do not peel. */
1641 if (do_peeling
1642 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1643 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1644 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1645 do_peeling = false;
1648 if (do_peeling)
1650 stmt = DR_STMT (dr0);
1651 stmt_info = vinfo_for_stmt (stmt);
1652 vectype = STMT_VINFO_VECTYPE (stmt_info);
1653 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1655 if (known_alignment_for_access_p (dr0))
1657 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1658 size_zero_node) < 0;
1659 if (!npeel)
1661 /* Since it's known at compile time, compute the number of
1662 iterations in the peeled loop (the peeling factor) for use in
1663 updating DR_MISALIGNMENT values. The peeling factor is the
1664 vectorization factor minus the misalignment as an element
1665 count. */
1666 mis = DR_MISALIGNMENT (dr0);
1667 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1668 npeel = ((negative ? mis - nelements : nelements - mis)
1669 & (nelements - 1));
1672 /* For interleaved data access every iteration accesses all the
1673 members of the group, therefore we divide the number of iterations
1674 by the group size. */
1675 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1676 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1677 npeel /= GROUP_SIZE (stmt_info);
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE, vect_location,
1681 "Try peeling by %d\n", npeel);
1684 /* Ensure that all data refs can be vectorized after the peel. */
1685 FOR_EACH_VEC_ELT (datarefs, i, dr)
1687 int save_misalignment;
1689 if (dr == dr0)
1690 continue;
1692 stmt = DR_STMT (dr);
1693 stmt_info = vinfo_for_stmt (stmt);
1694 /* For interleaving, only the alignment of the first access
1695 matters. */
1696 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1697 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1698 continue;
1700 /* Strided loads perform only component accesses, alignment is
1701 irrelevant for them. */
1702 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1703 continue;
1705 save_misalignment = DR_MISALIGNMENT (dr);
1706 vect_update_misalignment_for_peel (dr, dr0, npeel);
1707 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1708 SET_DR_MISALIGNMENT (dr, save_misalignment);
1710 if (!supportable_dr_alignment)
1712 do_peeling = false;
1713 break;
1717 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1719 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1720 if (!stat)
1721 do_peeling = false;
1722 else
1724 body_cost_vec.release ();
1725 return stat;
1729 if (do_peeling)
1731 unsigned max_allowed_peel
1732 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1733 if (max_allowed_peel != (unsigned)-1)
1735 unsigned max_peel = npeel;
1736 if (max_peel == 0)
1738 gimple dr_stmt = DR_STMT (dr0);
1739 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1740 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1741 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1743 if (max_peel > max_allowed_peel)
1745 do_peeling = false;
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "Disable peeling, max peels reached: %d\n", max_peel);
1753 if (do_peeling)
1755 stmt_info_for_cost *si;
1756 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1758 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1759 If the misalignment of DR_i is identical to that of dr0 then set
1760 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1761 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1762 by the peeling factor times the element size of DR_i (MOD the
1763 vectorization factor times the size). Otherwise, the
1764 misalignment of DR_i must be set to unknown. */
1765 FOR_EACH_VEC_ELT (datarefs, i, dr)
1766 if (dr != dr0)
1767 vect_update_misalignment_for_peel (dr, dr0, npeel);
1769 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1770 if (npeel)
1771 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1772 else
1773 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1774 = DR_MISALIGNMENT (dr0);
1775 SET_DR_MISALIGNMENT (dr0, 0);
1776 if (dump_enabled_p ())
1778 dump_printf_loc (MSG_NOTE, vect_location,
1779 "Alignment of access forced using peeling.\n");
1780 dump_printf_loc (MSG_NOTE, vect_location,
1781 "Peeling for alignment will be applied.\n");
1783 /* We've delayed passing the inside-loop peeling costs to the
1784 target cost model until we were sure peeling would happen.
1785 Do so now. */
1786 if (body_cost_vec.exists ())
1788 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1790 struct _stmt_vec_info *stmt_info
1791 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1792 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1793 si->misalign, vect_body);
1795 body_cost_vec.release ();
1798 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1799 gcc_assert (stat);
1800 return stat;
1804 body_cost_vec.release ();
1806 /* (2) Versioning to force alignment. */
1808 /* Try versioning if:
1809 1) optimize loop for speed
1810 2) there is at least one unsupported misaligned data ref with an unknown
1811 misalignment, and
1812 3) all misaligned data refs with a known misalignment are supported, and
1813 4) the number of runtime alignment checks is within reason. */
1815 do_versioning =
1816 optimize_loop_nest_for_speed_p (loop)
1817 && (!loop->inner); /* FORNOW */
1819 if (do_versioning)
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1823 stmt = DR_STMT (dr);
1824 stmt_info = vinfo_for_stmt (stmt);
1826 /* For interleaving, only the alignment of the first access
1827 matters. */
1828 if (aligned_access_p (dr)
1829 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1830 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1831 continue;
1833 /* Strided loads perform only component accesses, alignment is
1834 irrelevant for them. */
1835 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1836 continue;
1838 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1840 if (!supportable_dr_alignment)
1842 gimple stmt;
1843 int mask;
1844 tree vectype;
1846 if (known_alignment_for_access_p (dr)
1847 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1848 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1850 do_versioning = false;
1851 break;
1854 stmt = DR_STMT (dr);
1855 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1856 gcc_assert (vectype);
1858 /* The rightmost bits of an aligned address must be zeros.
1859 Construct the mask needed for this test. For example,
1860 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1861 mask must be 15 = 0xf. */
1862 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1864 /* FORNOW: use the same mask to test all potentially unaligned
1865 references in the loop. The vectorizer currently supports
1866 a single vector size, see the reference to
1867 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1868 vectorization factor is computed. */
1869 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1870 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1871 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1872 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1873 DR_STMT (dr));
1877 /* Versioning requires at least one misaligned data reference. */
1878 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1879 do_versioning = false;
1880 else if (!do_versioning)
1881 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1884 if (do_versioning)
1886 vec<gimple> may_misalign_stmts
1887 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1888 gimple stmt;
1890 /* It can now be assumed that the data references in the statements
1891 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1892 of the loop being vectorized. */
1893 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1895 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1896 dr = STMT_VINFO_DATA_REF (stmt_info);
1897 SET_DR_MISALIGNMENT (dr, 0);
1898 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Alignment of access forced using versioning.\n");
1903 if (dump_enabled_p ())
1904 dump_printf_loc (MSG_NOTE, vect_location,
1905 "Versioning for alignment will be applied.\n");
1907 /* Peeling and versioning can't be done together at this time. */
1908 gcc_assert (! (do_peeling && do_versioning));
1910 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1911 gcc_assert (stat);
1912 return stat;
1915 /* This point is reached if neither peeling nor versioning is being done. */
1916 gcc_assert (! (do_peeling || do_versioning));
1918 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1919 return stat;
1923 /* Function vect_find_same_alignment_drs.
1925 Update group and alignment relations according to the chosen
1926 vectorization factor. */
1928 static void
1929 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1930 loop_vec_info loop_vinfo)
1932 unsigned int i;
1933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1934 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1935 struct data_reference *dra = DDR_A (ddr);
1936 struct data_reference *drb = DDR_B (ddr);
1937 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1938 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1939 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1940 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1941 lambda_vector dist_v;
1942 unsigned int loop_depth;
1944 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1945 return;
1947 if (dra == drb)
1948 return;
1950 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1951 return;
1953 /* Loop-based vectorization and known data dependence. */
1954 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1955 return;
1957 /* Data-dependence analysis reports a distance vector of zero
1958 for data-references that overlap only in the first iteration
1959 but have different sign step (see PR45764).
1960 So as a sanity check require equal DR_STEP. */
1961 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1962 return;
1964 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1965 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1967 int dist = dist_v[loop_depth];
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE, vect_location,
1971 "dependence distance = %d.\n", dist);
1973 /* Same loop iteration. */
1974 if (dist == 0
1975 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1977 /* Two references with distance zero have the same alignment. */
1978 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1979 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1980 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "accesses have the same alignment.\n");
1984 dump_printf (MSG_NOTE,
1985 "dependence distance modulo vf == 0 between ");
1986 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1987 dump_printf (MSG_NOTE, " and ");
1988 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1989 dump_printf (MSG_NOTE, "\n");
1996 /* Function vect_analyze_data_refs_alignment
1998 Analyze the alignment of the data-references in the loop.
1999 Return FALSE if a data reference is found that cannot be vectorized. */
2001 bool
2002 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
2003 bb_vec_info bb_vinfo)
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_NOTE, vect_location,
2007 "=== vect_analyze_data_refs_alignment ===\n");
2009 /* Mark groups of data references with same alignment using
2010 data dependence information. */
2011 if (loop_vinfo)
2013 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2014 struct data_dependence_relation *ddr;
2015 unsigned int i;
2017 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2018 vect_find_same_alignment_drs (ddr, loop_vinfo);
2021 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2023 if (dump_enabled_p ())
2024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025 "not vectorized: can't calculate alignment "
2026 "for data ref.\n");
2027 return false;
2030 return true;
2034 /* Analyze groups of accesses: check that DR belongs to a group of
2035 accesses of legal size, step, etc. Detect gaps, single element
2036 interleaving, and other special cases. Set grouped access info.
2037 Collect groups of strided stores for further use in SLP analysis. */
2039 static bool
2040 vect_analyze_group_access (struct data_reference *dr)
2042 tree step = DR_STEP (dr);
2043 tree scalar_type = TREE_TYPE (DR_REF (dr));
2044 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2045 gimple stmt = DR_STMT (dr);
2046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2047 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2048 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2049 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2050 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2051 bool slp_impossible = false;
2052 struct loop *loop = NULL;
2054 if (loop_vinfo)
2055 loop = LOOP_VINFO_LOOP (loop_vinfo);
2057 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2058 size of the interleaving group (including gaps). */
2059 groupsize = absu_hwi (dr_step) / type_size;
2061 /* Not consecutive access is possible only if it is a part of interleaving. */
2062 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2064 /* Check if it this DR is a part of interleaving, and is a single
2065 element of the group that is accessed in the loop. */
2067 /* Gaps are supported only for loads. STEP must be a multiple of the type
2068 size. The size of the group must be a power of 2. */
2069 if (DR_IS_READ (dr)
2070 && (dr_step % type_size) == 0
2071 && groupsize > 0
2072 && exact_log2 (groupsize) != -1)
2074 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2075 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2076 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_NOTE, vect_location,
2079 "Detected single element interleaving ");
2080 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2081 dump_printf (MSG_NOTE, " step ");
2082 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2083 dump_printf (MSG_NOTE, "\n");
2086 if (loop_vinfo)
2088 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_NOTE, vect_location,
2090 "Data access with gaps requires scalar "
2091 "epilogue loop\n");
2092 if (loop->inner)
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096 "Peeling for outer loop is not"
2097 " supported\n");
2098 return false;
2101 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2104 return true;
2107 if (dump_enabled_p ())
2109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2110 "not consecutive access ");
2111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2112 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2115 if (bb_vinfo)
2117 /* Mark the statement as unvectorizable. */
2118 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2119 return true;
2122 return false;
2125 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2127 /* First stmt in the interleaving chain. Check the chain. */
2128 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2129 struct data_reference *data_ref = dr;
2130 unsigned int count = 1;
2131 tree prev_init = DR_INIT (data_ref);
2132 gimple prev = stmt;
2133 HOST_WIDE_INT diff, gaps = 0;
2134 unsigned HOST_WIDE_INT count_in_bytes;
2136 while (next)
2138 /* Skip same data-refs. In case that two or more stmts share
2139 data-ref (supported only for loads), we vectorize only the first
2140 stmt, and the rest get their vectorized loads from the first
2141 one. */
2142 if (!tree_int_cst_compare (DR_INIT (data_ref),
2143 DR_INIT (STMT_VINFO_DATA_REF (
2144 vinfo_for_stmt (next)))))
2146 if (DR_IS_WRITE (data_ref))
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "Two store stmts share the same dr.\n");
2151 return false;
2154 /* For load use the same data-ref load. */
2155 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2157 prev = next;
2158 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2159 continue;
2162 prev = next;
2163 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2165 /* All group members have the same STEP by construction. */
2166 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2168 /* Check that the distance between two accesses is equal to the type
2169 size. Otherwise, we have gaps. */
2170 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2171 - TREE_INT_CST_LOW (prev_init)) / type_size;
2172 if (diff != 1)
2174 /* FORNOW: SLP of accesses with gaps is not supported. */
2175 slp_impossible = true;
2176 if (DR_IS_WRITE (data_ref))
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180 "interleaved store with gaps\n");
2181 return false;
2184 gaps += diff - 1;
2187 last_accessed_element += diff;
2189 /* Store the gap from the previous member of the group. If there is no
2190 gap in the access, GROUP_GAP is always 1. */
2191 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2193 prev_init = DR_INIT (data_ref);
2194 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2195 /* Count the number of data-refs in the chain. */
2196 count++;
2199 /* COUNT is the number of accesses found, we multiply it by the size of
2200 the type to get COUNT_IN_BYTES. */
2201 count_in_bytes = type_size * count;
2203 /* Check that the size of the interleaving (including gaps) is not
2204 greater than STEP. */
2205 if (dr_step != 0
2206 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2208 if (dump_enabled_p ())
2210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211 "interleaving size is greater than step for ");
2212 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2213 DR_REF (dr));
2214 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2216 return false;
2219 /* Check that the size of the interleaving is equal to STEP for stores,
2220 i.e., that there are no gaps. */
2221 if (dr_step != 0
2222 && absu_hwi (dr_step) != count_in_bytes)
2224 if (DR_IS_READ (dr))
2226 slp_impossible = true;
2227 /* There is a gap after the last load in the group. This gap is a
2228 difference between the groupsize and the number of elements.
2229 When there is no gap, this difference should be 0. */
2230 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2232 else
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236 "interleaved store with gaps\n");
2237 return false;
2241 /* Check that STEP is a multiple of type size. */
2242 if (dr_step != 0
2243 && (dr_step % type_size) != 0)
2245 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248 "step is not a multiple of type size: step ");
2249 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2250 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2251 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2252 TYPE_SIZE_UNIT (scalar_type));
2253 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2255 return false;
2258 if (groupsize == 0)
2259 groupsize = count;
2261 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_NOTE, vect_location,
2264 "Detected interleaving of size %d\n", (int)groupsize);
2266 /* SLP: create an SLP data structure for every interleaving group of
2267 stores for further analysis in vect_analyse_slp. */
2268 if (DR_IS_WRITE (dr) && !slp_impossible)
2270 if (loop_vinfo)
2271 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2272 if (bb_vinfo)
2273 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2276 /* There is a gap in the end of the group. */
2277 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2279 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2281 "Data access with gaps requires scalar "
2282 "epilogue loop\n");
2283 if (loop->inner)
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "Peeling for outer loop is not supported\n");
2288 return false;
2291 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2295 return true;
2299 /* Analyze the access pattern of the data-reference DR.
2300 In case of non-consecutive accesses call vect_analyze_group_access() to
2301 analyze groups of accesses. */
2303 static bool
2304 vect_analyze_data_ref_access (struct data_reference *dr)
2306 tree step = DR_STEP (dr);
2307 tree scalar_type = TREE_TYPE (DR_REF (dr));
2308 gimple stmt = DR_STMT (dr);
2309 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2310 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2311 struct loop *loop = NULL;
2313 if (loop_vinfo)
2314 loop = LOOP_VINFO_LOOP (loop_vinfo);
2316 if (loop_vinfo && !step)
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 "bad data-ref access in loop\n");
2321 return false;
2324 /* Allow invariant loads in not nested loops. */
2325 if (loop_vinfo && integer_zerop (step))
2327 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2328 if (nested_in_vect_loop_p (loop, stmt))
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_NOTE, vect_location,
2332 "zero step in inner loop of nest\n");
2333 return false;
2335 return DR_IS_READ (dr);
2338 if (loop && nested_in_vect_loop_p (loop, stmt))
2340 /* Interleaved accesses are not yet supported within outer-loop
2341 vectorization for references in the inner-loop. */
2342 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2344 /* For the rest of the analysis we use the outer-loop step. */
2345 step = STMT_VINFO_DR_STEP (stmt_info);
2346 if (integer_zerop (step))
2348 if (dump_enabled_p ())
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "zero step in outer loop.\n");
2351 if (DR_IS_READ (dr))
2352 return true;
2353 else
2354 return false;
2358 /* Consecutive? */
2359 if (TREE_CODE (step) == INTEGER_CST)
2361 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2362 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2363 || (dr_step < 0
2364 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2366 /* Mark that it is not interleaving. */
2367 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2368 return true;
2372 if (loop && nested_in_vect_loop_p (loop, stmt))
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_NOTE, vect_location,
2376 "grouped access in outer loop.\n");
2377 return false;
2380 /* Assume this is a DR handled by non-constant strided load case. */
2381 if (TREE_CODE (step) != INTEGER_CST)
2382 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2384 /* Not consecutive access - check if it's a part of interleaving group. */
2385 return vect_analyze_group_access (dr);
2390 /* A helper function used in the comparator function to sort data
2391 references. T1 and T2 are two data references to be compared.
2392 The function returns -1, 0, or 1. */
2394 static int
2395 compare_tree (tree t1, tree t2)
2397 int i, cmp;
2398 enum tree_code code;
2399 char tclass;
2401 if (t1 == t2)
2402 return 0;
2403 if (t1 == NULL)
2404 return -1;
2405 if (t2 == NULL)
2406 return 1;
2409 if (TREE_CODE (t1) != TREE_CODE (t2))
2410 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2412 code = TREE_CODE (t1);
2413 switch (code)
2415 /* For const values, we can just use hash values for comparisons. */
2416 case INTEGER_CST:
2417 case REAL_CST:
2418 case FIXED_CST:
2419 case STRING_CST:
2420 case COMPLEX_CST:
2421 case VECTOR_CST:
2423 hashval_t h1 = iterative_hash_expr (t1, 0);
2424 hashval_t h2 = iterative_hash_expr (t2, 0);
2425 if (h1 != h2)
2426 return h1 < h2 ? -1 : 1;
2427 break;
2430 case SSA_NAME:
2431 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2432 if (cmp != 0)
2433 return cmp;
2435 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2436 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2437 break;
2439 default:
2440 tclass = TREE_CODE_CLASS (code);
2442 /* For var-decl, we could compare their UIDs. */
2443 if (tclass == tcc_declaration)
2445 if (DECL_UID (t1) != DECL_UID (t2))
2446 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2447 break;
2450 /* For expressions with operands, compare their operands recursively. */
2451 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2453 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2454 if (cmp != 0)
2455 return cmp;
2459 return 0;
2463 /* Compare two data-references DRA and DRB to group them into chunks
2464 suitable for grouping. */
2466 static int
2467 dr_group_sort_cmp (const void *dra_, const void *drb_)
2469 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2470 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2471 int cmp;
2473 /* Stabilize sort. */
2474 if (dra == drb)
2475 return 0;
2477 /* Ordering of DRs according to base. */
2478 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2480 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2481 if (cmp != 0)
2482 return cmp;
2485 /* And according to DR_OFFSET. */
2486 if (!dr_equal_offsets_p (dra, drb))
2488 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2489 if (cmp != 0)
2490 return cmp;
2493 /* Put reads before writes. */
2494 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2495 return DR_IS_READ (dra) ? -1 : 1;
2497 /* Then sort after access size. */
2498 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2499 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2501 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2502 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2503 if (cmp != 0)
2504 return cmp;
2507 /* And after step. */
2508 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2510 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2511 if (cmp != 0)
2512 return cmp;
2515 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2516 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2517 if (cmp == 0)
2518 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2519 return cmp;
2522 /* Function vect_analyze_data_ref_accesses.
2524 Analyze the access pattern of all the data references in the loop.
2526 FORNOW: the only access pattern that is considered vectorizable is a
2527 simple step 1 (consecutive) access.
2529 FORNOW: handle only arrays and pointer accesses. */
2531 bool
2532 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2534 unsigned int i;
2535 vec<data_reference_p> datarefs;
2536 struct data_reference *dr;
2538 if (dump_enabled_p ())
2539 dump_printf_loc (MSG_NOTE, vect_location,
2540 "=== vect_analyze_data_ref_accesses ===\n");
2542 if (loop_vinfo)
2543 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2544 else
2545 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2547 if (datarefs.is_empty ())
2548 return true;
2550 /* Sort the array of datarefs to make building the interleaving chains
2551 linear. Don't modify the original vector's order, it is needed for
2552 determining what dependencies are reversed. */
2553 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2554 datarefs_copy.qsort (dr_group_sort_cmp);
2556 /* Build the interleaving chains. */
2557 for (i = 0; i < datarefs_copy.length () - 1;)
2559 data_reference_p dra = datarefs_copy[i];
2560 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2561 stmt_vec_info lastinfo = NULL;
2562 for (i = i + 1; i < datarefs_copy.length (); ++i)
2564 data_reference_p drb = datarefs_copy[i];
2565 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2567 /* ??? Imperfect sorting (non-compatible types, non-modulo
2568 accesses, same accesses) can lead to a group to be artificially
2569 split here as we don't just skip over those. If it really
2570 matters we can push those to a worklist and re-iterate
2571 over them. The we can just skip ahead to the next DR here. */
2573 /* Check that the data-refs have same first location (except init)
2574 and they are both either store or load (not load and store,
2575 not masked loads or stores). */
2576 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2577 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2578 DR_BASE_ADDRESS (drb), 0)
2579 || !dr_equal_offsets_p (dra, drb)
2580 || !gimple_assign_single_p (DR_STMT (dra))
2581 || !gimple_assign_single_p (DR_STMT (drb)))
2582 break;
2584 /* Check that the data-refs have the same constant size and step. */
2585 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2586 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2587 if (!tree_fits_uhwi_p (sza)
2588 || !tree_fits_uhwi_p (szb)
2589 || !tree_int_cst_equal (sza, szb)
2590 || !tree_fits_shwi_p (DR_STEP (dra))
2591 || !tree_fits_shwi_p (DR_STEP (drb))
2592 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2593 break;
2595 /* Do not place the same access in the interleaving chain twice. */
2596 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2597 break;
2599 /* Check the types are compatible.
2600 ??? We don't distinguish this during sorting. */
2601 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2602 TREE_TYPE (DR_REF (drb))))
2603 break;
2605 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2606 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2607 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2608 gcc_assert (init_a < init_b);
2610 /* If init_b == init_a + the size of the type * k, we have an
2611 interleaving, and DRA is accessed before DRB. */
2612 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2613 if ((init_b - init_a) % type_size_a != 0)
2614 break;
2616 /* The step (if not zero) is greater than the difference between
2617 data-refs' inits. This splits groups into suitable sizes. */
2618 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2619 if (step != 0 && step <= (init_b - init_a))
2620 break;
2622 if (dump_enabled_p ())
2624 dump_printf_loc (MSG_NOTE, vect_location,
2625 "Detected interleaving ");
2626 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2627 dump_printf (MSG_NOTE, " and ");
2628 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2629 dump_printf (MSG_NOTE, "\n");
2632 /* Link the found element into the group list. */
2633 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2635 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2636 lastinfo = stmtinfo_a;
2638 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2639 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2640 lastinfo = stmtinfo_b;
2644 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2645 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2646 && !vect_analyze_data_ref_access (dr))
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2650 "not vectorized: complicated access pattern.\n");
2652 if (bb_vinfo)
2654 /* Mark the statement as not vectorizable. */
2655 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2656 continue;
2658 else
2660 datarefs_copy.release ();
2661 return false;
2665 datarefs_copy.release ();
2666 return true;
2670 /* Operator == between two dr_with_seg_len objects.
2672 This equality operator is used to make sure two data refs
2673 are the same one so that we will consider to combine the
2674 aliasing checks of those two pairs of data dependent data
2675 refs. */
2677 static bool
2678 operator == (const dr_with_seg_len& d1,
2679 const dr_with_seg_len& d2)
2681 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2682 DR_BASE_ADDRESS (d2.dr), 0)
2683 && compare_tree (d1.offset, d2.offset) == 0
2684 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2687 /* Function comp_dr_with_seg_len_pair.
2689 Comparison function for sorting objects of dr_with_seg_len_pair_t
2690 so that we can combine aliasing checks in one scan. */
2692 static int
2693 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2695 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2696 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2698 const dr_with_seg_len &p11 = p1->first,
2699 &p12 = p1->second,
2700 &p21 = p2->first,
2701 &p22 = p2->second;
2703 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2704 if a and c have the same basic address snd step, and b and d have the same
2705 address and step. Therefore, if any a&c or b&d don't have the same address
2706 and step, we don't care the order of those two pairs after sorting. */
2707 int comp_res;
2709 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2710 DR_BASE_ADDRESS (p21.dr))) != 0)
2711 return comp_res;
2712 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2713 DR_BASE_ADDRESS (p22.dr))) != 0)
2714 return comp_res;
2715 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2716 return comp_res;
2717 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2718 return comp_res;
2719 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2720 return comp_res;
2721 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2722 return comp_res;
2724 return 0;
2727 /* Function vect_vfa_segment_size.
2729 Create an expression that computes the size of segment
2730 that will be accessed for a data reference. The functions takes into
2731 account that realignment loads may access one more vector.
2733 Input:
2734 DR: The data reference.
2735 LENGTH_FACTOR: segment length to consider.
2737 Return an expression whose value is the size of segment which will be
2738 accessed by DR. */
2740 static tree
2741 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2743 tree segment_length;
2745 if (integer_zerop (DR_STEP (dr)))
2746 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2747 else
2748 segment_length = size_binop (MULT_EXPR,
2749 fold_convert (sizetype, DR_STEP (dr)),
2750 fold_convert (sizetype, length_factor));
2752 if (vect_supportable_dr_alignment (dr, false)
2753 == dr_explicit_realign_optimized)
2755 tree vector_size = TYPE_SIZE_UNIT
2756 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2758 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2760 return segment_length;
2763 /* Function vect_prune_runtime_alias_test_list.
2765 Prune a list of ddrs to be tested at run-time by versioning for alias.
2766 Merge several alias checks into one if possible.
2767 Return FALSE if resulting list of ddrs is longer then allowed by
2768 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2770 bool
2771 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2773 vec<ddr_p> may_alias_ddrs =
2774 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2775 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2776 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2777 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2778 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2780 ddr_p ddr;
2781 unsigned int i;
2782 tree length_factor;
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE, vect_location,
2786 "=== vect_prune_runtime_alias_test_list ===\n");
2788 if (may_alias_ddrs.is_empty ())
2789 return true;
2791 /* Basically, for each pair of dependent data refs store_ptr_0
2792 and load_ptr_0, we create an expression:
2794 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2795 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2797 for aliasing checks. However, in some cases we can decrease
2798 the number of checks by combining two checks into one. For
2799 example, suppose we have another pair of data refs store_ptr_0
2800 and load_ptr_1, and if the following condition is satisfied:
2802 load_ptr_0 < load_ptr_1 &&
2803 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2805 (this condition means, in each iteration of vectorized loop,
2806 the accessed memory of store_ptr_0 cannot be between the memory
2807 of load_ptr_0 and load_ptr_1.)
2809 we then can use only the following expression to finish the
2810 alising checks between store_ptr_0 & load_ptr_0 and
2811 store_ptr_0 & load_ptr_1:
2813 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2814 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2816 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2817 same basic address. */
2819 comp_alias_ddrs.create (may_alias_ddrs.length ());
2821 /* First, we collect all data ref pairs for aliasing checks. */
2822 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2824 struct data_reference *dr_a, *dr_b;
2825 gimple dr_group_first_a, dr_group_first_b;
2826 tree segment_length_a, segment_length_b;
2827 gimple stmt_a, stmt_b;
2829 dr_a = DDR_A (ddr);
2830 stmt_a = DR_STMT (DDR_A (ddr));
2831 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2832 if (dr_group_first_a)
2834 stmt_a = dr_group_first_a;
2835 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2838 dr_b = DDR_B (ddr);
2839 stmt_b = DR_STMT (DDR_B (ddr));
2840 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2841 if (dr_group_first_b)
2843 stmt_b = dr_group_first_b;
2844 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2847 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2848 length_factor = scalar_loop_iters;
2849 else
2850 length_factor = size_int (vect_factor);
2851 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2852 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2854 dr_with_seg_len_pair_t dr_with_seg_len_pair
2855 (dr_with_seg_len (dr_a, segment_length_a),
2856 dr_with_seg_len (dr_b, segment_length_b));
2858 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2859 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2861 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2864 /* Second, we sort the collected data ref pairs so that we can scan
2865 them once to combine all possible aliasing checks. */
2866 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2868 /* Third, we scan the sorted dr pairs and check if we can combine
2869 alias checks of two neighbouring dr pairs. */
2870 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2872 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2873 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2874 *dr_b1 = &comp_alias_ddrs[i-1].second,
2875 *dr_a2 = &comp_alias_ddrs[i].first,
2876 *dr_b2 = &comp_alias_ddrs[i].second;
2878 /* Remove duplicate data ref pairs. */
2879 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2881 if (dump_enabled_p ())
2883 dump_printf_loc (MSG_NOTE, vect_location,
2884 "found equal ranges ");
2885 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2886 DR_REF (dr_a1->dr));
2887 dump_printf (MSG_NOTE, ", ");
2888 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2889 DR_REF (dr_b1->dr));
2890 dump_printf (MSG_NOTE, " and ");
2891 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2892 DR_REF (dr_a2->dr));
2893 dump_printf (MSG_NOTE, ", ");
2894 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2895 DR_REF (dr_b2->dr));
2896 dump_printf (MSG_NOTE, "\n");
2899 comp_alias_ddrs.ordered_remove (i--);
2900 continue;
2903 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2905 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2906 and DR_A1 and DR_A2 are two consecutive memrefs. */
2907 if (*dr_a1 == *dr_a2)
2909 std::swap (dr_a1, dr_b1);
2910 std::swap (dr_a2, dr_b2);
2913 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2914 DR_BASE_ADDRESS (dr_a2->dr),
2916 || !tree_fits_shwi_p (dr_a1->offset)
2917 || !tree_fits_shwi_p (dr_a2->offset))
2918 continue;
2920 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2921 - tree_to_shwi (dr_a1->offset));
2924 /* Now we check if the following condition is satisfied:
2926 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2928 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2929 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2930 have to make a best estimation. We can get the minimum value
2931 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2932 then either of the following two conditions can guarantee the
2933 one above:
2935 1: DIFF <= MIN_SEG_LEN_B
2936 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2940 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2941 ? tree_to_shwi (dr_b1->seg_len)
2942 : vect_factor);
2944 if (diff <= min_seg_len_b
2945 || (tree_fits_shwi_p (dr_a1->seg_len)
2946 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2948 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "merging ranges for ");
2952 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2953 DR_REF (dr_a1->dr));
2954 dump_printf (MSG_NOTE, ", ");
2955 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2956 DR_REF (dr_b1->dr));
2957 dump_printf (MSG_NOTE, " and ");
2958 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2959 DR_REF (dr_a2->dr));
2960 dump_printf (MSG_NOTE, ", ");
2961 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2962 DR_REF (dr_b2->dr));
2963 dump_printf (MSG_NOTE, "\n");
2966 dr_a1->seg_len = size_binop (PLUS_EXPR,
2967 dr_a2->seg_len, size_int (diff));
2968 comp_alias_ddrs.ordered_remove (i--);
2973 dump_printf_loc (MSG_NOTE, vect_location,
2974 "improved number of alias checks from %d to %d\n",
2975 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2976 if ((int) comp_alias_ddrs.length () >
2977 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2978 return false;
2980 return true;
2983 /* Check whether a non-affine read in stmt is suitable for gather load
2984 and if so, return a builtin decl for that operation. */
2986 tree
2987 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2988 tree *offp, int *scalep)
2990 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2991 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2992 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2993 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2994 tree offtype = NULL_TREE;
2995 tree decl, base, off;
2996 machine_mode pmode;
2997 int punsignedp, pvolatilep;
2999 base = DR_REF (dr);
3000 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3001 see if we can use the def stmt of the address. */
3002 if (is_gimple_call (stmt)
3003 && gimple_call_internal_p (stmt)
3004 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3005 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3006 && TREE_CODE (base) == MEM_REF
3007 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3008 && integer_zerop (TREE_OPERAND (base, 1))
3009 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3011 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3012 if (is_gimple_assign (def_stmt)
3013 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3014 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3017 /* The gather builtins need address of the form
3018 loop_invariant + vector * {1, 2, 4, 8}
3020 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3021 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3022 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3023 multiplications and additions in it. To get a vector, we need
3024 a single SSA_NAME that will be defined in the loop and will
3025 contain everything that is not loop invariant and that can be
3026 vectorized. The following code attempts to find such a preexistng
3027 SSA_NAME OFF and put the loop invariants into a tree BASE
3028 that can be gimplified before the loop. */
3029 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3030 &pmode, &punsignedp, &pvolatilep, false);
3031 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3033 if (TREE_CODE (base) == MEM_REF)
3035 if (!integer_zerop (TREE_OPERAND (base, 1)))
3037 if (off == NULL_TREE)
3039 offset_int moff = mem_ref_offset (base);
3040 off = wide_int_to_tree (sizetype, moff);
3042 else
3043 off = size_binop (PLUS_EXPR, off,
3044 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3046 base = TREE_OPERAND (base, 0);
3048 else
3049 base = build_fold_addr_expr (base);
3051 if (off == NULL_TREE)
3052 off = size_zero_node;
3054 /* If base is not loop invariant, either off is 0, then we start with just
3055 the constant offset in the loop invariant BASE and continue with base
3056 as OFF, otherwise give up.
3057 We could handle that case by gimplifying the addition of base + off
3058 into some SSA_NAME and use that as off, but for now punt. */
3059 if (!expr_invariant_in_loop_p (loop, base))
3061 if (!integer_zerop (off))
3062 return NULL_TREE;
3063 off = base;
3064 base = size_int (pbitpos / BITS_PER_UNIT);
3066 /* Otherwise put base + constant offset into the loop invariant BASE
3067 and continue with OFF. */
3068 else
3070 base = fold_convert (sizetype, base);
3071 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3074 /* OFF at this point may be either a SSA_NAME or some tree expression
3075 from get_inner_reference. Try to peel off loop invariants from it
3076 into BASE as long as possible. */
3077 STRIP_NOPS (off);
3078 while (offtype == NULL_TREE)
3080 enum tree_code code;
3081 tree op0, op1, add = NULL_TREE;
3083 if (TREE_CODE (off) == SSA_NAME)
3085 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3087 if (expr_invariant_in_loop_p (loop, off))
3088 return NULL_TREE;
3090 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3091 break;
3093 op0 = gimple_assign_rhs1 (def_stmt);
3094 code = gimple_assign_rhs_code (def_stmt);
3095 op1 = gimple_assign_rhs2 (def_stmt);
3097 else
3099 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3100 return NULL_TREE;
3101 code = TREE_CODE (off);
3102 extract_ops_from_tree (off, &code, &op0, &op1);
3104 switch (code)
3106 case POINTER_PLUS_EXPR:
3107 case PLUS_EXPR:
3108 if (expr_invariant_in_loop_p (loop, op0))
3110 add = op0;
3111 off = op1;
3112 do_add:
3113 add = fold_convert (sizetype, add);
3114 if (scale != 1)
3115 add = size_binop (MULT_EXPR, add, size_int (scale));
3116 base = size_binop (PLUS_EXPR, base, add);
3117 continue;
3119 if (expr_invariant_in_loop_p (loop, op1))
3121 add = op1;
3122 off = op0;
3123 goto do_add;
3125 break;
3126 case MINUS_EXPR:
3127 if (expr_invariant_in_loop_p (loop, op1))
3129 add = fold_convert (sizetype, op1);
3130 add = size_binop (MINUS_EXPR, size_zero_node, add);
3131 off = op0;
3132 goto do_add;
3134 break;
3135 case MULT_EXPR:
3136 if (scale == 1 && tree_fits_shwi_p (op1))
3138 scale = tree_to_shwi (op1);
3139 off = op0;
3140 continue;
3142 break;
3143 case SSA_NAME:
3144 off = op0;
3145 continue;
3146 CASE_CONVERT:
3147 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3148 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3149 break;
3150 if (TYPE_PRECISION (TREE_TYPE (op0))
3151 == TYPE_PRECISION (TREE_TYPE (off)))
3153 off = op0;
3154 continue;
3156 if (TYPE_PRECISION (TREE_TYPE (op0))
3157 < TYPE_PRECISION (TREE_TYPE (off)))
3159 off = op0;
3160 offtype = TREE_TYPE (off);
3161 STRIP_NOPS (off);
3162 continue;
3164 break;
3165 default:
3166 break;
3168 break;
3171 /* If at the end OFF still isn't a SSA_NAME or isn't
3172 defined in the loop, punt. */
3173 if (TREE_CODE (off) != SSA_NAME
3174 || expr_invariant_in_loop_p (loop, off))
3175 return NULL_TREE;
3177 if (offtype == NULL_TREE)
3178 offtype = TREE_TYPE (off);
3180 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3181 offtype, scale);
3182 if (decl == NULL_TREE)
3183 return NULL_TREE;
3185 if (basep)
3186 *basep = base;
3187 if (offp)
3188 *offp = off;
3189 if (scalep)
3190 *scalep = scale;
3191 return decl;
3194 /* Function vect_analyze_data_refs.
3196 Find all the data references in the loop or basic block.
3198 The general structure of the analysis of data refs in the vectorizer is as
3199 follows:
3200 1- vect_analyze_data_refs(loop/bb): call
3201 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3202 in the loop/bb and their dependences.
3203 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3204 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3205 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3209 bool
3210 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3211 bb_vec_info bb_vinfo,
3212 int *min_vf, unsigned *n_stmts)
3214 struct loop *loop = NULL;
3215 basic_block bb = NULL;
3216 unsigned int i;
3217 vec<data_reference_p> datarefs;
3218 struct data_reference *dr;
3219 tree scalar_type;
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "=== vect_analyze_data_refs ===\n");
3225 if (loop_vinfo)
3227 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3229 loop = LOOP_VINFO_LOOP (loop_vinfo);
3230 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3231 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3233 if (dump_enabled_p ())
3234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3235 "not vectorized: loop contains function calls"
3236 " or data references that cannot be analyzed\n");
3237 return false;
3240 for (i = 0; i < loop->num_nodes; i++)
3242 gimple_stmt_iterator gsi;
3244 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3246 gimple stmt = gsi_stmt (gsi);
3247 if (is_gimple_debug (stmt))
3248 continue;
3249 ++*n_stmts;
3250 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3252 if (is_gimple_call (stmt) && loop->safelen)
3254 tree fndecl = gimple_call_fndecl (stmt), op;
3255 if (fndecl != NULL_TREE)
3257 struct cgraph_node *node = cgraph_node::get (fndecl);
3258 if (node != NULL && node->simd_clones != NULL)
3260 unsigned int j, n = gimple_call_num_args (stmt);
3261 for (j = 0; j < n; j++)
3263 op = gimple_call_arg (stmt, j);
3264 if (DECL_P (op)
3265 || (REFERENCE_CLASS_P (op)
3266 && get_base_address (op)))
3267 break;
3269 op = gimple_call_lhs (stmt);
3270 /* Ignore #pragma omp declare simd functions
3271 if they don't have data references in the
3272 call stmt itself. */
3273 if (j == n
3274 && !(op
3275 && (DECL_P (op)
3276 || (REFERENCE_CLASS_P (op)
3277 && get_base_address (op)))))
3278 continue;
3282 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3283 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285 "not vectorized: loop contains function "
3286 "calls or data references that cannot "
3287 "be analyzed\n");
3288 return false;
3293 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3295 else
3297 gimple_stmt_iterator gsi;
3299 bb = BB_VINFO_BB (bb_vinfo);
3300 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3302 gimple stmt = gsi_stmt (gsi);
3303 if (is_gimple_debug (stmt))
3304 continue;
3305 ++*n_stmts;
3306 if (!find_data_references_in_stmt (NULL, stmt,
3307 &BB_VINFO_DATAREFS (bb_vinfo)))
3309 /* Mark the rest of the basic-block as unvectorizable. */
3310 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3312 stmt = gsi_stmt (gsi);
3313 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3315 break;
3319 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3322 /* Go through the data-refs, check that the analysis succeeded. Update
3323 pointer from stmt_vec_info struct to DR and vectype. */
3325 FOR_EACH_VEC_ELT (datarefs, i, dr)
3327 gimple stmt;
3328 stmt_vec_info stmt_info;
3329 tree base, offset, init;
3330 bool gather = false;
3331 bool simd_lane_access = false;
3332 int vf;
3334 again:
3335 if (!dr || !DR_REF (dr))
3337 if (dump_enabled_p ())
3338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3339 "not vectorized: unhandled data-ref\n");
3340 return false;
3343 stmt = DR_STMT (dr);
3344 stmt_info = vinfo_for_stmt (stmt);
3346 /* Discard clobbers from the dataref vector. We will remove
3347 clobber stmts during vectorization. */
3348 if (gimple_clobber_p (stmt))
3350 free_data_ref (dr);
3351 if (i == datarefs.length () - 1)
3353 datarefs.pop ();
3354 break;
3356 datarefs.ordered_remove (i);
3357 dr = datarefs[i];
3358 goto again;
3361 /* Check that analysis of the data-ref succeeded. */
3362 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3363 || !DR_STEP (dr))
3365 bool maybe_gather
3366 = DR_IS_READ (dr)
3367 && !TREE_THIS_VOLATILE (DR_REF (dr))
3368 && targetm.vectorize.builtin_gather != NULL;
3369 bool maybe_simd_lane_access
3370 = loop_vinfo && loop->simduid;
3372 /* If target supports vector gather loads, or if this might be
3373 a SIMD lane access, see if they can't be used. */
3374 if (loop_vinfo
3375 && (maybe_gather || maybe_simd_lane_access)
3376 && !nested_in_vect_loop_p (loop, stmt))
3378 struct data_reference *newdr
3379 = create_data_ref (NULL, loop_containing_stmt (stmt),
3380 DR_REF (dr), stmt, true);
3381 gcc_assert (newdr != NULL && DR_REF (newdr));
3382 if (DR_BASE_ADDRESS (newdr)
3383 && DR_OFFSET (newdr)
3384 && DR_INIT (newdr)
3385 && DR_STEP (newdr)
3386 && integer_zerop (DR_STEP (newdr)))
3388 if (maybe_simd_lane_access)
3390 tree off = DR_OFFSET (newdr);
3391 STRIP_NOPS (off);
3392 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3393 && TREE_CODE (off) == MULT_EXPR
3394 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3396 tree step = TREE_OPERAND (off, 1);
3397 off = TREE_OPERAND (off, 0);
3398 STRIP_NOPS (off);
3399 if (CONVERT_EXPR_P (off)
3400 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3401 0)))
3402 < TYPE_PRECISION (TREE_TYPE (off)))
3403 off = TREE_OPERAND (off, 0);
3404 if (TREE_CODE (off) == SSA_NAME)
3406 gimple def = SSA_NAME_DEF_STMT (off);
3407 tree reft = TREE_TYPE (DR_REF (newdr));
3408 if (is_gimple_call (def)
3409 && gimple_call_internal_p (def)
3410 && (gimple_call_internal_fn (def)
3411 == IFN_GOMP_SIMD_LANE))
3413 tree arg = gimple_call_arg (def, 0);
3414 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3415 arg = SSA_NAME_VAR (arg);
3416 if (arg == loop->simduid
3417 /* For now. */
3418 && tree_int_cst_equal
3419 (TYPE_SIZE_UNIT (reft),
3420 step))
3422 DR_OFFSET (newdr) = ssize_int (0);
3423 DR_STEP (newdr) = step;
3424 DR_ALIGNED_TO (newdr)
3425 = size_int (BIGGEST_ALIGNMENT);
3426 dr = newdr;
3427 simd_lane_access = true;
3433 if (!simd_lane_access && maybe_gather)
3435 dr = newdr;
3436 gather = true;
3439 if (!gather && !simd_lane_access)
3440 free_data_ref (newdr);
3443 if (!gather && !simd_lane_access)
3445 if (dump_enabled_p ())
3447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3448 "not vectorized: data ref analysis "
3449 "failed ");
3450 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3451 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3454 if (bb_vinfo)
3455 break;
3457 return false;
3461 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3463 if (dump_enabled_p ())
3464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3465 "not vectorized: base addr of dr is a "
3466 "constant\n");
3468 if (bb_vinfo)
3469 break;
3471 if (gather || simd_lane_access)
3472 free_data_ref (dr);
3473 return false;
3476 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3478 if (dump_enabled_p ())
3480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3481 "not vectorized: volatile type ");
3482 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3483 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3486 if (bb_vinfo)
3487 break;
3489 return false;
3492 if (stmt_can_throw_internal (stmt))
3494 if (dump_enabled_p ())
3496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3497 "not vectorized: statement can throw an "
3498 "exception ");
3499 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3500 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3503 if (bb_vinfo)
3504 break;
3506 if (gather || simd_lane_access)
3507 free_data_ref (dr);
3508 return false;
3511 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3512 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3514 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3517 "not vectorized: statement is bitfield "
3518 "access ");
3519 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3520 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3523 if (bb_vinfo)
3524 break;
3526 if (gather || simd_lane_access)
3527 free_data_ref (dr);
3528 return false;
3531 base = unshare_expr (DR_BASE_ADDRESS (dr));
3532 offset = unshare_expr (DR_OFFSET (dr));
3533 init = unshare_expr (DR_INIT (dr));
3535 if (is_gimple_call (stmt)
3536 && (!gimple_call_internal_p (stmt)
3537 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3538 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3540 if (dump_enabled_p ())
3542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3543 "not vectorized: dr in a call ");
3544 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3545 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3548 if (bb_vinfo)
3549 break;
3551 if (gather || simd_lane_access)
3552 free_data_ref (dr);
3553 return false;
3556 /* Update DR field in stmt_vec_info struct. */
3558 /* If the dataref is in an inner-loop of the loop that is considered for
3559 for vectorization, we also want to analyze the access relative to
3560 the outer-loop (DR contains information only relative to the
3561 inner-most enclosing loop). We do that by building a reference to the
3562 first location accessed by the inner-loop, and analyze it relative to
3563 the outer-loop. */
3564 if (loop && nested_in_vect_loop_p (loop, stmt))
3566 tree outer_step, outer_base, outer_init;
3567 HOST_WIDE_INT pbitsize, pbitpos;
3568 tree poffset;
3569 machine_mode pmode;
3570 int punsignedp, pvolatilep;
3571 affine_iv base_iv, offset_iv;
3572 tree dinit;
3574 /* Build a reference to the first location accessed by the
3575 inner-loop: *(BASE+INIT). (The first location is actually
3576 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3577 tree inner_base = build_fold_indirect_ref
3578 (fold_build_pointer_plus (base, init));
3580 if (dump_enabled_p ())
3582 dump_printf_loc (MSG_NOTE, vect_location,
3583 "analyze in outer-loop: ");
3584 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3585 dump_printf (MSG_NOTE, "\n");
3588 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3589 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3590 gcc_assert (outer_base != NULL_TREE);
3592 if (pbitpos % BITS_PER_UNIT != 0)
3594 if (dump_enabled_p ())
3595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3596 "failed: bit offset alignment.\n");
3597 return false;
3600 outer_base = build_fold_addr_expr (outer_base);
3601 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3602 &base_iv, false))
3604 if (dump_enabled_p ())
3605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3606 "failed: evolution of base is not affine.\n");
3607 return false;
3610 if (offset)
3612 if (poffset)
3613 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3614 poffset);
3615 else
3616 poffset = offset;
3619 if (!poffset)
3621 offset_iv.base = ssize_int (0);
3622 offset_iv.step = ssize_int (0);
3624 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3625 &offset_iv, false))
3627 if (dump_enabled_p ())
3628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3629 "evolution of offset is not affine.\n");
3630 return false;
3633 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3634 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3635 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3636 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3637 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3639 outer_step = size_binop (PLUS_EXPR,
3640 fold_convert (ssizetype, base_iv.step),
3641 fold_convert (ssizetype, offset_iv.step));
3643 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3644 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3645 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3646 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3647 STMT_VINFO_DR_OFFSET (stmt_info) =
3648 fold_convert (ssizetype, offset_iv.base);
3649 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3650 size_int (highest_pow2_factor (offset_iv.base));
3652 if (dump_enabled_p ())
3654 dump_printf_loc (MSG_NOTE, vect_location,
3655 "\touter base_address: ");
3656 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3657 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3658 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3659 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660 STMT_VINFO_DR_OFFSET (stmt_info));
3661 dump_printf (MSG_NOTE,
3662 "\n\touter constant offset from base address: ");
3663 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3664 STMT_VINFO_DR_INIT (stmt_info));
3665 dump_printf (MSG_NOTE, "\n\touter step: ");
3666 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3667 STMT_VINFO_DR_STEP (stmt_info));
3668 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3669 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3670 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3671 dump_printf (MSG_NOTE, "\n");
3675 if (STMT_VINFO_DATA_REF (stmt_info))
3677 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680 "not vectorized: more than one data ref "
3681 "in stmt: ");
3682 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3683 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3686 if (bb_vinfo)
3687 break;
3689 if (gather || simd_lane_access)
3690 free_data_ref (dr);
3691 return false;
3694 STMT_VINFO_DATA_REF (stmt_info) = dr;
3695 if (simd_lane_access)
3697 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3698 free_data_ref (datarefs[i]);
3699 datarefs[i] = dr;
3702 /* Set vectype for STMT. */
3703 scalar_type = TREE_TYPE (DR_REF (dr));
3704 STMT_VINFO_VECTYPE (stmt_info)
3705 = get_vectype_for_scalar_type (scalar_type);
3706 if (!STMT_VINFO_VECTYPE (stmt_info))
3708 if (dump_enabled_p ())
3710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3711 "not vectorized: no vectype for stmt: ");
3712 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3713 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3714 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3715 scalar_type);
3716 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3719 if (bb_vinfo)
3720 break;
3722 if (gather || simd_lane_access)
3724 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3725 if (gather)
3726 free_data_ref (dr);
3728 return false;
3730 else
3732 if (dump_enabled_p ())
3734 dump_printf_loc (MSG_NOTE, vect_location,
3735 "got vectype for stmt: ");
3736 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3737 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3738 STMT_VINFO_VECTYPE (stmt_info));
3739 dump_printf (MSG_NOTE, "\n");
3743 /* Adjust the minimal vectorization factor according to the
3744 vector type. */
3745 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3746 if (vf > *min_vf)
3747 *min_vf = vf;
3749 if (gather)
3751 tree off;
3753 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3754 if (gather
3755 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3756 gather = false;
3757 if (!gather)
3759 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3760 free_data_ref (dr);
3761 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3764 "not vectorized: not suitable for gather "
3765 "load ");
3766 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3767 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3769 return false;
3772 datarefs[i] = dr;
3773 STMT_VINFO_GATHER_P (stmt_info) = true;
3775 else if (loop_vinfo
3776 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3778 if (nested_in_vect_loop_p (loop, stmt)
3779 || !DR_IS_READ (dr))
3781 if (dump_enabled_p ())
3783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784 "not vectorized: not suitable for strided "
3785 "load ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3787 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3789 return false;
3791 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3795 /* If we stopped analysis at the first dataref we could not analyze
3796 when trying to vectorize a basic-block mark the rest of the datarefs
3797 as not vectorizable and truncate the vector of datarefs. That
3798 avoids spending useless time in analyzing their dependence. */
3799 if (i != datarefs.length ())
3801 gcc_assert (bb_vinfo != NULL);
3802 for (unsigned j = i; j < datarefs.length (); ++j)
3804 data_reference_p dr = datarefs[j];
3805 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3806 free_data_ref (dr);
3808 datarefs.truncate (i);
3811 return true;
3815 /* Function vect_get_new_vect_var.
3817 Returns a name for a new variable. The current naming scheme appends the
3818 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3819 the name of vectorizer generated variables, and appends that to NAME if
3820 provided. */
3822 tree
3823 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3825 const char *prefix;
3826 tree new_vect_var;
3828 switch (var_kind)
3830 case vect_simple_var:
3831 prefix = "vect";
3832 break;
3833 case vect_scalar_var:
3834 prefix = "stmp";
3835 break;
3836 case vect_pointer_var:
3837 prefix = "vectp";
3838 break;
3839 default:
3840 gcc_unreachable ();
3843 if (name)
3845 char* tmp = concat (prefix, "_", name, NULL);
3846 new_vect_var = create_tmp_reg (type, tmp);
3847 free (tmp);
3849 else
3850 new_vect_var = create_tmp_reg (type, prefix);
3852 return new_vect_var;
3856 /* Function vect_create_addr_base_for_vector_ref.
3858 Create an expression that computes the address of the first memory location
3859 that will be accessed for a data reference.
3861 Input:
3862 STMT: The statement containing the data reference.
3863 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3864 OFFSET: Optional. If supplied, it is be added to the initial address.
3865 LOOP: Specify relative to which loop-nest should the address be computed.
3866 For example, when the dataref is in an inner-loop nested in an
3867 outer-loop that is now being vectorized, LOOP can be either the
3868 outer-loop, or the inner-loop. The first memory location accessed
3869 by the following dataref ('in' points to short):
3871 for (i=0; i<N; i++)
3872 for (j=0; j<M; j++)
3873 s += in[i+j]
3875 is as follows:
3876 if LOOP=i_loop: &in (relative to i_loop)
3877 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3878 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3879 initial address. Unlike OFFSET, which is number of elements to
3880 be added, BYTE_OFFSET is measured in bytes.
3882 Output:
3883 1. Return an SSA_NAME whose value is the address of the memory location of
3884 the first vector of the data reference.
3885 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3886 these statement(s) which define the returned SSA_NAME.
3888 FORNOW: We are only handling array accesses with step 1. */
3890 tree
3891 vect_create_addr_base_for_vector_ref (gimple stmt,
3892 gimple_seq *new_stmt_list,
3893 tree offset,
3894 struct loop *loop,
3895 tree byte_offset)
3897 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3898 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3899 tree data_ref_base;
3900 const char *base_name;
3901 tree addr_base;
3902 tree dest;
3903 gimple_seq seq = NULL;
3904 tree base_offset;
3905 tree init;
3906 tree vect_ptr_type;
3907 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3908 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3910 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3912 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3914 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3916 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3917 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3918 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3920 else
3922 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3923 base_offset = unshare_expr (DR_OFFSET (dr));
3924 init = unshare_expr (DR_INIT (dr));
3927 if (loop_vinfo)
3928 base_name = get_name (data_ref_base);
3929 else
3931 base_offset = ssize_int (0);
3932 init = ssize_int (0);
3933 base_name = get_name (DR_REF (dr));
3936 /* Create base_offset */
3937 base_offset = size_binop (PLUS_EXPR,
3938 fold_convert (sizetype, base_offset),
3939 fold_convert (sizetype, init));
3941 if (offset)
3943 offset = fold_build2 (MULT_EXPR, sizetype,
3944 fold_convert (sizetype, offset), step);
3945 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3946 base_offset, offset);
3948 if (byte_offset)
3950 byte_offset = fold_convert (sizetype, byte_offset);
3951 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3952 base_offset, byte_offset);
3955 /* base + base_offset */
3956 if (loop_vinfo)
3957 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3958 else
3960 addr_base = build1 (ADDR_EXPR,
3961 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3962 unshare_expr (DR_REF (dr)));
3965 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3966 addr_base = fold_convert (vect_ptr_type, addr_base);
3967 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3968 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3969 gimple_seq_add_seq (new_stmt_list, seq);
3971 if (DR_PTR_INFO (dr)
3972 && TREE_CODE (addr_base) == SSA_NAME)
3974 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3975 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3976 int misalign = DR_MISALIGNMENT (dr);
3977 if (offset || byte_offset || (misalign == -1))
3978 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3979 else
3980 set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
3983 if (dump_enabled_p ())
3985 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3986 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3987 dump_printf (MSG_NOTE, "\n");
3990 return addr_base;
3994 /* Function vect_create_data_ref_ptr.
3996 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3997 location accessed in the loop by STMT, along with the def-use update
3998 chain to appropriately advance the pointer through the loop iterations.
3999 Also set aliasing information for the pointer. This pointer is used by
4000 the callers to this function to create a memory reference expression for
4001 vector load/store access.
4003 Input:
4004 1. STMT: a stmt that references memory. Expected to be of the form
4005 GIMPLE_ASSIGN <name, data-ref> or
4006 GIMPLE_ASSIGN <data-ref, name>.
4007 2. AGGR_TYPE: the type of the reference, which should be either a vector
4008 or an array.
4009 3. AT_LOOP: the loop where the vector memref is to be created.
4010 4. OFFSET (optional): an offset to be added to the initial address accessed
4011 by the data-ref in STMT.
4012 5. BSI: location where the new stmts are to be placed if there is no loop
4013 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4014 pointing to the initial address.
4015 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4016 to the initial address accessed by the data-ref in STMT. This is
4017 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4018 in bytes.
4020 Output:
4021 1. Declare a new ptr to vector_type, and have it point to the base of the
4022 data reference (initial addressed accessed by the data reference).
4023 For example, for vector of type V8HI, the following code is generated:
4025 v8hi *ap;
4026 ap = (v8hi *)initial_address;
4028 if OFFSET is not supplied:
4029 initial_address = &a[init];
4030 if OFFSET is supplied:
4031 initial_address = &a[init + OFFSET];
4032 if BYTE_OFFSET is supplied:
4033 initial_address = &a[init] + BYTE_OFFSET;
4035 Return the initial_address in INITIAL_ADDRESS.
4037 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4038 update the pointer in each iteration of the loop.
4040 Return the increment stmt that updates the pointer in PTR_INCR.
4042 3. Set INV_P to true if the access pattern of the data reference in the
4043 vectorized loop is invariant. Set it to false otherwise.
4045 4. Return the pointer. */
4047 tree
4048 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4049 tree offset, tree *initial_address,
4050 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4051 bool only_init, bool *inv_p, tree byte_offset)
4053 const char *base_name;
4054 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4055 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4056 struct loop *loop = NULL;
4057 bool nested_in_vect_loop = false;
4058 struct loop *containing_loop = NULL;
4059 tree aggr_ptr_type;
4060 tree aggr_ptr;
4061 tree new_temp;
4062 gimple vec_stmt;
4063 gimple_seq new_stmt_list = NULL;
4064 edge pe = NULL;
4065 basic_block new_bb;
4066 tree aggr_ptr_init;
4067 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4068 tree aptr;
4069 gimple_stmt_iterator incr_gsi;
4070 bool insert_after;
4071 tree indx_before_incr, indx_after_incr;
4072 gimple incr;
4073 tree step;
4074 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4076 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4077 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4079 if (loop_vinfo)
4081 loop = LOOP_VINFO_LOOP (loop_vinfo);
4082 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4083 containing_loop = (gimple_bb (stmt))->loop_father;
4084 pe = loop_preheader_edge (loop);
4086 else
4088 gcc_assert (bb_vinfo);
4089 only_init = true;
4090 *ptr_incr = NULL;
4093 /* Check the step (evolution) of the load in LOOP, and record
4094 whether it's invariant. */
4095 if (nested_in_vect_loop)
4096 step = STMT_VINFO_DR_STEP (stmt_info);
4097 else
4098 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4100 if (integer_zerop (step))
4101 *inv_p = true;
4102 else
4103 *inv_p = false;
4105 /* Create an expression for the first address accessed by this load
4106 in LOOP. */
4107 base_name = get_name (DR_BASE_ADDRESS (dr));
4109 if (dump_enabled_p ())
4111 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4112 dump_printf_loc (MSG_NOTE, vect_location,
4113 "create %s-pointer variable to type: ",
4114 get_tree_code_name (TREE_CODE (aggr_type)));
4115 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4116 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4117 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4118 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4119 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4120 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4121 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4122 else
4123 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4124 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4125 dump_printf (MSG_NOTE, "\n");
4128 /* (1) Create the new aggregate-pointer variable.
4129 Vector and array types inherit the alias set of their component
4130 type by default so we need to use a ref-all pointer if the data
4131 reference does not conflict with the created aggregated data
4132 reference because it is not addressable. */
4133 bool need_ref_all = false;
4134 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4135 get_alias_set (DR_REF (dr))))
4136 need_ref_all = true;
4137 /* Likewise for any of the data references in the stmt group. */
4138 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4140 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4143 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4144 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4145 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4146 get_alias_set (DR_REF (sdr))))
4148 need_ref_all = true;
4149 break;
4151 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4153 while (orig_stmt);
4155 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4156 need_ref_all);
4157 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4160 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4161 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4162 def-use update cycles for the pointer: one relative to the outer-loop
4163 (LOOP), which is what steps (3) and (4) below do. The other is relative
4164 to the inner-loop (which is the inner-most loop containing the dataref),
4165 and this is done be step (5) below.
4167 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4168 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4169 redundant. Steps (3),(4) create the following:
4171 vp0 = &base_addr;
4172 LOOP: vp1 = phi(vp0,vp2)
4175 vp2 = vp1 + step
4176 goto LOOP
4178 If there is an inner-loop nested in loop, then step (5) will also be
4179 applied, and an additional update in the inner-loop will be created:
4181 vp0 = &base_addr;
4182 LOOP: vp1 = phi(vp0,vp2)
4184 inner: vp3 = phi(vp1,vp4)
4185 vp4 = vp3 + inner_step
4186 if () goto inner
4188 vp2 = vp1 + step
4189 if () goto LOOP */
4191 /* (2) Calculate the initial address of the aggregate-pointer, and set
4192 the aggregate-pointer to point to it before the loop. */
4194 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4196 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4197 offset, loop, byte_offset);
4198 if (new_stmt_list)
4200 if (pe)
4202 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4203 gcc_assert (!new_bb);
4205 else
4206 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4209 *initial_address = new_temp;
4211 /* Create: p = (aggr_type *) initial_base */
4212 if (TREE_CODE (new_temp) != SSA_NAME
4213 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4215 vec_stmt = gimple_build_assign (aggr_ptr,
4216 fold_convert (aggr_ptr_type, new_temp));
4217 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4218 /* Copy the points-to information if it exists. */
4219 if (DR_PTR_INFO (dr))
4220 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4221 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4222 if (pe)
4224 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4225 gcc_assert (!new_bb);
4227 else
4228 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4230 else
4231 aggr_ptr_init = new_temp;
4233 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4234 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4235 inner-loop nested in LOOP (during outer-loop vectorization). */
4237 /* No update in loop is required. */
4238 if (only_init && (!loop_vinfo || at_loop == loop))
4239 aptr = aggr_ptr_init;
4240 else
4242 /* The step of the aggregate pointer is the type size. */
4243 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4244 /* One exception to the above is when the scalar step of the load in
4245 LOOP is zero. In this case the step here is also zero. */
4246 if (*inv_p)
4247 iv_step = size_zero_node;
4248 else if (tree_int_cst_sgn (step) == -1)
4249 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4251 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4253 create_iv (aggr_ptr_init,
4254 fold_convert (aggr_ptr_type, iv_step),
4255 aggr_ptr, loop, &incr_gsi, insert_after,
4256 &indx_before_incr, &indx_after_incr);
4257 incr = gsi_stmt (incr_gsi);
4258 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4260 /* Copy the points-to information if it exists. */
4261 if (DR_PTR_INFO (dr))
4263 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4264 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4266 if (ptr_incr)
4267 *ptr_incr = incr;
4269 aptr = indx_before_incr;
4272 if (!nested_in_vect_loop || only_init)
4273 return aptr;
4276 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4277 nested in LOOP, if exists. */
4279 gcc_assert (nested_in_vect_loop);
4280 if (!only_init)
4282 standard_iv_increment_position (containing_loop, &incr_gsi,
4283 &insert_after);
4284 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4285 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4286 &indx_after_incr);
4287 incr = gsi_stmt (incr_gsi);
4288 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4290 /* Copy the points-to information if it exists. */
4291 if (DR_PTR_INFO (dr))
4293 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4294 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4296 if (ptr_incr)
4297 *ptr_incr = incr;
4299 return indx_before_incr;
4301 else
4302 gcc_unreachable ();
4306 /* Function bump_vector_ptr
4308 Increment a pointer (to a vector type) by vector-size. If requested,
4309 i.e. if PTR-INCR is given, then also connect the new increment stmt
4310 to the existing def-use update-chain of the pointer, by modifying
4311 the PTR_INCR as illustrated below:
4313 The pointer def-use update-chain before this function:
4314 DATAREF_PTR = phi (p_0, p_2)
4315 ....
4316 PTR_INCR: p_2 = DATAREF_PTR + step
4318 The pointer def-use update-chain after this function:
4319 DATAREF_PTR = phi (p_0, p_2)
4320 ....
4321 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4322 ....
4323 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4325 Input:
4326 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4327 in the loop.
4328 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4329 the loop. The increment amount across iterations is expected
4330 to be vector_size.
4331 BSI - location where the new update stmt is to be placed.
4332 STMT - the original scalar memory-access stmt that is being vectorized.
4333 BUMP - optional. The offset by which to bump the pointer. If not given,
4334 the offset is assumed to be vector_size.
4336 Output: Return NEW_DATAREF_PTR as illustrated above.
4340 tree
4341 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4342 gimple stmt, tree bump)
4344 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4345 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4346 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4347 tree update = TYPE_SIZE_UNIT (vectype);
4348 gassign *incr_stmt;
4349 ssa_op_iter iter;
4350 use_operand_p use_p;
4351 tree new_dataref_ptr;
4353 if (bump)
4354 update = bump;
4356 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4357 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4358 dataref_ptr, update);
4359 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4361 /* Copy the points-to information if it exists. */
4362 if (DR_PTR_INFO (dr))
4364 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4365 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4368 if (!ptr_incr)
4369 return new_dataref_ptr;
4371 /* Update the vector-pointer's cross-iteration increment. */
4372 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4374 tree use = USE_FROM_PTR (use_p);
4376 if (use == dataref_ptr)
4377 SET_USE (use_p, new_dataref_ptr);
4378 else
4379 gcc_assert (tree_int_cst_compare (use, update) == 0);
4382 return new_dataref_ptr;
4386 /* Function vect_create_destination_var.
4388 Create a new temporary of type VECTYPE. */
4390 tree
4391 vect_create_destination_var (tree scalar_dest, tree vectype)
4393 tree vec_dest;
4394 const char *name;
4395 char *new_name;
4396 tree type;
4397 enum vect_var_kind kind;
4399 kind = vectype ? vect_simple_var : vect_scalar_var;
4400 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4402 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4404 name = get_name (scalar_dest);
4405 if (name)
4406 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4407 else
4408 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4409 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4410 free (new_name);
4412 return vec_dest;
4415 /* Function vect_grouped_store_supported.
4417 Returns TRUE if interleave high and interleave low permutations
4418 are supported, and FALSE otherwise. */
4420 bool
4421 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4423 machine_mode mode = TYPE_MODE (vectype);
4425 /* vect_permute_store_chain requires the group size to be equal to 3 or
4426 be a power of two. */
4427 if (count != 3 && exact_log2 (count) == -1)
4429 if (dump_enabled_p ())
4430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4431 "the size of the group of accesses"
4432 " is not a power of 2 or not eqaul to 3\n");
4433 return false;
4436 /* Check that the permutation is supported. */
4437 if (VECTOR_MODE_P (mode))
4439 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4440 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4442 if (count == 3)
4444 unsigned int j0 = 0, j1 = 0, j2 = 0;
4445 unsigned int i, j;
4447 for (j = 0; j < 3; j++)
4449 int nelt0 = ((3 - j) * nelt) % 3;
4450 int nelt1 = ((3 - j) * nelt + 1) % 3;
4451 int nelt2 = ((3 - j) * nelt + 2) % 3;
4452 for (i = 0; i < nelt; i++)
4454 if (3 * i + nelt0 < nelt)
4455 sel[3 * i + nelt0] = j0++;
4456 if (3 * i + nelt1 < nelt)
4457 sel[3 * i + nelt1] = nelt + j1++;
4458 if (3 * i + nelt2 < nelt)
4459 sel[3 * i + nelt2] = 0;
4461 if (!can_vec_perm_p (mode, false, sel))
4463 if (dump_enabled_p ())
4464 dump_printf (MSG_MISSED_OPTIMIZATION,
4465 "permutaion op not supported by target.\n");
4466 return false;
4469 for (i = 0; i < nelt; i++)
4471 if (3 * i + nelt0 < nelt)
4472 sel[3 * i + nelt0] = 3 * i + nelt0;
4473 if (3 * i + nelt1 < nelt)
4474 sel[3 * i + nelt1] = 3 * i + nelt1;
4475 if (3 * i + nelt2 < nelt)
4476 sel[3 * i + nelt2] = nelt + j2++;
4478 if (!can_vec_perm_p (mode, false, sel))
4480 if (dump_enabled_p ())
4481 dump_printf (MSG_MISSED_OPTIMIZATION,
4482 "permutaion op not supported by target.\n");
4483 return false;
4486 return true;
4488 else
4490 /* If length is not equal to 3 then only power of 2 is supported. */
4491 gcc_assert (exact_log2 (count) != -1);
4493 for (i = 0; i < nelt / 2; i++)
4495 sel[i * 2] = i;
4496 sel[i * 2 + 1] = i + nelt;
4498 if (can_vec_perm_p (mode, false, sel))
4500 for (i = 0; i < nelt; i++)
4501 sel[i] += nelt / 2;
4502 if (can_vec_perm_p (mode, false, sel))
4503 return true;
4508 if (dump_enabled_p ())
4509 dump_printf (MSG_MISSED_OPTIMIZATION,
4510 "permutaion op not supported by target.\n");
4511 return false;
4515 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4516 type VECTYPE. */
4518 bool
4519 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4521 return vect_lanes_optab_supported_p ("vec_store_lanes",
4522 vec_store_lanes_optab,
4523 vectype, count);
4527 /* Function vect_permute_store_chain.
4529 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4530 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4531 the data correctly for the stores. Return the final references for stores
4532 in RESULT_CHAIN.
4534 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4535 The input is 4 vectors each containing 8 elements. We assign a number to
4536 each element, the input sequence is:
4538 1st vec: 0 1 2 3 4 5 6 7
4539 2nd vec: 8 9 10 11 12 13 14 15
4540 3rd vec: 16 17 18 19 20 21 22 23
4541 4th vec: 24 25 26 27 28 29 30 31
4543 The output sequence should be:
4545 1st vec: 0 8 16 24 1 9 17 25
4546 2nd vec: 2 10 18 26 3 11 19 27
4547 3rd vec: 4 12 20 28 5 13 21 30
4548 4th vec: 6 14 22 30 7 15 23 31
4550 i.e., we interleave the contents of the four vectors in their order.
4552 We use interleave_high/low instructions to create such output. The input of
4553 each interleave_high/low operation is two vectors:
4554 1st vec 2nd vec
4555 0 1 2 3 4 5 6 7
4556 the even elements of the result vector are obtained left-to-right from the
4557 high/low elements of the first vector. The odd elements of the result are
4558 obtained left-to-right from the high/low elements of the second vector.
4559 The output of interleave_high will be: 0 4 1 5
4560 and of interleave_low: 2 6 3 7
4563 The permutation is done in log LENGTH stages. In each stage interleave_high
4564 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4565 where the first argument is taken from the first half of DR_CHAIN and the
4566 second argument from it's second half.
4567 In our example,
4569 I1: interleave_high (1st vec, 3rd vec)
4570 I2: interleave_low (1st vec, 3rd vec)
4571 I3: interleave_high (2nd vec, 4th vec)
4572 I4: interleave_low (2nd vec, 4th vec)
4574 The output for the first stage is:
4576 I1: 0 16 1 17 2 18 3 19
4577 I2: 4 20 5 21 6 22 7 23
4578 I3: 8 24 9 25 10 26 11 27
4579 I4: 12 28 13 29 14 30 15 31
4581 The output of the second stage, i.e. the final result is:
4583 I1: 0 8 16 24 1 9 17 25
4584 I2: 2 10 18 26 3 11 19 27
4585 I3: 4 12 20 28 5 13 21 30
4586 I4: 6 14 22 30 7 15 23 31. */
4588 void
4589 vect_permute_store_chain (vec<tree> dr_chain,
4590 unsigned int length,
4591 gimple stmt,
4592 gimple_stmt_iterator *gsi,
4593 vec<tree> *result_chain)
4595 tree vect1, vect2, high, low;
4596 gimple perm_stmt;
4597 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4598 tree perm_mask_low, perm_mask_high;
4599 tree data_ref;
4600 tree perm3_mask_low, perm3_mask_high;
4601 unsigned int i, n, log_length = exact_log2 (length);
4602 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4603 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4605 result_chain->quick_grow (length);
4606 memcpy (result_chain->address (), dr_chain.address (),
4607 length * sizeof (tree));
4609 if (length == 3)
4611 unsigned int j0 = 0, j1 = 0, j2 = 0;
4613 for (j = 0; j < 3; j++)
4615 int nelt0 = ((3 - j) * nelt) % 3;
4616 int nelt1 = ((3 - j) * nelt + 1) % 3;
4617 int nelt2 = ((3 - j) * nelt + 2) % 3;
4619 for (i = 0; i < nelt; i++)
4621 if (3 * i + nelt0 < nelt)
4622 sel[3 * i + nelt0] = j0++;
4623 if (3 * i + nelt1 < nelt)
4624 sel[3 * i + nelt1] = nelt + j1++;
4625 if (3 * i + nelt2 < nelt)
4626 sel[3 * i + nelt2] = 0;
4628 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4630 for (i = 0; i < nelt; i++)
4632 if (3 * i + nelt0 < nelt)
4633 sel[3 * i + nelt0] = 3 * i + nelt0;
4634 if (3 * i + nelt1 < nelt)
4635 sel[3 * i + nelt1] = 3 * i + nelt1;
4636 if (3 * i + nelt2 < nelt)
4637 sel[3 * i + nelt2] = nelt + j2++;
4639 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4641 vect1 = dr_chain[0];
4642 vect2 = dr_chain[1];
4644 /* Create interleaving stmt:
4645 low = VEC_PERM_EXPR <vect1, vect2,
4646 {j, nelt, *, j + 1, nelt + j + 1, *,
4647 j + 2, nelt + j + 2, *, ...}> */
4648 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4649 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4650 vect2, perm3_mask_low);
4651 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4653 vect1 = data_ref;
4654 vect2 = dr_chain[2];
4655 /* Create interleaving stmt:
4656 low = VEC_PERM_EXPR <vect1, vect2,
4657 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4658 6, 7, nelt + j + 2, ...}> */
4659 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4660 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4661 vect2, perm3_mask_high);
4662 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4663 (*result_chain)[j] = data_ref;
4666 else
4668 /* If length is not equal to 3 then only power of 2 is supported. */
4669 gcc_assert (exact_log2 (length) != -1);
4671 for (i = 0, n = nelt / 2; i < n; i++)
4673 sel[i * 2] = i;
4674 sel[i * 2 + 1] = i + nelt;
4676 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4678 for (i = 0; i < nelt; i++)
4679 sel[i] += nelt / 2;
4680 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4682 for (i = 0, n = log_length; i < n; i++)
4684 for (j = 0; j < length/2; j++)
4686 vect1 = dr_chain[j];
4687 vect2 = dr_chain[j+length/2];
4689 /* Create interleaving stmt:
4690 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4691 ...}> */
4692 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4693 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4694 vect2, perm_mask_high);
4695 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4696 (*result_chain)[2*j] = high;
4698 /* Create interleaving stmt:
4699 low = VEC_PERM_EXPR <vect1, vect2,
4700 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4701 ...}> */
4702 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4703 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4704 vect2, perm_mask_low);
4705 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4706 (*result_chain)[2*j+1] = low;
4708 memcpy (dr_chain.address (), result_chain->address (),
4709 length * sizeof (tree));
4714 /* Function vect_setup_realignment
4716 This function is called when vectorizing an unaligned load using
4717 the dr_explicit_realign[_optimized] scheme.
4718 This function generates the following code at the loop prolog:
4720 p = initial_addr;
4721 x msq_init = *(floor(p)); # prolog load
4722 realignment_token = call target_builtin;
4723 loop:
4724 x msq = phi (msq_init, ---)
4726 The stmts marked with x are generated only for the case of
4727 dr_explicit_realign_optimized.
4729 The code above sets up a new (vector) pointer, pointing to the first
4730 location accessed by STMT, and a "floor-aligned" load using that pointer.
4731 It also generates code to compute the "realignment-token" (if the relevant
4732 target hook was defined), and creates a phi-node at the loop-header bb
4733 whose arguments are the result of the prolog-load (created by this
4734 function) and the result of a load that takes place in the loop (to be
4735 created by the caller to this function).
4737 For the case of dr_explicit_realign_optimized:
4738 The caller to this function uses the phi-result (msq) to create the
4739 realignment code inside the loop, and sets up the missing phi argument,
4740 as follows:
4741 loop:
4742 msq = phi (msq_init, lsq)
4743 lsq = *(floor(p')); # load in loop
4744 result = realign_load (msq, lsq, realignment_token);
4746 For the case of dr_explicit_realign:
4747 loop:
4748 msq = *(floor(p)); # load in loop
4749 p' = p + (VS-1);
4750 lsq = *(floor(p')); # load in loop
4751 result = realign_load (msq, lsq, realignment_token);
4753 Input:
4754 STMT - (scalar) load stmt to be vectorized. This load accesses
4755 a memory location that may be unaligned.
4756 BSI - place where new code is to be inserted.
4757 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4758 is used.
4760 Output:
4761 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4762 target hook, if defined.
4763 Return value - the result of the loop-header phi node. */
4765 tree
4766 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4767 tree *realignment_token,
4768 enum dr_alignment_support alignment_support_scheme,
4769 tree init_addr,
4770 struct loop **at_loop)
4772 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4773 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4774 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4775 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4776 struct loop *loop = NULL;
4777 edge pe = NULL;
4778 tree scalar_dest = gimple_assign_lhs (stmt);
4779 tree vec_dest;
4780 gimple inc;
4781 tree ptr;
4782 tree data_ref;
4783 basic_block new_bb;
4784 tree msq_init = NULL_TREE;
4785 tree new_temp;
4786 gphi *phi_stmt;
4787 tree msq = NULL_TREE;
4788 gimple_seq stmts = NULL;
4789 bool inv_p;
4790 bool compute_in_loop = false;
4791 bool nested_in_vect_loop = false;
4792 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4793 struct loop *loop_for_initial_load = NULL;
4795 if (loop_vinfo)
4797 loop = LOOP_VINFO_LOOP (loop_vinfo);
4798 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4801 gcc_assert (alignment_support_scheme == dr_explicit_realign
4802 || alignment_support_scheme == dr_explicit_realign_optimized);
4804 /* We need to generate three things:
4805 1. the misalignment computation
4806 2. the extra vector load (for the optimized realignment scheme).
4807 3. the phi node for the two vectors from which the realignment is
4808 done (for the optimized realignment scheme). */
4810 /* 1. Determine where to generate the misalignment computation.
4812 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4813 calculation will be generated by this function, outside the loop (in the
4814 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4815 caller, inside the loop.
4817 Background: If the misalignment remains fixed throughout the iterations of
4818 the loop, then both realignment schemes are applicable, and also the
4819 misalignment computation can be done outside LOOP. This is because we are
4820 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4821 are a multiple of VS (the Vector Size), and therefore the misalignment in
4822 different vectorized LOOP iterations is always the same.
4823 The problem arises only if the memory access is in an inner-loop nested
4824 inside LOOP, which is now being vectorized using outer-loop vectorization.
4825 This is the only case when the misalignment of the memory access may not
4826 remain fixed throughout the iterations of the inner-loop (as explained in
4827 detail in vect_supportable_dr_alignment). In this case, not only is the
4828 optimized realignment scheme not applicable, but also the misalignment
4829 computation (and generation of the realignment token that is passed to
4830 REALIGN_LOAD) have to be done inside the loop.
4832 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4833 or not, which in turn determines if the misalignment is computed inside
4834 the inner-loop, or outside LOOP. */
4836 if (init_addr != NULL_TREE || !loop_vinfo)
4838 compute_in_loop = true;
4839 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4843 /* 2. Determine where to generate the extra vector load.
4845 For the optimized realignment scheme, instead of generating two vector
4846 loads in each iteration, we generate a single extra vector load in the
4847 preheader of the loop, and in each iteration reuse the result of the
4848 vector load from the previous iteration. In case the memory access is in
4849 an inner-loop nested inside LOOP, which is now being vectorized using
4850 outer-loop vectorization, we need to determine whether this initial vector
4851 load should be generated at the preheader of the inner-loop, or can be
4852 generated at the preheader of LOOP. If the memory access has no evolution
4853 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4854 to be generated inside LOOP (in the preheader of the inner-loop). */
4856 if (nested_in_vect_loop)
4858 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4859 bool invariant_in_outerloop =
4860 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4861 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4863 else
4864 loop_for_initial_load = loop;
4865 if (at_loop)
4866 *at_loop = loop_for_initial_load;
4868 if (loop_for_initial_load)
4869 pe = loop_preheader_edge (loop_for_initial_load);
4871 /* 3. For the case of the optimized realignment, create the first vector
4872 load at the loop preheader. */
4874 if (alignment_support_scheme == dr_explicit_realign_optimized)
4876 /* Create msq_init = *(floor(p1)) in the loop preheader */
4877 gassign *new_stmt;
4879 gcc_assert (!compute_in_loop);
4880 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4881 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4882 NULL_TREE, &init_addr, NULL, &inc,
4883 true, &inv_p);
4884 new_temp = copy_ssa_name (ptr);
4885 new_stmt = gimple_build_assign
4886 (new_temp, BIT_AND_EXPR, ptr,
4887 build_int_cst (TREE_TYPE (ptr),
4888 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4889 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4890 gcc_assert (!new_bb);
4891 data_ref
4892 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4893 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4894 new_stmt = gimple_build_assign (vec_dest, data_ref);
4895 new_temp = make_ssa_name (vec_dest, new_stmt);
4896 gimple_assign_set_lhs (new_stmt, new_temp);
4897 if (pe)
4899 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4900 gcc_assert (!new_bb);
4902 else
4903 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4905 msq_init = gimple_assign_lhs (new_stmt);
4908 /* 4. Create realignment token using a target builtin, if available.
4909 It is done either inside the containing loop, or before LOOP (as
4910 determined above). */
4912 if (targetm.vectorize.builtin_mask_for_load)
4914 gcall *new_stmt;
4915 tree builtin_decl;
4917 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4918 if (!init_addr)
4920 /* Generate the INIT_ADDR computation outside LOOP. */
4921 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4922 NULL_TREE, loop);
4923 if (loop)
4925 pe = loop_preheader_edge (loop);
4926 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4927 gcc_assert (!new_bb);
4929 else
4930 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4933 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4934 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4935 vec_dest =
4936 vect_create_destination_var (scalar_dest,
4937 gimple_call_return_type (new_stmt));
4938 new_temp = make_ssa_name (vec_dest, new_stmt);
4939 gimple_call_set_lhs (new_stmt, new_temp);
4941 if (compute_in_loop)
4942 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4943 else
4945 /* Generate the misalignment computation outside LOOP. */
4946 pe = loop_preheader_edge (loop);
4947 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4948 gcc_assert (!new_bb);
4951 *realignment_token = gimple_call_lhs (new_stmt);
4953 /* The result of the CALL_EXPR to this builtin is determined from
4954 the value of the parameter and no global variables are touched
4955 which makes the builtin a "const" function. Requiring the
4956 builtin to have the "const" attribute makes it unnecessary
4957 to call mark_call_clobbered. */
4958 gcc_assert (TREE_READONLY (builtin_decl));
4961 if (alignment_support_scheme == dr_explicit_realign)
4962 return msq;
4964 gcc_assert (!compute_in_loop);
4965 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4968 /* 5. Create msq = phi <msq_init, lsq> in loop */
4970 pe = loop_preheader_edge (containing_loop);
4971 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4972 msq = make_ssa_name (vec_dest);
4973 phi_stmt = create_phi_node (msq, containing_loop->header);
4974 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4976 return msq;
4980 /* Function vect_grouped_load_supported.
4982 Returns TRUE if even and odd permutations are supported,
4983 and FALSE otherwise. */
4985 bool
4986 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4988 machine_mode mode = TYPE_MODE (vectype);
4990 /* vect_permute_load_chain requires the group size to be equal to 3 or
4991 be a power of two. */
4992 if (count != 3 && exact_log2 (count) == -1)
4994 if (dump_enabled_p ())
4995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4996 "the size of the group of accesses"
4997 " is not a power of 2 or not equal to 3\n");
4998 return false;
5001 /* Check that the permutation is supported. */
5002 if (VECTOR_MODE_P (mode))
5004 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5005 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5007 if (count == 3)
5009 unsigned int k;
5010 for (k = 0; k < 3; k++)
5012 for (i = 0; i < nelt; i++)
5013 if (3 * i + k < 2 * nelt)
5014 sel[i] = 3 * i + k;
5015 else
5016 sel[i] = 0;
5017 if (!can_vec_perm_p (mode, false, sel))
5019 if (dump_enabled_p ())
5020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5021 "shuffle of 3 loads is not supported by"
5022 " target\n");
5023 return false;
5025 for (i = 0, j = 0; i < nelt; i++)
5026 if (3 * i + k < 2 * nelt)
5027 sel[i] = i;
5028 else
5029 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5030 if (!can_vec_perm_p (mode, false, sel))
5032 if (dump_enabled_p ())
5033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5034 "shuffle of 3 loads is not supported by"
5035 " target\n");
5036 return false;
5039 return true;
5041 else
5043 /* If length is not equal to 3 then only power of 2 is supported. */
5044 gcc_assert (exact_log2 (count) != -1);
5045 for (i = 0; i < nelt; i++)
5046 sel[i] = i * 2;
5047 if (can_vec_perm_p (mode, false, sel))
5049 for (i = 0; i < nelt; i++)
5050 sel[i] = i * 2 + 1;
5051 if (can_vec_perm_p (mode, false, sel))
5052 return true;
5057 if (dump_enabled_p ())
5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059 "extract even/odd not supported by target\n");
5060 return false;
5063 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5064 type VECTYPE. */
5066 bool
5067 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5069 return vect_lanes_optab_supported_p ("vec_load_lanes",
5070 vec_load_lanes_optab,
5071 vectype, count);
5074 /* Function vect_permute_load_chain.
5076 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5077 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5078 the input data correctly. Return the final references for loads in
5079 RESULT_CHAIN.
5081 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5082 The input is 4 vectors each containing 8 elements. We assign a number to each
5083 element, the input sequence is:
5085 1st vec: 0 1 2 3 4 5 6 7
5086 2nd vec: 8 9 10 11 12 13 14 15
5087 3rd vec: 16 17 18 19 20 21 22 23
5088 4th vec: 24 25 26 27 28 29 30 31
5090 The output sequence should be:
5092 1st vec: 0 4 8 12 16 20 24 28
5093 2nd vec: 1 5 9 13 17 21 25 29
5094 3rd vec: 2 6 10 14 18 22 26 30
5095 4th vec: 3 7 11 15 19 23 27 31
5097 i.e., the first output vector should contain the first elements of each
5098 interleaving group, etc.
5100 We use extract_even/odd instructions to create such output. The input of
5101 each extract_even/odd operation is two vectors
5102 1st vec 2nd vec
5103 0 1 2 3 4 5 6 7
5105 and the output is the vector of extracted even/odd elements. The output of
5106 extract_even will be: 0 2 4 6
5107 and of extract_odd: 1 3 5 7
5110 The permutation is done in log LENGTH stages. In each stage extract_even
5111 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5112 their order. In our example,
5114 E1: extract_even (1st vec, 2nd vec)
5115 E2: extract_odd (1st vec, 2nd vec)
5116 E3: extract_even (3rd vec, 4th vec)
5117 E4: extract_odd (3rd vec, 4th vec)
5119 The output for the first stage will be:
5121 E1: 0 2 4 6 8 10 12 14
5122 E2: 1 3 5 7 9 11 13 15
5123 E3: 16 18 20 22 24 26 28 30
5124 E4: 17 19 21 23 25 27 29 31
5126 In order to proceed and create the correct sequence for the next stage (or
5127 for the correct output, if the second stage is the last one, as in our
5128 example), we first put the output of extract_even operation and then the
5129 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5130 The input for the second stage is:
5132 1st vec (E1): 0 2 4 6 8 10 12 14
5133 2nd vec (E3): 16 18 20 22 24 26 28 30
5134 3rd vec (E2): 1 3 5 7 9 11 13 15
5135 4th vec (E4): 17 19 21 23 25 27 29 31
5137 The output of the second stage:
5139 E1: 0 4 8 12 16 20 24 28
5140 E2: 2 6 10 14 18 22 26 30
5141 E3: 1 5 9 13 17 21 25 29
5142 E4: 3 7 11 15 19 23 27 31
5144 And RESULT_CHAIN after reordering:
5146 1st vec (E1): 0 4 8 12 16 20 24 28
5147 2nd vec (E3): 1 5 9 13 17 21 25 29
5148 3rd vec (E2): 2 6 10 14 18 22 26 30
5149 4th vec (E4): 3 7 11 15 19 23 27 31. */
5151 static void
5152 vect_permute_load_chain (vec<tree> dr_chain,
5153 unsigned int length,
5154 gimple stmt,
5155 gimple_stmt_iterator *gsi,
5156 vec<tree> *result_chain)
5158 tree data_ref, first_vect, second_vect;
5159 tree perm_mask_even, perm_mask_odd;
5160 tree perm3_mask_low, perm3_mask_high;
5161 gimple perm_stmt;
5162 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5163 unsigned int i, j, log_length = exact_log2 (length);
5164 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5165 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5167 result_chain->quick_grow (length);
5168 memcpy (result_chain->address (), dr_chain.address (),
5169 length * sizeof (tree));
5171 if (length == 3)
5173 unsigned int k;
5175 for (k = 0; k < 3; k++)
5177 for (i = 0; i < nelt; i++)
5178 if (3 * i + k < 2 * nelt)
5179 sel[i] = 3 * i + k;
5180 else
5181 sel[i] = 0;
5182 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5184 for (i = 0, j = 0; i < nelt; i++)
5185 if (3 * i + k < 2 * nelt)
5186 sel[i] = i;
5187 else
5188 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5190 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5192 first_vect = dr_chain[0];
5193 second_vect = dr_chain[1];
5195 /* Create interleaving stmt (low part of):
5196 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5197 ...}> */
5198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5200 second_vect, perm3_mask_low);
5201 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5203 /* Create interleaving stmt (high part of):
5204 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5205 ...}> */
5206 first_vect = data_ref;
5207 second_vect = dr_chain[2];
5208 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5209 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5210 second_vect, perm3_mask_high);
5211 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5212 (*result_chain)[k] = data_ref;
5215 else
5217 /* If length is not equal to 3 then only power of 2 is supported. */
5218 gcc_assert (exact_log2 (length) != -1);
5220 for (i = 0; i < nelt; ++i)
5221 sel[i] = i * 2;
5222 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5224 for (i = 0; i < nelt; ++i)
5225 sel[i] = i * 2 + 1;
5226 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5228 for (i = 0; i < log_length; i++)
5230 for (j = 0; j < length; j += 2)
5232 first_vect = dr_chain[j];
5233 second_vect = dr_chain[j+1];
5235 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5236 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5237 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5238 first_vect, second_vect,
5239 perm_mask_even);
5240 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241 (*result_chain)[j/2] = data_ref;
5243 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5244 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5245 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5246 first_vect, second_vect,
5247 perm_mask_odd);
5248 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5249 (*result_chain)[j/2+length/2] = data_ref;
5251 memcpy (dr_chain.address (), result_chain->address (),
5252 length * sizeof (tree));
5257 /* Function vect_shift_permute_load_chain.
5259 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5260 sequence of stmts to reorder the input data accordingly.
5261 Return the final references for loads in RESULT_CHAIN.
5262 Return true if successed, false otherwise.
5264 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5265 The input is 3 vectors each containing 8 elements. We assign a
5266 number to each element, the input sequence is:
5268 1st vec: 0 1 2 3 4 5 6 7
5269 2nd vec: 8 9 10 11 12 13 14 15
5270 3rd vec: 16 17 18 19 20 21 22 23
5272 The output sequence should be:
5274 1st vec: 0 3 6 9 12 15 18 21
5275 2nd vec: 1 4 7 10 13 16 19 22
5276 3rd vec: 2 5 8 11 14 17 20 23
5278 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5280 First we shuffle all 3 vectors to get correct elements order:
5282 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5283 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5284 3rd vec: (16 19 22) (17 20 23) (18 21)
5286 Next we unite and shift vector 3 times:
5288 1st step:
5289 shift right by 6 the concatenation of:
5290 "1st vec" and "2nd vec"
5291 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5292 "2nd vec" and "3rd vec"
5293 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5294 "3rd vec" and "1st vec"
5295 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5296 | New vectors |
5298 So that now new vectors are:
5300 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5301 2nd vec: (10 13) (16 19 22) (17 20 23)
5302 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5304 2nd step:
5305 shift right by 5 the concatenation of:
5306 "1st vec" and "3rd vec"
5307 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5308 "2nd vec" and "1st vec"
5309 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5310 "3rd vec" and "2nd vec"
5311 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5312 | New vectors |
5314 So that now new vectors are:
5316 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5317 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5318 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5320 3rd step:
5321 shift right by 5 the concatenation of:
5322 "1st vec" and "1st vec"
5323 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5324 shift right by 3 the concatenation of:
5325 "2nd vec" and "2nd vec"
5326 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5327 | New vectors |
5329 So that now all vectors are READY:
5330 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5331 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5332 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5334 This algorithm is faster than one in vect_permute_load_chain if:
5335 1. "shift of a concatination" is faster than general permutation.
5336 This is usually so.
5337 2. The TARGET machine can't execute vector instructions in parallel.
5338 This is because each step of the algorithm depends on previous.
5339 The algorithm in vect_permute_load_chain is much more parallel.
5341 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5344 static bool
5345 vect_shift_permute_load_chain (vec<tree> dr_chain,
5346 unsigned int length,
5347 gimple stmt,
5348 gimple_stmt_iterator *gsi,
5349 vec<tree> *result_chain)
5351 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5352 tree perm2_mask1, perm2_mask2, perm3_mask;
5353 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5354 gimple perm_stmt;
5356 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5357 unsigned int i;
5358 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5359 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5360 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5361 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5363 result_chain->quick_grow (length);
5364 memcpy (result_chain->address (), dr_chain.address (),
5365 length * sizeof (tree));
5367 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5369 unsigned int j, log_length = exact_log2 (length);
5370 for (i = 0; i < nelt / 2; ++i)
5371 sel[i] = i * 2;
5372 for (i = 0; i < nelt / 2; ++i)
5373 sel[nelt / 2 + i] = i * 2 + 1;
5374 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5376 if (dump_enabled_p ())
5377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5378 "shuffle of 2 fields structure is not \
5379 supported by target\n");
5380 return false;
5382 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5384 for (i = 0; i < nelt / 2; ++i)
5385 sel[i] = i * 2 + 1;
5386 for (i = 0; i < nelt / 2; ++i)
5387 sel[nelt / 2 + i] = i * 2;
5388 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5390 if (dump_enabled_p ())
5391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5392 "shuffle of 2 fields structure is not \
5393 supported by target\n");
5394 return false;
5396 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5398 /* Generating permutation constant to shift all elements.
5399 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5400 for (i = 0; i < nelt; i++)
5401 sel[i] = nelt / 2 + i;
5402 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5404 if (dump_enabled_p ())
5405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5406 "shift permutation is not supported by target\n");
5407 return false;
5409 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5411 /* Generating permutation constant to select vector from 2.
5412 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5413 for (i = 0; i < nelt / 2; i++)
5414 sel[i] = i;
5415 for (i = nelt / 2; i < nelt; i++)
5416 sel[i] = nelt + i;
5417 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5419 if (dump_enabled_p ())
5420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421 "select is not supported by target\n");
5422 return false;
5424 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5426 for (i = 0; i < log_length; i++)
5428 for (j = 0; j < length; j += 2)
5430 first_vect = dr_chain[j];
5431 second_vect = dr_chain[j + 1];
5433 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5434 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5435 first_vect, first_vect,
5436 perm2_mask1);
5437 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5438 vect[0] = data_ref;
5440 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5441 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5442 second_vect, second_vect,
5443 perm2_mask2);
5444 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5445 vect[1] = data_ref;
5447 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5448 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5449 vect[0], vect[1], shift1_mask);
5450 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5451 (*result_chain)[j/2 + length/2] = data_ref;
5453 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5454 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5455 vect[0], vect[1], select_mask);
5456 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5457 (*result_chain)[j/2] = data_ref;
5459 memcpy (dr_chain.address (), result_chain->address (),
5460 length * sizeof (tree));
5462 return true;
5464 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5466 unsigned int k = 0, l = 0;
5468 /* Generating permutation constant to get all elements in rigth order.
5469 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5470 for (i = 0; i < nelt; i++)
5472 if (3 * k + (l % 3) >= nelt)
5474 k = 0;
5475 l += (3 - (nelt % 3));
5477 sel[i] = 3 * k + (l % 3);
5478 k++;
5480 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5482 if (dump_enabled_p ())
5483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5484 "shuffle of 3 fields structure is not \
5485 supported by target\n");
5486 return false;
5488 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5490 /* Generating permutation constant to shift all elements.
5491 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5492 for (i = 0; i < nelt; i++)
5493 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5494 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5498 "shift permutation is not supported by target\n");
5499 return false;
5501 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5503 /* Generating permutation constant to shift all elements.
5504 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5505 for (i = 0; i < nelt; i++)
5506 sel[i] = 2 * (nelt / 3) + 1 + i;
5507 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5509 if (dump_enabled_p ())
5510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5511 "shift permutation is not supported by target\n");
5512 return false;
5514 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5516 /* Generating permutation constant to shift all elements.
5517 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5518 for (i = 0; i < nelt; i++)
5519 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5520 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5522 if (dump_enabled_p ())
5523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524 "shift permutation is not supported by target\n");
5525 return false;
5527 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5529 /* Generating permutation constant to shift all elements.
5530 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5531 for (i = 0; i < nelt; i++)
5532 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5533 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5535 if (dump_enabled_p ())
5536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537 "shift permutation is not supported by target\n");
5538 return false;
5540 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5542 for (k = 0; k < 3; k++)
5544 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5545 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5546 dr_chain[k], dr_chain[k],
5547 perm3_mask);
5548 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5549 vect[k] = data_ref;
5552 for (k = 0; k < 3; k++)
5554 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5555 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5556 vect[k % 3], vect[(k + 1) % 3],
5557 shift1_mask);
5558 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5559 vect_shift[k] = data_ref;
5562 for (k = 0; k < 3; k++)
5564 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5565 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5566 vect_shift[(4 - k) % 3],
5567 vect_shift[(3 - k) % 3],
5568 shift2_mask);
5569 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5570 vect[k] = data_ref;
5573 (*result_chain)[3 - (nelt % 3)] = vect[2];
5575 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5576 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5577 vect[0], shift3_mask);
5578 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5579 (*result_chain)[nelt % 3] = data_ref;
5581 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5582 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5583 vect[1], shift4_mask);
5584 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5585 (*result_chain)[0] = data_ref;
5586 return true;
5588 return false;
5591 /* Function vect_transform_grouped_load.
5593 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5594 to perform their permutation and ascribe the result vectorized statements to
5595 the scalar statements.
5598 void
5599 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5600 gimple_stmt_iterator *gsi)
5602 machine_mode mode;
5603 vec<tree> result_chain = vNULL;
5605 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5606 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5607 vectors, that are ready for vector computation. */
5608 result_chain.create (size);
5610 /* If reassociation width for vector type is 2 or greater target machine can
5611 execute 2 or more vector instructions in parallel. Otherwise try to
5612 get chain for loads group using vect_shift_permute_load_chain. */
5613 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5614 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5615 || exact_log2 (size) != -1
5616 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5617 gsi, &result_chain))
5618 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5619 vect_record_grouped_load_vectors (stmt, result_chain);
5620 result_chain.release ();
5623 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5624 generated as part of the vectorization of STMT. Assign the statement
5625 for each vector to the associated scalar statement. */
5627 void
5628 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5630 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5631 gimple next_stmt, new_stmt;
5632 unsigned int i, gap_count;
5633 tree tmp_data_ref;
5635 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5636 Since we scan the chain starting from it's first node, their order
5637 corresponds the order of data-refs in RESULT_CHAIN. */
5638 next_stmt = first_stmt;
5639 gap_count = 1;
5640 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5642 if (!next_stmt)
5643 break;
5645 /* Skip the gaps. Loads created for the gaps will be removed by dead
5646 code elimination pass later. No need to check for the first stmt in
5647 the group, since it always exists.
5648 GROUP_GAP is the number of steps in elements from the previous
5649 access (if there is no gap GROUP_GAP is 1). We skip loads that
5650 correspond to the gaps. */
5651 if (next_stmt != first_stmt
5652 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5654 gap_count++;
5655 continue;
5658 while (next_stmt)
5660 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5661 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5662 copies, and we put the new vector statement in the first available
5663 RELATED_STMT. */
5664 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5665 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5666 else
5668 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5670 gimple prev_stmt =
5671 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5672 gimple rel_stmt =
5673 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5674 while (rel_stmt)
5676 prev_stmt = rel_stmt;
5677 rel_stmt =
5678 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5681 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5682 new_stmt;
5686 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5687 gap_count = 1;
5688 /* If NEXT_STMT accesses the same DR as the previous statement,
5689 put the same TMP_DATA_REF as its vectorized statement; otherwise
5690 get the next data-ref from RESULT_CHAIN. */
5691 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5692 break;
5697 /* Function vect_force_dr_alignment_p.
5699 Returns whether the alignment of a DECL can be forced to be aligned
5700 on ALIGNMENT bit boundary. */
5702 bool
5703 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5705 if (TREE_CODE (decl) != VAR_DECL)
5706 return false;
5708 /* With -fno-toplevel-reorder we may have already output the constant. */
5709 if (TREE_ASM_WRITTEN (decl))
5710 return false;
5712 /* Constant pool entries may be shared and not properly merged by LTO. */
5713 if (DECL_IN_CONSTANT_POOL (decl))
5714 return false;
5716 if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5718 symtab_node *snode;
5720 /* We cannot change alignment of symbols that may bind to symbols
5721 in other translation unit that may contain a definition with lower
5722 alignment. */
5723 if (!decl_binds_to_current_def_p (decl))
5724 return false;
5726 /* When compiling partition, be sure the symbol is not output by other
5727 partition. */
5728 snode = symtab_node::get (decl);
5729 if (flag_ltrans
5730 && (snode->in_other_partition
5731 || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
5732 return false;
5735 /* Do not override the alignment as specified by the ABI when the used
5736 attribute is set. */
5737 if (DECL_PRESERVE_P (decl))
5738 return false;
5740 /* Do not override explicit alignment set by the user when an explicit
5741 section name is also used. This is a common idiom used by many
5742 software projects. */
5743 if (TREE_STATIC (decl)
5744 && DECL_SECTION_NAME (decl) != NULL
5745 && !symtab_node::get (decl)->implicit_section)
5746 return false;
5748 /* If symbol is an alias, we need to check that target is OK. */
5749 if (TREE_STATIC (decl))
5751 tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
5752 if (target != decl)
5754 if (DECL_PRESERVE_P (target))
5755 return false;
5756 decl = target;
5760 if (TREE_STATIC (decl))
5761 return (alignment <= MAX_OFILE_ALIGNMENT);
5762 else
5763 return (alignment <= MAX_STACK_ALIGNMENT);
5767 /* Return whether the data reference DR is supported with respect to its
5768 alignment.
5769 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5770 it is aligned, i.e., check if it is possible to vectorize it with different
5771 alignment. */
5773 enum dr_alignment_support
5774 vect_supportable_dr_alignment (struct data_reference *dr,
5775 bool check_aligned_accesses)
5777 gimple stmt = DR_STMT (dr);
5778 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5779 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5780 machine_mode mode = TYPE_MODE (vectype);
5781 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5782 struct loop *vect_loop = NULL;
5783 bool nested_in_vect_loop = false;
5785 if (aligned_access_p (dr) && !check_aligned_accesses)
5786 return dr_aligned;
5788 /* For now assume all conditional loads/stores support unaligned
5789 access without any special code. */
5790 if (is_gimple_call (stmt)
5791 && gimple_call_internal_p (stmt)
5792 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5793 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5794 return dr_unaligned_supported;
5796 if (loop_vinfo)
5798 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5799 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5802 /* Possibly unaligned access. */
5804 /* We can choose between using the implicit realignment scheme (generating
5805 a misaligned_move stmt) and the explicit realignment scheme (generating
5806 aligned loads with a REALIGN_LOAD). There are two variants to the
5807 explicit realignment scheme: optimized, and unoptimized.
5808 We can optimize the realignment only if the step between consecutive
5809 vector loads is equal to the vector size. Since the vector memory
5810 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5811 is guaranteed that the misalignment amount remains the same throughout the
5812 execution of the vectorized loop. Therefore, we can create the
5813 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5814 at the loop preheader.
5816 However, in the case of outer-loop vectorization, when vectorizing a
5817 memory access in the inner-loop nested within the LOOP that is now being
5818 vectorized, while it is guaranteed that the misalignment of the
5819 vectorized memory access will remain the same in different outer-loop
5820 iterations, it is *not* guaranteed that is will remain the same throughout
5821 the execution of the inner-loop. This is because the inner-loop advances
5822 with the original scalar step (and not in steps of VS). If the inner-loop
5823 step happens to be a multiple of VS, then the misalignment remains fixed
5824 and we can use the optimized realignment scheme. For example:
5826 for (i=0; i<N; i++)
5827 for (j=0; j<M; j++)
5828 s += a[i+j];
5830 When vectorizing the i-loop in the above example, the step between
5831 consecutive vector loads is 1, and so the misalignment does not remain
5832 fixed across the execution of the inner-loop, and the realignment cannot
5833 be optimized (as illustrated in the following pseudo vectorized loop):
5835 for (i=0; i<N; i+=4)
5836 for (j=0; j<M; j++){
5837 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5838 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5839 // (assuming that we start from an aligned address).
5842 We therefore have to use the unoptimized realignment scheme:
5844 for (i=0; i<N; i+=4)
5845 for (j=k; j<M; j+=4)
5846 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5847 // that the misalignment of the initial address is
5848 // 0).
5850 The loop can then be vectorized as follows:
5852 for (k=0; k<4; k++){
5853 rt = get_realignment_token (&vp[k]);
5854 for (i=0; i<N; i+=4){
5855 v1 = vp[i+k];
5856 for (j=k; j<M; j+=4){
5857 v2 = vp[i+j+VS-1];
5858 va = REALIGN_LOAD <v1,v2,rt>;
5859 vs += va;
5860 v1 = v2;
5863 } */
5865 if (DR_IS_READ (dr))
5867 bool is_packed = false;
5868 tree type = (TREE_TYPE (DR_REF (dr)));
5870 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5871 && (!targetm.vectorize.builtin_mask_for_load
5872 || targetm.vectorize.builtin_mask_for_load ()))
5874 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5875 if ((nested_in_vect_loop
5876 && (TREE_INT_CST_LOW (DR_STEP (dr))
5877 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5878 || !loop_vinfo)
5879 return dr_explicit_realign;
5880 else
5881 return dr_explicit_realign_optimized;
5883 if (!known_alignment_for_access_p (dr))
5884 is_packed = not_size_aligned (DR_REF (dr));
5886 if ((TYPE_USER_ALIGN (type) && !is_packed)
5887 || targetm.vectorize.
5888 support_vector_misalignment (mode, type,
5889 DR_MISALIGNMENT (dr), is_packed))
5890 /* Can't software pipeline the loads, but can at least do them. */
5891 return dr_unaligned_supported;
5893 else
5895 bool is_packed = false;
5896 tree type = (TREE_TYPE (DR_REF (dr)));
5898 if (!known_alignment_for_access_p (dr))
5899 is_packed = not_size_aligned (DR_REF (dr));
5901 if ((TYPE_USER_ALIGN (type) && !is_packed)
5902 || targetm.vectorize.
5903 support_vector_misalignment (mode, type,
5904 DR_MISALIGNMENT (dr), is_packed))
5905 return dr_unaligned_supported;
5908 /* Unsupported. */
5909 return dr_unaligned_unsupported;