[AArch64][14/14] Reuse target_option_current_node when passing pragma string to targe...
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob80df9dfe52e73fd9871c9dd10f1c5787ad67e7eb
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "predict.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "tm_p.h"
36 #include "target.h"
37 #include "gimple-pretty-print.h"
38 #include "internal-fn.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-chrec.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "diagnostic-core.h"
51 #include "cgraph.h"
52 /* Need to include rtl.h, expr.h, etc. for optabs. */
53 #include "flags.h"
54 #include "insn-config.h"
55 #include "expmed.h"
56 #include "dojump.h"
57 #include "explow.h"
58 #include "calls.h"
59 #include "emit-rtl.h"
60 #include "varasm.h"
61 #include "stmt.h"
62 #include "expr.h"
63 #include "insn-codes.h"
64 #include "optabs.h"
65 #include "builtins.h"
66 #include "params.h"
68 /* Return true if load- or store-lanes optab OPTAB is implemented for
69 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
71 static bool
72 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
73 tree vectype, unsigned HOST_WIDE_INT count)
75 machine_mode mode, array_mode;
76 bool limit_p;
78 mode = TYPE_MODE (vectype);
79 limit_p = !targetm.array_mode_supported_p (mode, count);
80 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
81 MODE_INT, limit_p);
83 if (array_mode == BLKmode)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
88 GET_MODE_NAME (mode), count);
89 return false;
92 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
96 "cannot use %s<%s><%s>\n", name,
97 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
98 return false;
101 if (dump_enabled_p ())
102 dump_printf_loc (MSG_NOTE, vect_location,
103 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
104 GET_MODE_NAME (mode));
106 return true;
110 /* Return the smallest scalar part of STMT.
111 This is used to determine the vectype of the stmt. We generally set the
112 vectype according to the type of the result (lhs). For stmts whose
113 result-type is different than the type of the arguments (e.g., demotion,
114 promotion), vectype will be reset appropriately (later). Note that we have
115 to visit the smallest datatype in this function, because that determines the
116 VF. If the smallest datatype in the loop is present only as the rhs of a
117 promotion operation - we'd miss it.
118 Such a case, where a variable of this datatype does not appear in the lhs
119 anywhere in the loop, can only occur if it's an invariant: e.g.:
120 'int_x = (int) short_inv', which we'd expect to have been optimized away by
121 invariant motion. However, we cannot rely on invariant motion to always
122 take invariants out of the loop, and so in the case of promotion we also
123 have to check the rhs.
124 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
125 types. */
127 tree
128 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
129 HOST_WIDE_INT *rhs_size_unit)
131 tree scalar_type = gimple_expr_type (stmt);
132 HOST_WIDE_INT lhs, rhs;
134 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
136 if (is_gimple_assign (stmt)
137 && (gimple_assign_cast_p (stmt)
138 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
159 static bool
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return false;
167 if (dump_enabled_p ())
169 dump_printf_loc (MSG_NOTE, vect_location,
170 "mark for run-time aliasing test between ");
171 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
172 dump_printf (MSG_NOTE, " and ");
173 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
174 dump_printf (MSG_NOTE, "\n");
177 if (optimize_loop_nest_for_size_p (loop))
179 if (dump_enabled_p ())
180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
181 "versioning not supported when optimizing"
182 " for size.\n");
183 return false;
186 /* FORNOW: We don't support versioning with outer-loop vectorization. */
187 if (loop->inner)
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
191 "versioning not yet supported for outer-loops.\n");
192 return false;
195 /* FORNOW: We don't support creating runtime alias tests for non-constant
196 step. */
197 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
198 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
200 if (dump_enabled_p ())
201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
202 "versioning not yet supported for non-constant "
203 "step\n");
204 return false;
207 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
208 return true;
212 /* Function vect_analyze_data_ref_dependence.
214 Return TRUE if there (might) exist a dependence between a memory-reference
215 DRA and a memory-reference DRB. When versioning for alias may check a
216 dependence at run-time, return FALSE. Adjust *MAX_VF according to
217 the data dependence. */
219 static bool
220 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
221 loop_vec_info loop_vinfo, int *max_vf)
223 unsigned int i;
224 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
225 struct data_reference *dra = DDR_A (ddr);
226 struct data_reference *drb = DDR_B (ddr);
227 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
228 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
229 lambda_vector dist_v;
230 unsigned int loop_depth;
232 /* In loop analysis all data references should be vectorizable. */
233 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
234 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
235 gcc_unreachable ();
237 /* Independent data accesses. */
238 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
239 return false;
241 if (dra == drb
242 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
243 return false;
245 /* Even if we have an anti-dependence then, as the vectorized loop covers at
246 least two scalar iterations, there is always also a true dependence.
247 As the vectorizer does not re-order loads and stores we can ignore
248 the anti-dependence if TBAA can disambiguate both DRs similar to the
249 case with known negative distance anti-dependences (positive
250 distance anti-dependences would violate TBAA constraints). */
251 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
252 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
253 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
254 get_alias_set (DR_REF (drb))))
255 return false;
257 /* Unknown data dependence. */
258 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
260 /* If user asserted safelen consecutive iterations can be
261 executed concurrently, assume independence. */
262 if (loop->safelen >= 2)
264 if (loop->safelen < *max_vf)
265 *max_vf = loop->safelen;
266 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
267 return false;
270 if (STMT_VINFO_GATHER_P (stmtinfo_a)
271 || STMT_VINFO_GATHER_P (stmtinfo_b))
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias not supported for: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
285 return true;
288 if (dump_enabled_p ())
290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
291 "versioning for alias required: "
292 "can't determine dependence between ");
293 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
294 DR_REF (dra));
295 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (drb));
298 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
301 /* Add to list of ddrs that need to be tested at run-time. */
302 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
305 /* Known data dependence. */
306 if (DDR_NUM_DIST_VECTS (ddr) == 0)
308 /* If user asserted safelen consecutive iterations can be
309 executed concurrently, assume independence. */
310 if (loop->safelen >= 2)
312 if (loop->safelen < *max_vf)
313 *max_vf = loop->safelen;
314 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
315 return false;
318 if (STMT_VINFO_GATHER_P (stmtinfo_a)
319 || STMT_VINFO_GATHER_P (stmtinfo_b))
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias not supported for: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
327 DR_REF (dra));
328 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
329 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
330 DR_REF (drb));
331 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
333 return true;
336 if (dump_enabled_p ())
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "versioning for alias required: "
340 "bad dist vector for ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
342 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
343 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
344 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
346 /* Add to list of ddrs that need to be tested at run-time. */
347 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
350 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
351 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
353 int dist = dist_v[loop_depth];
355 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "dependence distance = %d.\n", dist);
359 if (dist == 0)
361 if (dump_enabled_p ())
363 dump_printf_loc (MSG_NOTE, vect_location,
364 "dependence distance == 0 between ");
365 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
366 dump_printf (MSG_NOTE, " and ");
367 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
368 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
371 /* When we perform grouped accesses and perform implicit CSE
372 by detecting equal accesses and doing disambiguation with
373 runtime alias tests like for
374 .. = a[i];
375 .. = a[i+1];
376 a[i] = ..;
377 a[i+1] = ..;
378 *p = ..;
379 .. = a[i];
380 .. = a[i+1];
381 where we will end up loading { a[i], a[i+1] } once, make
382 sure that inserting group loads before the first load and
383 stores after the last store will do the right thing.
384 Similar for groups like
385 a[i] = ...;
386 ... = a[i];
387 a[i+1] = ...;
388 where loads from the group interleave with the store. */
389 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
390 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
392 gimple earlier_stmt;
393 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
394 if (DR_IS_WRITE
395 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
397 if (dump_enabled_p ())
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "READ_WRITE dependence in interleaving."
400 "\n");
401 return true;
405 continue;
408 if (dist > 0 && DDR_REVERSED_P (ddr))
410 /* If DDR_REVERSED_P the order of the data-refs in DDR was
411 reversed (to make distance vector positive), and the actual
412 distance is negative. */
413 if (dump_enabled_p ())
414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
415 "dependence distance negative.\n");
416 /* Record a negative dependence distance to later limit the
417 amount of stmt copying / unrolling we can perform.
418 Only need to handle read-after-write dependence. */
419 if (DR_IS_READ (drb)
420 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
421 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
422 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
423 continue;
426 if (abs (dist) >= 2
427 && abs (dist) < *max_vf)
429 /* The dependence distance requires reduction of the maximal
430 vectorization factor. */
431 *max_vf = abs (dist);
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE, vect_location,
434 "adjusting maximal vectorization factor to %i\n",
435 *max_vf);
438 if (abs (dist) >= *max_vf)
440 /* Dependence distance does not create dependence, as far as
441 vectorization is concerned, in this case. */
442 if (dump_enabled_p ())
443 dump_printf_loc (MSG_NOTE, vect_location,
444 "dependence distance >= VF.\n");
445 continue;
448 if (dump_enabled_p ())
450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
451 "not vectorized, possible dependence "
452 "between data-refs ");
453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
454 dump_printf (MSG_NOTE, " and ");
455 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
456 dump_printf (MSG_NOTE, "\n");
459 return true;
462 return false;
465 /* Function vect_analyze_data_ref_dependences.
467 Examine all the data references in the loop, and make sure there do not
468 exist any data dependences between them. Set *MAX_VF according to
469 the maximum vectorization factor the data dependences allow. */
471 bool
472 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
474 unsigned int i;
475 struct data_dependence_relation *ddr;
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_NOTE, vect_location,
479 "=== vect_analyze_data_ref_dependences ===\n");
481 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
482 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
483 &LOOP_VINFO_DDRS (loop_vinfo),
484 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
485 return false;
487 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
488 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
489 return false;
491 return true;
495 /* Function vect_slp_analyze_data_ref_dependence.
497 Return TRUE if there (might) exist a dependence between a memory-reference
498 DRA and a memory-reference DRB. When versioning for alias may check a
499 dependence at run-time, return FALSE. Adjust *MAX_VF according to
500 the data dependence. */
502 static bool
503 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
505 struct data_reference *dra = DDR_A (ddr);
506 struct data_reference *drb = DDR_B (ddr);
508 /* We need to check dependences of statements marked as unvectorizable
509 as well, they still can prohibit vectorization. */
511 /* Independent data accesses. */
512 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
513 return false;
515 if (dra == drb)
516 return false;
518 /* Read-read is OK. */
519 if (DR_IS_READ (dra) && DR_IS_READ (drb))
520 return false;
522 /* If dra and drb are part of the same interleaving chain consider
523 them independent. */
524 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
525 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
526 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
527 return false;
529 /* Unknown data dependence. */
530 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "can't determine dependence between ");
536 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
537 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
539 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
542 else if (dump_enabled_p ())
544 dump_printf_loc (MSG_NOTE, vect_location,
545 "determined dependence between ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
547 dump_printf (MSG_NOTE, " and ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
549 dump_printf (MSG_NOTE, "\n");
552 /* We do not vectorize basic blocks with write-write dependencies. */
553 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
554 return true;
556 /* If we have a read-write dependence check that the load is before the store.
557 When we vectorize basic blocks, vector load can be only before
558 corresponding scalar load, and vector store can be only after its
559 corresponding scalar store. So the order of the acceses is preserved in
560 case the load is before the store. */
561 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
562 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
564 /* That only holds for load-store pairs taking part in vectorization. */
565 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
566 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
567 return false;
570 return true;
574 /* Function vect_analyze_data_ref_dependences.
576 Examine all the data references in the basic-block, and make sure there
577 do not exist any data dependences between them. Set *MAX_VF according to
578 the maximum vectorization factor the data dependences allow. */
580 bool
581 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
583 struct data_dependence_relation *ddr;
584 unsigned int i;
586 if (dump_enabled_p ())
587 dump_printf_loc (MSG_NOTE, vect_location,
588 "=== vect_slp_analyze_data_ref_dependences ===\n");
590 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
591 &BB_VINFO_DDRS (bb_vinfo),
592 vNULL, true))
593 return false;
595 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
596 if (vect_slp_analyze_data_ref_dependence (ddr))
597 return false;
599 return true;
603 /* Function vect_compute_data_ref_alignment
605 Compute the misalignment of the data reference DR.
607 Output:
608 1. If during the misalignment computation it is found that the data reference
609 cannot be vectorized then false is returned.
610 2. DR_MISALIGNMENT (DR) is defined.
612 FOR NOW: No analysis is actually performed. Misalignment is calculated
613 only for trivial cases. TODO. */
615 static bool
616 vect_compute_data_ref_alignment (struct data_reference *dr)
618 gimple stmt = DR_STMT (dr);
619 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
620 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
621 struct loop *loop = NULL;
622 tree ref = DR_REF (dr);
623 tree vectype;
624 tree base, base_addr;
625 tree misalign = NULL_TREE;
626 tree aligned_to;
627 unsigned HOST_WIDE_INT alignment;
629 if (dump_enabled_p ())
630 dump_printf_loc (MSG_NOTE, vect_location,
631 "vect_compute_data_ref_alignment:\n");
633 if (loop_vinfo)
634 loop = LOOP_VINFO_LOOP (loop_vinfo);
636 /* Initialize misalignment to unknown. */
637 SET_DR_MISALIGNMENT (dr, -1);
639 /* Strided accesses perform only component accesses, misalignment information
640 is irrelevant for them. */
641 if (STMT_VINFO_STRIDED_P (stmt_info)
642 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
643 return true;
645 if (tree_fits_shwi_p (DR_STEP (dr)))
646 misalign = DR_INIT (dr);
647 aligned_to = DR_ALIGNED_TO (dr);
648 base_addr = DR_BASE_ADDRESS (dr);
649 vectype = STMT_VINFO_VECTYPE (stmt_info);
651 /* In case the dataref is in an inner-loop of the loop that is being
652 vectorized (LOOP), we use the base and misalignment information
653 relative to the outer-loop (LOOP). This is ok only if the misalignment
654 stays the same throughout the execution of the inner-loop, which is why
655 we have to check that the stride of the dataref in the inner-loop evenly
656 divides by the vector size. */
657 if (loop && nested_in_vect_loop_p (loop, stmt))
659 tree step = DR_STEP (dr);
661 if (tree_fits_shwi_p (step)
662 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "inner step divides the vector-size.\n");
667 misalign = STMT_VINFO_DR_INIT (stmt_info);
668 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
669 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
671 else
673 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
675 "inner step doesn't divide the vector-size.\n");
676 misalign = NULL_TREE;
680 /* Similarly we can only use base and misalignment information relative to
681 an innermost loop if the misalignment stays the same throughout the
682 execution of the loop. As above, this is the case if the stride of
683 the dataref evenly divides by the vector size. */
684 else
686 tree step = DR_STEP (dr);
687 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
689 if (tree_fits_shwi_p (step)
690 && ((tree_to_shwi (step) * vf)
691 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695 "step doesn't divide the vector-size.\n");
696 misalign = NULL_TREE;
700 /* To look at alignment of the base we have to preserve an inner MEM_REF
701 as that carries alignment information of the actual access. */
702 base = ref;
703 while (handled_component_p (base))
704 base = TREE_OPERAND (base, 0);
705 if (TREE_CODE (base) == MEM_REF)
706 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
707 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
708 unsigned int base_alignment = get_object_alignment (base);
710 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
711 DR_VECT_AUX (dr)->base_element_aligned = true;
713 alignment = TYPE_ALIGN_UNIT (vectype);
715 if ((compare_tree_int (aligned_to, alignment) < 0)
716 || !misalign)
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
723 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
725 return true;
728 if (base_alignment < TYPE_ALIGN (vectype))
730 /* Strip an inner MEM_REF to a bare decl if possible. */
731 if (TREE_CODE (base) == MEM_REF
732 && integer_zerop (TREE_OPERAND (base, 1))
733 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
734 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
736 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "can't force alignment of ref: ");
742 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
743 dump_printf (MSG_NOTE, "\n");
745 return true;
748 /* Force the alignment of the decl.
749 NOTE: This is the only change to the code we make during
750 the analysis phase, before deciding to vectorize the loop. */
751 if (dump_enabled_p ())
753 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
754 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
755 dump_printf (MSG_NOTE, "\n");
758 DR_VECT_AUX (dr)->base_decl = base;
759 DR_VECT_AUX (dr)->base_misaligned = true;
760 DR_VECT_AUX (dr)->base_element_aligned = true;
763 /* If this is a backward running DR then first access in the larger
764 vectype actually is N-1 elements before the address in the DR.
765 Adjust misalign accordingly. */
766 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
768 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
769 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
770 otherwise we wouldn't be here. */
771 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
772 /* PLUS because DR_STEP was negative. */
773 misalign = size_binop (PLUS_EXPR, misalign, offset);
776 SET_DR_MISALIGNMENT (dr,
777 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
779 if (dump_enabled_p ())
781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
782 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
783 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
784 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
791 /* Function vect_compute_data_refs_alignment
793 Compute the misalignment of data references in the loop.
794 Return FALSE if a data reference is found that cannot be vectorized. */
796 static bool
797 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
798 bb_vec_info bb_vinfo)
800 vec<data_reference_p> datarefs;
801 struct data_reference *dr;
802 unsigned int i;
804 if (loop_vinfo)
805 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
806 else
807 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
809 FOR_EACH_VEC_ELT (datarefs, i, dr)
810 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
811 && !vect_compute_data_ref_alignment (dr))
813 if (bb_vinfo)
815 /* Mark unsupported statement as unvectorizable. */
816 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
817 continue;
819 else
820 return false;
823 return true;
827 /* Function vect_update_misalignment_for_peel
829 DR - the data reference whose misalignment is to be adjusted.
830 DR_PEEL - the data reference whose misalignment is being made
831 zero in the vector loop by the peel.
832 NPEEL - the number of iterations in the peel loop if the misalignment
833 of DR_PEEL is known at compile time. */
835 static void
836 vect_update_misalignment_for_peel (struct data_reference *dr,
837 struct data_reference *dr_peel, int npeel)
839 unsigned int i;
840 vec<dr_p> same_align_drs;
841 struct data_reference *current_dr;
842 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
843 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
844 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
845 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
847 /* For interleaved data accesses the step in the loop must be multiplied by
848 the size of the interleaving group. */
849 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
850 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
851 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
852 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
854 /* It can be assumed that the data refs with the same alignment as dr_peel
855 are aligned in the vector loop. */
856 same_align_drs
857 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
858 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
860 if (current_dr != dr)
861 continue;
862 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
863 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
864 SET_DR_MISALIGNMENT (dr, 0);
865 return;
868 if (known_alignment_for_access_p (dr)
869 && known_alignment_for_access_p (dr_peel))
871 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
872 int misal = DR_MISALIGNMENT (dr);
873 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
874 misal += negative ? -npeel * dr_size : npeel * dr_size;
875 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
876 SET_DR_MISALIGNMENT (dr, misal);
877 return;
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
882 SET_DR_MISALIGNMENT (dr, -1);
886 /* Function vect_verify_datarefs_alignment
888 Return TRUE if all data references in the loop can be
889 handled with respect to alignment. */
891 bool
892 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
894 vec<data_reference_p> datarefs;
895 struct data_reference *dr;
896 enum dr_alignment_support supportable_dr_alignment;
897 unsigned int i;
899 if (loop_vinfo)
900 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
901 else
902 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
904 FOR_EACH_VEC_ELT (datarefs, i, dr)
906 gimple stmt = DR_STMT (dr);
907 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
909 if (!STMT_VINFO_RELEVANT_P (stmt_info))
910 continue;
912 /* For interleaving, only the alignment of the first access matters.
913 Skip statements marked as not vectorizable. */
914 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
915 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
916 || !STMT_VINFO_VECTORIZABLE (stmt_info))
917 continue;
919 /* Strided accesses perform only component accesses, alignment is
920 irrelevant for them. */
921 if (STMT_VINFO_STRIDED_P (stmt_info)
922 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
923 continue;
925 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
926 if (!supportable_dr_alignment)
928 if (dump_enabled_p ())
930 if (DR_IS_READ (dr))
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "not vectorized: unsupported unaligned load.");
933 else
934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
935 "not vectorized: unsupported unaligned "
936 "store.");
938 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
939 DR_REF (dr));
940 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
942 return false;
944 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
945 dump_printf_loc (MSG_NOTE, vect_location,
946 "Vectorizing an unaligned access.\n");
948 return true;
951 /* Given an memory reference EXP return whether its alignment is less
952 than its size. */
954 static bool
955 not_size_aligned (tree exp)
957 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
958 return true;
960 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
961 > get_object_alignment (exp));
964 /* Function vector_alignment_reachable_p
966 Return true if vector alignment for DR is reachable by peeling
967 a few loop iterations. Return false otherwise. */
969 static bool
970 vector_alignment_reachable_p (struct data_reference *dr)
972 gimple stmt = DR_STMT (dr);
973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
974 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
978 /* For interleaved access we peel only if number of iterations in
979 the prolog loop ({VF - misalignment}), is a multiple of the
980 number of the interleaved accesses. */
981 int elem_size, mis_in_elements;
982 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
984 /* FORNOW: handle only known alignment. */
985 if (!known_alignment_for_access_p (dr))
986 return false;
988 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
989 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
991 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
992 return false;
995 /* If misalignment is known at the compile time then allow peeling
996 only if natural alignment is reachable through peeling. */
997 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
999 HOST_WIDE_INT elmsize =
1000 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1001 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_NOTE, vect_location,
1004 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1005 dump_printf (MSG_NOTE,
1006 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1008 if (DR_MISALIGNMENT (dr) % elmsize)
1010 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "data size does not divide the misalignment.\n");
1013 return false;
1017 if (!known_alignment_for_access_p (dr))
1019 tree type = TREE_TYPE (DR_REF (dr));
1020 bool is_packed = not_size_aligned (DR_REF (dr));
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1023 "Unknown misalignment, is_packed = %d\n",is_packed);
1024 if ((TYPE_USER_ALIGN (type) && !is_packed)
1025 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1026 return true;
1027 else
1028 return false;
1031 return true;
1035 /* Calculate the cost of the memory access represented by DR. */
1037 static void
1038 vect_get_data_access_cost (struct data_reference *dr,
1039 unsigned int *inside_cost,
1040 unsigned int *outside_cost,
1041 stmt_vector_for_cost *body_cost_vec)
1043 gimple stmt = DR_STMT (dr);
1044 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1045 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1046 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1047 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1048 int ncopies = vf / nunits;
1050 if (DR_IS_READ (dr))
1051 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1052 NULL, body_cost_vec, false);
1053 else
1054 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_NOTE, vect_location,
1058 "vect_get_data_access_cost: inside_cost = %d, "
1059 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1063 /* Insert DR into peeling hash table with NPEEL as key. */
1065 static void
1066 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1067 int npeel)
1069 struct _vect_peel_info elem, *slot;
1070 _vect_peel_info **new_slot;
1071 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1073 elem.npeel = npeel;
1074 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1075 if (slot)
1076 slot->count++;
1077 else
1079 slot = XNEW (struct _vect_peel_info);
1080 slot->npeel = npeel;
1081 slot->dr = dr;
1082 slot->count = 1;
1083 new_slot
1084 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1085 *new_slot = slot;
1088 if (!supportable_dr_alignment
1089 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1090 slot->count += VECT_MAX_COST;
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095 number of data accesses. */
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1099 _vect_peel_extended_info *max)
1101 vect_peel_info elem = *slot;
1103 if (elem->count > max->peel_info.count
1104 || (elem->count == max->peel_info.count
1105 && max->peel_info.npeel > elem->npeel))
1107 max->peel_info.npeel = elem->npeel;
1108 max->peel_info.count = elem->count;
1109 max->peel_info.dr = elem->dr;
1112 return 1;
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117 Find the one with the lowest cost. */
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1121 _vect_peel_extended_info *min)
1123 vect_peel_info elem = *slot;
1124 int save_misalignment, dummy;
1125 unsigned int inside_cost = 0, outside_cost = 0, i;
1126 gimple stmt = DR_STMT (elem->dr);
1127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1128 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1129 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1130 struct data_reference *dr;
1131 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1133 prologue_cost_vec.create (2);
1134 body_cost_vec.create (2);
1135 epilogue_cost_vec.create (2);
1137 FOR_EACH_VEC_ELT (datarefs, i, dr)
1139 stmt = DR_STMT (dr);
1140 stmt_info = vinfo_for_stmt (stmt);
1141 /* For interleaving, only the alignment of the first access
1142 matters. */
1143 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1144 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1145 continue;
1147 save_misalignment = DR_MISALIGNMENT (dr);
1148 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1149 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1150 &body_cost_vec);
1151 SET_DR_MISALIGNMENT (dr, save_misalignment);
1154 outside_cost += vect_get_known_peeling_cost
1155 (loop_vinfo, elem->npeel, &dummy,
1156 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1157 &prologue_cost_vec, &epilogue_cost_vec);
1159 /* Prologue and epilogue costs are added to the target model later.
1160 These costs depend only on the scalar iteration cost, the
1161 number of peeling iterations finally chosen, and the number of
1162 misaligned statements. So discard the information found here. */
1163 prologue_cost_vec.release ();
1164 epilogue_cost_vec.release ();
1166 if (inside_cost < min->inside_cost
1167 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1169 min->inside_cost = inside_cost;
1170 min->outside_cost = outside_cost;
1171 min->body_cost_vec.release ();
1172 min->body_cost_vec = body_cost_vec;
1173 min->peel_info.dr = elem->dr;
1174 min->peel_info.npeel = elem->npeel;
1176 else
1177 body_cost_vec.release ();
1179 return 1;
1183 /* Choose best peeling option by traversing peeling hash table and either
1184 choosing an option with the lowest cost (if cost model is enabled) or the
1185 option that aligns as many accesses as possible. */
1187 static struct data_reference *
1188 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1189 unsigned int *npeel,
1190 stmt_vector_for_cost *body_cost_vec)
1192 struct _vect_peel_extended_info res;
1194 res.peel_info.dr = NULL;
1195 res.body_cost_vec = stmt_vector_for_cost ();
1197 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1199 res.inside_cost = INT_MAX;
1200 res.outside_cost = INT_MAX;
1201 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1202 ->traverse <_vect_peel_extended_info *,
1203 vect_peeling_hash_get_lowest_cost> (&res);
1205 else
1207 res.peel_info.count = 0;
1208 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1209 ->traverse <_vect_peel_extended_info *,
1210 vect_peeling_hash_get_most_frequent> (&res);
1213 *npeel = res.peel_info.npeel;
1214 *body_cost_vec = res.body_cost_vec;
1215 return res.peel_info.dr;
1219 /* Function vect_enhance_data_refs_alignment
1221 This pass will use loop versioning and loop peeling in order to enhance
1222 the alignment of data references in the loop.
1224 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1225 original loop is to be vectorized. Any other loops that are created by
1226 the transformations performed in this pass - are not supposed to be
1227 vectorized. This restriction will be relaxed.
1229 This pass will require a cost model to guide it whether to apply peeling
1230 or versioning or a combination of the two. For example, the scheme that
1231 intel uses when given a loop with several memory accesses, is as follows:
1232 choose one memory access ('p') which alignment you want to force by doing
1233 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1234 other accesses are not necessarily aligned, or (2) use loop versioning to
1235 generate one loop in which all accesses are aligned, and another loop in
1236 which only 'p' is necessarily aligned.
1238 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1239 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1240 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1242 Devising a cost model is the most critical aspect of this work. It will
1243 guide us on which access to peel for, whether to use loop versioning, how
1244 many versions to create, etc. The cost model will probably consist of
1245 generic considerations as well as target specific considerations (on
1246 powerpc for example, misaligned stores are more painful than misaligned
1247 loads).
1249 Here are the general steps involved in alignment enhancements:
1251 -- original loop, before alignment analysis:
1252 for (i=0; i<N; i++){
1253 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1254 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1257 -- After vect_compute_data_refs_alignment:
1258 for (i=0; i<N; i++){
1259 x = q[i]; # DR_MISALIGNMENT(q) = 3
1260 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1263 -- Possibility 1: we do loop versioning:
1264 if (p is aligned) {
1265 for (i=0; i<N; i++){ # loop 1A
1266 x = q[i]; # DR_MISALIGNMENT(q) = 3
1267 p[i] = y; # DR_MISALIGNMENT(p) = 0
1270 else {
1271 for (i=0; i<N; i++){ # loop 1B
1272 x = q[i]; # DR_MISALIGNMENT(q) = 3
1273 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1277 -- Possibility 2: we do loop peeling:
1278 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1279 x = q[i];
1280 p[i] = y;
1282 for (i = 3; i < N; i++){ # loop 2A
1283 x = q[i]; # DR_MISALIGNMENT(q) = 0
1284 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1287 -- Possibility 3: combination of loop peeling and versioning:
1288 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1289 x = q[i];
1290 p[i] = y;
1292 if (p is aligned) {
1293 for (i = 3; i<N; i++){ # loop 3A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 0
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1298 else {
1299 for (i = 3; i<N; i++){ # loop 3B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 0
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1305 These loops are later passed to loop_transform to be vectorized. The
1306 vectorizer will use the alignment information to guide the transformation
1307 (whether to generate regular loads/stores, or with special handling for
1308 misalignment). */
1310 bool
1311 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1313 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1315 enum dr_alignment_support supportable_dr_alignment;
1316 struct data_reference *dr0 = NULL, *first_store = NULL;
1317 struct data_reference *dr;
1318 unsigned int i, j;
1319 bool do_peeling = false;
1320 bool do_versioning = false;
1321 bool stat;
1322 gimple stmt;
1323 stmt_vec_info stmt_info;
1324 unsigned int npeel = 0;
1325 bool all_misalignments_unknown = true;
1326 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1327 unsigned possible_npeel_number = 1;
1328 tree vectype;
1329 unsigned int nelements, mis, same_align_drs_max = 0;
1330 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1332 if (dump_enabled_p ())
1333 dump_printf_loc (MSG_NOTE, vect_location,
1334 "=== vect_enhance_data_refs_alignment ===\n");
1336 /* While cost model enhancements are expected in the future, the high level
1337 view of the code at this time is as follows:
1339 A) If there is a misaligned access then see if peeling to align
1340 this access can make all data references satisfy
1341 vect_supportable_dr_alignment. If so, update data structures
1342 as needed and return true.
1344 B) If peeling wasn't possible and there is a data reference with an
1345 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1346 then see if loop versioning checks can be used to make all data
1347 references satisfy vect_supportable_dr_alignment. If so, update
1348 data structures as needed and return true.
1350 C) If neither peeling nor versioning were successful then return false if
1351 any data reference does not satisfy vect_supportable_dr_alignment.
1353 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1355 Note, Possibility 3 above (which is peeling and versioning together) is not
1356 being done at this time. */
1358 /* (1) Peeling to force alignment. */
1360 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1361 Considerations:
1362 + How many accesses will become aligned due to the peeling
1363 - How many accesses will become unaligned due to the peeling,
1364 and the cost of misaligned accesses.
1365 - The cost of peeling (the extra runtime checks, the increase
1366 in code size). */
1368 FOR_EACH_VEC_ELT (datarefs, i, dr)
1370 stmt = DR_STMT (dr);
1371 stmt_info = vinfo_for_stmt (stmt);
1373 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1374 continue;
1376 /* For interleaving, only the alignment of the first access
1377 matters. */
1378 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1379 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1380 continue;
1382 /* For invariant accesses there is nothing to enhance. */
1383 if (integer_zerop (DR_STEP (dr)))
1384 continue;
1386 /* Strided accesses perform only component accesses, alignment is
1387 irrelevant for them. */
1388 if (STMT_VINFO_STRIDED_P (stmt_info)
1389 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1390 continue;
1392 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1393 do_peeling = vector_alignment_reachable_p (dr);
1394 if (do_peeling)
1396 if (known_alignment_for_access_p (dr))
1398 unsigned int npeel_tmp;
1399 bool negative = tree_int_cst_compare (DR_STEP (dr),
1400 size_zero_node) < 0;
1402 /* Save info about DR in the hash table. */
1403 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1404 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1405 = new hash_table<peel_info_hasher> (1);
1407 vectype = STMT_VINFO_VECTYPE (stmt_info);
1408 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1409 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1410 TREE_TYPE (DR_REF (dr))));
1411 npeel_tmp = (negative
1412 ? (mis - nelements) : (nelements - mis))
1413 & (nelements - 1);
1415 /* For multiple types, it is possible that the bigger type access
1416 will have more than one peeling option. E.g., a loop with two
1417 types: one of size (vector size / 4), and the other one of
1418 size (vector size / 8). Vectorization factor will 8. If both
1419 access are misaligned by 3, the first one needs one scalar
1420 iteration to be aligned, and the second one needs 5. But the
1421 the first one will be aligned also by peeling 5 scalar
1422 iterations, and in that case both accesses will be aligned.
1423 Hence, except for the immediate peeling amount, we also want
1424 to try to add full vector size, while we don't exceed
1425 vectorization factor.
1426 We do this automtically for cost model, since we calculate cost
1427 for every peeling option. */
1428 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1430 if (STMT_SLP_TYPE (stmt_info))
1431 possible_npeel_number
1432 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1433 else
1434 possible_npeel_number = vf / nelements;
1437 /* Handle the aligned case. We may decide to align some other
1438 access, making DR unaligned. */
1439 if (DR_MISALIGNMENT (dr) == 0)
1441 npeel_tmp = 0;
1442 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1443 possible_npeel_number++;
1446 for (j = 0; j < possible_npeel_number; j++)
1448 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1449 npeel_tmp += nelements;
1452 all_misalignments_unknown = false;
1453 /* Data-ref that was chosen for the case that all the
1454 misalignments are unknown is not relevant anymore, since we
1455 have a data-ref with known alignment. */
1456 dr0 = NULL;
1458 else
1460 /* If we don't know any misalignment values, we prefer
1461 peeling for data-ref that has the maximum number of data-refs
1462 with the same alignment, unless the target prefers to align
1463 stores over load. */
1464 if (all_misalignments_unknown)
1466 unsigned same_align_drs
1467 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1468 if (!dr0
1469 || same_align_drs_max < same_align_drs)
1471 same_align_drs_max = same_align_drs;
1472 dr0 = dr;
1474 /* For data-refs with the same number of related
1475 accesses prefer the one where the misalign
1476 computation will be invariant in the outermost loop. */
1477 else if (same_align_drs_max == same_align_drs)
1479 struct loop *ivloop0, *ivloop;
1480 ivloop0 = outermost_invariant_loop_for_expr
1481 (loop, DR_BASE_ADDRESS (dr0));
1482 ivloop = outermost_invariant_loop_for_expr
1483 (loop, DR_BASE_ADDRESS (dr));
1484 if ((ivloop && !ivloop0)
1485 || (ivloop && ivloop0
1486 && flow_loop_nested_p (ivloop, ivloop0)))
1487 dr0 = dr;
1490 if (!first_store && DR_IS_WRITE (dr))
1491 first_store = dr;
1494 /* If there are both known and unknown misaligned accesses in the
1495 loop, we choose peeling amount according to the known
1496 accesses. */
1497 if (!supportable_dr_alignment)
1499 dr0 = dr;
1500 if (!first_store && DR_IS_WRITE (dr))
1501 first_store = dr;
1505 else
1507 if (!aligned_access_p (dr))
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1511 "vector alignment may not be reachable\n");
1512 break;
1517 /* Check if we can possibly peel the loop. */
1518 if (!vect_can_advance_ivs_p (loop_vinfo)
1519 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1520 || loop->inner)
1521 do_peeling = false;
1523 if (do_peeling
1524 && all_misalignments_unknown
1525 && vect_supportable_dr_alignment (dr0, false))
1527 /* Check if the target requires to prefer stores over loads, i.e., if
1528 misaligned stores are more expensive than misaligned loads (taking
1529 drs with same alignment into account). */
1530 if (first_store && DR_IS_READ (dr0))
1532 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1533 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1534 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1535 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1536 stmt_vector_for_cost dummy;
1537 dummy.create (2);
1539 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1540 &dummy);
1541 vect_get_data_access_cost (first_store, &store_inside_cost,
1542 &store_outside_cost, &dummy);
1544 dummy.release ();
1546 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1547 aligning the load DR0). */
1548 load_inside_penalty = store_inside_cost;
1549 load_outside_penalty = store_outside_cost;
1550 for (i = 0;
1551 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1552 DR_STMT (first_store))).iterate (i, &dr);
1553 i++)
1554 if (DR_IS_READ (dr))
1556 load_inside_penalty += load_inside_cost;
1557 load_outside_penalty += load_outside_cost;
1559 else
1561 load_inside_penalty += store_inside_cost;
1562 load_outside_penalty += store_outside_cost;
1565 /* Calculate the penalty for leaving DR0 unaligned (by
1566 aligning the FIRST_STORE). */
1567 store_inside_penalty = load_inside_cost;
1568 store_outside_penalty = load_outside_cost;
1569 for (i = 0;
1570 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1571 DR_STMT (dr0))).iterate (i, &dr);
1572 i++)
1573 if (DR_IS_READ (dr))
1575 store_inside_penalty += load_inside_cost;
1576 store_outside_penalty += load_outside_cost;
1578 else
1580 store_inside_penalty += store_inside_cost;
1581 store_outside_penalty += store_outside_cost;
1584 if (load_inside_penalty > store_inside_penalty
1585 || (load_inside_penalty == store_inside_penalty
1586 && load_outside_penalty > store_outside_penalty))
1587 dr0 = first_store;
1590 /* In case there are only loads with different unknown misalignments, use
1591 peeling only if it may help to align other accesses in the loop or
1592 if it may help improving load bandwith when we'd end up using
1593 unaligned loads. */
1594 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1595 if (!first_store
1596 && !STMT_VINFO_SAME_ALIGN_REFS (
1597 vinfo_for_stmt (DR_STMT (dr0))).length ()
1598 && (vect_supportable_dr_alignment (dr0, false)
1599 != dr_unaligned_supported
1600 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1601 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1602 do_peeling = false;
1605 if (do_peeling && !dr0)
1607 /* Peeling is possible, but there is no data access that is not supported
1608 unless aligned. So we try to choose the best possible peeling. */
1610 /* We should get here only if there are drs with known misalignment. */
1611 gcc_assert (!all_misalignments_unknown);
1613 /* Choose the best peeling from the hash table. */
1614 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1615 &body_cost_vec);
1616 if (!dr0 || !npeel)
1617 do_peeling = false;
1620 if (do_peeling)
1622 stmt = DR_STMT (dr0);
1623 stmt_info = vinfo_for_stmt (stmt);
1624 vectype = STMT_VINFO_VECTYPE (stmt_info);
1625 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1627 if (known_alignment_for_access_p (dr0))
1629 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1630 size_zero_node) < 0;
1631 if (!npeel)
1633 /* Since it's known at compile time, compute the number of
1634 iterations in the peeled loop (the peeling factor) for use in
1635 updating DR_MISALIGNMENT values. The peeling factor is the
1636 vectorization factor minus the misalignment as an element
1637 count. */
1638 mis = DR_MISALIGNMENT (dr0);
1639 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1640 npeel = ((negative ? mis - nelements : nelements - mis)
1641 & (nelements - 1));
1644 /* For interleaved data access every iteration accesses all the
1645 members of the group, therefore we divide the number of iterations
1646 by the group size. */
1647 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1648 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1649 npeel /= GROUP_SIZE (stmt_info);
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_NOTE, vect_location,
1653 "Try peeling by %d\n", npeel);
1656 /* Ensure that all data refs can be vectorized after the peel. */
1657 FOR_EACH_VEC_ELT (datarefs, i, dr)
1659 int save_misalignment;
1661 if (dr == dr0)
1662 continue;
1664 stmt = DR_STMT (dr);
1665 stmt_info = vinfo_for_stmt (stmt);
1666 /* For interleaving, only the alignment of the first access
1667 matters. */
1668 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1669 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1670 continue;
1672 /* Strided accesses perform only component accesses, alignment is
1673 irrelevant for them. */
1674 if (STMT_VINFO_STRIDED_P (stmt_info)
1675 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1676 continue;
1678 save_misalignment = DR_MISALIGNMENT (dr);
1679 vect_update_misalignment_for_peel (dr, dr0, npeel);
1680 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1681 SET_DR_MISALIGNMENT (dr, save_misalignment);
1683 if (!supportable_dr_alignment)
1685 do_peeling = false;
1686 break;
1690 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1692 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1693 if (!stat)
1694 do_peeling = false;
1695 else
1697 body_cost_vec.release ();
1698 return stat;
1702 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1703 if (do_peeling)
1705 unsigned max_allowed_peel
1706 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1707 if (max_allowed_peel != (unsigned)-1)
1709 unsigned max_peel = npeel;
1710 if (max_peel == 0)
1712 gimple dr_stmt = DR_STMT (dr0);
1713 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1714 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1715 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1717 if (max_peel > max_allowed_peel)
1719 do_peeling = false;
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_NOTE, vect_location,
1722 "Disable peeling, max peels reached: %d\n", max_peel);
1727 /* Cost model #2 - if peeling may result in a remaining loop not
1728 iterating enough to be vectorized then do not peel. */
1729 if (do_peeling
1730 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1732 unsigned max_peel
1733 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1734 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1735 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1736 do_peeling = false;
1739 if (do_peeling)
1741 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1742 If the misalignment of DR_i is identical to that of dr0 then set
1743 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1744 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1745 by the peeling factor times the element size of DR_i (MOD the
1746 vectorization factor times the size). Otherwise, the
1747 misalignment of DR_i must be set to unknown. */
1748 FOR_EACH_VEC_ELT (datarefs, i, dr)
1749 if (dr != dr0)
1750 vect_update_misalignment_for_peel (dr, dr0, npeel);
1752 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1753 if (npeel)
1754 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1755 else
1756 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1757 = DR_MISALIGNMENT (dr0);
1758 SET_DR_MISALIGNMENT (dr0, 0);
1759 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_NOTE, vect_location,
1762 "Alignment of access forced using peeling.\n");
1763 dump_printf_loc (MSG_NOTE, vect_location,
1764 "Peeling for alignment will be applied.\n");
1766 /* The inside-loop cost will be accounted for in vectorizable_load
1767 and vectorizable_store correctly with adjusted alignments.
1768 Drop the body_cst_vec on the floor here. */
1769 body_cost_vec.release ();
1771 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1772 gcc_assert (stat);
1773 return stat;
1777 body_cost_vec.release ();
1779 /* (2) Versioning to force alignment. */
1781 /* Try versioning if:
1782 1) optimize loop for speed
1783 2) there is at least one unsupported misaligned data ref with an unknown
1784 misalignment, and
1785 3) all misaligned data refs with a known misalignment are supported, and
1786 4) the number of runtime alignment checks is within reason. */
1788 do_versioning =
1789 optimize_loop_nest_for_speed_p (loop)
1790 && (!loop->inner); /* FORNOW */
1792 if (do_versioning)
1794 FOR_EACH_VEC_ELT (datarefs, i, dr)
1796 stmt = DR_STMT (dr);
1797 stmt_info = vinfo_for_stmt (stmt);
1799 /* For interleaving, only the alignment of the first access
1800 matters. */
1801 if (aligned_access_p (dr)
1802 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1803 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1804 continue;
1806 if (STMT_VINFO_STRIDED_P (stmt_info))
1808 /* Strided loads perform only component accesses, alignment is
1809 irrelevant for them. */
1810 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1811 continue;
1812 do_versioning = false;
1813 break;
1816 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1818 if (!supportable_dr_alignment)
1820 gimple stmt;
1821 int mask;
1822 tree vectype;
1824 if (known_alignment_for_access_p (dr)
1825 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1826 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1828 do_versioning = false;
1829 break;
1832 stmt = DR_STMT (dr);
1833 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1834 gcc_assert (vectype);
1836 /* The rightmost bits of an aligned address must be zeros.
1837 Construct the mask needed for this test. For example,
1838 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1839 mask must be 15 = 0xf. */
1840 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1842 /* FORNOW: use the same mask to test all potentially unaligned
1843 references in the loop. The vectorizer currently supports
1844 a single vector size, see the reference to
1845 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1846 vectorization factor is computed. */
1847 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1848 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1849 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1850 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1851 DR_STMT (dr));
1855 /* Versioning requires at least one misaligned data reference. */
1856 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1857 do_versioning = false;
1858 else if (!do_versioning)
1859 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1862 if (do_versioning)
1864 vec<gimple> may_misalign_stmts
1865 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1866 gimple stmt;
1868 /* It can now be assumed that the data references in the statements
1869 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1870 of the loop being vectorized. */
1871 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1873 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1874 dr = STMT_VINFO_DATA_REF (stmt_info);
1875 SET_DR_MISALIGNMENT (dr, 0);
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_NOTE, vect_location,
1878 "Alignment of access forced using versioning.\n");
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_NOTE, vect_location,
1883 "Versioning for alignment will be applied.\n");
1885 /* Peeling and versioning can't be done together at this time. */
1886 gcc_assert (! (do_peeling && do_versioning));
1888 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1889 gcc_assert (stat);
1890 return stat;
1893 /* This point is reached if neither peeling nor versioning is being done. */
1894 gcc_assert (! (do_peeling || do_versioning));
1896 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1897 return stat;
1901 /* Function vect_find_same_alignment_drs.
1903 Update group and alignment relations according to the chosen
1904 vectorization factor. */
1906 static void
1907 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1908 loop_vec_info loop_vinfo)
1910 unsigned int i;
1911 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1912 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1913 struct data_reference *dra = DDR_A (ddr);
1914 struct data_reference *drb = DDR_B (ddr);
1915 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1916 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1917 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1918 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1919 lambda_vector dist_v;
1920 unsigned int loop_depth;
1922 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1923 return;
1925 if (dra == drb)
1926 return;
1928 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1929 return;
1931 /* Loop-based vectorization and known data dependence. */
1932 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1933 return;
1935 /* Data-dependence analysis reports a distance vector of zero
1936 for data-references that overlap only in the first iteration
1937 but have different sign step (see PR45764).
1938 So as a sanity check require equal DR_STEP. */
1939 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1940 return;
1942 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1943 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1945 int dist = dist_v[loop_depth];
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location,
1949 "dependence distance = %d.\n", dist);
1951 /* Same loop iteration. */
1952 if (dist == 0
1953 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1955 /* Two references with distance zero have the same alignment. */
1956 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1957 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1958 if (dump_enabled_p ())
1960 dump_printf_loc (MSG_NOTE, vect_location,
1961 "accesses have the same alignment.\n");
1962 dump_printf (MSG_NOTE,
1963 "dependence distance modulo vf == 0 between ");
1964 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1965 dump_printf (MSG_NOTE, " and ");
1966 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1967 dump_printf (MSG_NOTE, "\n");
1974 /* Function vect_analyze_data_refs_alignment
1976 Analyze the alignment of the data-references in the loop.
1977 Return FALSE if a data reference is found that cannot be vectorized. */
1979 bool
1980 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1981 bb_vec_info bb_vinfo)
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location,
1985 "=== vect_analyze_data_refs_alignment ===\n");
1987 /* Mark groups of data references with same alignment using
1988 data dependence information. */
1989 if (loop_vinfo)
1991 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1992 struct data_dependence_relation *ddr;
1993 unsigned int i;
1995 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1996 vect_find_same_alignment_drs (ddr, loop_vinfo);
1999 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 "not vectorized: can't calculate alignment "
2004 "for data ref.\n");
2005 return false;
2008 return true;
2012 /* Analyze groups of accesses: check that DR belongs to a group of
2013 accesses of legal size, step, etc. Detect gaps, single element
2014 interleaving, and other special cases. Set grouped access info.
2015 Collect groups of strided stores for further use in SLP analysis. */
2017 static bool
2018 vect_analyze_group_access (struct data_reference *dr)
2020 tree step = DR_STEP (dr);
2021 tree scalar_type = TREE_TYPE (DR_REF (dr));
2022 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2023 gimple stmt = DR_STMT (dr);
2024 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2025 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2026 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2027 HOST_WIDE_INT dr_step = -1;
2028 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2029 bool slp_impossible = false;
2030 struct loop *loop = NULL;
2032 if (loop_vinfo)
2033 loop = LOOP_VINFO_LOOP (loop_vinfo);
2035 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2036 size of the interleaving group (including gaps). */
2037 if (tree_fits_shwi_p (step))
2039 dr_step = tree_to_shwi (step);
2040 groupsize = absu_hwi (dr_step) / type_size;
2042 else
2043 groupsize = 0;
2045 /* Not consecutive access is possible only if it is a part of interleaving. */
2046 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2048 /* Check if it this DR is a part of interleaving, and is a single
2049 element of the group that is accessed in the loop. */
2051 /* Gaps are supported only for loads. STEP must be a multiple of the type
2052 size. The size of the group must be a power of 2. */
2053 if (DR_IS_READ (dr)
2054 && (dr_step % type_size) == 0
2055 && groupsize > 0
2056 && exact_log2 (groupsize) != -1)
2058 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2059 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2060 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location,
2063 "Detected single element interleaving ");
2064 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2065 dump_printf (MSG_NOTE, " step ");
2066 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2067 dump_printf (MSG_NOTE, "\n");
2070 if (loop_vinfo)
2072 if (dump_enabled_p ())
2073 dump_printf_loc (MSG_NOTE, vect_location,
2074 "Data access with gaps requires scalar "
2075 "epilogue loop\n");
2076 if (loop->inner)
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080 "Peeling for outer loop is not"
2081 " supported\n");
2082 return false;
2085 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2088 return true;
2091 if (dump_enabled_p ())
2093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2094 "not consecutive access ");
2095 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2096 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2099 if (bb_vinfo)
2101 /* Mark the statement as unvectorizable. */
2102 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2103 return true;
2106 return false;
2109 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2111 /* First stmt in the interleaving chain. Check the chain. */
2112 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2113 struct data_reference *data_ref = dr;
2114 unsigned int count = 1;
2115 tree prev_init = DR_INIT (data_ref);
2116 gimple prev = stmt;
2117 HOST_WIDE_INT diff, gaps = 0;
2119 while (next)
2121 /* Skip same data-refs. In case that two or more stmts share
2122 data-ref (supported only for loads), we vectorize only the first
2123 stmt, and the rest get their vectorized loads from the first
2124 one. */
2125 if (!tree_int_cst_compare (DR_INIT (data_ref),
2126 DR_INIT (STMT_VINFO_DATA_REF (
2127 vinfo_for_stmt (next)))))
2129 if (DR_IS_WRITE (data_ref))
2131 if (dump_enabled_p ())
2132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133 "Two store stmts share the same dr.\n");
2134 return false;
2137 /* For load use the same data-ref load. */
2138 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2140 prev = next;
2141 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2142 continue;
2145 prev = next;
2146 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2148 /* All group members have the same STEP by construction. */
2149 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2151 /* Check that the distance between two accesses is equal to the type
2152 size. Otherwise, we have gaps. */
2153 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2154 - TREE_INT_CST_LOW (prev_init)) / type_size;
2155 if (diff != 1)
2157 /* FORNOW: SLP of accesses with gaps is not supported. */
2158 slp_impossible = true;
2159 if (DR_IS_WRITE (data_ref))
2161 if (dump_enabled_p ())
2162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2163 "interleaved store with gaps\n");
2164 return false;
2167 gaps += diff - 1;
2170 last_accessed_element += diff;
2172 /* Store the gap from the previous member of the group. If there is no
2173 gap in the access, GROUP_GAP is always 1. */
2174 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2176 prev_init = DR_INIT (data_ref);
2177 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2178 /* Count the number of data-refs in the chain. */
2179 count++;
2182 if (groupsize == 0)
2183 groupsize = count + gaps;
2185 /* Check that the size of the interleaving is equal to count for stores,
2186 i.e., that there are no gaps. */
2187 if (groupsize != count
2188 && !DR_IS_READ (dr))
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 "interleaved store with gaps\n");
2193 return false;
2196 /* If there is a gap after the last load in the group it is the
2197 difference between the groupsize and the last accessed
2198 element.
2199 When there is no gap, this difference should be 0. */
2200 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2202 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2203 if (dump_enabled_p ())
2205 dump_printf_loc (MSG_NOTE, vect_location,
2206 "Detected interleaving of size %d starting with ",
2207 (int)groupsize);
2208 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2209 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2210 dump_printf_loc (MSG_NOTE, vect_location,
2211 "There is a gap of %d elements after the group\n",
2212 (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2215 /* SLP: create an SLP data structure for every interleaving group of
2216 stores for further analysis in vect_analyse_slp. */
2217 if (DR_IS_WRITE (dr) && !slp_impossible)
2219 if (loop_vinfo)
2220 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2221 if (bb_vinfo)
2222 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2225 /* If there is a gap in the end of the group or the group size cannot
2226 be made a multiple of the vector element count then we access excess
2227 elements in the last iteration and thus need to peel that off. */
2228 if (loop_vinfo
2229 && (groupsize - last_accessed_element > 0
2230 || exact_log2 (groupsize) == -1))
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2235 "Data access with gaps requires scalar "
2236 "epilogue loop\n");
2237 if (loop->inner)
2239 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241 "Peeling for outer loop is not supported\n");
2242 return false;
2245 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2249 return true;
2253 /* Analyze the access pattern of the data-reference DR.
2254 In case of non-consecutive accesses call vect_analyze_group_access() to
2255 analyze groups of accesses. */
2257 static bool
2258 vect_analyze_data_ref_access (struct data_reference *dr)
2260 tree step = DR_STEP (dr);
2261 tree scalar_type = TREE_TYPE (DR_REF (dr));
2262 gimple stmt = DR_STMT (dr);
2263 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2264 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2265 struct loop *loop = NULL;
2267 if (loop_vinfo)
2268 loop = LOOP_VINFO_LOOP (loop_vinfo);
2270 if (loop_vinfo && !step)
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274 "bad data-ref access in loop\n");
2275 return false;
2278 /* Allow loads with zero step in inner-loop vectorization. */
2279 if (loop_vinfo && integer_zerop (step))
2281 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2282 if (!nested_in_vect_loop_p (loop, stmt))
2283 return DR_IS_READ (dr);
2284 /* Allow references with zero step for outer loops marked
2285 with pragma omp simd only - it guarantees absence of
2286 loop-carried dependencies between inner loop iterations. */
2287 if (!loop->force_vectorize)
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_NOTE, vect_location,
2291 "zero step in inner loop of nest\n");
2292 return false;
2296 if (loop && nested_in_vect_loop_p (loop, stmt))
2298 /* Interleaved accesses are not yet supported within outer-loop
2299 vectorization for references in the inner-loop. */
2300 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2302 /* For the rest of the analysis we use the outer-loop step. */
2303 step = STMT_VINFO_DR_STEP (stmt_info);
2304 if (integer_zerop (step))
2306 if (dump_enabled_p ())
2307 dump_printf_loc (MSG_NOTE, vect_location,
2308 "zero step in outer loop.\n");
2309 if (DR_IS_READ (dr))
2310 return true;
2311 else
2312 return false;
2316 /* Consecutive? */
2317 if (TREE_CODE (step) == INTEGER_CST)
2319 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2320 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2321 || (dr_step < 0
2322 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2324 /* Mark that it is not interleaving. */
2325 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2326 return true;
2330 if (loop && nested_in_vect_loop_p (loop, stmt))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 "grouped access in outer loop.\n");
2335 return false;
2339 /* Assume this is a DR handled by non-constant strided load case. */
2340 if (TREE_CODE (step) != INTEGER_CST)
2341 return (STMT_VINFO_STRIDED_P (stmt_info)
2342 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2343 || vect_analyze_group_access (dr)));
2345 /* Not consecutive access - check if it's a part of interleaving group. */
2346 return vect_analyze_group_access (dr);
2351 /* A helper function used in the comparator function to sort data
2352 references. T1 and T2 are two data references to be compared.
2353 The function returns -1, 0, or 1. */
2355 static int
2356 compare_tree (tree t1, tree t2)
2358 int i, cmp;
2359 enum tree_code code;
2360 char tclass;
2362 if (t1 == t2)
2363 return 0;
2364 if (t1 == NULL)
2365 return -1;
2366 if (t2 == NULL)
2367 return 1;
2370 if (TREE_CODE (t1) != TREE_CODE (t2))
2371 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2373 code = TREE_CODE (t1);
2374 switch (code)
2376 /* For const values, we can just use hash values for comparisons. */
2377 case INTEGER_CST:
2378 case REAL_CST:
2379 case FIXED_CST:
2380 case STRING_CST:
2381 case COMPLEX_CST:
2382 case VECTOR_CST:
2384 hashval_t h1 = iterative_hash_expr (t1, 0);
2385 hashval_t h2 = iterative_hash_expr (t2, 0);
2386 if (h1 != h2)
2387 return h1 < h2 ? -1 : 1;
2388 break;
2391 case SSA_NAME:
2392 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2393 if (cmp != 0)
2394 return cmp;
2396 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2397 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2398 break;
2400 default:
2401 tclass = TREE_CODE_CLASS (code);
2403 /* For var-decl, we could compare their UIDs. */
2404 if (tclass == tcc_declaration)
2406 if (DECL_UID (t1) != DECL_UID (t2))
2407 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2408 break;
2411 /* For expressions with operands, compare their operands recursively. */
2412 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2414 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2415 if (cmp != 0)
2416 return cmp;
2420 return 0;
2424 /* Compare two data-references DRA and DRB to group them into chunks
2425 suitable for grouping. */
2427 static int
2428 dr_group_sort_cmp (const void *dra_, const void *drb_)
2430 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2431 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2432 int cmp;
2434 /* Stabilize sort. */
2435 if (dra == drb)
2436 return 0;
2438 /* Ordering of DRs according to base. */
2439 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2441 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2442 if (cmp != 0)
2443 return cmp;
2446 /* And according to DR_OFFSET. */
2447 if (!dr_equal_offsets_p (dra, drb))
2449 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2450 if (cmp != 0)
2451 return cmp;
2454 /* Put reads before writes. */
2455 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2456 return DR_IS_READ (dra) ? -1 : 1;
2458 /* Then sort after access size. */
2459 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2460 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2462 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2463 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2464 if (cmp != 0)
2465 return cmp;
2468 /* And after step. */
2469 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2471 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2472 if (cmp != 0)
2473 return cmp;
2476 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2477 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2478 if (cmp == 0)
2479 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2480 return cmp;
2483 /* Function vect_analyze_data_ref_accesses.
2485 Analyze the access pattern of all the data references in the loop.
2487 FORNOW: the only access pattern that is considered vectorizable is a
2488 simple step 1 (consecutive) access.
2490 FORNOW: handle only arrays and pointer accesses. */
2492 bool
2493 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2495 unsigned int i;
2496 vec<data_reference_p> datarefs;
2497 struct data_reference *dr;
2499 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_NOTE, vect_location,
2501 "=== vect_analyze_data_ref_accesses ===\n");
2503 if (loop_vinfo)
2504 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2505 else
2506 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2508 if (datarefs.is_empty ())
2509 return true;
2511 /* Sort the array of datarefs to make building the interleaving chains
2512 linear. Don't modify the original vector's order, it is needed for
2513 determining what dependencies are reversed. */
2514 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2515 datarefs_copy.qsort (dr_group_sort_cmp);
2517 /* Build the interleaving chains. */
2518 for (i = 0; i < datarefs_copy.length () - 1;)
2520 data_reference_p dra = datarefs_copy[i];
2521 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2522 stmt_vec_info lastinfo = NULL;
2523 for (i = i + 1; i < datarefs_copy.length (); ++i)
2525 data_reference_p drb = datarefs_copy[i];
2526 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2528 /* ??? Imperfect sorting (non-compatible types, non-modulo
2529 accesses, same accesses) can lead to a group to be artificially
2530 split here as we don't just skip over those. If it really
2531 matters we can push those to a worklist and re-iterate
2532 over them. The we can just skip ahead to the next DR here. */
2534 /* Check that the data-refs have same first location (except init)
2535 and they are both either store or load (not load and store,
2536 not masked loads or stores). */
2537 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2538 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2539 DR_BASE_ADDRESS (drb), 0)
2540 || !dr_equal_offsets_p (dra, drb)
2541 || !gimple_assign_single_p (DR_STMT (dra))
2542 || !gimple_assign_single_p (DR_STMT (drb)))
2543 break;
2545 /* Check that the data-refs have the same constant size. */
2546 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2547 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2548 if (!tree_fits_uhwi_p (sza)
2549 || !tree_fits_uhwi_p (szb)
2550 || !tree_int_cst_equal (sza, szb))
2551 break;
2553 /* Check that the data-refs have the same step. */
2554 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2555 break;
2557 /* Do not place the same access in the interleaving chain twice. */
2558 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2559 break;
2561 /* Check the types are compatible.
2562 ??? We don't distinguish this during sorting. */
2563 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2564 TREE_TYPE (DR_REF (drb))))
2565 break;
2567 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2568 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2569 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2570 gcc_assert (init_a < init_b);
2572 /* If init_b == init_a + the size of the type * k, we have an
2573 interleaving, and DRA is accessed before DRB. */
2574 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2575 if ((init_b - init_a) % type_size_a != 0)
2576 break;
2578 /* If we have a store, the accesses are adjacent. This splits
2579 groups into chunks we support (we don't support vectorization
2580 of stores with gaps). */
2581 if (!DR_IS_READ (dra)
2582 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2583 (DR_INIT (datarefs_copy[i-1]))
2584 != type_size_a))
2585 break;
2587 /* If the step (if not zero or non-constant) is greater than the
2588 difference between data-refs' inits this splits groups into
2589 suitable sizes. */
2590 if (tree_fits_shwi_p (DR_STEP (dra)))
2592 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2593 if (step != 0 && step <= (init_b - init_a))
2594 break;
2597 if (dump_enabled_p ())
2599 dump_printf_loc (MSG_NOTE, vect_location,
2600 "Detected interleaving ");
2601 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2602 dump_printf (MSG_NOTE, " and ");
2603 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2604 dump_printf (MSG_NOTE, "\n");
2607 /* Link the found element into the group list. */
2608 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2610 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2611 lastinfo = stmtinfo_a;
2613 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2614 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2615 lastinfo = stmtinfo_b;
2619 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2620 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2621 && !vect_analyze_data_ref_access (dr))
2623 if (dump_enabled_p ())
2624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2625 "not vectorized: complicated access pattern.\n");
2627 if (bb_vinfo)
2629 /* Mark the statement as not vectorizable. */
2630 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2631 continue;
2633 else
2635 datarefs_copy.release ();
2636 return false;
2640 datarefs_copy.release ();
2641 return true;
2645 /* Operator == between two dr_with_seg_len objects.
2647 This equality operator is used to make sure two data refs
2648 are the same one so that we will consider to combine the
2649 aliasing checks of those two pairs of data dependent data
2650 refs. */
2652 static bool
2653 operator == (const dr_with_seg_len& d1,
2654 const dr_with_seg_len& d2)
2656 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2657 DR_BASE_ADDRESS (d2.dr), 0)
2658 && compare_tree (d1.offset, d2.offset) == 0
2659 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2662 /* Function comp_dr_with_seg_len_pair.
2664 Comparison function for sorting objects of dr_with_seg_len_pair_t
2665 so that we can combine aliasing checks in one scan. */
2667 static int
2668 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2670 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2671 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2673 const dr_with_seg_len &p11 = p1->first,
2674 &p12 = p1->second,
2675 &p21 = p2->first,
2676 &p22 = p2->second;
2678 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2679 if a and c have the same basic address snd step, and b and d have the same
2680 address and step. Therefore, if any a&c or b&d don't have the same address
2681 and step, we don't care the order of those two pairs after sorting. */
2682 int comp_res;
2684 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2685 DR_BASE_ADDRESS (p21.dr))) != 0)
2686 return comp_res;
2687 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2688 DR_BASE_ADDRESS (p22.dr))) != 0)
2689 return comp_res;
2690 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2691 return comp_res;
2692 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2693 return comp_res;
2694 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2695 return comp_res;
2696 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2697 return comp_res;
2699 return 0;
2702 /* Function vect_vfa_segment_size.
2704 Create an expression that computes the size of segment
2705 that will be accessed for a data reference. The functions takes into
2706 account that realignment loads may access one more vector.
2708 Input:
2709 DR: The data reference.
2710 LENGTH_FACTOR: segment length to consider.
2712 Return an expression whose value is the size of segment which will be
2713 accessed by DR. */
2715 static tree
2716 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2718 tree segment_length;
2720 if (integer_zerop (DR_STEP (dr)))
2721 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2722 else
2723 segment_length = size_binop (MULT_EXPR,
2724 fold_convert (sizetype, DR_STEP (dr)),
2725 fold_convert (sizetype, length_factor));
2727 if (vect_supportable_dr_alignment (dr, false)
2728 == dr_explicit_realign_optimized)
2730 tree vector_size = TYPE_SIZE_UNIT
2731 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2733 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2735 return segment_length;
2738 /* Function vect_prune_runtime_alias_test_list.
2740 Prune a list of ddrs to be tested at run-time by versioning for alias.
2741 Merge several alias checks into one if possible.
2742 Return FALSE if resulting list of ddrs is longer then allowed by
2743 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2745 bool
2746 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2748 vec<ddr_p> may_alias_ddrs =
2749 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2750 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2751 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2752 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2753 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2755 ddr_p ddr;
2756 unsigned int i;
2757 tree length_factor;
2759 if (dump_enabled_p ())
2760 dump_printf_loc (MSG_NOTE, vect_location,
2761 "=== vect_prune_runtime_alias_test_list ===\n");
2763 if (may_alias_ddrs.is_empty ())
2764 return true;
2766 /* Basically, for each pair of dependent data refs store_ptr_0
2767 and load_ptr_0, we create an expression:
2769 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2770 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2772 for aliasing checks. However, in some cases we can decrease
2773 the number of checks by combining two checks into one. For
2774 example, suppose we have another pair of data refs store_ptr_0
2775 and load_ptr_1, and if the following condition is satisfied:
2777 load_ptr_0 < load_ptr_1 &&
2778 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2780 (this condition means, in each iteration of vectorized loop,
2781 the accessed memory of store_ptr_0 cannot be between the memory
2782 of load_ptr_0 and load_ptr_1.)
2784 we then can use only the following expression to finish the
2785 alising checks between store_ptr_0 & load_ptr_0 and
2786 store_ptr_0 & load_ptr_1:
2788 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2789 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2791 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2792 same basic address. */
2794 comp_alias_ddrs.create (may_alias_ddrs.length ());
2796 /* First, we collect all data ref pairs for aliasing checks. */
2797 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2799 struct data_reference *dr_a, *dr_b;
2800 gimple dr_group_first_a, dr_group_first_b;
2801 tree segment_length_a, segment_length_b;
2802 gimple stmt_a, stmt_b;
2804 dr_a = DDR_A (ddr);
2805 stmt_a = DR_STMT (DDR_A (ddr));
2806 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2807 if (dr_group_first_a)
2809 stmt_a = dr_group_first_a;
2810 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2813 dr_b = DDR_B (ddr);
2814 stmt_b = DR_STMT (DDR_B (ddr));
2815 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2816 if (dr_group_first_b)
2818 stmt_b = dr_group_first_b;
2819 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2822 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2823 length_factor = scalar_loop_iters;
2824 else
2825 length_factor = size_int (vect_factor);
2826 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2827 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2829 dr_with_seg_len_pair_t dr_with_seg_len_pair
2830 (dr_with_seg_len (dr_a, segment_length_a),
2831 dr_with_seg_len (dr_b, segment_length_b));
2833 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2834 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2836 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2839 /* Second, we sort the collected data ref pairs so that we can scan
2840 them once to combine all possible aliasing checks. */
2841 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2843 /* Third, we scan the sorted dr pairs and check if we can combine
2844 alias checks of two neighbouring dr pairs. */
2845 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2847 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2848 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2849 *dr_b1 = &comp_alias_ddrs[i-1].second,
2850 *dr_a2 = &comp_alias_ddrs[i].first,
2851 *dr_b2 = &comp_alias_ddrs[i].second;
2853 /* Remove duplicate data ref pairs. */
2854 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2856 if (dump_enabled_p ())
2858 dump_printf_loc (MSG_NOTE, vect_location,
2859 "found equal ranges ");
2860 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2861 DR_REF (dr_a1->dr));
2862 dump_printf (MSG_NOTE, ", ");
2863 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2864 DR_REF (dr_b1->dr));
2865 dump_printf (MSG_NOTE, " and ");
2866 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2867 DR_REF (dr_a2->dr));
2868 dump_printf (MSG_NOTE, ", ");
2869 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2870 DR_REF (dr_b2->dr));
2871 dump_printf (MSG_NOTE, "\n");
2874 comp_alias_ddrs.ordered_remove (i--);
2875 continue;
2878 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2880 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2881 and DR_A1 and DR_A2 are two consecutive memrefs. */
2882 if (*dr_a1 == *dr_a2)
2884 std::swap (dr_a1, dr_b1);
2885 std::swap (dr_a2, dr_b2);
2888 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2889 DR_BASE_ADDRESS (dr_a2->dr),
2891 || !tree_fits_shwi_p (dr_a1->offset)
2892 || !tree_fits_shwi_p (dr_a2->offset))
2893 continue;
2895 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2896 - tree_to_shwi (dr_a1->offset));
2899 /* Now we check if the following condition is satisfied:
2901 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2903 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2904 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2905 have to make a best estimation. We can get the minimum value
2906 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2907 then either of the following two conditions can guarantee the
2908 one above:
2910 1: DIFF <= MIN_SEG_LEN_B
2911 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2915 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2916 ? tree_to_shwi (dr_b1->seg_len)
2917 : vect_factor);
2919 if (diff <= min_seg_len_b
2920 || (tree_fits_shwi_p (dr_a1->seg_len)
2921 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2923 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_NOTE, vect_location,
2926 "merging ranges for ");
2927 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2928 DR_REF (dr_a1->dr));
2929 dump_printf (MSG_NOTE, ", ");
2930 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931 DR_REF (dr_b1->dr));
2932 dump_printf (MSG_NOTE, " and ");
2933 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2934 DR_REF (dr_a2->dr));
2935 dump_printf (MSG_NOTE, ", ");
2936 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2937 DR_REF (dr_b2->dr));
2938 dump_printf (MSG_NOTE, "\n");
2941 dr_a1->seg_len = size_binop (PLUS_EXPR,
2942 dr_a2->seg_len, size_int (diff));
2943 comp_alias_ddrs.ordered_remove (i--);
2948 dump_printf_loc (MSG_NOTE, vect_location,
2949 "improved number of alias checks from %d to %d\n",
2950 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2951 if ((int) comp_alias_ddrs.length () >
2952 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2953 return false;
2955 return true;
2958 /* Check whether a non-affine read in stmt is suitable for gather load
2959 and if so, return a builtin decl for that operation. */
2961 tree
2962 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2963 tree *offp, int *scalep)
2965 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2966 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2967 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2968 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2969 tree offtype = NULL_TREE;
2970 tree decl, base, off;
2971 machine_mode pmode;
2972 int punsignedp, pvolatilep;
2974 base = DR_REF (dr);
2975 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2976 see if we can use the def stmt of the address. */
2977 if (is_gimple_call (stmt)
2978 && gimple_call_internal_p (stmt)
2979 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2980 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2981 && TREE_CODE (base) == MEM_REF
2982 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2983 && integer_zerop (TREE_OPERAND (base, 1))
2984 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2986 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2987 if (is_gimple_assign (def_stmt)
2988 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2989 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2992 /* The gather builtins need address of the form
2993 loop_invariant + vector * {1, 2, 4, 8}
2995 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2996 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2997 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2998 multiplications and additions in it. To get a vector, we need
2999 a single SSA_NAME that will be defined in the loop and will
3000 contain everything that is not loop invariant and that can be
3001 vectorized. The following code attempts to find such a preexistng
3002 SSA_NAME OFF and put the loop invariants into a tree BASE
3003 that can be gimplified before the loop. */
3004 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3005 &pmode, &punsignedp, &pvolatilep, false);
3006 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3008 if (TREE_CODE (base) == MEM_REF)
3010 if (!integer_zerop (TREE_OPERAND (base, 1)))
3012 if (off == NULL_TREE)
3014 offset_int moff = mem_ref_offset (base);
3015 off = wide_int_to_tree (sizetype, moff);
3017 else
3018 off = size_binop (PLUS_EXPR, off,
3019 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3021 base = TREE_OPERAND (base, 0);
3023 else
3024 base = build_fold_addr_expr (base);
3026 if (off == NULL_TREE)
3027 off = size_zero_node;
3029 /* If base is not loop invariant, either off is 0, then we start with just
3030 the constant offset in the loop invariant BASE and continue with base
3031 as OFF, otherwise give up.
3032 We could handle that case by gimplifying the addition of base + off
3033 into some SSA_NAME and use that as off, but for now punt. */
3034 if (!expr_invariant_in_loop_p (loop, base))
3036 if (!integer_zerop (off))
3037 return NULL_TREE;
3038 off = base;
3039 base = size_int (pbitpos / BITS_PER_UNIT);
3041 /* Otherwise put base + constant offset into the loop invariant BASE
3042 and continue with OFF. */
3043 else
3045 base = fold_convert (sizetype, base);
3046 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3049 /* OFF at this point may be either a SSA_NAME or some tree expression
3050 from get_inner_reference. Try to peel off loop invariants from it
3051 into BASE as long as possible. */
3052 STRIP_NOPS (off);
3053 while (offtype == NULL_TREE)
3055 enum tree_code code;
3056 tree op0, op1, add = NULL_TREE;
3058 if (TREE_CODE (off) == SSA_NAME)
3060 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3062 if (expr_invariant_in_loop_p (loop, off))
3063 return NULL_TREE;
3065 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3066 break;
3068 op0 = gimple_assign_rhs1 (def_stmt);
3069 code = gimple_assign_rhs_code (def_stmt);
3070 op1 = gimple_assign_rhs2 (def_stmt);
3072 else
3074 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3075 return NULL_TREE;
3076 code = TREE_CODE (off);
3077 extract_ops_from_tree (off, &code, &op0, &op1);
3079 switch (code)
3081 case POINTER_PLUS_EXPR:
3082 case PLUS_EXPR:
3083 if (expr_invariant_in_loop_p (loop, op0))
3085 add = op0;
3086 off = op1;
3087 do_add:
3088 add = fold_convert (sizetype, add);
3089 if (scale != 1)
3090 add = size_binop (MULT_EXPR, add, size_int (scale));
3091 base = size_binop (PLUS_EXPR, base, add);
3092 continue;
3094 if (expr_invariant_in_loop_p (loop, op1))
3096 add = op1;
3097 off = op0;
3098 goto do_add;
3100 break;
3101 case MINUS_EXPR:
3102 if (expr_invariant_in_loop_p (loop, op1))
3104 add = fold_convert (sizetype, op1);
3105 add = size_binop (MINUS_EXPR, size_zero_node, add);
3106 off = op0;
3107 goto do_add;
3109 break;
3110 case MULT_EXPR:
3111 if (scale == 1 && tree_fits_shwi_p (op1))
3113 scale = tree_to_shwi (op1);
3114 off = op0;
3115 continue;
3117 break;
3118 case SSA_NAME:
3119 off = op0;
3120 continue;
3121 CASE_CONVERT:
3122 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3123 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3124 break;
3125 if (TYPE_PRECISION (TREE_TYPE (op0))
3126 == TYPE_PRECISION (TREE_TYPE (off)))
3128 off = op0;
3129 continue;
3131 if (TYPE_PRECISION (TREE_TYPE (op0))
3132 < TYPE_PRECISION (TREE_TYPE (off)))
3134 off = op0;
3135 offtype = TREE_TYPE (off);
3136 STRIP_NOPS (off);
3137 continue;
3139 break;
3140 default:
3141 break;
3143 break;
3146 /* If at the end OFF still isn't a SSA_NAME or isn't
3147 defined in the loop, punt. */
3148 if (TREE_CODE (off) != SSA_NAME
3149 || expr_invariant_in_loop_p (loop, off))
3150 return NULL_TREE;
3152 if (offtype == NULL_TREE)
3153 offtype = TREE_TYPE (off);
3155 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3156 offtype, scale);
3157 if (decl == NULL_TREE)
3158 return NULL_TREE;
3160 if (basep)
3161 *basep = base;
3162 if (offp)
3163 *offp = off;
3164 if (scalep)
3165 *scalep = scale;
3166 return decl;
3169 /* Function vect_analyze_data_refs.
3171 Find all the data references in the loop or basic block.
3173 The general structure of the analysis of data refs in the vectorizer is as
3174 follows:
3175 1- vect_analyze_data_refs(loop/bb): call
3176 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3177 in the loop/bb and their dependences.
3178 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3179 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3180 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3184 bool
3185 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3186 bb_vec_info bb_vinfo,
3187 int *min_vf, unsigned *n_stmts)
3189 struct loop *loop = NULL;
3190 basic_block bb = NULL;
3191 unsigned int i;
3192 vec<data_reference_p> datarefs;
3193 struct data_reference *dr;
3194 tree scalar_type;
3196 if (dump_enabled_p ())
3197 dump_printf_loc (MSG_NOTE, vect_location,
3198 "=== vect_analyze_data_refs ===\n");
3200 if (loop_vinfo)
3202 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3204 loop = LOOP_VINFO_LOOP (loop_vinfo);
3205 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3206 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3208 if (dump_enabled_p ())
3209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3210 "not vectorized: loop contains function calls"
3211 " or data references that cannot be analyzed\n");
3212 return false;
3215 for (i = 0; i < loop->num_nodes; i++)
3217 gimple_stmt_iterator gsi;
3219 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3221 gimple stmt = gsi_stmt (gsi);
3222 if (is_gimple_debug (stmt))
3223 continue;
3224 ++*n_stmts;
3225 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3227 if (is_gimple_call (stmt) && loop->safelen)
3229 tree fndecl = gimple_call_fndecl (stmt), op;
3230 if (fndecl != NULL_TREE)
3232 struct cgraph_node *node = cgraph_node::get (fndecl);
3233 if (node != NULL && node->simd_clones != NULL)
3235 unsigned int j, n = gimple_call_num_args (stmt);
3236 for (j = 0; j < n; j++)
3238 op = gimple_call_arg (stmt, j);
3239 if (DECL_P (op)
3240 || (REFERENCE_CLASS_P (op)
3241 && get_base_address (op)))
3242 break;
3244 op = gimple_call_lhs (stmt);
3245 /* Ignore #pragma omp declare simd functions
3246 if they don't have data references in the
3247 call stmt itself. */
3248 if (j == n
3249 && !(op
3250 && (DECL_P (op)
3251 || (REFERENCE_CLASS_P (op)
3252 && get_base_address (op)))))
3253 continue;
3257 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3258 if (dump_enabled_p ())
3259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3260 "not vectorized: loop contains function "
3261 "calls or data references that cannot "
3262 "be analyzed\n");
3263 return false;
3268 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3270 else
3272 gimple_stmt_iterator gsi;
3274 bb = BB_VINFO_BB (bb_vinfo);
3275 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3277 gimple stmt = gsi_stmt (gsi);
3278 if (is_gimple_debug (stmt))
3279 continue;
3280 ++*n_stmts;
3281 if (!find_data_references_in_stmt (NULL, stmt,
3282 &BB_VINFO_DATAREFS (bb_vinfo)))
3284 /* Mark the rest of the basic-block as unvectorizable. */
3285 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3287 stmt = gsi_stmt (gsi);
3288 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3290 break;
3294 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3297 /* Go through the data-refs, check that the analysis succeeded. Update
3298 pointer from stmt_vec_info struct to DR and vectype. */
3300 FOR_EACH_VEC_ELT (datarefs, i, dr)
3302 gimple stmt;
3303 stmt_vec_info stmt_info;
3304 tree base, offset, init;
3305 bool gather = false;
3306 bool simd_lane_access = false;
3307 int vf;
3309 again:
3310 if (!dr || !DR_REF (dr))
3312 if (dump_enabled_p ())
3313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3314 "not vectorized: unhandled data-ref\n");
3315 return false;
3318 stmt = DR_STMT (dr);
3319 stmt_info = vinfo_for_stmt (stmt);
3321 /* Discard clobbers from the dataref vector. We will remove
3322 clobber stmts during vectorization. */
3323 if (gimple_clobber_p (stmt))
3325 free_data_ref (dr);
3326 if (i == datarefs.length () - 1)
3328 datarefs.pop ();
3329 break;
3331 datarefs.ordered_remove (i);
3332 dr = datarefs[i];
3333 goto again;
3336 /* Check that analysis of the data-ref succeeded. */
3337 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3338 || !DR_STEP (dr))
3340 bool maybe_gather
3341 = DR_IS_READ (dr)
3342 && !TREE_THIS_VOLATILE (DR_REF (dr))
3343 && targetm.vectorize.builtin_gather != NULL;
3344 bool maybe_simd_lane_access
3345 = loop_vinfo && loop->simduid;
3347 /* If target supports vector gather loads, or if this might be
3348 a SIMD lane access, see if they can't be used. */
3349 if (loop_vinfo
3350 && (maybe_gather || maybe_simd_lane_access)
3351 && !nested_in_vect_loop_p (loop, stmt))
3353 struct data_reference *newdr
3354 = create_data_ref (NULL, loop_containing_stmt (stmt),
3355 DR_REF (dr), stmt, true);
3356 gcc_assert (newdr != NULL && DR_REF (newdr));
3357 if (DR_BASE_ADDRESS (newdr)
3358 && DR_OFFSET (newdr)
3359 && DR_INIT (newdr)
3360 && DR_STEP (newdr)
3361 && integer_zerop (DR_STEP (newdr)))
3363 if (maybe_simd_lane_access)
3365 tree off = DR_OFFSET (newdr);
3366 STRIP_NOPS (off);
3367 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3368 && TREE_CODE (off) == MULT_EXPR
3369 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3371 tree step = TREE_OPERAND (off, 1);
3372 off = TREE_OPERAND (off, 0);
3373 STRIP_NOPS (off);
3374 if (CONVERT_EXPR_P (off)
3375 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3376 0)))
3377 < TYPE_PRECISION (TREE_TYPE (off)))
3378 off = TREE_OPERAND (off, 0);
3379 if (TREE_CODE (off) == SSA_NAME)
3381 gimple def = SSA_NAME_DEF_STMT (off);
3382 tree reft = TREE_TYPE (DR_REF (newdr));
3383 if (is_gimple_call (def)
3384 && gimple_call_internal_p (def)
3385 && (gimple_call_internal_fn (def)
3386 == IFN_GOMP_SIMD_LANE))
3388 tree arg = gimple_call_arg (def, 0);
3389 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3390 arg = SSA_NAME_VAR (arg);
3391 if (arg == loop->simduid
3392 /* For now. */
3393 && tree_int_cst_equal
3394 (TYPE_SIZE_UNIT (reft),
3395 step))
3397 DR_OFFSET (newdr) = ssize_int (0);
3398 DR_STEP (newdr) = step;
3399 DR_ALIGNED_TO (newdr)
3400 = size_int (BIGGEST_ALIGNMENT);
3401 dr = newdr;
3402 simd_lane_access = true;
3408 if (!simd_lane_access && maybe_gather)
3410 dr = newdr;
3411 gather = true;
3414 if (!gather && !simd_lane_access)
3415 free_data_ref (newdr);
3418 if (!gather && !simd_lane_access)
3420 if (dump_enabled_p ())
3422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3423 "not vectorized: data ref analysis "
3424 "failed ");
3425 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3426 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3429 if (bb_vinfo)
3430 break;
3432 return false;
3436 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3438 if (dump_enabled_p ())
3439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3440 "not vectorized: base addr of dr is a "
3441 "constant\n");
3443 if (bb_vinfo)
3444 break;
3446 if (gather || simd_lane_access)
3447 free_data_ref (dr);
3448 return false;
3451 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3453 if (dump_enabled_p ())
3455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3456 "not vectorized: volatile type ");
3457 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3458 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3461 if (bb_vinfo)
3462 break;
3464 return false;
3467 if (stmt_can_throw_internal (stmt))
3469 if (dump_enabled_p ())
3471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3472 "not vectorized: statement can throw an "
3473 "exception ");
3474 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3475 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3478 if (bb_vinfo)
3479 break;
3481 if (gather || simd_lane_access)
3482 free_data_ref (dr);
3483 return false;
3486 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3487 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3489 if (dump_enabled_p ())
3491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3492 "not vectorized: statement is bitfield "
3493 "access ");
3494 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3495 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3498 if (bb_vinfo)
3499 break;
3501 if (gather || simd_lane_access)
3502 free_data_ref (dr);
3503 return false;
3506 base = unshare_expr (DR_BASE_ADDRESS (dr));
3507 offset = unshare_expr (DR_OFFSET (dr));
3508 init = unshare_expr (DR_INIT (dr));
3510 if (is_gimple_call (stmt)
3511 && (!gimple_call_internal_p (stmt)
3512 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3513 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3515 if (dump_enabled_p ())
3517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3518 "not vectorized: dr in a call ");
3519 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3520 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3523 if (bb_vinfo)
3524 break;
3526 if (gather || simd_lane_access)
3527 free_data_ref (dr);
3528 return false;
3531 /* Update DR field in stmt_vec_info struct. */
3533 /* If the dataref is in an inner-loop of the loop that is considered for
3534 for vectorization, we also want to analyze the access relative to
3535 the outer-loop (DR contains information only relative to the
3536 inner-most enclosing loop). We do that by building a reference to the
3537 first location accessed by the inner-loop, and analyze it relative to
3538 the outer-loop. */
3539 if (loop && nested_in_vect_loop_p (loop, stmt))
3541 tree outer_step, outer_base, outer_init;
3542 HOST_WIDE_INT pbitsize, pbitpos;
3543 tree poffset;
3544 machine_mode pmode;
3545 int punsignedp, pvolatilep;
3546 affine_iv base_iv, offset_iv;
3547 tree dinit;
3549 /* Build a reference to the first location accessed by the
3550 inner-loop: *(BASE+INIT). (The first location is actually
3551 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3552 tree inner_base = build_fold_indirect_ref
3553 (fold_build_pointer_plus (base, init));
3555 if (dump_enabled_p ())
3557 dump_printf_loc (MSG_NOTE, vect_location,
3558 "analyze in outer-loop: ");
3559 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3560 dump_printf (MSG_NOTE, "\n");
3563 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3564 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3565 gcc_assert (outer_base != NULL_TREE);
3567 if (pbitpos % BITS_PER_UNIT != 0)
3569 if (dump_enabled_p ())
3570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3571 "failed: bit offset alignment.\n");
3572 return false;
3575 outer_base = build_fold_addr_expr (outer_base);
3576 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3577 &base_iv, false))
3579 if (dump_enabled_p ())
3580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3581 "failed: evolution of base is not affine.\n");
3582 return false;
3585 if (offset)
3587 if (poffset)
3588 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3589 poffset);
3590 else
3591 poffset = offset;
3594 if (!poffset)
3596 offset_iv.base = ssize_int (0);
3597 offset_iv.step = ssize_int (0);
3599 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3600 &offset_iv, false))
3602 if (dump_enabled_p ())
3603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3604 "evolution of offset is not affine.\n");
3605 return false;
3608 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3609 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3610 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3611 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3612 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3614 outer_step = size_binop (PLUS_EXPR,
3615 fold_convert (ssizetype, base_iv.step),
3616 fold_convert (ssizetype, offset_iv.step));
3618 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3619 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3620 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3621 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3622 STMT_VINFO_DR_OFFSET (stmt_info) =
3623 fold_convert (ssizetype, offset_iv.base);
3624 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3625 size_int (highest_pow2_factor (offset_iv.base));
3627 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_NOTE, vect_location,
3630 "\touter base_address: ");
3631 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3632 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3633 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3634 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3635 STMT_VINFO_DR_OFFSET (stmt_info));
3636 dump_printf (MSG_NOTE,
3637 "\n\touter constant offset from base address: ");
3638 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3639 STMT_VINFO_DR_INIT (stmt_info));
3640 dump_printf (MSG_NOTE, "\n\touter step: ");
3641 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3642 STMT_VINFO_DR_STEP (stmt_info));
3643 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3644 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3645 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3646 dump_printf (MSG_NOTE, "\n");
3650 if (STMT_VINFO_DATA_REF (stmt_info))
3652 if (dump_enabled_p ())
3654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3655 "not vectorized: more than one data ref "
3656 "in stmt: ");
3657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3661 if (bb_vinfo)
3662 break;
3664 if (gather || simd_lane_access)
3665 free_data_ref (dr);
3666 return false;
3669 STMT_VINFO_DATA_REF (stmt_info) = dr;
3670 if (simd_lane_access)
3672 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3673 free_data_ref (datarefs[i]);
3674 datarefs[i] = dr;
3677 /* Set vectype for STMT. */
3678 scalar_type = TREE_TYPE (DR_REF (dr));
3679 STMT_VINFO_VECTYPE (stmt_info)
3680 = get_vectype_for_scalar_type (scalar_type);
3681 if (!STMT_VINFO_VECTYPE (stmt_info))
3683 if (dump_enabled_p ())
3685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3686 "not vectorized: no vectype for stmt: ");
3687 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3688 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3689 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3690 scalar_type);
3691 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3694 if (bb_vinfo)
3695 break;
3697 if (gather || simd_lane_access)
3699 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3700 if (gather)
3701 free_data_ref (dr);
3703 return false;
3705 else
3707 if (dump_enabled_p ())
3709 dump_printf_loc (MSG_NOTE, vect_location,
3710 "got vectype for stmt: ");
3711 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3712 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3713 STMT_VINFO_VECTYPE (stmt_info));
3714 dump_printf (MSG_NOTE, "\n");
3718 /* Adjust the minimal vectorization factor according to the
3719 vector type. */
3720 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3721 if (vf > *min_vf)
3722 *min_vf = vf;
3724 if (gather)
3726 tree off;
3728 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3729 if (gather
3730 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3731 gather = false;
3732 if (!gather)
3734 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3735 free_data_ref (dr);
3736 if (dump_enabled_p ())
3738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3739 "not vectorized: not suitable for gather "
3740 "load ");
3741 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3742 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3744 return false;
3747 datarefs[i] = dr;
3748 STMT_VINFO_GATHER_P (stmt_info) = true;
3750 else if (loop_vinfo
3751 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3753 if (nested_in_vect_loop_p (loop, stmt))
3755 if (dump_enabled_p ())
3757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3758 "not vectorized: not suitable for strided "
3759 "load ");
3760 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3761 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3763 return false;
3765 STMT_VINFO_STRIDED_P (stmt_info) = true;
3769 /* If we stopped analysis at the first dataref we could not analyze
3770 when trying to vectorize a basic-block mark the rest of the datarefs
3771 as not vectorizable and truncate the vector of datarefs. That
3772 avoids spending useless time in analyzing their dependence. */
3773 if (i != datarefs.length ())
3775 gcc_assert (bb_vinfo != NULL);
3776 for (unsigned j = i; j < datarefs.length (); ++j)
3778 data_reference_p dr = datarefs[j];
3779 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3780 free_data_ref (dr);
3782 datarefs.truncate (i);
3785 return true;
3789 /* Function vect_get_new_vect_var.
3791 Returns a name for a new variable. The current naming scheme appends the
3792 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3793 the name of vectorizer generated variables, and appends that to NAME if
3794 provided. */
3796 tree
3797 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3799 const char *prefix;
3800 tree new_vect_var;
3802 switch (var_kind)
3804 case vect_simple_var:
3805 prefix = "vect";
3806 break;
3807 case vect_scalar_var:
3808 prefix = "stmp";
3809 break;
3810 case vect_pointer_var:
3811 prefix = "vectp";
3812 break;
3813 default:
3814 gcc_unreachable ();
3817 if (name)
3819 char* tmp = concat (prefix, "_", name, NULL);
3820 new_vect_var = create_tmp_reg (type, tmp);
3821 free (tmp);
3823 else
3824 new_vect_var = create_tmp_reg (type, prefix);
3826 return new_vect_var;
3829 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3831 static void
3832 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3833 stmt_vec_info stmt_info)
3835 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3836 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3837 int misalign = DR_MISALIGNMENT (dr);
3838 if (misalign == -1)
3839 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3840 else
3841 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3844 /* Function vect_create_addr_base_for_vector_ref.
3846 Create an expression that computes the address of the first memory location
3847 that will be accessed for a data reference.
3849 Input:
3850 STMT: The statement containing the data reference.
3851 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3852 OFFSET: Optional. If supplied, it is be added to the initial address.
3853 LOOP: Specify relative to which loop-nest should the address be computed.
3854 For example, when the dataref is in an inner-loop nested in an
3855 outer-loop that is now being vectorized, LOOP can be either the
3856 outer-loop, or the inner-loop. The first memory location accessed
3857 by the following dataref ('in' points to short):
3859 for (i=0; i<N; i++)
3860 for (j=0; j<M; j++)
3861 s += in[i+j]
3863 is as follows:
3864 if LOOP=i_loop: &in (relative to i_loop)
3865 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3866 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3867 initial address. Unlike OFFSET, which is number of elements to
3868 be added, BYTE_OFFSET is measured in bytes.
3870 Output:
3871 1. Return an SSA_NAME whose value is the address of the memory location of
3872 the first vector of the data reference.
3873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3874 these statement(s) which define the returned SSA_NAME.
3876 FORNOW: We are only handling array accesses with step 1. */
3878 tree
3879 vect_create_addr_base_for_vector_ref (gimple stmt,
3880 gimple_seq *new_stmt_list,
3881 tree offset,
3882 struct loop *loop,
3883 tree byte_offset)
3885 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3886 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3887 tree data_ref_base;
3888 const char *base_name;
3889 tree addr_base;
3890 tree dest;
3891 gimple_seq seq = NULL;
3892 tree base_offset;
3893 tree init;
3894 tree vect_ptr_type;
3895 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3896 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3898 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3900 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3902 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3904 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3905 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3906 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3908 else
3910 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3911 base_offset = unshare_expr (DR_OFFSET (dr));
3912 init = unshare_expr (DR_INIT (dr));
3915 if (loop_vinfo)
3916 base_name = get_name (data_ref_base);
3917 else
3919 base_offset = ssize_int (0);
3920 init = ssize_int (0);
3921 base_name = get_name (DR_REF (dr));
3924 /* Create base_offset */
3925 base_offset = size_binop (PLUS_EXPR,
3926 fold_convert (sizetype, base_offset),
3927 fold_convert (sizetype, init));
3929 if (offset)
3931 offset = fold_build2 (MULT_EXPR, sizetype,
3932 fold_convert (sizetype, offset), step);
3933 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3934 base_offset, offset);
3936 if (byte_offset)
3938 byte_offset = fold_convert (sizetype, byte_offset);
3939 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3940 base_offset, byte_offset);
3943 /* base + base_offset */
3944 if (loop_vinfo)
3945 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3946 else
3948 addr_base = build1 (ADDR_EXPR,
3949 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3950 unshare_expr (DR_REF (dr)));
3953 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3954 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3955 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3956 gimple_seq_add_seq (new_stmt_list, seq);
3958 if (DR_PTR_INFO (dr)
3959 && TREE_CODE (addr_base) == SSA_NAME
3960 && !SSA_NAME_PTR_INFO (addr_base))
3962 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3963 if (offset || byte_offset)
3964 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3967 if (dump_enabled_p ())
3969 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3970 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3971 dump_printf (MSG_NOTE, "\n");
3974 return addr_base;
3978 /* Function vect_create_data_ref_ptr.
3980 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3981 location accessed in the loop by STMT, along with the def-use update
3982 chain to appropriately advance the pointer through the loop iterations.
3983 Also set aliasing information for the pointer. This pointer is used by
3984 the callers to this function to create a memory reference expression for
3985 vector load/store access.
3987 Input:
3988 1. STMT: a stmt that references memory. Expected to be of the form
3989 GIMPLE_ASSIGN <name, data-ref> or
3990 GIMPLE_ASSIGN <data-ref, name>.
3991 2. AGGR_TYPE: the type of the reference, which should be either a vector
3992 or an array.
3993 3. AT_LOOP: the loop where the vector memref is to be created.
3994 4. OFFSET (optional): an offset to be added to the initial address accessed
3995 by the data-ref in STMT.
3996 5. BSI: location where the new stmts are to be placed if there is no loop
3997 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3998 pointing to the initial address.
3999 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4000 to the initial address accessed by the data-ref in STMT. This is
4001 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4002 in bytes.
4004 Output:
4005 1. Declare a new ptr to vector_type, and have it point to the base of the
4006 data reference (initial addressed accessed by the data reference).
4007 For example, for vector of type V8HI, the following code is generated:
4009 v8hi *ap;
4010 ap = (v8hi *)initial_address;
4012 if OFFSET is not supplied:
4013 initial_address = &a[init];
4014 if OFFSET is supplied:
4015 initial_address = &a[init + OFFSET];
4016 if BYTE_OFFSET is supplied:
4017 initial_address = &a[init] + BYTE_OFFSET;
4019 Return the initial_address in INITIAL_ADDRESS.
4021 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4022 update the pointer in each iteration of the loop.
4024 Return the increment stmt that updates the pointer in PTR_INCR.
4026 3. Set INV_P to true if the access pattern of the data reference in the
4027 vectorized loop is invariant. Set it to false otherwise.
4029 4. Return the pointer. */
4031 tree
4032 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4033 tree offset, tree *initial_address,
4034 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4035 bool only_init, bool *inv_p, tree byte_offset)
4037 const char *base_name;
4038 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4039 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4040 struct loop *loop = NULL;
4041 bool nested_in_vect_loop = false;
4042 struct loop *containing_loop = NULL;
4043 tree aggr_ptr_type;
4044 tree aggr_ptr;
4045 tree new_temp;
4046 gimple_seq new_stmt_list = NULL;
4047 edge pe = NULL;
4048 basic_block new_bb;
4049 tree aggr_ptr_init;
4050 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4051 tree aptr;
4052 gimple_stmt_iterator incr_gsi;
4053 bool insert_after;
4054 tree indx_before_incr, indx_after_incr;
4055 gimple incr;
4056 tree step;
4057 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4059 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4060 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4062 if (loop_vinfo)
4064 loop = LOOP_VINFO_LOOP (loop_vinfo);
4065 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4066 containing_loop = (gimple_bb (stmt))->loop_father;
4067 pe = loop_preheader_edge (loop);
4069 else
4071 gcc_assert (bb_vinfo);
4072 only_init = true;
4073 *ptr_incr = NULL;
4076 /* Check the step (evolution) of the load in LOOP, and record
4077 whether it's invariant. */
4078 if (nested_in_vect_loop)
4079 step = STMT_VINFO_DR_STEP (stmt_info);
4080 else
4081 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4083 if (integer_zerop (step))
4084 *inv_p = true;
4085 else
4086 *inv_p = false;
4088 /* Create an expression for the first address accessed by this load
4089 in LOOP. */
4090 base_name = get_name (DR_BASE_ADDRESS (dr));
4092 if (dump_enabled_p ())
4094 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4095 dump_printf_loc (MSG_NOTE, vect_location,
4096 "create %s-pointer variable to type: ",
4097 get_tree_code_name (TREE_CODE (aggr_type)));
4098 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4099 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4100 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4101 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4102 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4103 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4104 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4105 else
4106 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4107 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4108 dump_printf (MSG_NOTE, "\n");
4111 /* (1) Create the new aggregate-pointer variable.
4112 Vector and array types inherit the alias set of their component
4113 type by default so we need to use a ref-all pointer if the data
4114 reference does not conflict with the created aggregated data
4115 reference because it is not addressable. */
4116 bool need_ref_all = false;
4117 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4118 get_alias_set (DR_REF (dr))))
4119 need_ref_all = true;
4120 /* Likewise for any of the data references in the stmt group. */
4121 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4123 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4126 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4127 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4128 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4129 get_alias_set (DR_REF (sdr))))
4131 need_ref_all = true;
4132 break;
4134 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4136 while (orig_stmt);
4138 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4139 need_ref_all);
4140 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4143 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4144 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4145 def-use update cycles for the pointer: one relative to the outer-loop
4146 (LOOP), which is what steps (3) and (4) below do. The other is relative
4147 to the inner-loop (which is the inner-most loop containing the dataref),
4148 and this is done be step (5) below.
4150 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4151 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4152 redundant. Steps (3),(4) create the following:
4154 vp0 = &base_addr;
4155 LOOP: vp1 = phi(vp0,vp2)
4158 vp2 = vp1 + step
4159 goto LOOP
4161 If there is an inner-loop nested in loop, then step (5) will also be
4162 applied, and an additional update in the inner-loop will be created:
4164 vp0 = &base_addr;
4165 LOOP: vp1 = phi(vp0,vp2)
4167 inner: vp3 = phi(vp1,vp4)
4168 vp4 = vp3 + inner_step
4169 if () goto inner
4171 vp2 = vp1 + step
4172 if () goto LOOP */
4174 /* (2) Calculate the initial address of the aggregate-pointer, and set
4175 the aggregate-pointer to point to it before the loop. */
4177 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4179 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4180 offset, loop, byte_offset);
4181 if (new_stmt_list)
4183 if (pe)
4185 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4186 gcc_assert (!new_bb);
4188 else
4189 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4192 *initial_address = new_temp;
4193 aggr_ptr_init = new_temp;
4195 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4196 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4197 inner-loop nested in LOOP (during outer-loop vectorization). */
4199 /* No update in loop is required. */
4200 if (only_init && (!loop_vinfo || at_loop == loop))
4201 aptr = aggr_ptr_init;
4202 else
4204 /* The step of the aggregate pointer is the type size. */
4205 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4206 /* One exception to the above is when the scalar step of the load in
4207 LOOP is zero. In this case the step here is also zero. */
4208 if (*inv_p)
4209 iv_step = size_zero_node;
4210 else if (tree_int_cst_sgn (step) == -1)
4211 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4213 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4215 create_iv (aggr_ptr_init,
4216 fold_convert (aggr_ptr_type, iv_step),
4217 aggr_ptr, loop, &incr_gsi, insert_after,
4218 &indx_before_incr, &indx_after_incr);
4219 incr = gsi_stmt (incr_gsi);
4220 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4222 /* Copy the points-to information if it exists. */
4223 if (DR_PTR_INFO (dr))
4225 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4226 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4228 if (ptr_incr)
4229 *ptr_incr = incr;
4231 aptr = indx_before_incr;
4234 if (!nested_in_vect_loop || only_init)
4235 return aptr;
4238 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4239 nested in LOOP, if exists. */
4241 gcc_assert (nested_in_vect_loop);
4242 if (!only_init)
4244 standard_iv_increment_position (containing_loop, &incr_gsi,
4245 &insert_after);
4246 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4247 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4248 &indx_after_incr);
4249 incr = gsi_stmt (incr_gsi);
4250 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4252 /* Copy the points-to information if it exists. */
4253 if (DR_PTR_INFO (dr))
4255 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4256 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4258 if (ptr_incr)
4259 *ptr_incr = incr;
4261 return indx_before_incr;
4263 else
4264 gcc_unreachable ();
4268 /* Function bump_vector_ptr
4270 Increment a pointer (to a vector type) by vector-size. If requested,
4271 i.e. if PTR-INCR is given, then also connect the new increment stmt
4272 to the existing def-use update-chain of the pointer, by modifying
4273 the PTR_INCR as illustrated below:
4275 The pointer def-use update-chain before this function:
4276 DATAREF_PTR = phi (p_0, p_2)
4277 ....
4278 PTR_INCR: p_2 = DATAREF_PTR + step
4280 The pointer def-use update-chain after this function:
4281 DATAREF_PTR = phi (p_0, p_2)
4282 ....
4283 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4284 ....
4285 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4287 Input:
4288 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4289 in the loop.
4290 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4291 the loop. The increment amount across iterations is expected
4292 to be vector_size.
4293 BSI - location where the new update stmt is to be placed.
4294 STMT - the original scalar memory-access stmt that is being vectorized.
4295 BUMP - optional. The offset by which to bump the pointer. If not given,
4296 the offset is assumed to be vector_size.
4298 Output: Return NEW_DATAREF_PTR as illustrated above.
4302 tree
4303 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4304 gimple stmt, tree bump)
4306 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4307 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4308 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4309 tree update = TYPE_SIZE_UNIT (vectype);
4310 gassign *incr_stmt;
4311 ssa_op_iter iter;
4312 use_operand_p use_p;
4313 tree new_dataref_ptr;
4315 if (bump)
4316 update = bump;
4318 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4319 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4320 else
4321 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4322 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4323 dataref_ptr, update);
4324 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4326 /* Copy the points-to information if it exists. */
4327 if (DR_PTR_INFO (dr))
4329 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4330 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4333 if (!ptr_incr)
4334 return new_dataref_ptr;
4336 /* Update the vector-pointer's cross-iteration increment. */
4337 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4339 tree use = USE_FROM_PTR (use_p);
4341 if (use == dataref_ptr)
4342 SET_USE (use_p, new_dataref_ptr);
4343 else
4344 gcc_assert (tree_int_cst_compare (use, update) == 0);
4347 return new_dataref_ptr;
4351 /* Function vect_create_destination_var.
4353 Create a new temporary of type VECTYPE. */
4355 tree
4356 vect_create_destination_var (tree scalar_dest, tree vectype)
4358 tree vec_dest;
4359 const char *name;
4360 char *new_name;
4361 tree type;
4362 enum vect_var_kind kind;
4364 kind = vectype ? vect_simple_var : vect_scalar_var;
4365 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4367 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4369 name = get_name (scalar_dest);
4370 if (name)
4371 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4372 else
4373 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4374 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4375 free (new_name);
4377 return vec_dest;
4380 /* Function vect_grouped_store_supported.
4382 Returns TRUE if interleave high and interleave low permutations
4383 are supported, and FALSE otherwise. */
4385 bool
4386 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4388 machine_mode mode = TYPE_MODE (vectype);
4390 /* vect_permute_store_chain requires the group size to be equal to 3 or
4391 be a power of two. */
4392 if (count != 3 && exact_log2 (count) == -1)
4394 if (dump_enabled_p ())
4395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4396 "the size of the group of accesses"
4397 " is not a power of 2 or not eqaul to 3\n");
4398 return false;
4401 /* Check that the permutation is supported. */
4402 if (VECTOR_MODE_P (mode))
4404 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4405 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4407 if (count == 3)
4409 unsigned int j0 = 0, j1 = 0, j2 = 0;
4410 unsigned int i, j;
4412 for (j = 0; j < 3; j++)
4414 int nelt0 = ((3 - j) * nelt) % 3;
4415 int nelt1 = ((3 - j) * nelt + 1) % 3;
4416 int nelt2 = ((3 - j) * nelt + 2) % 3;
4417 for (i = 0; i < nelt; i++)
4419 if (3 * i + nelt0 < nelt)
4420 sel[3 * i + nelt0] = j0++;
4421 if (3 * i + nelt1 < nelt)
4422 sel[3 * i + nelt1] = nelt + j1++;
4423 if (3 * i + nelt2 < nelt)
4424 sel[3 * i + nelt2] = 0;
4426 if (!can_vec_perm_p (mode, false, sel))
4428 if (dump_enabled_p ())
4429 dump_printf (MSG_MISSED_OPTIMIZATION,
4430 "permutaion op not supported by target.\n");
4431 return false;
4434 for (i = 0; i < nelt; i++)
4436 if (3 * i + nelt0 < nelt)
4437 sel[3 * i + nelt0] = 3 * i + nelt0;
4438 if (3 * i + nelt1 < nelt)
4439 sel[3 * i + nelt1] = 3 * i + nelt1;
4440 if (3 * i + nelt2 < nelt)
4441 sel[3 * i + nelt2] = nelt + j2++;
4443 if (!can_vec_perm_p (mode, false, sel))
4445 if (dump_enabled_p ())
4446 dump_printf (MSG_MISSED_OPTIMIZATION,
4447 "permutaion op not supported by target.\n");
4448 return false;
4451 return true;
4453 else
4455 /* If length is not equal to 3 then only power of 2 is supported. */
4456 gcc_assert (exact_log2 (count) != -1);
4458 for (i = 0; i < nelt / 2; i++)
4460 sel[i * 2] = i;
4461 sel[i * 2 + 1] = i + nelt;
4463 if (can_vec_perm_p (mode, false, sel))
4465 for (i = 0; i < nelt; i++)
4466 sel[i] += nelt / 2;
4467 if (can_vec_perm_p (mode, false, sel))
4468 return true;
4473 if (dump_enabled_p ())
4474 dump_printf (MSG_MISSED_OPTIMIZATION,
4475 "permutaion op not supported by target.\n");
4476 return false;
4480 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4481 type VECTYPE. */
4483 bool
4484 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4486 return vect_lanes_optab_supported_p ("vec_store_lanes",
4487 vec_store_lanes_optab,
4488 vectype, count);
4492 /* Function vect_permute_store_chain.
4494 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4495 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4496 the data correctly for the stores. Return the final references for stores
4497 in RESULT_CHAIN.
4499 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4500 The input is 4 vectors each containing 8 elements. We assign a number to
4501 each element, the input sequence is:
4503 1st vec: 0 1 2 3 4 5 6 7
4504 2nd vec: 8 9 10 11 12 13 14 15
4505 3rd vec: 16 17 18 19 20 21 22 23
4506 4th vec: 24 25 26 27 28 29 30 31
4508 The output sequence should be:
4510 1st vec: 0 8 16 24 1 9 17 25
4511 2nd vec: 2 10 18 26 3 11 19 27
4512 3rd vec: 4 12 20 28 5 13 21 30
4513 4th vec: 6 14 22 30 7 15 23 31
4515 i.e., we interleave the contents of the four vectors in their order.
4517 We use interleave_high/low instructions to create such output. The input of
4518 each interleave_high/low operation is two vectors:
4519 1st vec 2nd vec
4520 0 1 2 3 4 5 6 7
4521 the even elements of the result vector are obtained left-to-right from the
4522 high/low elements of the first vector. The odd elements of the result are
4523 obtained left-to-right from the high/low elements of the second vector.
4524 The output of interleave_high will be: 0 4 1 5
4525 and of interleave_low: 2 6 3 7
4528 The permutation is done in log LENGTH stages. In each stage interleave_high
4529 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4530 where the first argument is taken from the first half of DR_CHAIN and the
4531 second argument from it's second half.
4532 In our example,
4534 I1: interleave_high (1st vec, 3rd vec)
4535 I2: interleave_low (1st vec, 3rd vec)
4536 I3: interleave_high (2nd vec, 4th vec)
4537 I4: interleave_low (2nd vec, 4th vec)
4539 The output for the first stage is:
4541 I1: 0 16 1 17 2 18 3 19
4542 I2: 4 20 5 21 6 22 7 23
4543 I3: 8 24 9 25 10 26 11 27
4544 I4: 12 28 13 29 14 30 15 31
4546 The output of the second stage, i.e. the final result is:
4548 I1: 0 8 16 24 1 9 17 25
4549 I2: 2 10 18 26 3 11 19 27
4550 I3: 4 12 20 28 5 13 21 30
4551 I4: 6 14 22 30 7 15 23 31. */
4553 void
4554 vect_permute_store_chain (vec<tree> dr_chain,
4555 unsigned int length,
4556 gimple stmt,
4557 gimple_stmt_iterator *gsi,
4558 vec<tree> *result_chain)
4560 tree vect1, vect2, high, low;
4561 gimple perm_stmt;
4562 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4563 tree perm_mask_low, perm_mask_high;
4564 tree data_ref;
4565 tree perm3_mask_low, perm3_mask_high;
4566 unsigned int i, n, log_length = exact_log2 (length);
4567 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4568 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4570 result_chain->quick_grow (length);
4571 memcpy (result_chain->address (), dr_chain.address (),
4572 length * sizeof (tree));
4574 if (length == 3)
4576 unsigned int j0 = 0, j1 = 0, j2 = 0;
4578 for (j = 0; j < 3; j++)
4580 int nelt0 = ((3 - j) * nelt) % 3;
4581 int nelt1 = ((3 - j) * nelt + 1) % 3;
4582 int nelt2 = ((3 - j) * nelt + 2) % 3;
4584 for (i = 0; i < nelt; i++)
4586 if (3 * i + nelt0 < nelt)
4587 sel[3 * i + nelt0] = j0++;
4588 if (3 * i + nelt1 < nelt)
4589 sel[3 * i + nelt1] = nelt + j1++;
4590 if (3 * i + nelt2 < nelt)
4591 sel[3 * i + nelt2] = 0;
4593 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4595 for (i = 0; i < nelt; i++)
4597 if (3 * i + nelt0 < nelt)
4598 sel[3 * i + nelt0] = 3 * i + nelt0;
4599 if (3 * i + nelt1 < nelt)
4600 sel[3 * i + nelt1] = 3 * i + nelt1;
4601 if (3 * i + nelt2 < nelt)
4602 sel[3 * i + nelt2] = nelt + j2++;
4604 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4606 vect1 = dr_chain[0];
4607 vect2 = dr_chain[1];
4609 /* Create interleaving stmt:
4610 low = VEC_PERM_EXPR <vect1, vect2,
4611 {j, nelt, *, j + 1, nelt + j + 1, *,
4612 j + 2, nelt + j + 2, *, ...}> */
4613 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4614 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4615 vect2, perm3_mask_low);
4616 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4618 vect1 = data_ref;
4619 vect2 = dr_chain[2];
4620 /* Create interleaving stmt:
4621 low = VEC_PERM_EXPR <vect1, vect2,
4622 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4623 6, 7, nelt + j + 2, ...}> */
4624 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4625 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4626 vect2, perm3_mask_high);
4627 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4628 (*result_chain)[j] = data_ref;
4631 else
4633 /* If length is not equal to 3 then only power of 2 is supported. */
4634 gcc_assert (exact_log2 (length) != -1);
4636 for (i = 0, n = nelt / 2; i < n; i++)
4638 sel[i * 2] = i;
4639 sel[i * 2 + 1] = i + nelt;
4641 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4643 for (i = 0; i < nelt; i++)
4644 sel[i] += nelt / 2;
4645 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4647 for (i = 0, n = log_length; i < n; i++)
4649 for (j = 0; j < length/2; j++)
4651 vect1 = dr_chain[j];
4652 vect2 = dr_chain[j+length/2];
4654 /* Create interleaving stmt:
4655 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4656 ...}> */
4657 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4658 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4659 vect2, perm_mask_high);
4660 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4661 (*result_chain)[2*j] = high;
4663 /* Create interleaving stmt:
4664 low = VEC_PERM_EXPR <vect1, vect2,
4665 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4666 ...}> */
4667 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4668 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4669 vect2, perm_mask_low);
4670 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4671 (*result_chain)[2*j+1] = low;
4673 memcpy (dr_chain.address (), result_chain->address (),
4674 length * sizeof (tree));
4679 /* Function vect_setup_realignment
4681 This function is called when vectorizing an unaligned load using
4682 the dr_explicit_realign[_optimized] scheme.
4683 This function generates the following code at the loop prolog:
4685 p = initial_addr;
4686 x msq_init = *(floor(p)); # prolog load
4687 realignment_token = call target_builtin;
4688 loop:
4689 x msq = phi (msq_init, ---)
4691 The stmts marked with x are generated only for the case of
4692 dr_explicit_realign_optimized.
4694 The code above sets up a new (vector) pointer, pointing to the first
4695 location accessed by STMT, and a "floor-aligned" load using that pointer.
4696 It also generates code to compute the "realignment-token" (if the relevant
4697 target hook was defined), and creates a phi-node at the loop-header bb
4698 whose arguments are the result of the prolog-load (created by this
4699 function) and the result of a load that takes place in the loop (to be
4700 created by the caller to this function).
4702 For the case of dr_explicit_realign_optimized:
4703 The caller to this function uses the phi-result (msq) to create the
4704 realignment code inside the loop, and sets up the missing phi argument,
4705 as follows:
4706 loop:
4707 msq = phi (msq_init, lsq)
4708 lsq = *(floor(p')); # load in loop
4709 result = realign_load (msq, lsq, realignment_token);
4711 For the case of dr_explicit_realign:
4712 loop:
4713 msq = *(floor(p)); # load in loop
4714 p' = p + (VS-1);
4715 lsq = *(floor(p')); # load in loop
4716 result = realign_load (msq, lsq, realignment_token);
4718 Input:
4719 STMT - (scalar) load stmt to be vectorized. This load accesses
4720 a memory location that may be unaligned.
4721 BSI - place where new code is to be inserted.
4722 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4723 is used.
4725 Output:
4726 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4727 target hook, if defined.
4728 Return value - the result of the loop-header phi node. */
4730 tree
4731 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4732 tree *realignment_token,
4733 enum dr_alignment_support alignment_support_scheme,
4734 tree init_addr,
4735 struct loop **at_loop)
4737 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4738 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4739 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4740 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4741 struct loop *loop = NULL;
4742 edge pe = NULL;
4743 tree scalar_dest = gimple_assign_lhs (stmt);
4744 tree vec_dest;
4745 gimple inc;
4746 tree ptr;
4747 tree data_ref;
4748 basic_block new_bb;
4749 tree msq_init = NULL_TREE;
4750 tree new_temp;
4751 gphi *phi_stmt;
4752 tree msq = NULL_TREE;
4753 gimple_seq stmts = NULL;
4754 bool inv_p;
4755 bool compute_in_loop = false;
4756 bool nested_in_vect_loop = false;
4757 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4758 struct loop *loop_for_initial_load = NULL;
4760 if (loop_vinfo)
4762 loop = LOOP_VINFO_LOOP (loop_vinfo);
4763 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4766 gcc_assert (alignment_support_scheme == dr_explicit_realign
4767 || alignment_support_scheme == dr_explicit_realign_optimized);
4769 /* We need to generate three things:
4770 1. the misalignment computation
4771 2. the extra vector load (for the optimized realignment scheme).
4772 3. the phi node for the two vectors from which the realignment is
4773 done (for the optimized realignment scheme). */
4775 /* 1. Determine where to generate the misalignment computation.
4777 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4778 calculation will be generated by this function, outside the loop (in the
4779 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4780 caller, inside the loop.
4782 Background: If the misalignment remains fixed throughout the iterations of
4783 the loop, then both realignment schemes are applicable, and also the
4784 misalignment computation can be done outside LOOP. This is because we are
4785 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4786 are a multiple of VS (the Vector Size), and therefore the misalignment in
4787 different vectorized LOOP iterations is always the same.
4788 The problem arises only if the memory access is in an inner-loop nested
4789 inside LOOP, which is now being vectorized using outer-loop vectorization.
4790 This is the only case when the misalignment of the memory access may not
4791 remain fixed throughout the iterations of the inner-loop (as explained in
4792 detail in vect_supportable_dr_alignment). In this case, not only is the
4793 optimized realignment scheme not applicable, but also the misalignment
4794 computation (and generation of the realignment token that is passed to
4795 REALIGN_LOAD) have to be done inside the loop.
4797 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4798 or not, which in turn determines if the misalignment is computed inside
4799 the inner-loop, or outside LOOP. */
4801 if (init_addr != NULL_TREE || !loop_vinfo)
4803 compute_in_loop = true;
4804 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4808 /* 2. Determine where to generate the extra vector load.
4810 For the optimized realignment scheme, instead of generating two vector
4811 loads in each iteration, we generate a single extra vector load in the
4812 preheader of the loop, and in each iteration reuse the result of the
4813 vector load from the previous iteration. In case the memory access is in
4814 an inner-loop nested inside LOOP, which is now being vectorized using
4815 outer-loop vectorization, we need to determine whether this initial vector
4816 load should be generated at the preheader of the inner-loop, or can be
4817 generated at the preheader of LOOP. If the memory access has no evolution
4818 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4819 to be generated inside LOOP (in the preheader of the inner-loop). */
4821 if (nested_in_vect_loop)
4823 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4824 bool invariant_in_outerloop =
4825 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4826 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4828 else
4829 loop_for_initial_load = loop;
4830 if (at_loop)
4831 *at_loop = loop_for_initial_load;
4833 if (loop_for_initial_load)
4834 pe = loop_preheader_edge (loop_for_initial_load);
4836 /* 3. For the case of the optimized realignment, create the first vector
4837 load at the loop preheader. */
4839 if (alignment_support_scheme == dr_explicit_realign_optimized)
4841 /* Create msq_init = *(floor(p1)) in the loop preheader */
4842 gassign *new_stmt;
4844 gcc_assert (!compute_in_loop);
4845 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4846 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4847 NULL_TREE, &init_addr, NULL, &inc,
4848 true, &inv_p);
4849 if (TREE_CODE (ptr) == SSA_NAME)
4850 new_temp = copy_ssa_name (ptr);
4851 else
4852 new_temp = make_ssa_name (TREE_TYPE (ptr));
4853 new_stmt = gimple_build_assign
4854 (new_temp, BIT_AND_EXPR, ptr,
4855 build_int_cst (TREE_TYPE (ptr),
4856 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4857 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4858 gcc_assert (!new_bb);
4859 data_ref
4860 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4861 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4862 new_stmt = gimple_build_assign (vec_dest, data_ref);
4863 new_temp = make_ssa_name (vec_dest, new_stmt);
4864 gimple_assign_set_lhs (new_stmt, new_temp);
4865 if (pe)
4867 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4868 gcc_assert (!new_bb);
4870 else
4871 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4873 msq_init = gimple_assign_lhs (new_stmt);
4876 /* 4. Create realignment token using a target builtin, if available.
4877 It is done either inside the containing loop, or before LOOP (as
4878 determined above). */
4880 if (targetm.vectorize.builtin_mask_for_load)
4882 gcall *new_stmt;
4883 tree builtin_decl;
4885 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4886 if (!init_addr)
4888 /* Generate the INIT_ADDR computation outside LOOP. */
4889 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4890 NULL_TREE, loop);
4891 if (loop)
4893 pe = loop_preheader_edge (loop);
4894 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4895 gcc_assert (!new_bb);
4897 else
4898 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4901 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4902 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4903 vec_dest =
4904 vect_create_destination_var (scalar_dest,
4905 gimple_call_return_type (new_stmt));
4906 new_temp = make_ssa_name (vec_dest, new_stmt);
4907 gimple_call_set_lhs (new_stmt, new_temp);
4909 if (compute_in_loop)
4910 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4911 else
4913 /* Generate the misalignment computation outside LOOP. */
4914 pe = loop_preheader_edge (loop);
4915 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4916 gcc_assert (!new_bb);
4919 *realignment_token = gimple_call_lhs (new_stmt);
4921 /* The result of the CALL_EXPR to this builtin is determined from
4922 the value of the parameter and no global variables are touched
4923 which makes the builtin a "const" function. Requiring the
4924 builtin to have the "const" attribute makes it unnecessary
4925 to call mark_call_clobbered. */
4926 gcc_assert (TREE_READONLY (builtin_decl));
4929 if (alignment_support_scheme == dr_explicit_realign)
4930 return msq;
4932 gcc_assert (!compute_in_loop);
4933 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4936 /* 5. Create msq = phi <msq_init, lsq> in loop */
4938 pe = loop_preheader_edge (containing_loop);
4939 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4940 msq = make_ssa_name (vec_dest);
4941 phi_stmt = create_phi_node (msq, containing_loop->header);
4942 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4944 return msq;
4948 /* Function vect_grouped_load_supported.
4950 Returns TRUE if even and odd permutations are supported,
4951 and FALSE otherwise. */
4953 bool
4954 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4956 machine_mode mode = TYPE_MODE (vectype);
4958 /* vect_permute_load_chain requires the group size to be equal to 3 or
4959 be a power of two. */
4960 if (count != 3 && exact_log2 (count) == -1)
4962 if (dump_enabled_p ())
4963 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4964 "the size of the group of accesses"
4965 " is not a power of 2 or not equal to 3\n");
4966 return false;
4969 /* Check that the permutation is supported. */
4970 if (VECTOR_MODE_P (mode))
4972 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4973 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4975 if (count == 3)
4977 unsigned int k;
4978 for (k = 0; k < 3; k++)
4980 for (i = 0; i < nelt; i++)
4981 if (3 * i + k < 2 * nelt)
4982 sel[i] = 3 * i + k;
4983 else
4984 sel[i] = 0;
4985 if (!can_vec_perm_p (mode, false, sel))
4987 if (dump_enabled_p ())
4988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4989 "shuffle of 3 loads is not supported by"
4990 " target\n");
4991 return false;
4993 for (i = 0, j = 0; i < nelt; i++)
4994 if (3 * i + k < 2 * nelt)
4995 sel[i] = i;
4996 else
4997 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4998 if (!can_vec_perm_p (mode, false, sel))
5000 if (dump_enabled_p ())
5001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5002 "shuffle of 3 loads is not supported by"
5003 " target\n");
5004 return false;
5007 return true;
5009 else
5011 /* If length is not equal to 3 then only power of 2 is supported. */
5012 gcc_assert (exact_log2 (count) != -1);
5013 for (i = 0; i < nelt; i++)
5014 sel[i] = i * 2;
5015 if (can_vec_perm_p (mode, false, sel))
5017 for (i = 0; i < nelt; i++)
5018 sel[i] = i * 2 + 1;
5019 if (can_vec_perm_p (mode, false, sel))
5020 return true;
5025 if (dump_enabled_p ())
5026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5027 "extract even/odd not supported by target\n");
5028 return false;
5031 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5032 type VECTYPE. */
5034 bool
5035 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5037 return vect_lanes_optab_supported_p ("vec_load_lanes",
5038 vec_load_lanes_optab,
5039 vectype, count);
5042 /* Function vect_permute_load_chain.
5044 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5045 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5046 the input data correctly. Return the final references for loads in
5047 RESULT_CHAIN.
5049 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5050 The input is 4 vectors each containing 8 elements. We assign a number to each
5051 element, the input sequence is:
5053 1st vec: 0 1 2 3 4 5 6 7
5054 2nd vec: 8 9 10 11 12 13 14 15
5055 3rd vec: 16 17 18 19 20 21 22 23
5056 4th vec: 24 25 26 27 28 29 30 31
5058 The output sequence should be:
5060 1st vec: 0 4 8 12 16 20 24 28
5061 2nd vec: 1 5 9 13 17 21 25 29
5062 3rd vec: 2 6 10 14 18 22 26 30
5063 4th vec: 3 7 11 15 19 23 27 31
5065 i.e., the first output vector should contain the first elements of each
5066 interleaving group, etc.
5068 We use extract_even/odd instructions to create such output. The input of
5069 each extract_even/odd operation is two vectors
5070 1st vec 2nd vec
5071 0 1 2 3 4 5 6 7
5073 and the output is the vector of extracted even/odd elements. The output of
5074 extract_even will be: 0 2 4 6
5075 and of extract_odd: 1 3 5 7
5078 The permutation is done in log LENGTH stages. In each stage extract_even
5079 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5080 their order. In our example,
5082 E1: extract_even (1st vec, 2nd vec)
5083 E2: extract_odd (1st vec, 2nd vec)
5084 E3: extract_even (3rd vec, 4th vec)
5085 E4: extract_odd (3rd vec, 4th vec)
5087 The output for the first stage will be:
5089 E1: 0 2 4 6 8 10 12 14
5090 E2: 1 3 5 7 9 11 13 15
5091 E3: 16 18 20 22 24 26 28 30
5092 E4: 17 19 21 23 25 27 29 31
5094 In order to proceed and create the correct sequence for the next stage (or
5095 for the correct output, if the second stage is the last one, as in our
5096 example), we first put the output of extract_even operation and then the
5097 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5098 The input for the second stage is:
5100 1st vec (E1): 0 2 4 6 8 10 12 14
5101 2nd vec (E3): 16 18 20 22 24 26 28 30
5102 3rd vec (E2): 1 3 5 7 9 11 13 15
5103 4th vec (E4): 17 19 21 23 25 27 29 31
5105 The output of the second stage:
5107 E1: 0 4 8 12 16 20 24 28
5108 E2: 2 6 10 14 18 22 26 30
5109 E3: 1 5 9 13 17 21 25 29
5110 E4: 3 7 11 15 19 23 27 31
5112 And RESULT_CHAIN after reordering:
5114 1st vec (E1): 0 4 8 12 16 20 24 28
5115 2nd vec (E3): 1 5 9 13 17 21 25 29
5116 3rd vec (E2): 2 6 10 14 18 22 26 30
5117 4th vec (E4): 3 7 11 15 19 23 27 31. */
5119 static void
5120 vect_permute_load_chain (vec<tree> dr_chain,
5121 unsigned int length,
5122 gimple stmt,
5123 gimple_stmt_iterator *gsi,
5124 vec<tree> *result_chain)
5126 tree data_ref, first_vect, second_vect;
5127 tree perm_mask_even, perm_mask_odd;
5128 tree perm3_mask_low, perm3_mask_high;
5129 gimple perm_stmt;
5130 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5131 unsigned int i, j, log_length = exact_log2 (length);
5132 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5133 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5135 result_chain->quick_grow (length);
5136 memcpy (result_chain->address (), dr_chain.address (),
5137 length * sizeof (tree));
5139 if (length == 3)
5141 unsigned int k;
5143 for (k = 0; k < 3; k++)
5145 for (i = 0; i < nelt; i++)
5146 if (3 * i + k < 2 * nelt)
5147 sel[i] = 3 * i + k;
5148 else
5149 sel[i] = 0;
5150 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5152 for (i = 0, j = 0; i < nelt; i++)
5153 if (3 * i + k < 2 * nelt)
5154 sel[i] = i;
5155 else
5156 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5158 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5160 first_vect = dr_chain[0];
5161 second_vect = dr_chain[1];
5163 /* Create interleaving stmt (low part of):
5164 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5165 ...}> */
5166 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5167 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5168 second_vect, perm3_mask_low);
5169 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5171 /* Create interleaving stmt (high part of):
5172 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5173 ...}> */
5174 first_vect = data_ref;
5175 second_vect = dr_chain[2];
5176 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5177 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5178 second_vect, perm3_mask_high);
5179 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5180 (*result_chain)[k] = data_ref;
5183 else
5185 /* If length is not equal to 3 then only power of 2 is supported. */
5186 gcc_assert (exact_log2 (length) != -1);
5188 for (i = 0; i < nelt; ++i)
5189 sel[i] = i * 2;
5190 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5192 for (i = 0; i < nelt; ++i)
5193 sel[i] = i * 2 + 1;
5194 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5196 for (i = 0; i < log_length; i++)
5198 for (j = 0; j < length; j += 2)
5200 first_vect = dr_chain[j];
5201 second_vect = dr_chain[j+1];
5203 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5204 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5205 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5206 first_vect, second_vect,
5207 perm_mask_even);
5208 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5209 (*result_chain)[j/2] = data_ref;
5211 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5212 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5213 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5214 first_vect, second_vect,
5215 perm_mask_odd);
5216 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5217 (*result_chain)[j/2+length/2] = data_ref;
5219 memcpy (dr_chain.address (), result_chain->address (),
5220 length * sizeof (tree));
5225 /* Function vect_shift_permute_load_chain.
5227 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5228 sequence of stmts to reorder the input data accordingly.
5229 Return the final references for loads in RESULT_CHAIN.
5230 Return true if successed, false otherwise.
5232 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5233 The input is 3 vectors each containing 8 elements. We assign a
5234 number to each element, the input sequence is:
5236 1st vec: 0 1 2 3 4 5 6 7
5237 2nd vec: 8 9 10 11 12 13 14 15
5238 3rd vec: 16 17 18 19 20 21 22 23
5240 The output sequence should be:
5242 1st vec: 0 3 6 9 12 15 18 21
5243 2nd vec: 1 4 7 10 13 16 19 22
5244 3rd vec: 2 5 8 11 14 17 20 23
5246 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5248 First we shuffle all 3 vectors to get correct elements order:
5250 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5251 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5252 3rd vec: (16 19 22) (17 20 23) (18 21)
5254 Next we unite and shift vector 3 times:
5256 1st step:
5257 shift right by 6 the concatenation of:
5258 "1st vec" and "2nd vec"
5259 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5260 "2nd vec" and "3rd vec"
5261 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5262 "3rd vec" and "1st vec"
5263 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5264 | New vectors |
5266 So that now new vectors are:
5268 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5269 2nd vec: (10 13) (16 19 22) (17 20 23)
5270 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5272 2nd step:
5273 shift right by 5 the concatenation of:
5274 "1st vec" and "3rd vec"
5275 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5276 "2nd vec" and "1st vec"
5277 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5278 "3rd vec" and "2nd vec"
5279 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5280 | New vectors |
5282 So that now new vectors are:
5284 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5285 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5286 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5288 3rd step:
5289 shift right by 5 the concatenation of:
5290 "1st vec" and "1st vec"
5291 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5292 shift right by 3 the concatenation of:
5293 "2nd vec" and "2nd vec"
5294 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5295 | New vectors |
5297 So that now all vectors are READY:
5298 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5299 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5300 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5302 This algorithm is faster than one in vect_permute_load_chain if:
5303 1. "shift of a concatination" is faster than general permutation.
5304 This is usually so.
5305 2. The TARGET machine can't execute vector instructions in parallel.
5306 This is because each step of the algorithm depends on previous.
5307 The algorithm in vect_permute_load_chain is much more parallel.
5309 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5312 static bool
5313 vect_shift_permute_load_chain (vec<tree> dr_chain,
5314 unsigned int length,
5315 gimple stmt,
5316 gimple_stmt_iterator *gsi,
5317 vec<tree> *result_chain)
5319 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5320 tree perm2_mask1, perm2_mask2, perm3_mask;
5321 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5322 gimple perm_stmt;
5324 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5325 unsigned int i;
5326 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5327 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5328 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5329 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5331 result_chain->quick_grow (length);
5332 memcpy (result_chain->address (), dr_chain.address (),
5333 length * sizeof (tree));
5335 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5337 unsigned int j, log_length = exact_log2 (length);
5338 for (i = 0; i < nelt / 2; ++i)
5339 sel[i] = i * 2;
5340 for (i = 0; i < nelt / 2; ++i)
5341 sel[nelt / 2 + i] = i * 2 + 1;
5342 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5344 if (dump_enabled_p ())
5345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5346 "shuffle of 2 fields structure is not \
5347 supported by target\n");
5348 return false;
5350 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5352 for (i = 0; i < nelt / 2; ++i)
5353 sel[i] = i * 2 + 1;
5354 for (i = 0; i < nelt / 2; ++i)
5355 sel[nelt / 2 + i] = i * 2;
5356 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5358 if (dump_enabled_p ())
5359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5360 "shuffle of 2 fields structure is not \
5361 supported by target\n");
5362 return false;
5364 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5366 /* Generating permutation constant to shift all elements.
5367 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5368 for (i = 0; i < nelt; i++)
5369 sel[i] = nelt / 2 + i;
5370 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5372 if (dump_enabled_p ())
5373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5374 "shift permutation is not supported by target\n");
5375 return false;
5377 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5379 /* Generating permutation constant to select vector from 2.
5380 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5381 for (i = 0; i < nelt / 2; i++)
5382 sel[i] = i;
5383 for (i = nelt / 2; i < nelt; i++)
5384 sel[i] = nelt + i;
5385 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5387 if (dump_enabled_p ())
5388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5389 "select is not supported by target\n");
5390 return false;
5392 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5394 for (i = 0; i < log_length; i++)
5396 for (j = 0; j < length; j += 2)
5398 first_vect = dr_chain[j];
5399 second_vect = dr_chain[j + 1];
5401 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5402 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5403 first_vect, first_vect,
5404 perm2_mask1);
5405 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5406 vect[0] = data_ref;
5408 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5409 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5410 second_vect, second_vect,
5411 perm2_mask2);
5412 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5413 vect[1] = data_ref;
5415 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5416 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5417 vect[0], vect[1], shift1_mask);
5418 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5419 (*result_chain)[j/2 + length/2] = data_ref;
5421 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5422 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5423 vect[0], vect[1], select_mask);
5424 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5425 (*result_chain)[j/2] = data_ref;
5427 memcpy (dr_chain.address (), result_chain->address (),
5428 length * sizeof (tree));
5430 return true;
5432 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5434 unsigned int k = 0, l = 0;
5436 /* Generating permutation constant to get all elements in rigth order.
5437 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5438 for (i = 0; i < nelt; i++)
5440 if (3 * k + (l % 3) >= nelt)
5442 k = 0;
5443 l += (3 - (nelt % 3));
5445 sel[i] = 3 * k + (l % 3);
5446 k++;
5448 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5450 if (dump_enabled_p ())
5451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5452 "shuffle of 3 fields structure is not \
5453 supported by target\n");
5454 return false;
5456 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5458 /* Generating permutation constant to shift all elements.
5459 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5460 for (i = 0; i < nelt; i++)
5461 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5462 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5464 if (dump_enabled_p ())
5465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5466 "shift permutation is not supported by target\n");
5467 return false;
5469 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5471 /* Generating permutation constant to shift all elements.
5472 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5473 for (i = 0; i < nelt; i++)
5474 sel[i] = 2 * (nelt / 3) + 1 + i;
5475 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5477 if (dump_enabled_p ())
5478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5479 "shift permutation is not supported by target\n");
5480 return false;
5482 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5484 /* Generating permutation constant to shift all elements.
5485 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5486 for (i = 0; i < nelt; i++)
5487 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5488 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5490 if (dump_enabled_p ())
5491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5492 "shift permutation is not supported by target\n");
5493 return false;
5495 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5497 /* Generating permutation constant to shift all elements.
5498 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5499 for (i = 0; i < nelt; i++)
5500 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5501 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5503 if (dump_enabled_p ())
5504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5505 "shift permutation is not supported by target\n");
5506 return false;
5508 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5510 for (k = 0; k < 3; k++)
5512 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5513 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5514 dr_chain[k], dr_chain[k],
5515 perm3_mask);
5516 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5517 vect[k] = data_ref;
5520 for (k = 0; k < 3; k++)
5522 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5523 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5524 vect[k % 3], vect[(k + 1) % 3],
5525 shift1_mask);
5526 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5527 vect_shift[k] = data_ref;
5530 for (k = 0; k < 3; k++)
5532 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5533 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5534 vect_shift[(4 - k) % 3],
5535 vect_shift[(3 - k) % 3],
5536 shift2_mask);
5537 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5538 vect[k] = data_ref;
5541 (*result_chain)[3 - (nelt % 3)] = vect[2];
5543 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5544 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5545 vect[0], shift3_mask);
5546 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5547 (*result_chain)[nelt % 3] = data_ref;
5549 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5550 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5551 vect[1], shift4_mask);
5552 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5553 (*result_chain)[0] = data_ref;
5554 return true;
5556 return false;
5559 /* Function vect_transform_grouped_load.
5561 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5562 to perform their permutation and ascribe the result vectorized statements to
5563 the scalar statements.
5566 void
5567 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5568 gimple_stmt_iterator *gsi)
5570 machine_mode mode;
5571 vec<tree> result_chain = vNULL;
5573 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5574 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5575 vectors, that are ready for vector computation. */
5576 result_chain.create (size);
5578 /* If reassociation width for vector type is 2 or greater target machine can
5579 execute 2 or more vector instructions in parallel. Otherwise try to
5580 get chain for loads group using vect_shift_permute_load_chain. */
5581 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5582 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5583 || exact_log2 (size) != -1
5584 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5585 gsi, &result_chain))
5586 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5587 vect_record_grouped_load_vectors (stmt, result_chain);
5588 result_chain.release ();
5591 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5592 generated as part of the vectorization of STMT. Assign the statement
5593 for each vector to the associated scalar statement. */
5595 void
5596 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5598 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5599 gimple next_stmt, new_stmt;
5600 unsigned int i, gap_count;
5601 tree tmp_data_ref;
5603 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5604 Since we scan the chain starting from it's first node, their order
5605 corresponds the order of data-refs in RESULT_CHAIN. */
5606 next_stmt = first_stmt;
5607 gap_count = 1;
5608 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5610 if (!next_stmt)
5611 break;
5613 /* Skip the gaps. Loads created for the gaps will be removed by dead
5614 code elimination pass later. No need to check for the first stmt in
5615 the group, since it always exists.
5616 GROUP_GAP is the number of steps in elements from the previous
5617 access (if there is no gap GROUP_GAP is 1). We skip loads that
5618 correspond to the gaps. */
5619 if (next_stmt != first_stmt
5620 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5622 gap_count++;
5623 continue;
5626 while (next_stmt)
5628 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5629 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5630 copies, and we put the new vector statement in the first available
5631 RELATED_STMT. */
5632 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5633 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5634 else
5636 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5638 gimple prev_stmt =
5639 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5640 gimple rel_stmt =
5641 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5642 while (rel_stmt)
5644 prev_stmt = rel_stmt;
5645 rel_stmt =
5646 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5649 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5650 new_stmt;
5654 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5655 gap_count = 1;
5656 /* If NEXT_STMT accesses the same DR as the previous statement,
5657 put the same TMP_DATA_REF as its vectorized statement; otherwise
5658 get the next data-ref from RESULT_CHAIN. */
5659 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5660 break;
5665 /* Function vect_force_dr_alignment_p.
5667 Returns whether the alignment of a DECL can be forced to be aligned
5668 on ALIGNMENT bit boundary. */
5670 bool
5671 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5673 if (TREE_CODE (decl) != VAR_DECL)
5674 return false;
5676 if (decl_in_symtab_p (decl)
5677 && !symtab_node::get (decl)->can_increase_alignment_p ())
5678 return false;
5680 if (TREE_STATIC (decl))
5681 return (alignment <= MAX_OFILE_ALIGNMENT);
5682 else
5683 return (alignment <= MAX_STACK_ALIGNMENT);
5687 /* Return whether the data reference DR is supported with respect to its
5688 alignment.
5689 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5690 it is aligned, i.e., check if it is possible to vectorize it with different
5691 alignment. */
5693 enum dr_alignment_support
5694 vect_supportable_dr_alignment (struct data_reference *dr,
5695 bool check_aligned_accesses)
5697 gimple stmt = DR_STMT (dr);
5698 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5699 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5700 machine_mode mode = TYPE_MODE (vectype);
5701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5702 struct loop *vect_loop = NULL;
5703 bool nested_in_vect_loop = false;
5705 if (aligned_access_p (dr) && !check_aligned_accesses)
5706 return dr_aligned;
5708 /* For now assume all conditional loads/stores support unaligned
5709 access without any special code. */
5710 if (is_gimple_call (stmt)
5711 && gimple_call_internal_p (stmt)
5712 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5713 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5714 return dr_unaligned_supported;
5716 if (loop_vinfo)
5718 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5719 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5722 /* Possibly unaligned access. */
5724 /* We can choose between using the implicit realignment scheme (generating
5725 a misaligned_move stmt) and the explicit realignment scheme (generating
5726 aligned loads with a REALIGN_LOAD). There are two variants to the
5727 explicit realignment scheme: optimized, and unoptimized.
5728 We can optimize the realignment only if the step between consecutive
5729 vector loads is equal to the vector size. Since the vector memory
5730 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5731 is guaranteed that the misalignment amount remains the same throughout the
5732 execution of the vectorized loop. Therefore, we can create the
5733 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5734 at the loop preheader.
5736 However, in the case of outer-loop vectorization, when vectorizing a
5737 memory access in the inner-loop nested within the LOOP that is now being
5738 vectorized, while it is guaranteed that the misalignment of the
5739 vectorized memory access will remain the same in different outer-loop
5740 iterations, it is *not* guaranteed that is will remain the same throughout
5741 the execution of the inner-loop. This is because the inner-loop advances
5742 with the original scalar step (and not in steps of VS). If the inner-loop
5743 step happens to be a multiple of VS, then the misalignment remains fixed
5744 and we can use the optimized realignment scheme. For example:
5746 for (i=0; i<N; i++)
5747 for (j=0; j<M; j++)
5748 s += a[i+j];
5750 When vectorizing the i-loop in the above example, the step between
5751 consecutive vector loads is 1, and so the misalignment does not remain
5752 fixed across the execution of the inner-loop, and the realignment cannot
5753 be optimized (as illustrated in the following pseudo vectorized loop):
5755 for (i=0; i<N; i+=4)
5756 for (j=0; j<M; j++){
5757 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5758 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5759 // (assuming that we start from an aligned address).
5762 We therefore have to use the unoptimized realignment scheme:
5764 for (i=0; i<N; i+=4)
5765 for (j=k; j<M; j+=4)
5766 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5767 // that the misalignment of the initial address is
5768 // 0).
5770 The loop can then be vectorized as follows:
5772 for (k=0; k<4; k++){
5773 rt = get_realignment_token (&vp[k]);
5774 for (i=0; i<N; i+=4){
5775 v1 = vp[i+k];
5776 for (j=k; j<M; j+=4){
5777 v2 = vp[i+j+VS-1];
5778 va = REALIGN_LOAD <v1,v2,rt>;
5779 vs += va;
5780 v1 = v2;
5783 } */
5785 if (DR_IS_READ (dr))
5787 bool is_packed = false;
5788 tree type = (TREE_TYPE (DR_REF (dr)));
5790 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5791 && (!targetm.vectorize.builtin_mask_for_load
5792 || targetm.vectorize.builtin_mask_for_load ()))
5794 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5795 if ((nested_in_vect_loop
5796 && (TREE_INT_CST_LOW (DR_STEP (dr))
5797 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5798 || !loop_vinfo)
5799 return dr_explicit_realign;
5800 else
5801 return dr_explicit_realign_optimized;
5803 if (!known_alignment_for_access_p (dr))
5804 is_packed = not_size_aligned (DR_REF (dr));
5806 if ((TYPE_USER_ALIGN (type) && !is_packed)
5807 || targetm.vectorize.
5808 support_vector_misalignment (mode, type,
5809 DR_MISALIGNMENT (dr), is_packed))
5810 /* Can't software pipeline the loads, but can at least do them. */
5811 return dr_unaligned_supported;
5813 else
5815 bool is_packed = false;
5816 tree type = (TREE_TYPE (DR_REF (dr)));
5818 if (!known_alignment_for_access_p (dr))
5819 is_packed = not_size_aligned (DR_REF (dr));
5821 if ((TYPE_USER_ALIGN (type) && !is_packed)
5822 || targetm.vectorize.
5823 support_vector_misalignment (mode, type,
5824 DR_MISALIGNMENT (dr), is_packed))
5825 return dr_unaligned_supported;
5828 /* Unsupported. */
5829 return dr_unaligned_unsupported;