gcc/
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob2439bd6390bad01ca57d2a090277f35480acc7d5
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "predict.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "tm_p.h"
36 #include "target.h"
37 #include "gimple-pretty-print.h"
38 #include "internal-fn.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-chrec.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "diagnostic-core.h"
51 #include "cgraph.h"
52 /* Need to include rtl.h, expr.h, etc. for optabs. */
53 #include "flags.h"
54 #include "insn-config.h"
55 #include "expmed.h"
56 #include "dojump.h"
57 #include "explow.h"
58 #include "calls.h"
59 #include "emit-rtl.h"
60 #include "varasm.h"
61 #include "stmt.h"
62 #include "expr.h"
63 #include "insn-codes.h"
64 #include "optabs.h"
65 #include "builtins.h"
66 #include "params.h"
68 /* Return true if load- or store-lanes optab OPTAB is implemented for
69 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
71 static bool
72 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
73 tree vectype, unsigned HOST_WIDE_INT count)
75 machine_mode mode, array_mode;
76 bool limit_p;
78 mode = TYPE_MODE (vectype);
79 limit_p = !targetm.array_mode_supported_p (mode, count);
80 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
81 MODE_INT, limit_p);
83 if (array_mode == BLKmode)
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
88 GET_MODE_NAME (mode), count);
89 return false;
92 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
96 "cannot use %s<%s><%s>\n", name,
97 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
98 return false;
101 if (dump_enabled_p ())
102 dump_printf_loc (MSG_NOTE, vect_location,
103 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
104 GET_MODE_NAME (mode));
106 return true;
110 /* Return the smallest scalar part of STMT.
111 This is used to determine the vectype of the stmt. We generally set the
112 vectype according to the type of the result (lhs). For stmts whose
113 result-type is different than the type of the arguments (e.g., demotion,
114 promotion), vectype will be reset appropriately (later). Note that we have
115 to visit the smallest datatype in this function, because that determines the
116 VF. If the smallest datatype in the loop is present only as the rhs of a
117 promotion operation - we'd miss it.
118 Such a case, where a variable of this datatype does not appear in the lhs
119 anywhere in the loop, can only occur if it's an invariant: e.g.:
120 'int_x = (int) short_inv', which we'd expect to have been optimized away by
121 invariant motion. However, we cannot rely on invariant motion to always
122 take invariants out of the loop, and so in the case of promotion we also
123 have to check the rhs.
124 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
125 types. */
127 tree
128 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
129 HOST_WIDE_INT *rhs_size_unit)
131 tree scalar_type = gimple_expr_type (stmt);
132 HOST_WIDE_INT lhs, rhs;
134 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
136 if (is_gimple_assign (stmt)
137 && (gimple_assign_cast_p (stmt)
138 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
159 static bool
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return false;
167 if (dump_enabled_p ())
169 dump_printf_loc (MSG_NOTE, vect_location,
170 "mark for run-time aliasing test between ");
171 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
172 dump_printf (MSG_NOTE, " and ");
173 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
174 dump_printf (MSG_NOTE, "\n");
177 if (optimize_loop_nest_for_size_p (loop))
179 if (dump_enabled_p ())
180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
181 "versioning not supported when optimizing"
182 " for size.\n");
183 return false;
186 /* FORNOW: We don't support versioning with outer-loop vectorization. */
187 if (loop->inner)
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
191 "versioning not yet supported for outer-loops.\n");
192 return false;
195 /* FORNOW: We don't support creating runtime alias tests for non-constant
196 step. */
197 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
198 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
200 if (dump_enabled_p ())
201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
202 "versioning not yet supported for non-constant "
203 "step\n");
204 return false;
207 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
208 return true;
212 /* Function vect_analyze_data_ref_dependence.
214 Return TRUE if there (might) exist a dependence between a memory-reference
215 DRA and a memory-reference DRB. When versioning for alias may check a
216 dependence at run-time, return FALSE. Adjust *MAX_VF according to
217 the data dependence. */
219 static bool
220 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
221 loop_vec_info loop_vinfo, int *max_vf)
223 unsigned int i;
224 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
225 struct data_reference *dra = DDR_A (ddr);
226 struct data_reference *drb = DDR_B (ddr);
227 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
228 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
229 lambda_vector dist_v;
230 unsigned int loop_depth;
232 /* In loop analysis all data references should be vectorizable. */
233 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
234 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
235 gcc_unreachable ();
237 /* Independent data accesses. */
238 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
239 return false;
241 if (dra == drb
242 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
243 return false;
245 /* Even if we have an anti-dependence then, as the vectorized loop covers at
246 least two scalar iterations, there is always also a true dependence.
247 As the vectorizer does not re-order loads and stores we can ignore
248 the anti-dependence if TBAA can disambiguate both DRs similar to the
249 case with known negative distance anti-dependences (positive
250 distance anti-dependences would violate TBAA constraints). */
251 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
252 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
253 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
254 get_alias_set (DR_REF (drb))))
255 return false;
257 /* Unknown data dependence. */
258 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
260 /* If user asserted safelen consecutive iterations can be
261 executed concurrently, assume independence. */
262 if (loop->safelen >= 2)
264 if (loop->safelen < *max_vf)
265 *max_vf = loop->safelen;
266 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
267 return false;
270 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
271 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias not supported for: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
285 return true;
288 if (dump_enabled_p ())
290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
291 "versioning for alias required: "
292 "can't determine dependence between ");
293 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
294 DR_REF (dra));
295 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (drb));
298 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
301 /* Add to list of ddrs that need to be tested at run-time. */
302 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
305 /* Known data dependence. */
306 if (DDR_NUM_DIST_VECTS (ddr) == 0)
308 /* If user asserted safelen consecutive iterations can be
309 executed concurrently, assume independence. */
310 if (loop->safelen >= 2)
312 if (loop->safelen < *max_vf)
313 *max_vf = loop->safelen;
314 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
315 return false;
318 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
319 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias not supported for: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
327 DR_REF (dra));
328 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
329 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
330 DR_REF (drb));
331 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
333 return true;
336 if (dump_enabled_p ())
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "versioning for alias required: "
340 "bad dist vector for ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
342 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
343 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
344 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
346 /* Add to list of ddrs that need to be tested at run-time. */
347 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
350 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
351 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
353 int dist = dist_v[loop_depth];
355 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "dependence distance = %d.\n", dist);
359 if (dist == 0)
361 if (dump_enabled_p ())
363 dump_printf_loc (MSG_NOTE, vect_location,
364 "dependence distance == 0 between ");
365 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
366 dump_printf (MSG_NOTE, " and ");
367 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
368 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
371 /* When we perform grouped accesses and perform implicit CSE
372 by detecting equal accesses and doing disambiguation with
373 runtime alias tests like for
374 .. = a[i];
375 .. = a[i+1];
376 a[i] = ..;
377 a[i+1] = ..;
378 *p = ..;
379 .. = a[i];
380 .. = a[i+1];
381 where we will end up loading { a[i], a[i+1] } once, make
382 sure that inserting group loads before the first load and
383 stores after the last store will do the right thing.
384 Similar for groups like
385 a[i] = ...;
386 ... = a[i];
387 a[i+1] = ...;
388 where loads from the group interleave with the store. */
389 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
390 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
392 gimple earlier_stmt;
393 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
394 if (DR_IS_WRITE
395 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
397 if (dump_enabled_p ())
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "READ_WRITE dependence in interleaving."
400 "\n");
401 return true;
405 continue;
408 if (dist > 0 && DDR_REVERSED_P (ddr))
410 /* If DDR_REVERSED_P the order of the data-refs in DDR was
411 reversed (to make distance vector positive), and the actual
412 distance is negative. */
413 if (dump_enabled_p ())
414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
415 "dependence distance negative.\n");
416 /* Record a negative dependence distance to later limit the
417 amount of stmt copying / unrolling we can perform.
418 Only need to handle read-after-write dependence. */
419 if (DR_IS_READ (drb)
420 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
421 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
422 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
423 continue;
426 if (abs (dist) >= 2
427 && abs (dist) < *max_vf)
429 /* The dependence distance requires reduction of the maximal
430 vectorization factor. */
431 *max_vf = abs (dist);
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_NOTE, vect_location,
434 "adjusting maximal vectorization factor to %i\n",
435 *max_vf);
438 if (abs (dist) >= *max_vf)
440 /* Dependence distance does not create dependence, as far as
441 vectorization is concerned, in this case. */
442 if (dump_enabled_p ())
443 dump_printf_loc (MSG_NOTE, vect_location,
444 "dependence distance >= VF.\n");
445 continue;
448 if (dump_enabled_p ())
450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
451 "not vectorized, possible dependence "
452 "between data-refs ");
453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
454 dump_printf (MSG_NOTE, " and ");
455 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
456 dump_printf (MSG_NOTE, "\n");
459 return true;
462 return false;
465 /* Function vect_analyze_data_ref_dependences.
467 Examine all the data references in the loop, and make sure there do not
468 exist any data dependences between them. Set *MAX_VF according to
469 the maximum vectorization factor the data dependences allow. */
471 bool
472 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
474 unsigned int i;
475 struct data_dependence_relation *ddr;
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_NOTE, vect_location,
479 "=== vect_analyze_data_ref_dependences ===\n");
481 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
482 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
483 &LOOP_VINFO_DDRS (loop_vinfo),
484 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
485 return false;
487 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
488 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
489 return false;
491 return true;
495 /* Function vect_slp_analyze_data_ref_dependence.
497 Return TRUE if there (might) exist a dependence between a memory-reference
498 DRA and a memory-reference DRB. When versioning for alias may check a
499 dependence at run-time, return FALSE. Adjust *MAX_VF according to
500 the data dependence. */
502 static bool
503 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
505 struct data_reference *dra = DDR_A (ddr);
506 struct data_reference *drb = DDR_B (ddr);
508 /* We need to check dependences of statements marked as unvectorizable
509 as well, they still can prohibit vectorization. */
511 /* Independent data accesses. */
512 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
513 return false;
515 if (dra == drb)
516 return false;
518 /* Read-read is OK. */
519 if (DR_IS_READ (dra) && DR_IS_READ (drb))
520 return false;
522 /* If dra and drb are part of the same interleaving chain consider
523 them independent. */
524 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
525 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
526 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
527 return false;
529 /* Unknown data dependence. */
530 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "can't determine dependence between ");
536 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
537 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
539 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
542 else if (dump_enabled_p ())
544 dump_printf_loc (MSG_NOTE, vect_location,
545 "determined dependence between ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
547 dump_printf (MSG_NOTE, " and ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
549 dump_printf (MSG_NOTE, "\n");
552 /* We do not vectorize basic blocks with write-write dependencies. */
553 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
554 return true;
556 /* If we have a read-write dependence check that the load is before the store.
557 When we vectorize basic blocks, vector load can be only before
558 corresponding scalar load, and vector store can be only after its
559 corresponding scalar store. So the order of the acceses is preserved in
560 case the load is before the store. */
561 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
562 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
564 /* That only holds for load-store pairs taking part in vectorization. */
565 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
566 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
567 return false;
570 return true;
574 /* Function vect_analyze_data_ref_dependences.
576 Examine all the data references in the basic-block, and make sure there
577 do not exist any data dependences between them. Set *MAX_VF according to
578 the maximum vectorization factor the data dependences allow. */
580 bool
581 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
583 struct data_dependence_relation *ddr;
584 unsigned int i;
586 if (dump_enabled_p ())
587 dump_printf_loc (MSG_NOTE, vect_location,
588 "=== vect_slp_analyze_data_ref_dependences ===\n");
590 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
591 &BB_VINFO_DDRS (bb_vinfo),
592 vNULL, true))
593 return false;
595 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
596 if (vect_slp_analyze_data_ref_dependence (ddr))
597 return false;
599 return true;
603 /* Function vect_compute_data_ref_alignment
605 Compute the misalignment of the data reference DR.
607 Output:
608 1. If during the misalignment computation it is found that the data reference
609 cannot be vectorized then false is returned.
610 2. DR_MISALIGNMENT (DR) is defined.
612 FOR NOW: No analysis is actually performed. Misalignment is calculated
613 only for trivial cases. TODO. */
615 static bool
616 vect_compute_data_ref_alignment (struct data_reference *dr)
618 gimple stmt = DR_STMT (dr);
619 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
620 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
621 struct loop *loop = NULL;
622 tree ref = DR_REF (dr);
623 tree vectype;
624 tree base, base_addr;
625 tree misalign = NULL_TREE;
626 tree aligned_to;
627 unsigned HOST_WIDE_INT alignment;
629 if (dump_enabled_p ())
630 dump_printf_loc (MSG_NOTE, vect_location,
631 "vect_compute_data_ref_alignment:\n");
633 if (loop_vinfo)
634 loop = LOOP_VINFO_LOOP (loop_vinfo);
636 /* Initialize misalignment to unknown. */
637 SET_DR_MISALIGNMENT (dr, -1);
639 /* Strided accesses perform only component accesses, misalignment information
640 is irrelevant for them. */
641 if (STMT_VINFO_STRIDED_P (stmt_info)
642 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
643 return true;
645 if (tree_fits_shwi_p (DR_STEP (dr)))
646 misalign = DR_INIT (dr);
647 aligned_to = DR_ALIGNED_TO (dr);
648 base_addr = DR_BASE_ADDRESS (dr);
649 vectype = STMT_VINFO_VECTYPE (stmt_info);
651 /* In case the dataref is in an inner-loop of the loop that is being
652 vectorized (LOOP), we use the base and misalignment information
653 relative to the outer-loop (LOOP). This is ok only if the misalignment
654 stays the same throughout the execution of the inner-loop, which is why
655 we have to check that the stride of the dataref in the inner-loop evenly
656 divides by the vector size. */
657 if (loop && nested_in_vect_loop_p (loop, stmt))
659 tree step = DR_STEP (dr);
661 if (tree_fits_shwi_p (step)
662 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "inner step divides the vector-size.\n");
667 misalign = STMT_VINFO_DR_INIT (stmt_info);
668 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
669 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
671 else
673 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
675 "inner step doesn't divide the vector-size.\n");
676 misalign = NULL_TREE;
680 /* Similarly we can only use base and misalignment information relative to
681 an innermost loop if the misalignment stays the same throughout the
682 execution of the loop. As above, this is the case if the stride of
683 the dataref evenly divides by the vector size. */
684 else
686 tree step = DR_STEP (dr);
687 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
689 if (tree_fits_shwi_p (step)
690 && ((tree_to_shwi (step) * vf)
691 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695 "step doesn't divide the vector-size.\n");
696 misalign = NULL_TREE;
700 /* To look at alignment of the base we have to preserve an inner MEM_REF
701 as that carries alignment information of the actual access. */
702 base = ref;
703 while (handled_component_p (base))
704 base = TREE_OPERAND (base, 0);
705 if (TREE_CODE (base) == MEM_REF)
706 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
707 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
708 unsigned int base_alignment = get_object_alignment (base);
710 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
711 DR_VECT_AUX (dr)->base_element_aligned = true;
713 alignment = TYPE_ALIGN_UNIT (vectype);
715 if ((compare_tree_int (aligned_to, alignment) < 0)
716 || !misalign)
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
723 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
725 return true;
728 if (base_alignment < TYPE_ALIGN (vectype))
730 /* Strip an inner MEM_REF to a bare decl if possible. */
731 if (TREE_CODE (base) == MEM_REF
732 && integer_zerop (TREE_OPERAND (base, 1))
733 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
734 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
736 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
738 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "can't force alignment of ref: ");
742 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
743 dump_printf (MSG_NOTE, "\n");
745 return true;
748 /* Force the alignment of the decl.
749 NOTE: This is the only change to the code we make during
750 the analysis phase, before deciding to vectorize the loop. */
751 if (dump_enabled_p ())
753 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
754 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
755 dump_printf (MSG_NOTE, "\n");
758 DR_VECT_AUX (dr)->base_decl = base;
759 DR_VECT_AUX (dr)->base_misaligned = true;
760 DR_VECT_AUX (dr)->base_element_aligned = true;
763 /* If this is a backward running DR then first access in the larger
764 vectype actually is N-1 elements before the address in the DR.
765 Adjust misalign accordingly. */
766 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
768 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
769 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
770 otherwise we wouldn't be here. */
771 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
772 /* PLUS because DR_STEP was negative. */
773 misalign = size_binop (PLUS_EXPR, misalign, offset);
776 SET_DR_MISALIGNMENT (dr,
777 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
779 if (dump_enabled_p ())
781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
782 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
783 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
784 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
791 /* Function vect_compute_data_refs_alignment
793 Compute the misalignment of data references in the loop.
794 Return FALSE if a data reference is found that cannot be vectorized. */
796 static bool
797 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
798 bb_vec_info bb_vinfo)
800 vec<data_reference_p> datarefs;
801 struct data_reference *dr;
802 unsigned int i;
804 if (loop_vinfo)
805 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
806 else
807 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
809 FOR_EACH_VEC_ELT (datarefs, i, dr)
810 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
811 && !vect_compute_data_ref_alignment (dr))
813 if (bb_vinfo)
815 /* Mark unsupported statement as unvectorizable. */
816 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
817 continue;
819 else
820 return false;
823 return true;
827 /* Function vect_update_misalignment_for_peel
829 DR - the data reference whose misalignment is to be adjusted.
830 DR_PEEL - the data reference whose misalignment is being made
831 zero in the vector loop by the peel.
832 NPEEL - the number of iterations in the peel loop if the misalignment
833 of DR_PEEL is known at compile time. */
835 static void
836 vect_update_misalignment_for_peel (struct data_reference *dr,
837 struct data_reference *dr_peel, int npeel)
839 unsigned int i;
840 vec<dr_p> same_align_drs;
841 struct data_reference *current_dr;
842 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
843 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
844 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
845 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
847 /* For interleaved data accesses the step in the loop must be multiplied by
848 the size of the interleaving group. */
849 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
850 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
851 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
852 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
854 /* It can be assumed that the data refs with the same alignment as dr_peel
855 are aligned in the vector loop. */
856 same_align_drs
857 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
858 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
860 if (current_dr != dr)
861 continue;
862 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
863 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
864 SET_DR_MISALIGNMENT (dr, 0);
865 return;
868 if (known_alignment_for_access_p (dr)
869 && known_alignment_for_access_p (dr_peel))
871 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
872 int misal = DR_MISALIGNMENT (dr);
873 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
874 misal += negative ? -npeel * dr_size : npeel * dr_size;
875 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
876 SET_DR_MISALIGNMENT (dr, misal);
877 return;
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
882 SET_DR_MISALIGNMENT (dr, -1);
886 /* Function vect_verify_datarefs_alignment
888 Return TRUE if all data references in the loop can be
889 handled with respect to alignment. */
891 bool
892 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
894 vec<data_reference_p> datarefs;
895 struct data_reference *dr;
896 enum dr_alignment_support supportable_dr_alignment;
897 unsigned int i;
899 if (loop_vinfo)
900 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
901 else
902 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
904 FOR_EACH_VEC_ELT (datarefs, i, dr)
906 gimple stmt = DR_STMT (dr);
907 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
909 if (!STMT_VINFO_RELEVANT_P (stmt_info))
910 continue;
912 /* For interleaving, only the alignment of the first access matters.
913 Skip statements marked as not vectorizable. */
914 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
915 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
916 || !STMT_VINFO_VECTORIZABLE (stmt_info))
917 continue;
919 /* Strided accesses perform only component accesses, alignment is
920 irrelevant for them. */
921 if (STMT_VINFO_STRIDED_P (stmt_info)
922 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
923 continue;
925 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
926 if (!supportable_dr_alignment)
928 if (dump_enabled_p ())
930 if (DR_IS_READ (dr))
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "not vectorized: unsupported unaligned load.");
933 else
934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
935 "not vectorized: unsupported unaligned "
936 "store.");
938 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
939 DR_REF (dr));
940 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
942 return false;
944 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
945 dump_printf_loc (MSG_NOTE, vect_location,
946 "Vectorizing an unaligned access.\n");
948 return true;
951 /* Given an memory reference EXP return whether its alignment is less
952 than its size. */
954 static bool
955 not_size_aligned (tree exp)
957 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
958 return true;
960 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
961 > get_object_alignment (exp));
964 /* Function vector_alignment_reachable_p
966 Return true if vector alignment for DR is reachable by peeling
967 a few loop iterations. Return false otherwise. */
969 static bool
970 vector_alignment_reachable_p (struct data_reference *dr)
972 gimple stmt = DR_STMT (dr);
973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
974 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
978 /* For interleaved access we peel only if number of iterations in
979 the prolog loop ({VF - misalignment}), is a multiple of the
980 number of the interleaved accesses. */
981 int elem_size, mis_in_elements;
982 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
984 /* FORNOW: handle only known alignment. */
985 if (!known_alignment_for_access_p (dr))
986 return false;
988 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
989 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
991 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
992 return false;
995 /* If misalignment is known at the compile time then allow peeling
996 only if natural alignment is reachable through peeling. */
997 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
999 HOST_WIDE_INT elmsize =
1000 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1001 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_NOTE, vect_location,
1004 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1005 dump_printf (MSG_NOTE,
1006 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1008 if (DR_MISALIGNMENT (dr) % elmsize)
1010 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "data size does not divide the misalignment.\n");
1013 return false;
1017 if (!known_alignment_for_access_p (dr))
1019 tree type = TREE_TYPE (DR_REF (dr));
1020 bool is_packed = not_size_aligned (DR_REF (dr));
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1023 "Unknown misalignment, is_packed = %d\n",is_packed);
1024 if ((TYPE_USER_ALIGN (type) && !is_packed)
1025 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1026 return true;
1027 else
1028 return false;
1031 return true;
1035 /* Calculate the cost of the memory access represented by DR. */
1037 static void
1038 vect_get_data_access_cost (struct data_reference *dr,
1039 unsigned int *inside_cost,
1040 unsigned int *outside_cost,
1041 stmt_vector_for_cost *body_cost_vec)
1043 gimple stmt = DR_STMT (dr);
1044 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1045 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1046 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1047 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1048 int ncopies = vf / nunits;
1050 if (DR_IS_READ (dr))
1051 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1052 NULL, body_cost_vec, false);
1053 else
1054 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_NOTE, vect_location,
1058 "vect_get_data_access_cost: inside_cost = %d, "
1059 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1063 /* Insert DR into peeling hash table with NPEEL as key. */
1065 static void
1066 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1067 int npeel)
1069 struct _vect_peel_info elem, *slot;
1070 _vect_peel_info **new_slot;
1071 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1073 elem.npeel = npeel;
1074 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1075 if (slot)
1076 slot->count++;
1077 else
1079 slot = XNEW (struct _vect_peel_info);
1080 slot->npeel = npeel;
1081 slot->dr = dr;
1082 slot->count = 1;
1083 new_slot
1084 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1085 *new_slot = slot;
1088 if (!supportable_dr_alignment
1089 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1090 slot->count += VECT_MAX_COST;
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095 number of data accesses. */
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1099 _vect_peel_extended_info *max)
1101 vect_peel_info elem = *slot;
1103 if (elem->count > max->peel_info.count
1104 || (elem->count == max->peel_info.count
1105 && max->peel_info.npeel > elem->npeel))
1107 max->peel_info.npeel = elem->npeel;
1108 max->peel_info.count = elem->count;
1109 max->peel_info.dr = elem->dr;
1112 return 1;
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117 Find the one with the lowest cost. */
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1121 _vect_peel_extended_info *min)
1123 vect_peel_info elem = *slot;
1124 int save_misalignment, dummy;
1125 unsigned int inside_cost = 0, outside_cost = 0, i;
1126 gimple stmt = DR_STMT (elem->dr);
1127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1128 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1129 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1130 struct data_reference *dr;
1131 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1133 prologue_cost_vec.create (2);
1134 body_cost_vec.create (2);
1135 epilogue_cost_vec.create (2);
1137 FOR_EACH_VEC_ELT (datarefs, i, dr)
1139 stmt = DR_STMT (dr);
1140 stmt_info = vinfo_for_stmt (stmt);
1141 /* For interleaving, only the alignment of the first access
1142 matters. */
1143 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1144 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1145 continue;
1147 save_misalignment = DR_MISALIGNMENT (dr);
1148 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1149 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1150 &body_cost_vec);
1151 SET_DR_MISALIGNMENT (dr, save_misalignment);
1154 outside_cost += vect_get_known_peeling_cost
1155 (loop_vinfo, elem->npeel, &dummy,
1156 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1157 &prologue_cost_vec, &epilogue_cost_vec);
1159 /* Prologue and epilogue costs are added to the target model later.
1160 These costs depend only on the scalar iteration cost, the
1161 number of peeling iterations finally chosen, and the number of
1162 misaligned statements. So discard the information found here. */
1163 prologue_cost_vec.release ();
1164 epilogue_cost_vec.release ();
1166 if (inside_cost < min->inside_cost
1167 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1169 min->inside_cost = inside_cost;
1170 min->outside_cost = outside_cost;
1171 min->body_cost_vec.release ();
1172 min->body_cost_vec = body_cost_vec;
1173 min->peel_info.dr = elem->dr;
1174 min->peel_info.npeel = elem->npeel;
1176 else
1177 body_cost_vec.release ();
1179 return 1;
1183 /* Choose best peeling option by traversing peeling hash table and either
1184 choosing an option with the lowest cost (if cost model is enabled) or the
1185 option that aligns as many accesses as possible. */
1187 static struct data_reference *
1188 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1189 unsigned int *npeel,
1190 stmt_vector_for_cost *body_cost_vec)
1192 struct _vect_peel_extended_info res;
1194 res.peel_info.dr = NULL;
1195 res.body_cost_vec = stmt_vector_for_cost ();
1197 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1199 res.inside_cost = INT_MAX;
1200 res.outside_cost = INT_MAX;
1201 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1202 ->traverse <_vect_peel_extended_info *,
1203 vect_peeling_hash_get_lowest_cost> (&res);
1205 else
1207 res.peel_info.count = 0;
1208 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1209 ->traverse <_vect_peel_extended_info *,
1210 vect_peeling_hash_get_most_frequent> (&res);
1213 *npeel = res.peel_info.npeel;
1214 *body_cost_vec = res.body_cost_vec;
1215 return res.peel_info.dr;
1219 /* Function vect_enhance_data_refs_alignment
1221 This pass will use loop versioning and loop peeling in order to enhance
1222 the alignment of data references in the loop.
1224 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1225 original loop is to be vectorized. Any other loops that are created by
1226 the transformations performed in this pass - are not supposed to be
1227 vectorized. This restriction will be relaxed.
1229 This pass will require a cost model to guide it whether to apply peeling
1230 or versioning or a combination of the two. For example, the scheme that
1231 intel uses when given a loop with several memory accesses, is as follows:
1232 choose one memory access ('p') which alignment you want to force by doing
1233 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1234 other accesses are not necessarily aligned, or (2) use loop versioning to
1235 generate one loop in which all accesses are aligned, and another loop in
1236 which only 'p' is necessarily aligned.
1238 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1239 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1240 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1242 Devising a cost model is the most critical aspect of this work. It will
1243 guide us on which access to peel for, whether to use loop versioning, how
1244 many versions to create, etc. The cost model will probably consist of
1245 generic considerations as well as target specific considerations (on
1246 powerpc for example, misaligned stores are more painful than misaligned
1247 loads).
1249 Here are the general steps involved in alignment enhancements:
1251 -- original loop, before alignment analysis:
1252 for (i=0; i<N; i++){
1253 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1254 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1257 -- After vect_compute_data_refs_alignment:
1258 for (i=0; i<N; i++){
1259 x = q[i]; # DR_MISALIGNMENT(q) = 3
1260 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1263 -- Possibility 1: we do loop versioning:
1264 if (p is aligned) {
1265 for (i=0; i<N; i++){ # loop 1A
1266 x = q[i]; # DR_MISALIGNMENT(q) = 3
1267 p[i] = y; # DR_MISALIGNMENT(p) = 0
1270 else {
1271 for (i=0; i<N; i++){ # loop 1B
1272 x = q[i]; # DR_MISALIGNMENT(q) = 3
1273 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1277 -- Possibility 2: we do loop peeling:
1278 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1279 x = q[i];
1280 p[i] = y;
1282 for (i = 3; i < N; i++){ # loop 2A
1283 x = q[i]; # DR_MISALIGNMENT(q) = 0
1284 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1287 -- Possibility 3: combination of loop peeling and versioning:
1288 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1289 x = q[i];
1290 p[i] = y;
1292 if (p is aligned) {
1293 for (i = 3; i<N; i++){ # loop 3A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 0
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1298 else {
1299 for (i = 3; i<N; i++){ # loop 3B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 0
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1305 These loops are later passed to loop_transform to be vectorized. The
1306 vectorizer will use the alignment information to guide the transformation
1307 (whether to generate regular loads/stores, or with special handling for
1308 misalignment). */
1310 bool
1311 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1313 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1315 enum dr_alignment_support supportable_dr_alignment;
1316 struct data_reference *dr0 = NULL, *first_store = NULL;
1317 struct data_reference *dr;
1318 unsigned int i, j;
1319 bool do_peeling = false;
1320 bool do_versioning = false;
1321 bool stat;
1322 gimple stmt;
1323 stmt_vec_info stmt_info;
1324 unsigned int npeel = 0;
1325 bool all_misalignments_unknown = true;
1326 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1327 unsigned possible_npeel_number = 1;
1328 tree vectype;
1329 unsigned int nelements, mis, same_align_drs_max = 0;
1330 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1332 if (dump_enabled_p ())
1333 dump_printf_loc (MSG_NOTE, vect_location,
1334 "=== vect_enhance_data_refs_alignment ===\n");
1336 /* While cost model enhancements are expected in the future, the high level
1337 view of the code at this time is as follows:
1339 A) If there is a misaligned access then see if peeling to align
1340 this access can make all data references satisfy
1341 vect_supportable_dr_alignment. If so, update data structures
1342 as needed and return true.
1344 B) If peeling wasn't possible and there is a data reference with an
1345 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1346 then see if loop versioning checks can be used to make all data
1347 references satisfy vect_supportable_dr_alignment. If so, update
1348 data structures as needed and return true.
1350 C) If neither peeling nor versioning were successful then return false if
1351 any data reference does not satisfy vect_supportable_dr_alignment.
1353 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1355 Note, Possibility 3 above (which is peeling and versioning together) is not
1356 being done at this time. */
1358 /* (1) Peeling to force alignment. */
1360 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1361 Considerations:
1362 + How many accesses will become aligned due to the peeling
1363 - How many accesses will become unaligned due to the peeling,
1364 and the cost of misaligned accesses.
1365 - The cost of peeling (the extra runtime checks, the increase
1366 in code size). */
1368 FOR_EACH_VEC_ELT (datarefs, i, dr)
1370 stmt = DR_STMT (dr);
1371 stmt_info = vinfo_for_stmt (stmt);
1373 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1374 continue;
1376 /* For interleaving, only the alignment of the first access
1377 matters. */
1378 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1379 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1380 continue;
1382 /* For invariant accesses there is nothing to enhance. */
1383 if (integer_zerop (DR_STEP (dr)))
1384 continue;
1386 /* Strided accesses perform only component accesses, alignment is
1387 irrelevant for them. */
1388 if (STMT_VINFO_STRIDED_P (stmt_info)
1389 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1390 continue;
1392 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1393 do_peeling = vector_alignment_reachable_p (dr);
1394 if (do_peeling)
1396 if (known_alignment_for_access_p (dr))
1398 unsigned int npeel_tmp;
1399 bool negative = tree_int_cst_compare (DR_STEP (dr),
1400 size_zero_node) < 0;
1402 /* Save info about DR in the hash table. */
1403 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1404 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1405 = new hash_table<peel_info_hasher> (1);
1407 vectype = STMT_VINFO_VECTYPE (stmt_info);
1408 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1409 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1410 TREE_TYPE (DR_REF (dr))));
1411 npeel_tmp = (negative
1412 ? (mis - nelements) : (nelements - mis))
1413 & (nelements - 1);
1415 /* For multiple types, it is possible that the bigger type access
1416 will have more than one peeling option. E.g., a loop with two
1417 types: one of size (vector size / 4), and the other one of
1418 size (vector size / 8). Vectorization factor will 8. If both
1419 access are misaligned by 3, the first one needs one scalar
1420 iteration to be aligned, and the second one needs 5. But the
1421 the first one will be aligned also by peeling 5 scalar
1422 iterations, and in that case both accesses will be aligned.
1423 Hence, except for the immediate peeling amount, we also want
1424 to try to add full vector size, while we don't exceed
1425 vectorization factor.
1426 We do this automtically for cost model, since we calculate cost
1427 for every peeling option. */
1428 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1430 if (STMT_SLP_TYPE (stmt_info))
1431 possible_npeel_number
1432 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1433 else
1434 possible_npeel_number = vf / nelements;
1437 /* Handle the aligned case. We may decide to align some other
1438 access, making DR unaligned. */
1439 if (DR_MISALIGNMENT (dr) == 0)
1441 npeel_tmp = 0;
1442 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1443 possible_npeel_number++;
1446 for (j = 0; j < possible_npeel_number; j++)
1448 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1449 npeel_tmp += nelements;
1452 all_misalignments_unknown = false;
1453 /* Data-ref that was chosen for the case that all the
1454 misalignments are unknown is not relevant anymore, since we
1455 have a data-ref with known alignment. */
1456 dr0 = NULL;
1458 else
1460 /* If we don't know any misalignment values, we prefer
1461 peeling for data-ref that has the maximum number of data-refs
1462 with the same alignment, unless the target prefers to align
1463 stores over load. */
1464 if (all_misalignments_unknown)
1466 unsigned same_align_drs
1467 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1468 if (!dr0
1469 || same_align_drs_max < same_align_drs)
1471 same_align_drs_max = same_align_drs;
1472 dr0 = dr;
1474 /* For data-refs with the same number of related
1475 accesses prefer the one where the misalign
1476 computation will be invariant in the outermost loop. */
1477 else if (same_align_drs_max == same_align_drs)
1479 struct loop *ivloop0, *ivloop;
1480 ivloop0 = outermost_invariant_loop_for_expr
1481 (loop, DR_BASE_ADDRESS (dr0));
1482 ivloop = outermost_invariant_loop_for_expr
1483 (loop, DR_BASE_ADDRESS (dr));
1484 if ((ivloop && !ivloop0)
1485 || (ivloop && ivloop0
1486 && flow_loop_nested_p (ivloop, ivloop0)))
1487 dr0 = dr;
1490 if (!first_store && DR_IS_WRITE (dr))
1491 first_store = dr;
1494 /* If there are both known and unknown misaligned accesses in the
1495 loop, we choose peeling amount according to the known
1496 accesses. */
1497 if (!supportable_dr_alignment)
1499 dr0 = dr;
1500 if (!first_store && DR_IS_WRITE (dr))
1501 first_store = dr;
1505 else
1507 if (!aligned_access_p (dr))
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1511 "vector alignment may not be reachable\n");
1512 break;
1517 /* Check if we can possibly peel the loop. */
1518 if (!vect_can_advance_ivs_p (loop_vinfo)
1519 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1520 || loop->inner)
1521 do_peeling = false;
1523 if (do_peeling
1524 && all_misalignments_unknown
1525 && vect_supportable_dr_alignment (dr0, false))
1527 /* Check if the target requires to prefer stores over loads, i.e., if
1528 misaligned stores are more expensive than misaligned loads (taking
1529 drs with same alignment into account). */
1530 if (first_store && DR_IS_READ (dr0))
1532 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1533 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1534 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1535 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1536 stmt_vector_for_cost dummy;
1537 dummy.create (2);
1539 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1540 &dummy);
1541 vect_get_data_access_cost (first_store, &store_inside_cost,
1542 &store_outside_cost, &dummy);
1544 dummy.release ();
1546 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1547 aligning the load DR0). */
1548 load_inside_penalty = store_inside_cost;
1549 load_outside_penalty = store_outside_cost;
1550 for (i = 0;
1551 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1552 DR_STMT (first_store))).iterate (i, &dr);
1553 i++)
1554 if (DR_IS_READ (dr))
1556 load_inside_penalty += load_inside_cost;
1557 load_outside_penalty += load_outside_cost;
1559 else
1561 load_inside_penalty += store_inside_cost;
1562 load_outside_penalty += store_outside_cost;
1565 /* Calculate the penalty for leaving DR0 unaligned (by
1566 aligning the FIRST_STORE). */
1567 store_inside_penalty = load_inside_cost;
1568 store_outside_penalty = load_outside_cost;
1569 for (i = 0;
1570 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1571 DR_STMT (dr0))).iterate (i, &dr);
1572 i++)
1573 if (DR_IS_READ (dr))
1575 store_inside_penalty += load_inside_cost;
1576 store_outside_penalty += load_outside_cost;
1578 else
1580 store_inside_penalty += store_inside_cost;
1581 store_outside_penalty += store_outside_cost;
1584 if (load_inside_penalty > store_inside_penalty
1585 || (load_inside_penalty == store_inside_penalty
1586 && load_outside_penalty > store_outside_penalty))
1587 dr0 = first_store;
1590 /* In case there are only loads with different unknown misalignments, use
1591 peeling only if it may help to align other accesses in the loop or
1592 if it may help improving load bandwith when we'd end up using
1593 unaligned loads. */
1594 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1595 if (!first_store
1596 && !STMT_VINFO_SAME_ALIGN_REFS (
1597 vinfo_for_stmt (DR_STMT (dr0))).length ()
1598 && (vect_supportable_dr_alignment (dr0, false)
1599 != dr_unaligned_supported
1600 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1601 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1602 do_peeling = false;
1605 if (do_peeling && !dr0)
1607 /* Peeling is possible, but there is no data access that is not supported
1608 unless aligned. So we try to choose the best possible peeling. */
1610 /* We should get here only if there are drs with known misalignment. */
1611 gcc_assert (!all_misalignments_unknown);
1613 /* Choose the best peeling from the hash table. */
1614 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1615 &body_cost_vec);
1616 if (!dr0 || !npeel)
1617 do_peeling = false;
1620 if (do_peeling)
1622 stmt = DR_STMT (dr0);
1623 stmt_info = vinfo_for_stmt (stmt);
1624 vectype = STMT_VINFO_VECTYPE (stmt_info);
1625 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1627 if (known_alignment_for_access_p (dr0))
1629 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1630 size_zero_node) < 0;
1631 if (!npeel)
1633 /* Since it's known at compile time, compute the number of
1634 iterations in the peeled loop (the peeling factor) for use in
1635 updating DR_MISALIGNMENT values. The peeling factor is the
1636 vectorization factor minus the misalignment as an element
1637 count. */
1638 mis = DR_MISALIGNMENT (dr0);
1639 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1640 npeel = ((negative ? mis - nelements : nelements - mis)
1641 & (nelements - 1));
1644 /* For interleaved data access every iteration accesses all the
1645 members of the group, therefore we divide the number of iterations
1646 by the group size. */
1647 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1648 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1649 npeel /= GROUP_SIZE (stmt_info);
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_NOTE, vect_location,
1653 "Try peeling by %d\n", npeel);
1656 /* Ensure that all data refs can be vectorized after the peel. */
1657 FOR_EACH_VEC_ELT (datarefs, i, dr)
1659 int save_misalignment;
1661 if (dr == dr0)
1662 continue;
1664 stmt = DR_STMT (dr);
1665 stmt_info = vinfo_for_stmt (stmt);
1666 /* For interleaving, only the alignment of the first access
1667 matters. */
1668 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1669 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1670 continue;
1672 /* Strided accesses perform only component accesses, alignment is
1673 irrelevant for them. */
1674 if (STMT_VINFO_STRIDED_P (stmt_info)
1675 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1676 continue;
1678 save_misalignment = DR_MISALIGNMENT (dr);
1679 vect_update_misalignment_for_peel (dr, dr0, npeel);
1680 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1681 SET_DR_MISALIGNMENT (dr, save_misalignment);
1683 if (!supportable_dr_alignment)
1685 do_peeling = false;
1686 break;
1690 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1692 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1693 if (!stat)
1694 do_peeling = false;
1695 else
1697 body_cost_vec.release ();
1698 return stat;
1702 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1703 if (do_peeling)
1705 unsigned max_allowed_peel
1706 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1707 if (max_allowed_peel != (unsigned)-1)
1709 unsigned max_peel = npeel;
1710 if (max_peel == 0)
1712 gimple dr_stmt = DR_STMT (dr0);
1713 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1714 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1715 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1717 if (max_peel > max_allowed_peel)
1719 do_peeling = false;
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_NOTE, vect_location,
1722 "Disable peeling, max peels reached: %d\n", max_peel);
1727 /* Cost model #2 - if peeling may result in a remaining loop not
1728 iterating enough to be vectorized then do not peel. */
1729 if (do_peeling
1730 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1732 unsigned max_peel
1733 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1734 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1735 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1736 do_peeling = false;
1739 if (do_peeling)
1741 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1742 If the misalignment of DR_i is identical to that of dr0 then set
1743 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1744 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1745 by the peeling factor times the element size of DR_i (MOD the
1746 vectorization factor times the size). Otherwise, the
1747 misalignment of DR_i must be set to unknown. */
1748 FOR_EACH_VEC_ELT (datarefs, i, dr)
1749 if (dr != dr0)
1750 vect_update_misalignment_for_peel (dr, dr0, npeel);
1752 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1753 if (npeel)
1754 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1755 else
1756 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1757 = DR_MISALIGNMENT (dr0);
1758 SET_DR_MISALIGNMENT (dr0, 0);
1759 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_NOTE, vect_location,
1762 "Alignment of access forced using peeling.\n");
1763 dump_printf_loc (MSG_NOTE, vect_location,
1764 "Peeling for alignment will be applied.\n");
1766 /* The inside-loop cost will be accounted for in vectorizable_load
1767 and vectorizable_store correctly with adjusted alignments.
1768 Drop the body_cst_vec on the floor here. */
1769 body_cost_vec.release ();
1771 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1772 gcc_assert (stat);
1773 return stat;
1777 body_cost_vec.release ();
1779 /* (2) Versioning to force alignment. */
1781 /* Try versioning if:
1782 1) optimize loop for speed
1783 2) there is at least one unsupported misaligned data ref with an unknown
1784 misalignment, and
1785 3) all misaligned data refs with a known misalignment are supported, and
1786 4) the number of runtime alignment checks is within reason. */
1788 do_versioning =
1789 optimize_loop_nest_for_speed_p (loop)
1790 && (!loop->inner); /* FORNOW */
1792 if (do_versioning)
1794 FOR_EACH_VEC_ELT (datarefs, i, dr)
1796 stmt = DR_STMT (dr);
1797 stmt_info = vinfo_for_stmt (stmt);
1799 /* For interleaving, only the alignment of the first access
1800 matters. */
1801 if (aligned_access_p (dr)
1802 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1803 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1804 continue;
1806 if (STMT_VINFO_STRIDED_P (stmt_info))
1808 /* Strided loads perform only component accesses, alignment is
1809 irrelevant for them. */
1810 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1811 continue;
1812 do_versioning = false;
1813 break;
1816 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1818 if (!supportable_dr_alignment)
1820 gimple stmt;
1821 int mask;
1822 tree vectype;
1824 if (known_alignment_for_access_p (dr)
1825 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1826 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1828 do_versioning = false;
1829 break;
1832 stmt = DR_STMT (dr);
1833 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1834 gcc_assert (vectype);
1836 /* The rightmost bits of an aligned address must be zeros.
1837 Construct the mask needed for this test. For example,
1838 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1839 mask must be 15 = 0xf. */
1840 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1842 /* FORNOW: use the same mask to test all potentially unaligned
1843 references in the loop. The vectorizer currently supports
1844 a single vector size, see the reference to
1845 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1846 vectorization factor is computed. */
1847 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1848 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1849 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1850 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1851 DR_STMT (dr));
1855 /* Versioning requires at least one misaligned data reference. */
1856 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1857 do_versioning = false;
1858 else if (!do_versioning)
1859 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1862 if (do_versioning)
1864 vec<gimple> may_misalign_stmts
1865 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1866 gimple stmt;
1868 /* It can now be assumed that the data references in the statements
1869 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1870 of the loop being vectorized. */
1871 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1873 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1874 dr = STMT_VINFO_DATA_REF (stmt_info);
1875 SET_DR_MISALIGNMENT (dr, 0);
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_NOTE, vect_location,
1878 "Alignment of access forced using versioning.\n");
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_NOTE, vect_location,
1883 "Versioning for alignment will be applied.\n");
1885 /* Peeling and versioning can't be done together at this time. */
1886 gcc_assert (! (do_peeling && do_versioning));
1888 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1889 gcc_assert (stat);
1890 return stat;
1893 /* This point is reached if neither peeling nor versioning is being done. */
1894 gcc_assert (! (do_peeling || do_versioning));
1896 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1897 return stat;
1901 /* Function vect_find_same_alignment_drs.
1903 Update group and alignment relations according to the chosen
1904 vectorization factor. */
1906 static void
1907 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1908 loop_vec_info loop_vinfo)
1910 unsigned int i;
1911 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1912 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1913 struct data_reference *dra = DDR_A (ddr);
1914 struct data_reference *drb = DDR_B (ddr);
1915 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1916 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1917 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1918 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1919 lambda_vector dist_v;
1920 unsigned int loop_depth;
1922 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1923 return;
1925 if (dra == drb)
1926 return;
1928 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1929 return;
1931 /* Loop-based vectorization and known data dependence. */
1932 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1933 return;
1935 /* Data-dependence analysis reports a distance vector of zero
1936 for data-references that overlap only in the first iteration
1937 but have different sign step (see PR45764).
1938 So as a sanity check require equal DR_STEP. */
1939 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1940 return;
1942 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1943 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1945 int dist = dist_v[loop_depth];
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location,
1949 "dependence distance = %d.\n", dist);
1951 /* Same loop iteration. */
1952 if (dist == 0
1953 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1955 /* Two references with distance zero have the same alignment. */
1956 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1957 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1958 if (dump_enabled_p ())
1960 dump_printf_loc (MSG_NOTE, vect_location,
1961 "accesses have the same alignment.\n");
1962 dump_printf (MSG_NOTE,
1963 "dependence distance modulo vf == 0 between ");
1964 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1965 dump_printf (MSG_NOTE, " and ");
1966 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1967 dump_printf (MSG_NOTE, "\n");
1974 /* Function vect_analyze_data_refs_alignment
1976 Analyze the alignment of the data-references in the loop.
1977 Return FALSE if a data reference is found that cannot be vectorized. */
1979 bool
1980 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1981 bb_vec_info bb_vinfo)
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location,
1985 "=== vect_analyze_data_refs_alignment ===\n");
1987 /* Mark groups of data references with same alignment using
1988 data dependence information. */
1989 if (loop_vinfo)
1991 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1992 struct data_dependence_relation *ddr;
1993 unsigned int i;
1995 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1996 vect_find_same_alignment_drs (ddr, loop_vinfo);
1999 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 "not vectorized: can't calculate alignment "
2004 "for data ref.\n");
2005 return false;
2008 return true;
2012 /* Analyze groups of accesses: check that DR belongs to a group of
2013 accesses of legal size, step, etc. Detect gaps, single element
2014 interleaving, and other special cases. Set grouped access info.
2015 Collect groups of strided stores for further use in SLP analysis.
2016 Worker for vect_analyze_group_access. */
2018 static bool
2019 vect_analyze_group_access_1 (struct data_reference *dr)
2021 tree step = DR_STEP (dr);
2022 tree scalar_type = TREE_TYPE (DR_REF (dr));
2023 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2024 gimple stmt = DR_STMT (dr);
2025 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2026 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2027 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2028 HOST_WIDE_INT dr_step = -1;
2029 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2030 bool slp_impossible = false;
2031 struct loop *loop = NULL;
2033 if (loop_vinfo)
2034 loop = LOOP_VINFO_LOOP (loop_vinfo);
2036 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2037 size of the interleaving group (including gaps). */
2038 if (tree_fits_shwi_p (step))
2040 dr_step = tree_to_shwi (step);
2041 groupsize = absu_hwi (dr_step) / type_size;
2043 else
2044 groupsize = 0;
2046 /* Not consecutive access is possible only if it is a part of interleaving. */
2047 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2049 /* Check if it this DR is a part of interleaving, and is a single
2050 element of the group that is accessed in the loop. */
2052 /* Gaps are supported only for loads. STEP must be a multiple of the type
2053 size. The size of the group must be a power of 2. */
2054 if (DR_IS_READ (dr)
2055 && (dr_step % type_size) == 0
2056 && groupsize > 0
2057 && exact_log2 (groupsize) != -1)
2059 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2060 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2061 if (dump_enabled_p ())
2063 dump_printf_loc (MSG_NOTE, vect_location,
2064 "Detected single element interleaving ");
2065 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2066 dump_printf (MSG_NOTE, " step ");
2067 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2068 dump_printf (MSG_NOTE, "\n");
2071 if (loop_vinfo)
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_NOTE, vect_location,
2075 "Data access with gaps requires scalar "
2076 "epilogue loop\n");
2077 if (loop->inner)
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081 "Peeling for outer loop is not"
2082 " supported\n");
2083 return false;
2086 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2089 return true;
2092 if (dump_enabled_p ())
2094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2095 "not consecutive access ");
2096 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2097 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2100 if (bb_vinfo)
2102 /* Mark the statement as unvectorizable. */
2103 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2104 return true;
2107 return false;
2110 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2112 /* First stmt in the interleaving chain. Check the chain. */
2113 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2114 struct data_reference *data_ref = dr;
2115 unsigned int count = 1;
2116 tree prev_init = DR_INIT (data_ref);
2117 gimple prev = stmt;
2118 HOST_WIDE_INT diff, gaps = 0;
2120 while (next)
2122 /* Skip same data-refs. In case that two or more stmts share
2123 data-ref (supported only for loads), we vectorize only the first
2124 stmt, and the rest get their vectorized loads from the first
2125 one. */
2126 if (!tree_int_cst_compare (DR_INIT (data_ref),
2127 DR_INIT (STMT_VINFO_DATA_REF (
2128 vinfo_for_stmt (next)))))
2130 if (DR_IS_WRITE (data_ref))
2132 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134 "Two store stmts share the same dr.\n");
2135 return false;
2138 /* For load use the same data-ref load. */
2139 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2141 prev = next;
2142 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2143 continue;
2146 prev = next;
2147 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2149 /* All group members have the same STEP by construction. */
2150 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2152 /* Check that the distance between two accesses is equal to the type
2153 size. Otherwise, we have gaps. */
2154 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2155 - TREE_INT_CST_LOW (prev_init)) / type_size;
2156 if (diff != 1)
2158 /* FORNOW: SLP of accesses with gaps is not supported. */
2159 slp_impossible = true;
2160 if (DR_IS_WRITE (data_ref))
2162 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2164 "interleaved store with gaps\n");
2165 return false;
2168 gaps += diff - 1;
2171 last_accessed_element += diff;
2173 /* Store the gap from the previous member of the group. If there is no
2174 gap in the access, GROUP_GAP is always 1. */
2175 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2177 prev_init = DR_INIT (data_ref);
2178 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2179 /* Count the number of data-refs in the chain. */
2180 count++;
2183 if (groupsize == 0)
2184 groupsize = count + gaps;
2186 if (groupsize > UINT_MAX)
2188 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2190 "group is too large\n");
2191 return false;
2194 /* Check that the size of the interleaving is equal to count for stores,
2195 i.e., that there are no gaps. */
2196 if (groupsize != count
2197 && !DR_IS_READ (dr))
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2201 "interleaved store with gaps\n");
2202 return false;
2205 /* If there is a gap after the last load in the group it is the
2206 difference between the groupsize and the last accessed
2207 element.
2208 When there is no gap, this difference should be 0. */
2209 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2211 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2212 if (dump_enabled_p ())
2214 dump_printf_loc (MSG_NOTE, vect_location,
2215 "Detected interleaving ");
2216 if (DR_IS_READ (dr))
2217 dump_printf (MSG_NOTE, "load ");
2218 else
2219 dump_printf (MSG_NOTE, "store ");
2220 dump_printf (MSG_NOTE, "of size %u starting with ",
2221 (unsigned)groupsize);
2222 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2223 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2224 dump_printf_loc (MSG_NOTE, vect_location,
2225 "There is a gap of %u elements after the group\n",
2226 GROUP_GAP (vinfo_for_stmt (stmt)));
2229 /* SLP: create an SLP data structure for every interleaving group of
2230 stores for further analysis in vect_analyse_slp. */
2231 if (DR_IS_WRITE (dr) && !slp_impossible)
2233 if (loop_vinfo)
2234 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2235 if (bb_vinfo)
2236 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2239 /* If there is a gap in the end of the group or the group size cannot
2240 be made a multiple of the vector element count then we access excess
2241 elements in the last iteration and thus need to peel that off. */
2242 if (loop_vinfo
2243 && (groupsize - last_accessed_element > 0
2244 || exact_log2 (groupsize) == -1))
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "Data access with gaps requires scalar "
2250 "epilogue loop\n");
2251 if (loop->inner)
2253 if (dump_enabled_p ())
2254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2255 "Peeling for outer loop is not supported\n");
2256 return false;
2259 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2263 return true;
2266 /* Analyze groups of accesses: check that DR belongs to a group of
2267 accesses of legal size, step, etc. Detect gaps, single element
2268 interleaving, and other special cases. Set grouped access info.
2269 Collect groups of strided stores for further use in SLP analysis. */
2271 static bool
2272 vect_analyze_group_access (struct data_reference *dr)
2274 if (!vect_analyze_group_access_1 (dr))
2276 /* Dissolve the group if present. */
2277 gimple next, stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2278 while (stmt)
2280 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2281 next = GROUP_NEXT_ELEMENT (vinfo);
2282 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2283 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2284 stmt = next;
2286 return false;
2288 return true;
2291 /* Analyze the access pattern of the data-reference DR.
2292 In case of non-consecutive accesses call vect_analyze_group_access() to
2293 analyze groups of accesses. */
2295 static bool
2296 vect_analyze_data_ref_access (struct data_reference *dr)
2298 tree step = DR_STEP (dr);
2299 tree scalar_type = TREE_TYPE (DR_REF (dr));
2300 gimple stmt = DR_STMT (dr);
2301 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2302 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2303 struct loop *loop = NULL;
2305 if (loop_vinfo)
2306 loop = LOOP_VINFO_LOOP (loop_vinfo);
2308 if (loop_vinfo && !step)
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2312 "bad data-ref access in loop\n");
2313 return false;
2316 /* Allow loads with zero step in inner-loop vectorization. */
2317 if (loop_vinfo && integer_zerop (step))
2319 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2320 if (!nested_in_vect_loop_p (loop, stmt))
2321 return DR_IS_READ (dr);
2322 /* Allow references with zero step for outer loops marked
2323 with pragma omp simd only - it guarantees absence of
2324 loop-carried dependencies between inner loop iterations. */
2325 if (!loop->force_vectorize)
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_NOTE, vect_location,
2329 "zero step in inner loop of nest\n");
2330 return false;
2334 if (loop && nested_in_vect_loop_p (loop, stmt))
2336 /* Interleaved accesses are not yet supported within outer-loop
2337 vectorization for references in the inner-loop. */
2338 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2340 /* For the rest of the analysis we use the outer-loop step. */
2341 step = STMT_VINFO_DR_STEP (stmt_info);
2342 if (integer_zerop (step))
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_NOTE, vect_location,
2346 "zero step in outer loop.\n");
2347 return DR_IS_READ (dr);
2351 /* Consecutive? */
2352 if (TREE_CODE (step) == INTEGER_CST)
2354 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2355 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2356 || (dr_step < 0
2357 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2359 /* Mark that it is not interleaving. */
2360 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2361 return true;
2365 if (loop && nested_in_vect_loop_p (loop, stmt))
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "grouped access in outer loop.\n");
2370 return false;
2374 /* Assume this is a DR handled by non-constant strided load case. */
2375 if (TREE_CODE (step) != INTEGER_CST)
2376 return (STMT_VINFO_STRIDED_P (stmt_info)
2377 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2378 || vect_analyze_group_access (dr)));
2380 /* Not consecutive access - check if it's a part of interleaving group. */
2381 return vect_analyze_group_access (dr);
2386 /* A helper function used in the comparator function to sort data
2387 references. T1 and T2 are two data references to be compared.
2388 The function returns -1, 0, or 1. */
2390 static int
2391 compare_tree (tree t1, tree t2)
2393 int i, cmp;
2394 enum tree_code code;
2395 char tclass;
2397 if (t1 == t2)
2398 return 0;
2399 if (t1 == NULL)
2400 return -1;
2401 if (t2 == NULL)
2402 return 1;
2405 if (TREE_CODE (t1) != TREE_CODE (t2))
2406 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2408 code = TREE_CODE (t1);
2409 switch (code)
2411 /* For const values, we can just use hash values for comparisons. */
2412 case INTEGER_CST:
2413 case REAL_CST:
2414 case FIXED_CST:
2415 case STRING_CST:
2416 case COMPLEX_CST:
2417 case VECTOR_CST:
2419 hashval_t h1 = iterative_hash_expr (t1, 0);
2420 hashval_t h2 = iterative_hash_expr (t2, 0);
2421 if (h1 != h2)
2422 return h1 < h2 ? -1 : 1;
2423 break;
2426 case SSA_NAME:
2427 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2428 if (cmp != 0)
2429 return cmp;
2431 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2432 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2433 break;
2435 default:
2436 tclass = TREE_CODE_CLASS (code);
2438 /* For var-decl, we could compare their UIDs. */
2439 if (tclass == tcc_declaration)
2441 if (DECL_UID (t1) != DECL_UID (t2))
2442 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2443 break;
2446 /* For expressions with operands, compare their operands recursively. */
2447 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2449 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2450 if (cmp != 0)
2451 return cmp;
2455 return 0;
2459 /* Compare two data-references DRA and DRB to group them into chunks
2460 suitable for grouping. */
2462 static int
2463 dr_group_sort_cmp (const void *dra_, const void *drb_)
2465 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2466 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2467 int cmp;
2469 /* Stabilize sort. */
2470 if (dra == drb)
2471 return 0;
2473 /* Ordering of DRs according to base. */
2474 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2476 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2477 if (cmp != 0)
2478 return cmp;
2481 /* And according to DR_OFFSET. */
2482 if (!dr_equal_offsets_p (dra, drb))
2484 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2485 if (cmp != 0)
2486 return cmp;
2489 /* Put reads before writes. */
2490 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2491 return DR_IS_READ (dra) ? -1 : 1;
2493 /* Then sort after access size. */
2494 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2495 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2497 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2498 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2499 if (cmp != 0)
2500 return cmp;
2503 /* And after step. */
2504 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2506 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2507 if (cmp != 0)
2508 return cmp;
2511 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2512 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2513 if (cmp == 0)
2514 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2515 return cmp;
2518 /* Function vect_analyze_data_ref_accesses.
2520 Analyze the access pattern of all the data references in the loop.
2522 FORNOW: the only access pattern that is considered vectorizable is a
2523 simple step 1 (consecutive) access.
2525 FORNOW: handle only arrays and pointer accesses. */
2527 bool
2528 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2530 unsigned int i;
2531 vec<data_reference_p> datarefs;
2532 struct data_reference *dr;
2534 if (dump_enabled_p ())
2535 dump_printf_loc (MSG_NOTE, vect_location,
2536 "=== vect_analyze_data_ref_accesses ===\n");
2538 if (loop_vinfo)
2539 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2540 else
2541 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2543 if (datarefs.is_empty ())
2544 return true;
2546 /* Sort the array of datarefs to make building the interleaving chains
2547 linear. Don't modify the original vector's order, it is needed for
2548 determining what dependencies are reversed. */
2549 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2550 datarefs_copy.qsort (dr_group_sort_cmp);
2552 /* Build the interleaving chains. */
2553 for (i = 0; i < datarefs_copy.length () - 1;)
2555 data_reference_p dra = datarefs_copy[i];
2556 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2557 stmt_vec_info lastinfo = NULL;
2558 for (i = i + 1; i < datarefs_copy.length (); ++i)
2560 data_reference_p drb = datarefs_copy[i];
2561 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2563 /* ??? Imperfect sorting (non-compatible types, non-modulo
2564 accesses, same accesses) can lead to a group to be artificially
2565 split here as we don't just skip over those. If it really
2566 matters we can push those to a worklist and re-iterate
2567 over them. The we can just skip ahead to the next DR here. */
2569 /* Check that the data-refs have same first location (except init)
2570 and they are both either store or load (not load and store,
2571 not masked loads or stores). */
2572 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2573 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2574 DR_BASE_ADDRESS (drb), 0)
2575 || !dr_equal_offsets_p (dra, drb)
2576 || !gimple_assign_single_p (DR_STMT (dra))
2577 || !gimple_assign_single_p (DR_STMT (drb)))
2578 break;
2580 /* Check that the data-refs have the same constant size. */
2581 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2582 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2583 if (!tree_fits_uhwi_p (sza)
2584 || !tree_fits_uhwi_p (szb)
2585 || !tree_int_cst_equal (sza, szb))
2586 break;
2588 /* Check that the data-refs have the same step. */
2589 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2590 break;
2592 /* Do not place the same access in the interleaving chain twice. */
2593 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2594 break;
2596 /* Check the types are compatible.
2597 ??? We don't distinguish this during sorting. */
2598 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2599 TREE_TYPE (DR_REF (drb))))
2600 break;
2602 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2603 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2604 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2605 gcc_assert (init_a < init_b);
2607 /* If init_b == init_a + the size of the type * k, we have an
2608 interleaving, and DRA is accessed before DRB. */
2609 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2610 if ((init_b - init_a) % type_size_a != 0)
2611 break;
2613 /* If we have a store, the accesses are adjacent. This splits
2614 groups into chunks we support (we don't support vectorization
2615 of stores with gaps). */
2616 if (!DR_IS_READ (dra)
2617 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2618 (DR_INIT (datarefs_copy[i-1]))
2619 != type_size_a))
2620 break;
2622 /* If the step (if not zero or non-constant) is greater than the
2623 difference between data-refs' inits this splits groups into
2624 suitable sizes. */
2625 if (tree_fits_shwi_p (DR_STEP (dra)))
2627 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2628 if (step != 0 && step <= (init_b - init_a))
2629 break;
2632 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_NOTE, vect_location,
2635 "Detected interleaving ");
2636 if (DR_IS_READ (dra))
2637 dump_printf (MSG_NOTE, "load ");
2638 else
2639 dump_printf (MSG_NOTE, "store ");
2640 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2641 dump_printf (MSG_NOTE, " and ");
2642 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2643 dump_printf (MSG_NOTE, "\n");
2646 /* Link the found element into the group list. */
2647 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2649 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2650 lastinfo = stmtinfo_a;
2652 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2653 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2654 lastinfo = stmtinfo_b;
2658 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2659 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2660 && !vect_analyze_data_ref_access (dr))
2662 if (dump_enabled_p ())
2663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2664 "not vectorized: complicated access pattern.\n");
2666 if (bb_vinfo)
2668 /* Mark the statement as not vectorizable. */
2669 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2670 continue;
2672 else
2674 datarefs_copy.release ();
2675 return false;
2679 datarefs_copy.release ();
2680 return true;
2684 /* Operator == between two dr_with_seg_len objects.
2686 This equality operator is used to make sure two data refs
2687 are the same one so that we will consider to combine the
2688 aliasing checks of those two pairs of data dependent data
2689 refs. */
2691 static bool
2692 operator == (const dr_with_seg_len& d1,
2693 const dr_with_seg_len& d2)
2695 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2696 DR_BASE_ADDRESS (d2.dr), 0)
2697 && compare_tree (d1.offset, d2.offset) == 0
2698 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2701 /* Function comp_dr_with_seg_len_pair.
2703 Comparison function for sorting objects of dr_with_seg_len_pair_t
2704 so that we can combine aliasing checks in one scan. */
2706 static int
2707 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2709 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2710 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2712 const dr_with_seg_len &p11 = p1->first,
2713 &p12 = p1->second,
2714 &p21 = p2->first,
2715 &p22 = p2->second;
2717 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2718 if a and c have the same basic address snd step, and b and d have the same
2719 address and step. Therefore, if any a&c or b&d don't have the same address
2720 and step, we don't care the order of those two pairs after sorting. */
2721 int comp_res;
2723 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2724 DR_BASE_ADDRESS (p21.dr))) != 0)
2725 return comp_res;
2726 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2727 DR_BASE_ADDRESS (p22.dr))) != 0)
2728 return comp_res;
2729 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2730 return comp_res;
2731 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2732 return comp_res;
2733 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2734 return comp_res;
2735 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2736 return comp_res;
2738 return 0;
2741 /* Function vect_vfa_segment_size.
2743 Create an expression that computes the size of segment
2744 that will be accessed for a data reference. The functions takes into
2745 account that realignment loads may access one more vector.
2747 Input:
2748 DR: The data reference.
2749 LENGTH_FACTOR: segment length to consider.
2751 Return an expression whose value is the size of segment which will be
2752 accessed by DR. */
2754 static tree
2755 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2757 tree segment_length;
2759 if (integer_zerop (DR_STEP (dr)))
2760 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2761 else
2762 segment_length = size_binop (MULT_EXPR,
2763 fold_convert (sizetype, DR_STEP (dr)),
2764 fold_convert (sizetype, length_factor));
2766 if (vect_supportable_dr_alignment (dr, false)
2767 == dr_explicit_realign_optimized)
2769 tree vector_size = TYPE_SIZE_UNIT
2770 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2772 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2774 return segment_length;
2777 /* Function vect_prune_runtime_alias_test_list.
2779 Prune a list of ddrs to be tested at run-time by versioning for alias.
2780 Merge several alias checks into one if possible.
2781 Return FALSE if resulting list of ddrs is longer then allowed by
2782 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2784 bool
2785 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2787 vec<ddr_p> may_alias_ddrs =
2788 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2789 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2790 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2791 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2792 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2794 ddr_p ddr;
2795 unsigned int i;
2796 tree length_factor;
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_NOTE, vect_location,
2800 "=== vect_prune_runtime_alias_test_list ===\n");
2802 if (may_alias_ddrs.is_empty ())
2803 return true;
2805 /* Basically, for each pair of dependent data refs store_ptr_0
2806 and load_ptr_0, we create an expression:
2808 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2809 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2811 for aliasing checks. However, in some cases we can decrease
2812 the number of checks by combining two checks into one. For
2813 example, suppose we have another pair of data refs store_ptr_0
2814 and load_ptr_1, and if the following condition is satisfied:
2816 load_ptr_0 < load_ptr_1 &&
2817 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2819 (this condition means, in each iteration of vectorized loop,
2820 the accessed memory of store_ptr_0 cannot be between the memory
2821 of load_ptr_0 and load_ptr_1.)
2823 we then can use only the following expression to finish the
2824 alising checks between store_ptr_0 & load_ptr_0 and
2825 store_ptr_0 & load_ptr_1:
2827 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2828 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2830 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2831 same basic address. */
2833 comp_alias_ddrs.create (may_alias_ddrs.length ());
2835 /* First, we collect all data ref pairs for aliasing checks. */
2836 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2838 struct data_reference *dr_a, *dr_b;
2839 gimple dr_group_first_a, dr_group_first_b;
2840 tree segment_length_a, segment_length_b;
2841 gimple stmt_a, stmt_b;
2843 dr_a = DDR_A (ddr);
2844 stmt_a = DR_STMT (DDR_A (ddr));
2845 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2846 if (dr_group_first_a)
2848 stmt_a = dr_group_first_a;
2849 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2852 dr_b = DDR_B (ddr);
2853 stmt_b = DR_STMT (DDR_B (ddr));
2854 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2855 if (dr_group_first_b)
2857 stmt_b = dr_group_first_b;
2858 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2861 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2862 length_factor = scalar_loop_iters;
2863 else
2864 length_factor = size_int (vect_factor);
2865 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2866 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2868 dr_with_seg_len_pair_t dr_with_seg_len_pair
2869 (dr_with_seg_len (dr_a, segment_length_a),
2870 dr_with_seg_len (dr_b, segment_length_b));
2872 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2873 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2875 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2878 /* Second, we sort the collected data ref pairs so that we can scan
2879 them once to combine all possible aliasing checks. */
2880 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2882 /* Third, we scan the sorted dr pairs and check if we can combine
2883 alias checks of two neighbouring dr pairs. */
2884 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2886 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2887 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2888 *dr_b1 = &comp_alias_ddrs[i-1].second,
2889 *dr_a2 = &comp_alias_ddrs[i].first,
2890 *dr_b2 = &comp_alias_ddrs[i].second;
2892 /* Remove duplicate data ref pairs. */
2893 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2895 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_NOTE, vect_location,
2898 "found equal ranges ");
2899 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2900 DR_REF (dr_a1->dr));
2901 dump_printf (MSG_NOTE, ", ");
2902 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2903 DR_REF (dr_b1->dr));
2904 dump_printf (MSG_NOTE, " and ");
2905 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2906 DR_REF (dr_a2->dr));
2907 dump_printf (MSG_NOTE, ", ");
2908 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2909 DR_REF (dr_b2->dr));
2910 dump_printf (MSG_NOTE, "\n");
2913 comp_alias_ddrs.ordered_remove (i--);
2914 continue;
2917 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2919 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2920 and DR_A1 and DR_A2 are two consecutive memrefs. */
2921 if (*dr_a1 == *dr_a2)
2923 std::swap (dr_a1, dr_b1);
2924 std::swap (dr_a2, dr_b2);
2927 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2928 DR_BASE_ADDRESS (dr_a2->dr),
2930 || !tree_fits_shwi_p (dr_a1->offset)
2931 || !tree_fits_shwi_p (dr_a2->offset))
2932 continue;
2934 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2935 - tree_to_shwi (dr_a1->offset));
2938 /* Now we check if the following condition is satisfied:
2940 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2942 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2943 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2944 have to make a best estimation. We can get the minimum value
2945 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2946 then either of the following two conditions can guarantee the
2947 one above:
2949 1: DIFF <= MIN_SEG_LEN_B
2950 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2954 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2955 ? tree_to_shwi (dr_b1->seg_len)
2956 : vect_factor);
2958 if (diff <= min_seg_len_b
2959 || (tree_fits_shwi_p (dr_a1->seg_len)
2960 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2962 if (dump_enabled_p ())
2964 dump_printf_loc (MSG_NOTE, vect_location,
2965 "merging ranges for ");
2966 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2967 DR_REF (dr_a1->dr));
2968 dump_printf (MSG_NOTE, ", ");
2969 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2970 DR_REF (dr_b1->dr));
2971 dump_printf (MSG_NOTE, " and ");
2972 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2973 DR_REF (dr_a2->dr));
2974 dump_printf (MSG_NOTE, ", ");
2975 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2976 DR_REF (dr_b2->dr));
2977 dump_printf (MSG_NOTE, "\n");
2980 dr_a1->seg_len = size_binop (PLUS_EXPR,
2981 dr_a2->seg_len, size_int (diff));
2982 comp_alias_ddrs.ordered_remove (i--);
2987 dump_printf_loc (MSG_NOTE, vect_location,
2988 "improved number of alias checks from %d to %d\n",
2989 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2990 if ((int) comp_alias_ddrs.length () >
2991 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2992 return false;
2994 return true;
2997 /* Check whether a non-affine read or write in stmt is suitable for gather load
2998 or scatter store and if so, return a builtin decl for that operation. */
3000 tree
3001 vect_check_gather_scatter (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
3002 tree *offp, int *scalep)
3004 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3005 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3006 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3007 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3008 tree offtype = NULL_TREE;
3009 tree decl, base, off;
3010 machine_mode pmode;
3011 int punsignedp, pvolatilep;
3013 base = DR_REF (dr);
3014 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3015 see if we can use the def stmt of the address. */
3016 if (is_gimple_call (stmt)
3017 && gimple_call_internal_p (stmt)
3018 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3019 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3020 && TREE_CODE (base) == MEM_REF
3021 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3022 && integer_zerop (TREE_OPERAND (base, 1))
3023 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3025 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3026 if (is_gimple_assign (def_stmt)
3027 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3028 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3031 /* The gather and scatter builtins need address of the form
3032 loop_invariant + vector * {1, 2, 4, 8}
3034 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3035 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3036 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3037 multiplications and additions in it. To get a vector, we need
3038 a single SSA_NAME that will be defined in the loop and will
3039 contain everything that is not loop invariant and that can be
3040 vectorized. The following code attempts to find such a preexistng
3041 SSA_NAME OFF and put the loop invariants into a tree BASE
3042 that can be gimplified before the loop. */
3043 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3044 &pmode, &punsignedp, &pvolatilep, false);
3045 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3047 if (TREE_CODE (base) == MEM_REF)
3049 if (!integer_zerop (TREE_OPERAND (base, 1)))
3051 if (off == NULL_TREE)
3053 offset_int moff = mem_ref_offset (base);
3054 off = wide_int_to_tree (sizetype, moff);
3056 else
3057 off = size_binop (PLUS_EXPR, off,
3058 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3060 base = TREE_OPERAND (base, 0);
3062 else
3063 base = build_fold_addr_expr (base);
3065 if (off == NULL_TREE)
3066 off = size_zero_node;
3068 /* If base is not loop invariant, either off is 0, then we start with just
3069 the constant offset in the loop invariant BASE and continue with base
3070 as OFF, otherwise give up.
3071 We could handle that case by gimplifying the addition of base + off
3072 into some SSA_NAME and use that as off, but for now punt. */
3073 if (!expr_invariant_in_loop_p (loop, base))
3075 if (!integer_zerop (off))
3076 return NULL_TREE;
3077 off = base;
3078 base = size_int (pbitpos / BITS_PER_UNIT);
3080 /* Otherwise put base + constant offset into the loop invariant BASE
3081 and continue with OFF. */
3082 else
3084 base = fold_convert (sizetype, base);
3085 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3088 /* OFF at this point may be either a SSA_NAME or some tree expression
3089 from get_inner_reference. Try to peel off loop invariants from it
3090 into BASE as long as possible. */
3091 STRIP_NOPS (off);
3092 while (offtype == NULL_TREE)
3094 enum tree_code code;
3095 tree op0, op1, add = NULL_TREE;
3097 if (TREE_CODE (off) == SSA_NAME)
3099 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3101 if (expr_invariant_in_loop_p (loop, off))
3102 return NULL_TREE;
3104 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3105 break;
3107 op0 = gimple_assign_rhs1 (def_stmt);
3108 code = gimple_assign_rhs_code (def_stmt);
3109 op1 = gimple_assign_rhs2 (def_stmt);
3111 else
3113 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3114 return NULL_TREE;
3115 code = TREE_CODE (off);
3116 extract_ops_from_tree (off, &code, &op0, &op1);
3118 switch (code)
3120 case POINTER_PLUS_EXPR:
3121 case PLUS_EXPR:
3122 if (expr_invariant_in_loop_p (loop, op0))
3124 add = op0;
3125 off = op1;
3126 do_add:
3127 add = fold_convert (sizetype, add);
3128 if (scale != 1)
3129 add = size_binop (MULT_EXPR, add, size_int (scale));
3130 base = size_binop (PLUS_EXPR, base, add);
3131 continue;
3133 if (expr_invariant_in_loop_p (loop, op1))
3135 add = op1;
3136 off = op0;
3137 goto do_add;
3139 break;
3140 case MINUS_EXPR:
3141 if (expr_invariant_in_loop_p (loop, op1))
3143 add = fold_convert (sizetype, op1);
3144 add = size_binop (MINUS_EXPR, size_zero_node, add);
3145 off = op0;
3146 goto do_add;
3148 break;
3149 case MULT_EXPR:
3150 if (scale == 1 && tree_fits_shwi_p (op1))
3152 scale = tree_to_shwi (op1);
3153 off = op0;
3154 continue;
3156 break;
3157 case SSA_NAME:
3158 off = op0;
3159 continue;
3160 CASE_CONVERT:
3161 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3162 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3163 break;
3164 if (TYPE_PRECISION (TREE_TYPE (op0))
3165 == TYPE_PRECISION (TREE_TYPE (off)))
3167 off = op0;
3168 continue;
3170 if (TYPE_PRECISION (TREE_TYPE (op0))
3171 < TYPE_PRECISION (TREE_TYPE (off)))
3173 off = op0;
3174 offtype = TREE_TYPE (off);
3175 STRIP_NOPS (off);
3176 continue;
3178 break;
3179 default:
3180 break;
3182 break;
3185 /* If at the end OFF still isn't a SSA_NAME or isn't
3186 defined in the loop, punt. */
3187 if (TREE_CODE (off) != SSA_NAME
3188 || expr_invariant_in_loop_p (loop, off))
3189 return NULL_TREE;
3191 if (offtype == NULL_TREE)
3192 offtype = TREE_TYPE (off);
3194 if (DR_IS_READ (dr))
3195 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3196 offtype, scale);
3197 else
3198 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3199 offtype, scale);
3201 if (decl == NULL_TREE)
3202 return NULL_TREE;
3204 if (basep)
3205 *basep = base;
3206 if (offp)
3207 *offp = off;
3208 if (scalep)
3209 *scalep = scale;
3210 return decl;
3213 /* Function vect_analyze_data_refs.
3215 Find all the data references in the loop or basic block.
3217 The general structure of the analysis of data refs in the vectorizer is as
3218 follows:
3219 1- vect_analyze_data_refs(loop/bb): call
3220 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3221 in the loop/bb and their dependences.
3222 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3223 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3224 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3228 bool
3229 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3230 bb_vec_info bb_vinfo,
3231 int *min_vf, unsigned *n_stmts)
3233 struct loop *loop = NULL;
3234 basic_block bb = NULL;
3235 unsigned int i;
3236 vec<data_reference_p> datarefs;
3237 struct data_reference *dr;
3238 tree scalar_type;
3240 if (dump_enabled_p ())
3241 dump_printf_loc (MSG_NOTE, vect_location,
3242 "=== vect_analyze_data_refs ===\n");
3244 if (loop_vinfo)
3246 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3248 loop = LOOP_VINFO_LOOP (loop_vinfo);
3249 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3250 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3252 if (dump_enabled_p ())
3253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3254 "not vectorized: loop contains function calls"
3255 " or data references that cannot be analyzed\n");
3256 return false;
3259 for (i = 0; i < loop->num_nodes; i++)
3261 gimple_stmt_iterator gsi;
3263 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3265 gimple stmt = gsi_stmt (gsi);
3266 if (is_gimple_debug (stmt))
3267 continue;
3268 ++*n_stmts;
3269 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3271 if (is_gimple_call (stmt) && loop->safelen)
3273 tree fndecl = gimple_call_fndecl (stmt), op;
3274 if (fndecl != NULL_TREE)
3276 struct cgraph_node *node = cgraph_node::get (fndecl);
3277 if (node != NULL && node->simd_clones != NULL)
3279 unsigned int j, n = gimple_call_num_args (stmt);
3280 for (j = 0; j < n; j++)
3282 op = gimple_call_arg (stmt, j);
3283 if (DECL_P (op)
3284 || (REFERENCE_CLASS_P (op)
3285 && get_base_address (op)))
3286 break;
3288 op = gimple_call_lhs (stmt);
3289 /* Ignore #pragma omp declare simd functions
3290 if they don't have data references in the
3291 call stmt itself. */
3292 if (j == n
3293 && !(op
3294 && (DECL_P (op)
3295 || (REFERENCE_CLASS_P (op)
3296 && get_base_address (op)))))
3297 continue;
3301 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3302 if (dump_enabled_p ())
3303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3304 "not vectorized: loop contains function "
3305 "calls or data references that cannot "
3306 "be analyzed\n");
3307 return false;
3312 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3314 else
3316 gimple_stmt_iterator gsi;
3318 bb = BB_VINFO_BB (bb_vinfo);
3319 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3321 gimple stmt = gsi_stmt (gsi);
3322 if (is_gimple_debug (stmt))
3323 continue;
3324 ++*n_stmts;
3325 if (!find_data_references_in_stmt (NULL, stmt,
3326 &BB_VINFO_DATAREFS (bb_vinfo)))
3328 /* Mark the rest of the basic-block as unvectorizable. */
3329 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3331 stmt = gsi_stmt (gsi);
3332 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3334 break;
3338 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3341 /* Go through the data-refs, check that the analysis succeeded. Update
3342 pointer from stmt_vec_info struct to DR and vectype. */
3344 FOR_EACH_VEC_ELT (datarefs, i, dr)
3346 gimple stmt;
3347 stmt_vec_info stmt_info;
3348 tree base, offset, init;
3349 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3350 bool simd_lane_access = false;
3351 int vf;
3353 again:
3354 if (!dr || !DR_REF (dr))
3356 if (dump_enabled_p ())
3357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3358 "not vectorized: unhandled data-ref\n");
3359 return false;
3362 stmt = DR_STMT (dr);
3363 stmt_info = vinfo_for_stmt (stmt);
3365 /* Discard clobbers from the dataref vector. We will remove
3366 clobber stmts during vectorization. */
3367 if (gimple_clobber_p (stmt))
3369 free_data_ref (dr);
3370 if (i == datarefs.length () - 1)
3372 datarefs.pop ();
3373 break;
3375 datarefs.ordered_remove (i);
3376 dr = datarefs[i];
3377 goto again;
3380 /* Check that analysis of the data-ref succeeded. */
3381 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3382 || !DR_STEP (dr))
3384 bool maybe_gather
3385 = DR_IS_READ (dr)
3386 && !TREE_THIS_VOLATILE (DR_REF (dr))
3387 && targetm.vectorize.builtin_gather != NULL;
3388 bool maybe_scatter
3389 = DR_IS_WRITE (dr)
3390 && !TREE_THIS_VOLATILE (DR_REF (dr))
3391 && targetm.vectorize.builtin_scatter != NULL;
3392 bool maybe_simd_lane_access
3393 = loop_vinfo && loop->simduid;
3395 /* If target supports vector gather loads or scatter stores, or if
3396 this might be a SIMD lane access, see if they can't be used. */
3397 if (loop_vinfo
3398 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3399 && !nested_in_vect_loop_p (loop, stmt))
3401 struct data_reference *newdr
3402 = create_data_ref (NULL, loop_containing_stmt (stmt),
3403 DR_REF (dr), stmt, maybe_scatter ? false : true);
3404 gcc_assert (newdr != NULL && DR_REF (newdr));
3405 if (DR_BASE_ADDRESS (newdr)
3406 && DR_OFFSET (newdr)
3407 && DR_INIT (newdr)
3408 && DR_STEP (newdr)
3409 && integer_zerop (DR_STEP (newdr)))
3411 if (maybe_simd_lane_access)
3413 tree off = DR_OFFSET (newdr);
3414 STRIP_NOPS (off);
3415 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3416 && TREE_CODE (off) == MULT_EXPR
3417 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3419 tree step = TREE_OPERAND (off, 1);
3420 off = TREE_OPERAND (off, 0);
3421 STRIP_NOPS (off);
3422 if (CONVERT_EXPR_P (off)
3423 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3424 0)))
3425 < TYPE_PRECISION (TREE_TYPE (off)))
3426 off = TREE_OPERAND (off, 0);
3427 if (TREE_CODE (off) == SSA_NAME)
3429 gimple def = SSA_NAME_DEF_STMT (off);
3430 tree reft = TREE_TYPE (DR_REF (newdr));
3431 if (is_gimple_call (def)
3432 && gimple_call_internal_p (def)
3433 && (gimple_call_internal_fn (def)
3434 == IFN_GOMP_SIMD_LANE))
3436 tree arg = gimple_call_arg (def, 0);
3437 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3438 arg = SSA_NAME_VAR (arg);
3439 if (arg == loop->simduid
3440 /* For now. */
3441 && tree_int_cst_equal
3442 (TYPE_SIZE_UNIT (reft),
3443 step))
3445 DR_OFFSET (newdr) = ssize_int (0);
3446 DR_STEP (newdr) = step;
3447 DR_ALIGNED_TO (newdr)
3448 = size_int (BIGGEST_ALIGNMENT);
3449 dr = newdr;
3450 simd_lane_access = true;
3456 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3458 dr = newdr;
3459 if (maybe_gather)
3460 gatherscatter = GATHER;
3461 else
3462 gatherscatter = SCATTER;
3465 if (gatherscatter == SG_NONE && !simd_lane_access)
3466 free_data_ref (newdr);
3469 if (gatherscatter == SG_NONE && !simd_lane_access)
3471 if (dump_enabled_p ())
3473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3474 "not vectorized: data ref analysis "
3475 "failed ");
3476 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3477 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3480 if (bb_vinfo)
3481 break;
3483 return false;
3487 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3489 if (dump_enabled_p ())
3490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3491 "not vectorized: base addr of dr is a "
3492 "constant\n");
3494 if (bb_vinfo)
3495 break;
3497 if (gatherscatter != SG_NONE || simd_lane_access)
3498 free_data_ref (dr);
3499 return false;
3502 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3504 if (dump_enabled_p ())
3506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3507 "not vectorized: volatile type ");
3508 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3509 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3512 if (bb_vinfo)
3513 break;
3515 return false;
3518 if (stmt_can_throw_internal (stmt))
3520 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3523 "not vectorized: statement can throw an "
3524 "exception ");
3525 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3529 if (bb_vinfo)
3530 break;
3532 if (gatherscatter != SG_NONE || simd_lane_access)
3533 free_data_ref (dr);
3534 return false;
3537 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3538 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3540 if (dump_enabled_p ())
3542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3543 "not vectorized: statement is bitfield "
3544 "access ");
3545 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3546 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3549 if (bb_vinfo)
3550 break;
3552 if (gatherscatter != SG_NONE || simd_lane_access)
3553 free_data_ref (dr);
3554 return false;
3557 base = unshare_expr (DR_BASE_ADDRESS (dr));
3558 offset = unshare_expr (DR_OFFSET (dr));
3559 init = unshare_expr (DR_INIT (dr));
3561 if (is_gimple_call (stmt)
3562 && (!gimple_call_internal_p (stmt)
3563 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3564 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3566 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3569 "not vectorized: dr in a call ");
3570 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3571 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3574 if (bb_vinfo)
3575 break;
3577 if (gatherscatter != SG_NONE || simd_lane_access)
3578 free_data_ref (dr);
3579 return false;
3582 /* Update DR field in stmt_vec_info struct. */
3584 /* If the dataref is in an inner-loop of the loop that is considered for
3585 for vectorization, we also want to analyze the access relative to
3586 the outer-loop (DR contains information only relative to the
3587 inner-most enclosing loop). We do that by building a reference to the
3588 first location accessed by the inner-loop, and analyze it relative to
3589 the outer-loop. */
3590 if (loop && nested_in_vect_loop_p (loop, stmt))
3592 tree outer_step, outer_base, outer_init;
3593 HOST_WIDE_INT pbitsize, pbitpos;
3594 tree poffset;
3595 machine_mode pmode;
3596 int punsignedp, pvolatilep;
3597 affine_iv base_iv, offset_iv;
3598 tree dinit;
3600 /* Build a reference to the first location accessed by the
3601 inner-loop: *(BASE+INIT). (The first location is actually
3602 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3603 tree inner_base = build_fold_indirect_ref
3604 (fold_build_pointer_plus (base, init));
3606 if (dump_enabled_p ())
3608 dump_printf_loc (MSG_NOTE, vect_location,
3609 "analyze in outer-loop: ");
3610 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3611 dump_printf (MSG_NOTE, "\n");
3614 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3615 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3616 gcc_assert (outer_base != NULL_TREE);
3618 if (pbitpos % BITS_PER_UNIT != 0)
3620 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 "failed: bit offset alignment.\n");
3623 return false;
3626 outer_base = build_fold_addr_expr (outer_base);
3627 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3628 &base_iv, false))
3630 if (dump_enabled_p ())
3631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3632 "failed: evolution of base is not affine.\n");
3633 return false;
3636 if (offset)
3638 if (poffset)
3639 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3640 poffset);
3641 else
3642 poffset = offset;
3645 if (!poffset)
3647 offset_iv.base = ssize_int (0);
3648 offset_iv.step = ssize_int (0);
3650 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3651 &offset_iv, false))
3653 if (dump_enabled_p ())
3654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3655 "evolution of offset is not affine.\n");
3656 return false;
3659 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3660 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3661 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3662 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3663 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3665 outer_step = size_binop (PLUS_EXPR,
3666 fold_convert (ssizetype, base_iv.step),
3667 fold_convert (ssizetype, offset_iv.step));
3669 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3670 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3671 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3672 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3673 STMT_VINFO_DR_OFFSET (stmt_info) =
3674 fold_convert (ssizetype, offset_iv.base);
3675 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3676 size_int (highest_pow2_factor (offset_iv.base));
3678 if (dump_enabled_p ())
3680 dump_printf_loc (MSG_NOTE, vect_location,
3681 "\touter base_address: ");
3682 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3683 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3684 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3685 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3686 STMT_VINFO_DR_OFFSET (stmt_info));
3687 dump_printf (MSG_NOTE,
3688 "\n\touter constant offset from base address: ");
3689 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3690 STMT_VINFO_DR_INIT (stmt_info));
3691 dump_printf (MSG_NOTE, "\n\touter step: ");
3692 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3693 STMT_VINFO_DR_STEP (stmt_info));
3694 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3695 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3696 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3697 dump_printf (MSG_NOTE, "\n");
3701 if (STMT_VINFO_DATA_REF (stmt_info))
3703 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706 "not vectorized: more than one data ref "
3707 "in stmt: ");
3708 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3709 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3712 if (bb_vinfo)
3713 break;
3715 if (gatherscatter != SG_NONE || simd_lane_access)
3716 free_data_ref (dr);
3717 return false;
3720 STMT_VINFO_DATA_REF (stmt_info) = dr;
3721 if (simd_lane_access)
3723 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3724 free_data_ref (datarefs[i]);
3725 datarefs[i] = dr;
3728 /* Set vectype for STMT. */
3729 scalar_type = TREE_TYPE (DR_REF (dr));
3730 STMT_VINFO_VECTYPE (stmt_info)
3731 = get_vectype_for_scalar_type (scalar_type);
3732 if (!STMT_VINFO_VECTYPE (stmt_info))
3734 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3737 "not vectorized: no vectype for stmt: ");
3738 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3739 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3740 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3741 scalar_type);
3742 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3745 if (bb_vinfo)
3746 break;
3748 if (gatherscatter != SG_NONE || simd_lane_access)
3750 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3751 if (gatherscatter != SG_NONE)
3752 free_data_ref (dr);
3754 return false;
3756 else
3758 if (dump_enabled_p ())
3760 dump_printf_loc (MSG_NOTE, vect_location,
3761 "got vectype for stmt: ");
3762 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3763 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3764 STMT_VINFO_VECTYPE (stmt_info));
3765 dump_printf (MSG_NOTE, "\n");
3769 /* Adjust the minimal vectorization factor according to the
3770 vector type. */
3771 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3772 if (vf > *min_vf)
3773 *min_vf = vf;
3775 if (gatherscatter != SG_NONE)
3777 tree off;
3778 if (!vect_check_gather_scatter (stmt, loop_vinfo, NULL, &off, NULL)
3779 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3781 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3782 free_data_ref (dr);
3783 if (dump_enabled_p ())
3785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3786 (gatherscatter == GATHER) ?
3787 "not vectorized: not suitable for gather "
3788 "load " :
3789 "not vectorized: not suitable for scatter "
3790 "store ");
3791 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3792 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3794 return false;
3797 datarefs[i] = dr;
3798 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3801 else if (loop_vinfo
3802 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3804 if (nested_in_vect_loop_p (loop, stmt))
3806 if (dump_enabled_p ())
3808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3809 "not vectorized: not suitable for strided "
3810 "load ");
3811 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3812 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3814 return false;
3816 STMT_VINFO_STRIDED_P (stmt_info) = true;
3820 /* If we stopped analysis at the first dataref we could not analyze
3821 when trying to vectorize a basic-block mark the rest of the datarefs
3822 as not vectorizable and truncate the vector of datarefs. That
3823 avoids spending useless time in analyzing their dependence. */
3824 if (i != datarefs.length ())
3826 gcc_assert (bb_vinfo != NULL);
3827 for (unsigned j = i; j < datarefs.length (); ++j)
3829 data_reference_p dr = datarefs[j];
3830 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3831 free_data_ref (dr);
3833 datarefs.truncate (i);
3836 return true;
3840 /* Function vect_get_new_vect_var.
3842 Returns a name for a new variable. The current naming scheme appends the
3843 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3844 the name of vectorizer generated variables, and appends that to NAME if
3845 provided. */
3847 tree
3848 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3850 const char *prefix;
3851 tree new_vect_var;
3853 switch (var_kind)
3855 case vect_simple_var:
3856 prefix = "vect";
3857 break;
3858 case vect_scalar_var:
3859 prefix = "stmp";
3860 break;
3861 case vect_pointer_var:
3862 prefix = "vectp";
3863 break;
3864 default:
3865 gcc_unreachable ();
3868 if (name)
3870 char* tmp = concat (prefix, "_", name, NULL);
3871 new_vect_var = create_tmp_reg (type, tmp);
3872 free (tmp);
3874 else
3875 new_vect_var = create_tmp_reg (type, prefix);
3877 return new_vect_var;
3880 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3882 static void
3883 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3884 stmt_vec_info stmt_info)
3886 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3887 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3888 int misalign = DR_MISALIGNMENT (dr);
3889 if (misalign == -1)
3890 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3891 else
3892 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3895 /* Function vect_create_addr_base_for_vector_ref.
3897 Create an expression that computes the address of the first memory location
3898 that will be accessed for a data reference.
3900 Input:
3901 STMT: The statement containing the data reference.
3902 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3903 OFFSET: Optional. If supplied, it is be added to the initial address.
3904 LOOP: Specify relative to which loop-nest should the address be computed.
3905 For example, when the dataref is in an inner-loop nested in an
3906 outer-loop that is now being vectorized, LOOP can be either the
3907 outer-loop, or the inner-loop. The first memory location accessed
3908 by the following dataref ('in' points to short):
3910 for (i=0; i<N; i++)
3911 for (j=0; j<M; j++)
3912 s += in[i+j]
3914 is as follows:
3915 if LOOP=i_loop: &in (relative to i_loop)
3916 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3917 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3918 initial address. Unlike OFFSET, which is number of elements to
3919 be added, BYTE_OFFSET is measured in bytes.
3921 Output:
3922 1. Return an SSA_NAME whose value is the address of the memory location of
3923 the first vector of the data reference.
3924 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3925 these statement(s) which define the returned SSA_NAME.
3927 FORNOW: We are only handling array accesses with step 1. */
3929 tree
3930 vect_create_addr_base_for_vector_ref (gimple stmt,
3931 gimple_seq *new_stmt_list,
3932 tree offset,
3933 struct loop *loop,
3934 tree byte_offset)
3936 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3937 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3938 tree data_ref_base;
3939 const char *base_name;
3940 tree addr_base;
3941 tree dest;
3942 gimple_seq seq = NULL;
3943 tree base_offset;
3944 tree init;
3945 tree vect_ptr_type;
3946 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3947 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3949 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3951 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3953 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3955 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3956 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3957 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3959 else
3961 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3962 base_offset = unshare_expr (DR_OFFSET (dr));
3963 init = unshare_expr (DR_INIT (dr));
3966 if (loop_vinfo)
3967 base_name = get_name (data_ref_base);
3968 else
3970 base_offset = ssize_int (0);
3971 init = ssize_int (0);
3972 base_name = get_name (DR_REF (dr));
3975 /* Create base_offset */
3976 base_offset = size_binop (PLUS_EXPR,
3977 fold_convert (sizetype, base_offset),
3978 fold_convert (sizetype, init));
3980 if (offset)
3982 offset = fold_build2 (MULT_EXPR, sizetype,
3983 fold_convert (sizetype, offset), step);
3984 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3985 base_offset, offset);
3987 if (byte_offset)
3989 byte_offset = fold_convert (sizetype, byte_offset);
3990 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3991 base_offset, byte_offset);
3994 /* base + base_offset */
3995 if (loop_vinfo)
3996 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3997 else
3999 addr_base = build1 (ADDR_EXPR,
4000 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4001 unshare_expr (DR_REF (dr)));
4004 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4005 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4006 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4007 gimple_seq_add_seq (new_stmt_list, seq);
4009 if (DR_PTR_INFO (dr)
4010 && TREE_CODE (addr_base) == SSA_NAME
4011 && !SSA_NAME_PTR_INFO (addr_base))
4013 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4014 if (offset || byte_offset)
4015 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4018 if (dump_enabled_p ())
4020 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4021 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4022 dump_printf (MSG_NOTE, "\n");
4025 return addr_base;
4029 /* Function vect_create_data_ref_ptr.
4031 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4032 location accessed in the loop by STMT, along with the def-use update
4033 chain to appropriately advance the pointer through the loop iterations.
4034 Also set aliasing information for the pointer. This pointer is used by
4035 the callers to this function to create a memory reference expression for
4036 vector load/store access.
4038 Input:
4039 1. STMT: a stmt that references memory. Expected to be of the form
4040 GIMPLE_ASSIGN <name, data-ref> or
4041 GIMPLE_ASSIGN <data-ref, name>.
4042 2. AGGR_TYPE: the type of the reference, which should be either a vector
4043 or an array.
4044 3. AT_LOOP: the loop where the vector memref is to be created.
4045 4. OFFSET (optional): an offset to be added to the initial address accessed
4046 by the data-ref in STMT.
4047 5. BSI: location where the new stmts are to be placed if there is no loop
4048 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4049 pointing to the initial address.
4050 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4051 to the initial address accessed by the data-ref in STMT. This is
4052 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4053 in bytes.
4055 Output:
4056 1. Declare a new ptr to vector_type, and have it point to the base of the
4057 data reference (initial addressed accessed by the data reference).
4058 For example, for vector of type V8HI, the following code is generated:
4060 v8hi *ap;
4061 ap = (v8hi *)initial_address;
4063 if OFFSET is not supplied:
4064 initial_address = &a[init];
4065 if OFFSET is supplied:
4066 initial_address = &a[init + OFFSET];
4067 if BYTE_OFFSET is supplied:
4068 initial_address = &a[init] + BYTE_OFFSET;
4070 Return the initial_address in INITIAL_ADDRESS.
4072 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4073 update the pointer in each iteration of the loop.
4075 Return the increment stmt that updates the pointer in PTR_INCR.
4077 3. Set INV_P to true if the access pattern of the data reference in the
4078 vectorized loop is invariant. Set it to false otherwise.
4080 4. Return the pointer. */
4082 tree
4083 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4084 tree offset, tree *initial_address,
4085 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4086 bool only_init, bool *inv_p, tree byte_offset)
4088 const char *base_name;
4089 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4090 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4091 struct loop *loop = NULL;
4092 bool nested_in_vect_loop = false;
4093 struct loop *containing_loop = NULL;
4094 tree aggr_ptr_type;
4095 tree aggr_ptr;
4096 tree new_temp;
4097 gimple_seq new_stmt_list = NULL;
4098 edge pe = NULL;
4099 basic_block new_bb;
4100 tree aggr_ptr_init;
4101 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4102 tree aptr;
4103 gimple_stmt_iterator incr_gsi;
4104 bool insert_after;
4105 tree indx_before_incr, indx_after_incr;
4106 gimple incr;
4107 tree step;
4108 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4110 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4111 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4113 if (loop_vinfo)
4115 loop = LOOP_VINFO_LOOP (loop_vinfo);
4116 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4117 containing_loop = (gimple_bb (stmt))->loop_father;
4118 pe = loop_preheader_edge (loop);
4120 else
4122 gcc_assert (bb_vinfo);
4123 only_init = true;
4124 *ptr_incr = NULL;
4127 /* Check the step (evolution) of the load in LOOP, and record
4128 whether it's invariant. */
4129 if (nested_in_vect_loop)
4130 step = STMT_VINFO_DR_STEP (stmt_info);
4131 else
4132 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4134 if (integer_zerop (step))
4135 *inv_p = true;
4136 else
4137 *inv_p = false;
4139 /* Create an expression for the first address accessed by this load
4140 in LOOP. */
4141 base_name = get_name (DR_BASE_ADDRESS (dr));
4143 if (dump_enabled_p ())
4145 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4146 dump_printf_loc (MSG_NOTE, vect_location,
4147 "create %s-pointer variable to type: ",
4148 get_tree_code_name (TREE_CODE (aggr_type)));
4149 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4150 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4151 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4152 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4153 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4154 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4155 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4156 else
4157 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4159 dump_printf (MSG_NOTE, "\n");
4162 /* (1) Create the new aggregate-pointer variable.
4163 Vector and array types inherit the alias set of their component
4164 type by default so we need to use a ref-all pointer if the data
4165 reference does not conflict with the created aggregated data
4166 reference because it is not addressable. */
4167 bool need_ref_all = false;
4168 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4169 get_alias_set (DR_REF (dr))))
4170 need_ref_all = true;
4171 /* Likewise for any of the data references in the stmt group. */
4172 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4174 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4177 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4178 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4179 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4180 get_alias_set (DR_REF (sdr))))
4182 need_ref_all = true;
4183 break;
4185 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4187 while (orig_stmt);
4189 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4190 need_ref_all);
4191 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4194 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4195 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4196 def-use update cycles for the pointer: one relative to the outer-loop
4197 (LOOP), which is what steps (3) and (4) below do. The other is relative
4198 to the inner-loop (which is the inner-most loop containing the dataref),
4199 and this is done be step (5) below.
4201 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4202 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4203 redundant. Steps (3),(4) create the following:
4205 vp0 = &base_addr;
4206 LOOP: vp1 = phi(vp0,vp2)
4209 vp2 = vp1 + step
4210 goto LOOP
4212 If there is an inner-loop nested in loop, then step (5) will also be
4213 applied, and an additional update in the inner-loop will be created:
4215 vp0 = &base_addr;
4216 LOOP: vp1 = phi(vp0,vp2)
4218 inner: vp3 = phi(vp1,vp4)
4219 vp4 = vp3 + inner_step
4220 if () goto inner
4222 vp2 = vp1 + step
4223 if () goto LOOP */
4225 /* (2) Calculate the initial address of the aggregate-pointer, and set
4226 the aggregate-pointer to point to it before the loop. */
4228 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4230 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4231 offset, loop, byte_offset);
4232 if (new_stmt_list)
4234 if (pe)
4236 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4237 gcc_assert (!new_bb);
4239 else
4240 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4243 *initial_address = new_temp;
4244 aggr_ptr_init = new_temp;
4246 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4247 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4248 inner-loop nested in LOOP (during outer-loop vectorization). */
4250 /* No update in loop is required. */
4251 if (only_init && (!loop_vinfo || at_loop == loop))
4252 aptr = aggr_ptr_init;
4253 else
4255 /* The step of the aggregate pointer is the type size. */
4256 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4257 /* One exception to the above is when the scalar step of the load in
4258 LOOP is zero. In this case the step here is also zero. */
4259 if (*inv_p)
4260 iv_step = size_zero_node;
4261 else if (tree_int_cst_sgn (step) == -1)
4262 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4264 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4266 create_iv (aggr_ptr_init,
4267 fold_convert (aggr_ptr_type, iv_step),
4268 aggr_ptr, loop, &incr_gsi, insert_after,
4269 &indx_before_incr, &indx_after_incr);
4270 incr = gsi_stmt (incr_gsi);
4271 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4273 /* Copy the points-to information if it exists. */
4274 if (DR_PTR_INFO (dr))
4276 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4277 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4279 if (ptr_incr)
4280 *ptr_incr = incr;
4282 aptr = indx_before_incr;
4285 if (!nested_in_vect_loop || only_init)
4286 return aptr;
4289 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4290 nested in LOOP, if exists. */
4292 gcc_assert (nested_in_vect_loop);
4293 if (!only_init)
4295 standard_iv_increment_position (containing_loop, &incr_gsi,
4296 &insert_after);
4297 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4298 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4299 &indx_after_incr);
4300 incr = gsi_stmt (incr_gsi);
4301 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4303 /* Copy the points-to information if it exists. */
4304 if (DR_PTR_INFO (dr))
4306 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4307 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4309 if (ptr_incr)
4310 *ptr_incr = incr;
4312 return indx_before_incr;
4314 else
4315 gcc_unreachable ();
4319 /* Function bump_vector_ptr
4321 Increment a pointer (to a vector type) by vector-size. If requested,
4322 i.e. if PTR-INCR is given, then also connect the new increment stmt
4323 to the existing def-use update-chain of the pointer, by modifying
4324 the PTR_INCR as illustrated below:
4326 The pointer def-use update-chain before this function:
4327 DATAREF_PTR = phi (p_0, p_2)
4328 ....
4329 PTR_INCR: p_2 = DATAREF_PTR + step
4331 The pointer def-use update-chain after this function:
4332 DATAREF_PTR = phi (p_0, p_2)
4333 ....
4334 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4335 ....
4336 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4338 Input:
4339 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4340 in the loop.
4341 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4342 the loop. The increment amount across iterations is expected
4343 to be vector_size.
4344 BSI - location where the new update stmt is to be placed.
4345 STMT - the original scalar memory-access stmt that is being vectorized.
4346 BUMP - optional. The offset by which to bump the pointer. If not given,
4347 the offset is assumed to be vector_size.
4349 Output: Return NEW_DATAREF_PTR as illustrated above.
4353 tree
4354 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4355 gimple stmt, tree bump)
4357 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4358 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4359 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4360 tree update = TYPE_SIZE_UNIT (vectype);
4361 gassign *incr_stmt;
4362 ssa_op_iter iter;
4363 use_operand_p use_p;
4364 tree new_dataref_ptr;
4366 if (bump)
4367 update = bump;
4369 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4370 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4371 else
4372 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4373 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4374 dataref_ptr, update);
4375 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4377 /* Copy the points-to information if it exists. */
4378 if (DR_PTR_INFO (dr))
4380 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4381 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4384 if (!ptr_incr)
4385 return new_dataref_ptr;
4387 /* Update the vector-pointer's cross-iteration increment. */
4388 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4390 tree use = USE_FROM_PTR (use_p);
4392 if (use == dataref_ptr)
4393 SET_USE (use_p, new_dataref_ptr);
4394 else
4395 gcc_assert (tree_int_cst_compare (use, update) == 0);
4398 return new_dataref_ptr;
4402 /* Function vect_create_destination_var.
4404 Create a new temporary of type VECTYPE. */
4406 tree
4407 vect_create_destination_var (tree scalar_dest, tree vectype)
4409 tree vec_dest;
4410 const char *name;
4411 char *new_name;
4412 tree type;
4413 enum vect_var_kind kind;
4415 kind = vectype ? vect_simple_var : vect_scalar_var;
4416 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4418 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4420 name = get_name (scalar_dest);
4421 if (name)
4422 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4423 else
4424 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4425 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4426 free (new_name);
4428 return vec_dest;
4431 /* Function vect_grouped_store_supported.
4433 Returns TRUE if interleave high and interleave low permutations
4434 are supported, and FALSE otherwise. */
4436 bool
4437 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4439 machine_mode mode = TYPE_MODE (vectype);
4441 /* vect_permute_store_chain requires the group size to be equal to 3 or
4442 be a power of two. */
4443 if (count != 3 && exact_log2 (count) == -1)
4445 if (dump_enabled_p ())
4446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4447 "the size of the group of accesses"
4448 " is not a power of 2 or not eqaul to 3\n");
4449 return false;
4452 /* Check that the permutation is supported. */
4453 if (VECTOR_MODE_P (mode))
4455 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4456 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4458 if (count == 3)
4460 unsigned int j0 = 0, j1 = 0, j2 = 0;
4461 unsigned int i, j;
4463 for (j = 0; j < 3; j++)
4465 int nelt0 = ((3 - j) * nelt) % 3;
4466 int nelt1 = ((3 - j) * nelt + 1) % 3;
4467 int nelt2 = ((3 - j) * nelt + 2) % 3;
4468 for (i = 0; i < nelt; i++)
4470 if (3 * i + nelt0 < nelt)
4471 sel[3 * i + nelt0] = j0++;
4472 if (3 * i + nelt1 < nelt)
4473 sel[3 * i + nelt1] = nelt + j1++;
4474 if (3 * i + nelt2 < nelt)
4475 sel[3 * i + nelt2] = 0;
4477 if (!can_vec_perm_p (mode, false, sel))
4479 if (dump_enabled_p ())
4480 dump_printf (MSG_MISSED_OPTIMIZATION,
4481 "permutaion op not supported by target.\n");
4482 return false;
4485 for (i = 0; i < nelt; i++)
4487 if (3 * i + nelt0 < nelt)
4488 sel[3 * i + nelt0] = 3 * i + nelt0;
4489 if (3 * i + nelt1 < nelt)
4490 sel[3 * i + nelt1] = 3 * i + nelt1;
4491 if (3 * i + nelt2 < nelt)
4492 sel[3 * i + nelt2] = nelt + j2++;
4494 if (!can_vec_perm_p (mode, false, sel))
4496 if (dump_enabled_p ())
4497 dump_printf (MSG_MISSED_OPTIMIZATION,
4498 "permutaion op not supported by target.\n");
4499 return false;
4502 return true;
4504 else
4506 /* If length is not equal to 3 then only power of 2 is supported. */
4507 gcc_assert (exact_log2 (count) != -1);
4509 for (i = 0; i < nelt / 2; i++)
4511 sel[i * 2] = i;
4512 sel[i * 2 + 1] = i + nelt;
4514 if (can_vec_perm_p (mode, false, sel))
4516 for (i = 0; i < nelt; i++)
4517 sel[i] += nelt / 2;
4518 if (can_vec_perm_p (mode, false, sel))
4519 return true;
4524 if (dump_enabled_p ())
4525 dump_printf (MSG_MISSED_OPTIMIZATION,
4526 "permutaion op not supported by target.\n");
4527 return false;
4531 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4532 type VECTYPE. */
4534 bool
4535 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4537 return vect_lanes_optab_supported_p ("vec_store_lanes",
4538 vec_store_lanes_optab,
4539 vectype, count);
4543 /* Function vect_permute_store_chain.
4545 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4546 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4547 the data correctly for the stores. Return the final references for stores
4548 in RESULT_CHAIN.
4550 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4551 The input is 4 vectors each containing 8 elements. We assign a number to
4552 each element, the input sequence is:
4554 1st vec: 0 1 2 3 4 5 6 7
4555 2nd vec: 8 9 10 11 12 13 14 15
4556 3rd vec: 16 17 18 19 20 21 22 23
4557 4th vec: 24 25 26 27 28 29 30 31
4559 The output sequence should be:
4561 1st vec: 0 8 16 24 1 9 17 25
4562 2nd vec: 2 10 18 26 3 11 19 27
4563 3rd vec: 4 12 20 28 5 13 21 30
4564 4th vec: 6 14 22 30 7 15 23 31
4566 i.e., we interleave the contents of the four vectors in their order.
4568 We use interleave_high/low instructions to create such output. The input of
4569 each interleave_high/low operation is two vectors:
4570 1st vec 2nd vec
4571 0 1 2 3 4 5 6 7
4572 the even elements of the result vector are obtained left-to-right from the
4573 high/low elements of the first vector. The odd elements of the result are
4574 obtained left-to-right from the high/low elements of the second vector.
4575 The output of interleave_high will be: 0 4 1 5
4576 and of interleave_low: 2 6 3 7
4579 The permutation is done in log LENGTH stages. In each stage interleave_high
4580 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4581 where the first argument is taken from the first half of DR_CHAIN and the
4582 second argument from it's second half.
4583 In our example,
4585 I1: interleave_high (1st vec, 3rd vec)
4586 I2: interleave_low (1st vec, 3rd vec)
4587 I3: interleave_high (2nd vec, 4th vec)
4588 I4: interleave_low (2nd vec, 4th vec)
4590 The output for the first stage is:
4592 I1: 0 16 1 17 2 18 3 19
4593 I2: 4 20 5 21 6 22 7 23
4594 I3: 8 24 9 25 10 26 11 27
4595 I4: 12 28 13 29 14 30 15 31
4597 The output of the second stage, i.e. the final result is:
4599 I1: 0 8 16 24 1 9 17 25
4600 I2: 2 10 18 26 3 11 19 27
4601 I3: 4 12 20 28 5 13 21 30
4602 I4: 6 14 22 30 7 15 23 31. */
4604 void
4605 vect_permute_store_chain (vec<tree> dr_chain,
4606 unsigned int length,
4607 gimple stmt,
4608 gimple_stmt_iterator *gsi,
4609 vec<tree> *result_chain)
4611 tree vect1, vect2, high, low;
4612 gimple perm_stmt;
4613 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4614 tree perm_mask_low, perm_mask_high;
4615 tree data_ref;
4616 tree perm3_mask_low, perm3_mask_high;
4617 unsigned int i, n, log_length = exact_log2 (length);
4618 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4619 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4621 result_chain->quick_grow (length);
4622 memcpy (result_chain->address (), dr_chain.address (),
4623 length * sizeof (tree));
4625 if (length == 3)
4627 unsigned int j0 = 0, j1 = 0, j2 = 0;
4629 for (j = 0; j < 3; j++)
4631 int nelt0 = ((3 - j) * nelt) % 3;
4632 int nelt1 = ((3 - j) * nelt + 1) % 3;
4633 int nelt2 = ((3 - j) * nelt + 2) % 3;
4635 for (i = 0; i < nelt; i++)
4637 if (3 * i + nelt0 < nelt)
4638 sel[3 * i + nelt0] = j0++;
4639 if (3 * i + nelt1 < nelt)
4640 sel[3 * i + nelt1] = nelt + j1++;
4641 if (3 * i + nelt2 < nelt)
4642 sel[3 * i + nelt2] = 0;
4644 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4646 for (i = 0; i < nelt; i++)
4648 if (3 * i + nelt0 < nelt)
4649 sel[3 * i + nelt0] = 3 * i + nelt0;
4650 if (3 * i + nelt1 < nelt)
4651 sel[3 * i + nelt1] = 3 * i + nelt1;
4652 if (3 * i + nelt2 < nelt)
4653 sel[3 * i + nelt2] = nelt + j2++;
4655 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4657 vect1 = dr_chain[0];
4658 vect2 = dr_chain[1];
4660 /* Create interleaving stmt:
4661 low = VEC_PERM_EXPR <vect1, vect2,
4662 {j, nelt, *, j + 1, nelt + j + 1, *,
4663 j + 2, nelt + j + 2, *, ...}> */
4664 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4665 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4666 vect2, perm3_mask_low);
4667 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4669 vect1 = data_ref;
4670 vect2 = dr_chain[2];
4671 /* Create interleaving stmt:
4672 low = VEC_PERM_EXPR <vect1, vect2,
4673 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4674 6, 7, nelt + j + 2, ...}> */
4675 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4676 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4677 vect2, perm3_mask_high);
4678 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4679 (*result_chain)[j] = data_ref;
4682 else
4684 /* If length is not equal to 3 then only power of 2 is supported. */
4685 gcc_assert (exact_log2 (length) != -1);
4687 for (i = 0, n = nelt / 2; i < n; i++)
4689 sel[i * 2] = i;
4690 sel[i * 2 + 1] = i + nelt;
4692 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4694 for (i = 0; i < nelt; i++)
4695 sel[i] += nelt / 2;
4696 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4698 for (i = 0, n = log_length; i < n; i++)
4700 for (j = 0; j < length/2; j++)
4702 vect1 = dr_chain[j];
4703 vect2 = dr_chain[j+length/2];
4705 /* Create interleaving stmt:
4706 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4707 ...}> */
4708 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4709 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4710 vect2, perm_mask_high);
4711 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4712 (*result_chain)[2*j] = high;
4714 /* Create interleaving stmt:
4715 low = VEC_PERM_EXPR <vect1, vect2,
4716 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4717 ...}> */
4718 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4719 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4720 vect2, perm_mask_low);
4721 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4722 (*result_chain)[2*j+1] = low;
4724 memcpy (dr_chain.address (), result_chain->address (),
4725 length * sizeof (tree));
4730 /* Function vect_setup_realignment
4732 This function is called when vectorizing an unaligned load using
4733 the dr_explicit_realign[_optimized] scheme.
4734 This function generates the following code at the loop prolog:
4736 p = initial_addr;
4737 x msq_init = *(floor(p)); # prolog load
4738 realignment_token = call target_builtin;
4739 loop:
4740 x msq = phi (msq_init, ---)
4742 The stmts marked with x are generated only for the case of
4743 dr_explicit_realign_optimized.
4745 The code above sets up a new (vector) pointer, pointing to the first
4746 location accessed by STMT, and a "floor-aligned" load using that pointer.
4747 It also generates code to compute the "realignment-token" (if the relevant
4748 target hook was defined), and creates a phi-node at the loop-header bb
4749 whose arguments are the result of the prolog-load (created by this
4750 function) and the result of a load that takes place in the loop (to be
4751 created by the caller to this function).
4753 For the case of dr_explicit_realign_optimized:
4754 The caller to this function uses the phi-result (msq) to create the
4755 realignment code inside the loop, and sets up the missing phi argument,
4756 as follows:
4757 loop:
4758 msq = phi (msq_init, lsq)
4759 lsq = *(floor(p')); # load in loop
4760 result = realign_load (msq, lsq, realignment_token);
4762 For the case of dr_explicit_realign:
4763 loop:
4764 msq = *(floor(p)); # load in loop
4765 p' = p + (VS-1);
4766 lsq = *(floor(p')); # load in loop
4767 result = realign_load (msq, lsq, realignment_token);
4769 Input:
4770 STMT - (scalar) load stmt to be vectorized. This load accesses
4771 a memory location that may be unaligned.
4772 BSI - place where new code is to be inserted.
4773 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4774 is used.
4776 Output:
4777 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4778 target hook, if defined.
4779 Return value - the result of the loop-header phi node. */
4781 tree
4782 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4783 tree *realignment_token,
4784 enum dr_alignment_support alignment_support_scheme,
4785 tree init_addr,
4786 struct loop **at_loop)
4788 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4789 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4790 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4791 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4792 struct loop *loop = NULL;
4793 edge pe = NULL;
4794 tree scalar_dest = gimple_assign_lhs (stmt);
4795 tree vec_dest;
4796 gimple inc;
4797 tree ptr;
4798 tree data_ref;
4799 basic_block new_bb;
4800 tree msq_init = NULL_TREE;
4801 tree new_temp;
4802 gphi *phi_stmt;
4803 tree msq = NULL_TREE;
4804 gimple_seq stmts = NULL;
4805 bool inv_p;
4806 bool compute_in_loop = false;
4807 bool nested_in_vect_loop = false;
4808 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4809 struct loop *loop_for_initial_load = NULL;
4811 if (loop_vinfo)
4813 loop = LOOP_VINFO_LOOP (loop_vinfo);
4814 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4817 gcc_assert (alignment_support_scheme == dr_explicit_realign
4818 || alignment_support_scheme == dr_explicit_realign_optimized);
4820 /* We need to generate three things:
4821 1. the misalignment computation
4822 2. the extra vector load (for the optimized realignment scheme).
4823 3. the phi node for the two vectors from which the realignment is
4824 done (for the optimized realignment scheme). */
4826 /* 1. Determine where to generate the misalignment computation.
4828 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4829 calculation will be generated by this function, outside the loop (in the
4830 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4831 caller, inside the loop.
4833 Background: If the misalignment remains fixed throughout the iterations of
4834 the loop, then both realignment schemes are applicable, and also the
4835 misalignment computation can be done outside LOOP. This is because we are
4836 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4837 are a multiple of VS (the Vector Size), and therefore the misalignment in
4838 different vectorized LOOP iterations is always the same.
4839 The problem arises only if the memory access is in an inner-loop nested
4840 inside LOOP, which is now being vectorized using outer-loop vectorization.
4841 This is the only case when the misalignment of the memory access may not
4842 remain fixed throughout the iterations of the inner-loop (as explained in
4843 detail in vect_supportable_dr_alignment). In this case, not only is the
4844 optimized realignment scheme not applicable, but also the misalignment
4845 computation (and generation of the realignment token that is passed to
4846 REALIGN_LOAD) have to be done inside the loop.
4848 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4849 or not, which in turn determines if the misalignment is computed inside
4850 the inner-loop, or outside LOOP. */
4852 if (init_addr != NULL_TREE || !loop_vinfo)
4854 compute_in_loop = true;
4855 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4859 /* 2. Determine where to generate the extra vector load.
4861 For the optimized realignment scheme, instead of generating two vector
4862 loads in each iteration, we generate a single extra vector load in the
4863 preheader of the loop, and in each iteration reuse the result of the
4864 vector load from the previous iteration. In case the memory access is in
4865 an inner-loop nested inside LOOP, which is now being vectorized using
4866 outer-loop vectorization, we need to determine whether this initial vector
4867 load should be generated at the preheader of the inner-loop, or can be
4868 generated at the preheader of LOOP. If the memory access has no evolution
4869 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4870 to be generated inside LOOP (in the preheader of the inner-loop). */
4872 if (nested_in_vect_loop)
4874 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4875 bool invariant_in_outerloop =
4876 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4877 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4879 else
4880 loop_for_initial_load = loop;
4881 if (at_loop)
4882 *at_loop = loop_for_initial_load;
4884 if (loop_for_initial_load)
4885 pe = loop_preheader_edge (loop_for_initial_load);
4887 /* 3. For the case of the optimized realignment, create the first vector
4888 load at the loop preheader. */
4890 if (alignment_support_scheme == dr_explicit_realign_optimized)
4892 /* Create msq_init = *(floor(p1)) in the loop preheader */
4893 gassign *new_stmt;
4895 gcc_assert (!compute_in_loop);
4896 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4897 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4898 NULL_TREE, &init_addr, NULL, &inc,
4899 true, &inv_p);
4900 if (TREE_CODE (ptr) == SSA_NAME)
4901 new_temp = copy_ssa_name (ptr);
4902 else
4903 new_temp = make_ssa_name (TREE_TYPE (ptr));
4904 new_stmt = gimple_build_assign
4905 (new_temp, BIT_AND_EXPR, ptr,
4906 build_int_cst (TREE_TYPE (ptr),
4907 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4908 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4909 gcc_assert (!new_bb);
4910 data_ref
4911 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4912 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4913 new_stmt = gimple_build_assign (vec_dest, data_ref);
4914 new_temp = make_ssa_name (vec_dest, new_stmt);
4915 gimple_assign_set_lhs (new_stmt, new_temp);
4916 if (pe)
4918 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4919 gcc_assert (!new_bb);
4921 else
4922 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4924 msq_init = gimple_assign_lhs (new_stmt);
4927 /* 4. Create realignment token using a target builtin, if available.
4928 It is done either inside the containing loop, or before LOOP (as
4929 determined above). */
4931 if (targetm.vectorize.builtin_mask_for_load)
4933 gcall *new_stmt;
4934 tree builtin_decl;
4936 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4937 if (!init_addr)
4939 /* Generate the INIT_ADDR computation outside LOOP. */
4940 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4941 NULL_TREE, loop);
4942 if (loop)
4944 pe = loop_preheader_edge (loop);
4945 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4946 gcc_assert (!new_bb);
4948 else
4949 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4952 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4953 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4954 vec_dest =
4955 vect_create_destination_var (scalar_dest,
4956 gimple_call_return_type (new_stmt));
4957 new_temp = make_ssa_name (vec_dest, new_stmt);
4958 gimple_call_set_lhs (new_stmt, new_temp);
4960 if (compute_in_loop)
4961 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4962 else
4964 /* Generate the misalignment computation outside LOOP. */
4965 pe = loop_preheader_edge (loop);
4966 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4967 gcc_assert (!new_bb);
4970 *realignment_token = gimple_call_lhs (new_stmt);
4972 /* The result of the CALL_EXPR to this builtin is determined from
4973 the value of the parameter and no global variables are touched
4974 which makes the builtin a "const" function. Requiring the
4975 builtin to have the "const" attribute makes it unnecessary
4976 to call mark_call_clobbered. */
4977 gcc_assert (TREE_READONLY (builtin_decl));
4980 if (alignment_support_scheme == dr_explicit_realign)
4981 return msq;
4983 gcc_assert (!compute_in_loop);
4984 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4987 /* 5. Create msq = phi <msq_init, lsq> in loop */
4989 pe = loop_preheader_edge (containing_loop);
4990 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4991 msq = make_ssa_name (vec_dest);
4992 phi_stmt = create_phi_node (msq, containing_loop->header);
4993 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4995 return msq;
4999 /* Function vect_grouped_load_supported.
5001 Returns TRUE if even and odd permutations are supported,
5002 and FALSE otherwise. */
5004 bool
5005 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5007 machine_mode mode = TYPE_MODE (vectype);
5009 /* vect_permute_load_chain requires the group size to be equal to 3 or
5010 be a power of two. */
5011 if (count != 3 && exact_log2 (count) == -1)
5013 if (dump_enabled_p ())
5014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015 "the size of the group of accesses"
5016 " is not a power of 2 or not equal to 3\n");
5017 return false;
5020 /* Check that the permutation is supported. */
5021 if (VECTOR_MODE_P (mode))
5023 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5024 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5026 if (count == 3)
5028 unsigned int k;
5029 for (k = 0; k < 3; k++)
5031 for (i = 0; i < nelt; i++)
5032 if (3 * i + k < 2 * nelt)
5033 sel[i] = 3 * i + k;
5034 else
5035 sel[i] = 0;
5036 if (!can_vec_perm_p (mode, false, sel))
5038 if (dump_enabled_p ())
5039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5040 "shuffle of 3 loads is not supported by"
5041 " target\n");
5042 return false;
5044 for (i = 0, j = 0; i < nelt; i++)
5045 if (3 * i + k < 2 * nelt)
5046 sel[i] = i;
5047 else
5048 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5049 if (!can_vec_perm_p (mode, false, sel))
5051 if (dump_enabled_p ())
5052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5053 "shuffle of 3 loads is not supported by"
5054 " target\n");
5055 return false;
5058 return true;
5060 else
5062 /* If length is not equal to 3 then only power of 2 is supported. */
5063 gcc_assert (exact_log2 (count) != -1);
5064 for (i = 0; i < nelt; i++)
5065 sel[i] = i * 2;
5066 if (can_vec_perm_p (mode, false, sel))
5068 for (i = 0; i < nelt; i++)
5069 sel[i] = i * 2 + 1;
5070 if (can_vec_perm_p (mode, false, sel))
5071 return true;
5076 if (dump_enabled_p ())
5077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5078 "extract even/odd not supported by target\n");
5079 return false;
5082 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5083 type VECTYPE. */
5085 bool
5086 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5088 return vect_lanes_optab_supported_p ("vec_load_lanes",
5089 vec_load_lanes_optab,
5090 vectype, count);
5093 /* Function vect_permute_load_chain.
5095 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5096 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5097 the input data correctly. Return the final references for loads in
5098 RESULT_CHAIN.
5100 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5101 The input is 4 vectors each containing 8 elements. We assign a number to each
5102 element, the input sequence is:
5104 1st vec: 0 1 2 3 4 5 6 7
5105 2nd vec: 8 9 10 11 12 13 14 15
5106 3rd vec: 16 17 18 19 20 21 22 23
5107 4th vec: 24 25 26 27 28 29 30 31
5109 The output sequence should be:
5111 1st vec: 0 4 8 12 16 20 24 28
5112 2nd vec: 1 5 9 13 17 21 25 29
5113 3rd vec: 2 6 10 14 18 22 26 30
5114 4th vec: 3 7 11 15 19 23 27 31
5116 i.e., the first output vector should contain the first elements of each
5117 interleaving group, etc.
5119 We use extract_even/odd instructions to create such output. The input of
5120 each extract_even/odd operation is two vectors
5121 1st vec 2nd vec
5122 0 1 2 3 4 5 6 7
5124 and the output is the vector of extracted even/odd elements. The output of
5125 extract_even will be: 0 2 4 6
5126 and of extract_odd: 1 3 5 7
5129 The permutation is done in log LENGTH stages. In each stage extract_even
5130 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5131 their order. In our example,
5133 E1: extract_even (1st vec, 2nd vec)
5134 E2: extract_odd (1st vec, 2nd vec)
5135 E3: extract_even (3rd vec, 4th vec)
5136 E4: extract_odd (3rd vec, 4th vec)
5138 The output for the first stage will be:
5140 E1: 0 2 4 6 8 10 12 14
5141 E2: 1 3 5 7 9 11 13 15
5142 E3: 16 18 20 22 24 26 28 30
5143 E4: 17 19 21 23 25 27 29 31
5145 In order to proceed and create the correct sequence for the next stage (or
5146 for the correct output, if the second stage is the last one, as in our
5147 example), we first put the output of extract_even operation and then the
5148 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5149 The input for the second stage is:
5151 1st vec (E1): 0 2 4 6 8 10 12 14
5152 2nd vec (E3): 16 18 20 22 24 26 28 30
5153 3rd vec (E2): 1 3 5 7 9 11 13 15
5154 4th vec (E4): 17 19 21 23 25 27 29 31
5156 The output of the second stage:
5158 E1: 0 4 8 12 16 20 24 28
5159 E2: 2 6 10 14 18 22 26 30
5160 E3: 1 5 9 13 17 21 25 29
5161 E4: 3 7 11 15 19 23 27 31
5163 And RESULT_CHAIN after reordering:
5165 1st vec (E1): 0 4 8 12 16 20 24 28
5166 2nd vec (E3): 1 5 9 13 17 21 25 29
5167 3rd vec (E2): 2 6 10 14 18 22 26 30
5168 4th vec (E4): 3 7 11 15 19 23 27 31. */
5170 static void
5171 vect_permute_load_chain (vec<tree> dr_chain,
5172 unsigned int length,
5173 gimple stmt,
5174 gimple_stmt_iterator *gsi,
5175 vec<tree> *result_chain)
5177 tree data_ref, first_vect, second_vect;
5178 tree perm_mask_even, perm_mask_odd;
5179 tree perm3_mask_low, perm3_mask_high;
5180 gimple perm_stmt;
5181 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5182 unsigned int i, j, log_length = exact_log2 (length);
5183 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5184 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5186 result_chain->quick_grow (length);
5187 memcpy (result_chain->address (), dr_chain.address (),
5188 length * sizeof (tree));
5190 if (length == 3)
5192 unsigned int k;
5194 for (k = 0; k < 3; k++)
5196 for (i = 0; i < nelt; i++)
5197 if (3 * i + k < 2 * nelt)
5198 sel[i] = 3 * i + k;
5199 else
5200 sel[i] = 0;
5201 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5203 for (i = 0, j = 0; i < nelt; i++)
5204 if (3 * i + k < 2 * nelt)
5205 sel[i] = i;
5206 else
5207 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5209 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5211 first_vect = dr_chain[0];
5212 second_vect = dr_chain[1];
5214 /* Create interleaving stmt (low part of):
5215 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5216 ...}> */
5217 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5218 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5219 second_vect, perm3_mask_low);
5220 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5222 /* Create interleaving stmt (high part of):
5223 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5224 ...}> */
5225 first_vect = data_ref;
5226 second_vect = dr_chain[2];
5227 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5228 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5229 second_vect, perm3_mask_high);
5230 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5231 (*result_chain)[k] = data_ref;
5234 else
5236 /* If length is not equal to 3 then only power of 2 is supported. */
5237 gcc_assert (exact_log2 (length) != -1);
5239 for (i = 0; i < nelt; ++i)
5240 sel[i] = i * 2;
5241 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5243 for (i = 0; i < nelt; ++i)
5244 sel[i] = i * 2 + 1;
5245 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5247 for (i = 0; i < log_length; i++)
5249 for (j = 0; j < length; j += 2)
5251 first_vect = dr_chain[j];
5252 second_vect = dr_chain[j+1];
5254 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5255 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5256 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5257 first_vect, second_vect,
5258 perm_mask_even);
5259 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5260 (*result_chain)[j/2] = data_ref;
5262 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5263 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5264 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5265 first_vect, second_vect,
5266 perm_mask_odd);
5267 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5268 (*result_chain)[j/2+length/2] = data_ref;
5270 memcpy (dr_chain.address (), result_chain->address (),
5271 length * sizeof (tree));
5276 /* Function vect_shift_permute_load_chain.
5278 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5279 sequence of stmts to reorder the input data accordingly.
5280 Return the final references for loads in RESULT_CHAIN.
5281 Return true if successed, false otherwise.
5283 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5284 The input is 3 vectors each containing 8 elements. We assign a
5285 number to each element, the input sequence is:
5287 1st vec: 0 1 2 3 4 5 6 7
5288 2nd vec: 8 9 10 11 12 13 14 15
5289 3rd vec: 16 17 18 19 20 21 22 23
5291 The output sequence should be:
5293 1st vec: 0 3 6 9 12 15 18 21
5294 2nd vec: 1 4 7 10 13 16 19 22
5295 3rd vec: 2 5 8 11 14 17 20 23
5297 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5299 First we shuffle all 3 vectors to get correct elements order:
5301 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5302 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5303 3rd vec: (16 19 22) (17 20 23) (18 21)
5305 Next we unite and shift vector 3 times:
5307 1st step:
5308 shift right by 6 the concatenation of:
5309 "1st vec" and "2nd vec"
5310 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5311 "2nd vec" and "3rd vec"
5312 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5313 "3rd vec" and "1st vec"
5314 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5315 | New vectors |
5317 So that now new vectors are:
5319 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5320 2nd vec: (10 13) (16 19 22) (17 20 23)
5321 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5323 2nd step:
5324 shift right by 5 the concatenation of:
5325 "1st vec" and "3rd vec"
5326 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5327 "2nd vec" and "1st vec"
5328 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5329 "3rd vec" and "2nd vec"
5330 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5331 | New vectors |
5333 So that now new vectors are:
5335 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5336 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5337 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5339 3rd step:
5340 shift right by 5 the concatenation of:
5341 "1st vec" and "1st vec"
5342 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5343 shift right by 3 the concatenation of:
5344 "2nd vec" and "2nd vec"
5345 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5346 | New vectors |
5348 So that now all vectors are READY:
5349 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5350 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5351 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5353 This algorithm is faster than one in vect_permute_load_chain if:
5354 1. "shift of a concatination" is faster than general permutation.
5355 This is usually so.
5356 2. The TARGET machine can't execute vector instructions in parallel.
5357 This is because each step of the algorithm depends on previous.
5358 The algorithm in vect_permute_load_chain is much more parallel.
5360 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5363 static bool
5364 vect_shift_permute_load_chain (vec<tree> dr_chain,
5365 unsigned int length,
5366 gimple stmt,
5367 gimple_stmt_iterator *gsi,
5368 vec<tree> *result_chain)
5370 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5371 tree perm2_mask1, perm2_mask2, perm3_mask;
5372 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5373 gimple perm_stmt;
5375 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5376 unsigned int i;
5377 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5378 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5379 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5382 result_chain->quick_grow (length);
5383 memcpy (result_chain->address (), dr_chain.address (),
5384 length * sizeof (tree));
5386 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5388 unsigned int j, log_length = exact_log2 (length);
5389 for (i = 0; i < nelt / 2; ++i)
5390 sel[i] = i * 2;
5391 for (i = 0; i < nelt / 2; ++i)
5392 sel[nelt / 2 + i] = i * 2 + 1;
5393 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5395 if (dump_enabled_p ())
5396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5397 "shuffle of 2 fields structure is not \
5398 supported by target\n");
5399 return false;
5401 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5403 for (i = 0; i < nelt / 2; ++i)
5404 sel[i] = i * 2 + 1;
5405 for (i = 0; i < nelt / 2; ++i)
5406 sel[nelt / 2 + i] = i * 2;
5407 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5409 if (dump_enabled_p ())
5410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5411 "shuffle of 2 fields structure is not \
5412 supported by target\n");
5413 return false;
5415 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5417 /* Generating permutation constant to shift all elements.
5418 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5419 for (i = 0; i < nelt; i++)
5420 sel[i] = nelt / 2 + i;
5421 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5423 if (dump_enabled_p ())
5424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5425 "shift permutation is not supported by target\n");
5426 return false;
5428 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5430 /* Generating permutation constant to select vector from 2.
5431 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5432 for (i = 0; i < nelt / 2; i++)
5433 sel[i] = i;
5434 for (i = nelt / 2; i < nelt; i++)
5435 sel[i] = nelt + i;
5436 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5438 if (dump_enabled_p ())
5439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5440 "select is not supported by target\n");
5441 return false;
5443 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5445 for (i = 0; i < log_length; i++)
5447 for (j = 0; j < length; j += 2)
5449 first_vect = dr_chain[j];
5450 second_vect = dr_chain[j + 1];
5452 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5453 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5454 first_vect, first_vect,
5455 perm2_mask1);
5456 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5457 vect[0] = data_ref;
5459 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5460 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5461 second_vect, second_vect,
5462 perm2_mask2);
5463 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5464 vect[1] = data_ref;
5466 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5467 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5468 vect[0], vect[1], shift1_mask);
5469 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5470 (*result_chain)[j/2 + length/2] = data_ref;
5472 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5473 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5474 vect[0], vect[1], select_mask);
5475 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5476 (*result_chain)[j/2] = data_ref;
5478 memcpy (dr_chain.address (), result_chain->address (),
5479 length * sizeof (tree));
5481 return true;
5483 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5485 unsigned int k = 0, l = 0;
5487 /* Generating permutation constant to get all elements in rigth order.
5488 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5489 for (i = 0; i < nelt; i++)
5491 if (3 * k + (l % 3) >= nelt)
5493 k = 0;
5494 l += (3 - (nelt % 3));
5496 sel[i] = 3 * k + (l % 3);
5497 k++;
5499 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5501 if (dump_enabled_p ())
5502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5503 "shuffle of 3 fields structure is not \
5504 supported by target\n");
5505 return false;
5507 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5509 /* Generating permutation constant to shift all elements.
5510 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5511 for (i = 0; i < nelt; i++)
5512 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5513 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5515 if (dump_enabled_p ())
5516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5517 "shift permutation is not supported by target\n");
5518 return false;
5520 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5522 /* Generating permutation constant to shift all elements.
5523 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5524 for (i = 0; i < nelt; i++)
5525 sel[i] = 2 * (nelt / 3) + 1 + i;
5526 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5528 if (dump_enabled_p ())
5529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5530 "shift permutation is not supported by target\n");
5531 return false;
5533 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5535 /* Generating permutation constant to shift all elements.
5536 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5537 for (i = 0; i < nelt; i++)
5538 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5539 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5541 if (dump_enabled_p ())
5542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5543 "shift permutation is not supported by target\n");
5544 return false;
5546 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5548 /* Generating permutation constant to shift all elements.
5549 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5550 for (i = 0; i < nelt; i++)
5551 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5552 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5554 if (dump_enabled_p ())
5555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5556 "shift permutation is not supported by target\n");
5557 return false;
5559 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5561 for (k = 0; k < 3; k++)
5563 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5564 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5565 dr_chain[k], dr_chain[k],
5566 perm3_mask);
5567 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5568 vect[k] = data_ref;
5571 for (k = 0; k < 3; k++)
5573 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5574 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5575 vect[k % 3], vect[(k + 1) % 3],
5576 shift1_mask);
5577 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5578 vect_shift[k] = data_ref;
5581 for (k = 0; k < 3; k++)
5583 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5584 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5585 vect_shift[(4 - k) % 3],
5586 vect_shift[(3 - k) % 3],
5587 shift2_mask);
5588 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5589 vect[k] = data_ref;
5592 (*result_chain)[3 - (nelt % 3)] = vect[2];
5594 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5595 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5596 vect[0], shift3_mask);
5597 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5598 (*result_chain)[nelt % 3] = data_ref;
5600 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5601 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5602 vect[1], shift4_mask);
5603 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5604 (*result_chain)[0] = data_ref;
5605 return true;
5607 return false;
5610 /* Function vect_transform_grouped_load.
5612 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5613 to perform their permutation and ascribe the result vectorized statements to
5614 the scalar statements.
5617 void
5618 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5619 gimple_stmt_iterator *gsi)
5621 machine_mode mode;
5622 vec<tree> result_chain = vNULL;
5624 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5625 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5626 vectors, that are ready for vector computation. */
5627 result_chain.create (size);
5629 /* If reassociation width for vector type is 2 or greater target machine can
5630 execute 2 or more vector instructions in parallel. Otherwise try to
5631 get chain for loads group using vect_shift_permute_load_chain. */
5632 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5633 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5634 || exact_log2 (size) != -1
5635 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5636 gsi, &result_chain))
5637 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5638 vect_record_grouped_load_vectors (stmt, result_chain);
5639 result_chain.release ();
5642 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5643 generated as part of the vectorization of STMT. Assign the statement
5644 for each vector to the associated scalar statement. */
5646 void
5647 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5649 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5650 gimple next_stmt, new_stmt;
5651 unsigned int i, gap_count;
5652 tree tmp_data_ref;
5654 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5655 Since we scan the chain starting from it's first node, their order
5656 corresponds the order of data-refs in RESULT_CHAIN. */
5657 next_stmt = first_stmt;
5658 gap_count = 1;
5659 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5661 if (!next_stmt)
5662 break;
5664 /* Skip the gaps. Loads created for the gaps will be removed by dead
5665 code elimination pass later. No need to check for the first stmt in
5666 the group, since it always exists.
5667 GROUP_GAP is the number of steps in elements from the previous
5668 access (if there is no gap GROUP_GAP is 1). We skip loads that
5669 correspond to the gaps. */
5670 if (next_stmt != first_stmt
5671 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5673 gap_count++;
5674 continue;
5677 while (next_stmt)
5679 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5680 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5681 copies, and we put the new vector statement in the first available
5682 RELATED_STMT. */
5683 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5684 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5685 else
5687 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5689 gimple prev_stmt =
5690 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5691 gimple rel_stmt =
5692 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5693 while (rel_stmt)
5695 prev_stmt = rel_stmt;
5696 rel_stmt =
5697 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5700 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5701 new_stmt;
5705 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5706 gap_count = 1;
5707 /* If NEXT_STMT accesses the same DR as the previous statement,
5708 put the same TMP_DATA_REF as its vectorized statement; otherwise
5709 get the next data-ref from RESULT_CHAIN. */
5710 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5711 break;
5716 /* Function vect_force_dr_alignment_p.
5718 Returns whether the alignment of a DECL can be forced to be aligned
5719 on ALIGNMENT bit boundary. */
5721 bool
5722 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5724 if (TREE_CODE (decl) != VAR_DECL)
5725 return false;
5727 if (decl_in_symtab_p (decl)
5728 && !symtab_node::get (decl)->can_increase_alignment_p ())
5729 return false;
5731 if (TREE_STATIC (decl))
5732 return (alignment <= MAX_OFILE_ALIGNMENT);
5733 else
5734 return (alignment <= MAX_STACK_ALIGNMENT);
5738 /* Return whether the data reference DR is supported with respect to its
5739 alignment.
5740 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5741 it is aligned, i.e., check if it is possible to vectorize it with different
5742 alignment. */
5744 enum dr_alignment_support
5745 vect_supportable_dr_alignment (struct data_reference *dr,
5746 bool check_aligned_accesses)
5748 gimple stmt = DR_STMT (dr);
5749 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5750 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5751 machine_mode mode = TYPE_MODE (vectype);
5752 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5753 struct loop *vect_loop = NULL;
5754 bool nested_in_vect_loop = false;
5756 if (aligned_access_p (dr) && !check_aligned_accesses)
5757 return dr_aligned;
5759 /* For now assume all conditional loads/stores support unaligned
5760 access without any special code. */
5761 if (is_gimple_call (stmt)
5762 && gimple_call_internal_p (stmt)
5763 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5764 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5765 return dr_unaligned_supported;
5767 if (loop_vinfo)
5769 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5770 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5773 /* Possibly unaligned access. */
5775 /* We can choose between using the implicit realignment scheme (generating
5776 a misaligned_move stmt) and the explicit realignment scheme (generating
5777 aligned loads with a REALIGN_LOAD). There are two variants to the
5778 explicit realignment scheme: optimized, and unoptimized.
5779 We can optimize the realignment only if the step between consecutive
5780 vector loads is equal to the vector size. Since the vector memory
5781 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5782 is guaranteed that the misalignment amount remains the same throughout the
5783 execution of the vectorized loop. Therefore, we can create the
5784 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5785 at the loop preheader.
5787 However, in the case of outer-loop vectorization, when vectorizing a
5788 memory access in the inner-loop nested within the LOOP that is now being
5789 vectorized, while it is guaranteed that the misalignment of the
5790 vectorized memory access will remain the same in different outer-loop
5791 iterations, it is *not* guaranteed that is will remain the same throughout
5792 the execution of the inner-loop. This is because the inner-loop advances
5793 with the original scalar step (and not in steps of VS). If the inner-loop
5794 step happens to be a multiple of VS, then the misalignment remains fixed
5795 and we can use the optimized realignment scheme. For example:
5797 for (i=0; i<N; i++)
5798 for (j=0; j<M; j++)
5799 s += a[i+j];
5801 When vectorizing the i-loop in the above example, the step between
5802 consecutive vector loads is 1, and so the misalignment does not remain
5803 fixed across the execution of the inner-loop, and the realignment cannot
5804 be optimized (as illustrated in the following pseudo vectorized loop):
5806 for (i=0; i<N; i+=4)
5807 for (j=0; j<M; j++){
5808 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5809 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5810 // (assuming that we start from an aligned address).
5813 We therefore have to use the unoptimized realignment scheme:
5815 for (i=0; i<N; i+=4)
5816 for (j=k; j<M; j+=4)
5817 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5818 // that the misalignment of the initial address is
5819 // 0).
5821 The loop can then be vectorized as follows:
5823 for (k=0; k<4; k++){
5824 rt = get_realignment_token (&vp[k]);
5825 for (i=0; i<N; i+=4){
5826 v1 = vp[i+k];
5827 for (j=k; j<M; j+=4){
5828 v2 = vp[i+j+VS-1];
5829 va = REALIGN_LOAD <v1,v2,rt>;
5830 vs += va;
5831 v1 = v2;
5834 } */
5836 if (DR_IS_READ (dr))
5838 bool is_packed = false;
5839 tree type = (TREE_TYPE (DR_REF (dr)));
5841 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5842 && (!targetm.vectorize.builtin_mask_for_load
5843 || targetm.vectorize.builtin_mask_for_load ()))
5845 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5846 if ((nested_in_vect_loop
5847 && (TREE_INT_CST_LOW (DR_STEP (dr))
5848 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5849 || !loop_vinfo)
5850 return dr_explicit_realign;
5851 else
5852 return dr_explicit_realign_optimized;
5854 if (!known_alignment_for_access_p (dr))
5855 is_packed = not_size_aligned (DR_REF (dr));
5857 if ((TYPE_USER_ALIGN (type) && !is_packed)
5858 || targetm.vectorize.
5859 support_vector_misalignment (mode, type,
5860 DR_MISALIGNMENT (dr), is_packed))
5861 /* Can't software pipeline the loads, but can at least do them. */
5862 return dr_unaligned_supported;
5864 else
5866 bool is_packed = false;
5867 tree type = (TREE_TYPE (DR_REF (dr)));
5869 if (!known_alignment_for_access_p (dr))
5870 is_packed = not_size_aligned (DR_REF (dr));
5872 if ((TYPE_USER_ALIGN (type) && !is_packed)
5873 || targetm.vectorize.
5874 support_vector_misalignment (mode, type,
5875 DR_MISALIGNMENT (dr), is_packed))
5876 return dr_unaligned_supported;
5879 /* Unsupported. */
5880 return dr_unaligned_unsupported;