PR ipa/65648
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob094275e84394384c39f74457cebf8c0afde8f3b2
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tm_p.h"
40 #include "target.h"
41 #include "predict.h"
42 #include "hard-reg-set.h"
43 #include "function.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "basic-block.h"
47 #include "gimple-pretty-print.h"
48 #include "tree-ssa-alias.h"
49 #include "internal-fn.h"
50 #include "tree-eh.h"
51 #include "gimple-expr.h"
52 #include "is-a.h"
53 #include "gimple.h"
54 #include "gimplify.h"
55 #include "gimple-iterator.h"
56 #include "gimplify-me.h"
57 #include "gimple-ssa.h"
58 #include "tree-phinodes.h"
59 #include "ssa-iterators.h"
60 #include "stringpool.h"
61 #include "tree-ssanames.h"
62 #include "tree-ssa-loop-ivopts.h"
63 #include "tree-ssa-loop-manip.h"
64 #include "tree-ssa-loop.h"
65 #include "cfgloop.h"
66 #include "tree-chrec.h"
67 #include "tree-scalar-evolution.h"
68 #include "tree-vectorizer.h"
69 #include "diagnostic-core.h"
70 #include "hash-map.h"
71 #include "plugin-api.h"
72 #include "ipa-ref.h"
73 #include "cgraph.h"
74 /* Need to include rtl.h, expr.h, etc. for optabs. */
75 #include "hashtab.h"
76 #include "rtl.h"
77 #include "flags.h"
78 #include "statistics.h"
79 #include "real.h"
80 #include "fixed-value.h"
81 #include "insn-config.h"
82 #include "expmed.h"
83 #include "dojump.h"
84 #include "explow.h"
85 #include "calls.h"
86 #include "emit-rtl.h"
87 #include "varasm.h"
88 #include "stmt.h"
89 #include "expr.h"
90 #include "insn-codes.h"
91 #include "optabs.h"
92 #include "builtins.h"
94 /* Return true if load- or store-lanes optab OPTAB is implemented for
95 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
97 static bool
98 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
99 tree vectype, unsigned HOST_WIDE_INT count)
101 machine_mode mode, array_mode;
102 bool limit_p;
104 mode = TYPE_MODE (vectype);
105 limit_p = !targetm.array_mode_supported_p (mode, count);
106 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
107 MODE_INT, limit_p);
109 if (array_mode == BLKmode)
111 if (dump_enabled_p ())
112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
113 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
114 GET_MODE_NAME (mode), count);
115 return false;
118 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
120 if (dump_enabled_p ())
121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
122 "cannot use %s<%s><%s>\n", name,
123 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
124 return false;
127 if (dump_enabled_p ())
128 dump_printf_loc (MSG_NOTE, vect_location,
129 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
130 GET_MODE_NAME (mode));
132 return true;
136 /* Return the smallest scalar part of STMT.
137 This is used to determine the vectype of the stmt. We generally set the
138 vectype according to the type of the result (lhs). For stmts whose
139 result-type is different than the type of the arguments (e.g., demotion,
140 promotion), vectype will be reset appropriately (later). Note that we have
141 to visit the smallest datatype in this function, because that determines the
142 VF. If the smallest datatype in the loop is present only as the rhs of a
143 promotion operation - we'd miss it.
144 Such a case, where a variable of this datatype does not appear in the lhs
145 anywhere in the loop, can only occur if it's an invariant: e.g.:
146 'int_x = (int) short_inv', which we'd expect to have been optimized away by
147 invariant motion. However, we cannot rely on invariant motion to always
148 take invariants out of the loop, and so in the case of promotion we also
149 have to check the rhs.
150 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
151 types. */
153 tree
154 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
155 HOST_WIDE_INT *rhs_size_unit)
157 tree scalar_type = gimple_expr_type (stmt);
158 HOST_WIDE_INT lhs, rhs;
160 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
162 if (is_gimple_assign (stmt)
163 && (gimple_assign_cast_p (stmt)
164 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
165 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
166 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
168 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
170 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
171 if (rhs < lhs)
172 scalar_type = rhs_type;
175 *lhs_size_unit = lhs;
176 *rhs_size_unit = rhs;
177 return scalar_type;
181 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
182 tested at run-time. Return TRUE if DDR was successfully inserted.
183 Return false if versioning is not supported. */
185 static bool
186 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
190 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
191 return false;
193 if (dump_enabled_p ())
195 dump_printf_loc (MSG_NOTE, vect_location,
196 "mark for run-time aliasing test between ");
197 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
198 dump_printf (MSG_NOTE, " and ");
199 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
200 dump_printf (MSG_NOTE, "\n");
203 if (optimize_loop_nest_for_size_p (loop))
205 if (dump_enabled_p ())
206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
207 "versioning not supported when optimizing"
208 " for size.\n");
209 return false;
212 /* FORNOW: We don't support versioning with outer-loop vectorization. */
213 if (loop->inner)
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217 "versioning not yet supported for outer-loops.\n");
218 return false;
221 /* FORNOW: We don't support creating runtime alias tests for non-constant
222 step. */
223 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
224 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
226 if (dump_enabled_p ())
227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
228 "versioning not yet supported for non-constant "
229 "step\n");
230 return false;
233 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
234 return true;
238 /* Function vect_analyze_data_ref_dependence.
240 Return TRUE if there (might) exist a dependence between a memory-reference
241 DRA and a memory-reference DRB. When versioning for alias may check a
242 dependence at run-time, return FALSE. Adjust *MAX_VF according to
243 the data dependence. */
245 static bool
246 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
247 loop_vec_info loop_vinfo, int *max_vf)
249 unsigned int i;
250 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
251 struct data_reference *dra = DDR_A (ddr);
252 struct data_reference *drb = DDR_B (ddr);
253 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
254 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
255 lambda_vector dist_v;
256 unsigned int loop_depth;
258 /* In loop analysis all data references should be vectorizable. */
259 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
260 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
261 gcc_unreachable ();
263 /* Independent data accesses. */
264 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
265 return false;
267 if (dra == drb
268 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
269 return false;
271 /* Even if we have an anti-dependence then, as the vectorized loop covers at
272 least two scalar iterations, there is always also a true dependence.
273 As the vectorizer does not re-order loads and stores we can ignore
274 the anti-dependence if TBAA can disambiguate both DRs similar to the
275 case with known negative distance anti-dependences (positive
276 distance anti-dependences would violate TBAA constraints). */
277 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
278 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
279 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
280 get_alias_set (DR_REF (drb))))
281 return false;
283 /* Unknown data dependence. */
284 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
286 /* If user asserted safelen consecutive iterations can be
287 executed concurrently, assume independence. */
288 if (loop->safelen >= 2)
290 if (loop->safelen < *max_vf)
291 *max_vf = loop->safelen;
292 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
293 return false;
296 if (STMT_VINFO_GATHER_P (stmtinfo_a)
297 || STMT_VINFO_GATHER_P (stmtinfo_b))
299 if (dump_enabled_p ())
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 "versioning for alias not supported for: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 DR_REF (dra));
306 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (drb));
309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
311 return true;
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias required: "
318 "can't determine dependence between ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 DR_REF (drb));
324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
327 /* Add to list of ddrs that need to be tested at run-time. */
328 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
331 /* Known data dependence. */
332 if (DDR_NUM_DIST_VECTS (ddr) == 0)
334 /* If user asserted safelen consecutive iterations can be
335 executed concurrently, assume independence. */
336 if (loop->safelen >= 2)
338 if (loop->safelen < *max_vf)
339 *max_vf = loop->safelen;
340 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
341 return false;
344 if (STMT_VINFO_GATHER_P (stmtinfo_a)
345 || STMT_VINFO_GATHER_P (stmtinfo_b))
347 if (dump_enabled_p ())
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350 "versioning for alias not supported for: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 DR_REF (dra));
354 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
355 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
356 DR_REF (drb));
357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
359 return true;
362 if (dump_enabled_p ())
364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
365 "versioning for alias required: "
366 "bad dist vector for ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
368 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
369 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
370 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
376 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
377 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
379 int dist = dist_v[loop_depth];
381 if (dump_enabled_p ())
382 dump_printf_loc (MSG_NOTE, vect_location,
383 "dependence distance = %d.\n", dist);
385 if (dist == 0)
387 if (dump_enabled_p ())
389 dump_printf_loc (MSG_NOTE, vect_location,
390 "dependence distance == 0 between ");
391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
392 dump_printf (MSG_NOTE, " and ");
393 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
397 /* When we perform grouped accesses and perform implicit CSE
398 by detecting equal accesses and doing disambiguation with
399 runtime alias tests like for
400 .. = a[i];
401 .. = a[i+1];
402 a[i] = ..;
403 a[i+1] = ..;
404 *p = ..;
405 .. = a[i];
406 .. = a[i+1];
407 where we will end up loading { a[i], a[i+1] } once, make
408 sure that inserting group loads before the first load and
409 stores after the last store will do the right thing.
410 Similar for groups like
411 a[i] = ...;
412 ... = a[i];
413 a[i+1] = ...;
414 where loads from the group interleave with the store. */
415 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
416 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
418 gimple earlier_stmt;
419 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
420 if (DR_IS_WRITE
421 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 "READ_WRITE dependence in interleaving."
426 "\n");
427 return true;
431 continue;
434 if (dist > 0 && DDR_REVERSED_P (ddr))
436 /* If DDR_REVERSED_P the order of the data-refs in DDR was
437 reversed (to make distance vector positive), and the actual
438 distance is negative. */
439 if (dump_enabled_p ())
440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
441 "dependence distance negative.\n");
442 /* Record a negative dependence distance to later limit the
443 amount of stmt copying / unrolling we can perform.
444 Only need to handle read-after-write dependence. */
445 if (DR_IS_READ (drb)
446 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
447 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
448 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
449 continue;
452 if (abs (dist) >= 2
453 && abs (dist) < *max_vf)
455 /* The dependence distance requires reduction of the maximal
456 vectorization factor. */
457 *max_vf = abs (dist);
458 if (dump_enabled_p ())
459 dump_printf_loc (MSG_NOTE, vect_location,
460 "adjusting maximal vectorization factor to %i\n",
461 *max_vf);
464 if (abs (dist) >= *max_vf)
466 /* Dependence distance does not create dependence, as far as
467 vectorization is concerned, in this case. */
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance >= VF.\n");
471 continue;
474 if (dump_enabled_p ())
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "not vectorized, possible dependence "
478 "between data-refs ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
480 dump_printf (MSG_NOTE, " and ");
481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
482 dump_printf (MSG_NOTE, "\n");
485 return true;
488 return false;
491 /* Function vect_analyze_data_ref_dependences.
493 Examine all the data references in the loop, and make sure there do not
494 exist any data dependences between them. Set *MAX_VF according to
495 the maximum vectorization factor the data dependences allow. */
497 bool
498 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
500 unsigned int i;
501 struct data_dependence_relation *ddr;
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_NOTE, vect_location,
505 "=== vect_analyze_data_ref_dependences ===\n");
507 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
508 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
509 &LOOP_VINFO_DDRS (loop_vinfo),
510 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
511 return false;
513 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
514 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
515 return false;
517 return true;
521 /* Function vect_slp_analyze_data_ref_dependence.
523 Return TRUE if there (might) exist a dependence between a memory-reference
524 DRA and a memory-reference DRB. When versioning for alias may check a
525 dependence at run-time, return FALSE. Adjust *MAX_VF according to
526 the data dependence. */
528 static bool
529 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
531 struct data_reference *dra = DDR_A (ddr);
532 struct data_reference *drb = DDR_B (ddr);
534 /* We need to check dependences of statements marked as unvectorizable
535 as well, they still can prohibit vectorization. */
537 /* Independent data accesses. */
538 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
539 return false;
541 if (dra == drb)
542 return false;
544 /* Read-read is OK. */
545 if (DR_IS_READ (dra) && DR_IS_READ (drb))
546 return false;
548 /* If dra and drb are part of the same interleaving chain consider
549 them independent. */
550 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
551 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
552 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
553 return false;
555 /* Unknown data dependence. */
556 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
558 if (dump_enabled_p ())
560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
561 "can't determine dependence between ");
562 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
563 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
565 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
568 else if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "determined dependence between ");
572 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
573 dump_printf (MSG_NOTE, " and ");
574 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
575 dump_printf (MSG_NOTE, "\n");
578 /* We do not vectorize basic blocks with write-write dependencies. */
579 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
580 return true;
582 /* If we have a read-write dependence check that the load is before the store.
583 When we vectorize basic blocks, vector load can be only before
584 corresponding scalar load, and vector store can be only after its
585 corresponding scalar store. So the order of the acceses is preserved in
586 case the load is before the store. */
587 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
588 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
590 /* That only holds for load-store pairs taking part in vectorization. */
591 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
592 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
593 return false;
596 return true;
600 /* Function vect_analyze_data_ref_dependences.
602 Examine all the data references in the basic-block, and make sure there
603 do not exist any data dependences between them. Set *MAX_VF according to
604 the maximum vectorization factor the data dependences allow. */
606 bool
607 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
609 struct data_dependence_relation *ddr;
610 unsigned int i;
612 if (dump_enabled_p ())
613 dump_printf_loc (MSG_NOTE, vect_location,
614 "=== vect_slp_analyze_data_ref_dependences ===\n");
616 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
617 &BB_VINFO_DDRS (bb_vinfo),
618 vNULL, true))
619 return false;
621 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
622 if (vect_slp_analyze_data_ref_dependence (ddr))
623 return false;
625 return true;
629 /* Function vect_compute_data_ref_alignment
631 Compute the misalignment of the data reference DR.
633 Output:
634 1. If during the misalignment computation it is found that the data reference
635 cannot be vectorized then false is returned.
636 2. DR_MISALIGNMENT (DR) is defined.
638 FOR NOW: No analysis is actually performed. Misalignment is calculated
639 only for trivial cases. TODO. */
641 static bool
642 vect_compute_data_ref_alignment (struct data_reference *dr)
644 gimple stmt = DR_STMT (dr);
645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
647 struct loop *loop = NULL;
648 tree ref = DR_REF (dr);
649 tree vectype;
650 tree base, base_addr;
651 bool base_aligned;
652 tree misalign;
653 tree aligned_to;
654 unsigned HOST_WIDE_INT alignment;
656 if (dump_enabled_p ())
657 dump_printf_loc (MSG_NOTE, vect_location,
658 "vect_compute_data_ref_alignment:\n");
660 if (loop_vinfo)
661 loop = LOOP_VINFO_LOOP (loop_vinfo);
663 /* Initialize misalignment to unknown. */
664 SET_DR_MISALIGNMENT (dr, -1);
666 /* Strided loads perform only component accesses, misalignment information
667 is irrelevant for them. */
668 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
669 return true;
671 misalign = DR_INIT (dr);
672 aligned_to = DR_ALIGNED_TO (dr);
673 base_addr = DR_BASE_ADDRESS (dr);
674 vectype = STMT_VINFO_VECTYPE (stmt_info);
676 /* In case the dataref is in an inner-loop of the loop that is being
677 vectorized (LOOP), we use the base and misalignment information
678 relative to the outer-loop (LOOP). This is ok only if the misalignment
679 stays the same throughout the execution of the inner-loop, which is why
680 we have to check that the stride of the dataref in the inner-loop evenly
681 divides by the vector size. */
682 if (loop && nested_in_vect_loop_p (loop, stmt))
684 tree step = DR_STEP (dr);
685 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
687 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
689 if (dump_enabled_p ())
690 dump_printf_loc (MSG_NOTE, vect_location,
691 "inner step divides the vector-size.\n");
692 misalign = STMT_VINFO_DR_INIT (stmt_info);
693 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
694 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
696 else
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
700 "inner step doesn't divide the vector-size.\n");
701 misalign = NULL_TREE;
705 /* Similarly, if we're doing basic-block vectorization, we can only use
706 base and misalignment information relative to an innermost loop if the
707 misalignment stays the same throughout the execution of the loop.
708 As above, this is the case if the stride of the dataref evenly divides
709 by the vector size. */
710 if (!loop)
712 tree step = DR_STEP (dr);
713 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
715 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "SLP: step doesn't divide the vector-size.\n");
720 misalign = NULL_TREE;
724 alignment = TYPE_ALIGN_UNIT (vectype);
726 if ((compare_tree_int (aligned_to, alignment) < 0)
727 || !misalign)
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
732 "Unknown alignment for access: ");
733 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
734 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
736 return true;
739 /* To look at alignment of the base we have to preserve an inner MEM_REF
740 as that carries alignment information of the actual access. */
741 base = ref;
742 while (handled_component_p (base))
743 base = TREE_OPERAND (base, 0);
744 if (TREE_CODE (base) == MEM_REF)
745 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
746 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
748 if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
749 base_aligned = true;
750 else
751 base_aligned = false;
753 if (!base_aligned)
755 /* Strip an inner MEM_REF to a bare decl if possible. */
756 if (TREE_CODE (base) == MEM_REF
757 && integer_zerop (TREE_OPERAND (base, 1))
758 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
759 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
761 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
763 if (dump_enabled_p ())
765 dump_printf_loc (MSG_NOTE, vect_location,
766 "can't force alignment of ref: ");
767 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
768 dump_printf (MSG_NOTE, "\n");
770 return true;
773 /* Force the alignment of the decl.
774 NOTE: This is the only change to the code we make during
775 the analysis phase, before deciding to vectorize the loop. */
776 if (dump_enabled_p ())
778 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
779 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
780 dump_printf (MSG_NOTE, "\n");
783 ((dataref_aux *)dr->aux)->base_decl = base;
784 ((dataref_aux *)dr->aux)->base_misaligned = true;
787 /* If this is a backward running DR then first access in the larger
788 vectype actually is N-1 elements before the address in the DR.
789 Adjust misalign accordingly. */
790 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
792 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
793 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
794 otherwise we wouldn't be here. */
795 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
796 /* PLUS because DR_STEP was negative. */
797 misalign = size_binop (PLUS_EXPR, misalign, offset);
800 SET_DR_MISALIGNMENT (dr,
801 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
803 if (dump_enabled_p ())
805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
806 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
807 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
808 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
811 return true;
815 /* Function vect_compute_data_refs_alignment
817 Compute the misalignment of data references in the loop.
818 Return FALSE if a data reference is found that cannot be vectorized. */
820 static bool
821 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
822 bb_vec_info bb_vinfo)
824 vec<data_reference_p> datarefs;
825 struct data_reference *dr;
826 unsigned int i;
828 if (loop_vinfo)
829 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
830 else
831 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
833 FOR_EACH_VEC_ELT (datarefs, i, dr)
834 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
835 && !vect_compute_data_ref_alignment (dr))
837 if (bb_vinfo)
839 /* Mark unsupported statement as unvectorizable. */
840 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
841 continue;
843 else
844 return false;
847 return true;
851 /* Function vect_update_misalignment_for_peel
853 DR - the data reference whose misalignment is to be adjusted.
854 DR_PEEL - the data reference whose misalignment is being made
855 zero in the vector loop by the peel.
856 NPEEL - the number of iterations in the peel loop if the misalignment
857 of DR_PEEL is known at compile time. */
859 static void
860 vect_update_misalignment_for_peel (struct data_reference *dr,
861 struct data_reference *dr_peel, int npeel)
863 unsigned int i;
864 vec<dr_p> same_align_drs;
865 struct data_reference *current_dr;
866 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
867 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
868 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
869 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
871 /* For interleaved data accesses the step in the loop must be multiplied by
872 the size of the interleaving group. */
873 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
874 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
875 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
876 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
878 /* It can be assumed that the data refs with the same alignment as dr_peel
879 are aligned in the vector loop. */
880 same_align_drs
881 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
882 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
884 if (current_dr != dr)
885 continue;
886 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
887 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
888 SET_DR_MISALIGNMENT (dr, 0);
889 return;
892 if (known_alignment_for_access_p (dr)
893 && known_alignment_for_access_p (dr_peel))
895 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
896 int misal = DR_MISALIGNMENT (dr);
897 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
898 misal += negative ? -npeel * dr_size : npeel * dr_size;
899 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
900 SET_DR_MISALIGNMENT (dr, misal);
901 return;
904 if (dump_enabled_p ())
905 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
906 SET_DR_MISALIGNMENT (dr, -1);
910 /* Function vect_verify_datarefs_alignment
912 Return TRUE if all data references in the loop can be
913 handled with respect to alignment. */
915 bool
916 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
918 vec<data_reference_p> datarefs;
919 struct data_reference *dr;
920 enum dr_alignment_support supportable_dr_alignment;
921 unsigned int i;
923 if (loop_vinfo)
924 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
925 else
926 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
928 FOR_EACH_VEC_ELT (datarefs, i, dr)
930 gimple stmt = DR_STMT (dr);
931 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
933 if (!STMT_VINFO_RELEVANT_P (stmt_info))
934 continue;
936 /* For interleaving, only the alignment of the first access matters.
937 Skip statements marked as not vectorizable. */
938 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
939 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
940 || !STMT_VINFO_VECTORIZABLE (stmt_info))
941 continue;
943 /* Strided loads perform only component accesses, alignment is
944 irrelevant for them. */
945 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
946 continue;
948 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
949 if (!supportable_dr_alignment)
951 if (dump_enabled_p ())
953 if (DR_IS_READ (dr))
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "not vectorized: unsupported unaligned load.");
956 else
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 "not vectorized: unsupported unaligned "
959 "store.");
961 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
962 DR_REF (dr));
963 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
965 return false;
967 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
968 dump_printf_loc (MSG_NOTE, vect_location,
969 "Vectorizing an unaligned access.\n");
971 return true;
974 /* Given an memory reference EXP return whether its alignment is less
975 than its size. */
977 static bool
978 not_size_aligned (tree exp)
980 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
981 return true;
983 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
984 > get_object_alignment (exp));
987 /* Function vector_alignment_reachable_p
989 Return true if vector alignment for DR is reachable by peeling
990 a few loop iterations. Return false otherwise. */
992 static bool
993 vector_alignment_reachable_p (struct data_reference *dr)
995 gimple stmt = DR_STMT (dr);
996 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
997 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
999 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1001 /* For interleaved access we peel only if number of iterations in
1002 the prolog loop ({VF - misalignment}), is a multiple of the
1003 number of the interleaved accesses. */
1004 int elem_size, mis_in_elements;
1005 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1007 /* FORNOW: handle only known alignment. */
1008 if (!known_alignment_for_access_p (dr))
1009 return false;
1011 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1012 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1014 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1015 return false;
1018 /* If misalignment is known at the compile time then allow peeling
1019 only if natural alignment is reachable through peeling. */
1020 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1022 HOST_WIDE_INT elmsize =
1023 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1024 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_NOTE, vect_location,
1027 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1028 dump_printf (MSG_NOTE,
1029 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1031 if (DR_MISALIGNMENT (dr) % elmsize)
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "data size does not divide the misalignment.\n");
1036 return false;
1040 if (!known_alignment_for_access_p (dr))
1042 tree type = TREE_TYPE (DR_REF (dr));
1043 bool is_packed = not_size_aligned (DR_REF (dr));
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "Unknown misalignment, is_packed = %d\n",is_packed);
1047 if ((TYPE_USER_ALIGN (type) && !is_packed)
1048 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1049 return true;
1050 else
1051 return false;
1054 return true;
1058 /* Calculate the cost of the memory access represented by DR. */
1060 static void
1061 vect_get_data_access_cost (struct data_reference *dr,
1062 unsigned int *inside_cost,
1063 unsigned int *outside_cost,
1064 stmt_vector_for_cost *body_cost_vec)
1066 gimple stmt = DR_STMT (dr);
1067 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1068 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1069 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1070 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1071 int ncopies = vf / nunits;
1073 if (DR_IS_READ (dr))
1074 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1075 NULL, body_cost_vec, false);
1076 else
1077 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1079 if (dump_enabled_p ())
1080 dump_printf_loc (MSG_NOTE, vect_location,
1081 "vect_get_data_access_cost: inside_cost = %d, "
1082 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1086 /* Insert DR into peeling hash table with NPEEL as key. */
1088 static void
1089 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1090 int npeel)
1092 struct _vect_peel_info elem, *slot;
1093 _vect_peel_info **new_slot;
1094 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1096 elem.npeel = npeel;
1097 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1098 if (slot)
1099 slot->count++;
1100 else
1102 slot = XNEW (struct _vect_peel_info);
1103 slot->npeel = npeel;
1104 slot->dr = dr;
1105 slot->count = 1;
1106 new_slot
1107 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1108 *new_slot = slot;
1111 if (!supportable_dr_alignment
1112 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1113 slot->count += VECT_MAX_COST;
1117 /* Traverse peeling hash table to find peeling option that aligns maximum
1118 number of data accesses. */
1121 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1122 _vect_peel_extended_info *max)
1124 vect_peel_info elem = *slot;
1126 if (elem->count > max->peel_info.count
1127 || (elem->count == max->peel_info.count
1128 && max->peel_info.npeel > elem->npeel))
1130 max->peel_info.npeel = elem->npeel;
1131 max->peel_info.count = elem->count;
1132 max->peel_info.dr = elem->dr;
1135 return 1;
1139 /* Traverse peeling hash table and calculate cost for each peeling option.
1140 Find the one with the lowest cost. */
1143 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1144 _vect_peel_extended_info *min)
1146 vect_peel_info elem = *slot;
1147 int save_misalignment, dummy;
1148 unsigned int inside_cost = 0, outside_cost = 0, i;
1149 gimple stmt = DR_STMT (elem->dr);
1150 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1151 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1152 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1153 struct data_reference *dr;
1154 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1155 int single_iter_cost;
1157 prologue_cost_vec.create (2);
1158 body_cost_vec.create (2);
1159 epilogue_cost_vec.create (2);
1161 FOR_EACH_VEC_ELT (datarefs, i, dr)
1163 stmt = DR_STMT (dr);
1164 stmt_info = vinfo_for_stmt (stmt);
1165 /* For interleaving, only the alignment of the first access
1166 matters. */
1167 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1168 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1169 continue;
1171 save_misalignment = DR_MISALIGNMENT (dr);
1172 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1173 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1174 &body_cost_vec);
1175 SET_DR_MISALIGNMENT (dr, save_misalignment);
1178 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1179 outside_cost += vect_get_known_peeling_cost
1180 (loop_vinfo, elem->npeel, &dummy,
1181 /* ??? We use this cost as number of stmts with scalar_stmt cost,
1182 thus divide by that. This introduces rounding errors, thus better
1183 introduce a new cost kind (raw_cost? scalar_iter_cost?). */
1184 single_iter_cost / vect_get_stmt_cost (scalar_stmt),
1185 &prologue_cost_vec, &epilogue_cost_vec);
1187 /* Prologue and epilogue costs are added to the target model later.
1188 These costs depend only on the scalar iteration cost, the
1189 number of peeling iterations finally chosen, and the number of
1190 misaligned statements. So discard the information found here. */
1191 prologue_cost_vec.release ();
1192 epilogue_cost_vec.release ();
1194 if (inside_cost < min->inside_cost
1195 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1197 min->inside_cost = inside_cost;
1198 min->outside_cost = outside_cost;
1199 min->body_cost_vec.release ();
1200 min->body_cost_vec = body_cost_vec;
1201 min->peel_info.dr = elem->dr;
1202 min->peel_info.npeel = elem->npeel;
1204 else
1205 body_cost_vec.release ();
1207 return 1;
1211 /* Choose best peeling option by traversing peeling hash table and either
1212 choosing an option with the lowest cost (if cost model is enabled) or the
1213 option that aligns as many accesses as possible. */
1215 static struct data_reference *
1216 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1217 unsigned int *npeel,
1218 stmt_vector_for_cost *body_cost_vec)
1220 struct _vect_peel_extended_info res;
1222 res.peel_info.dr = NULL;
1223 res.body_cost_vec = stmt_vector_for_cost ();
1225 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1227 res.inside_cost = INT_MAX;
1228 res.outside_cost = INT_MAX;
1229 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1230 ->traverse <_vect_peel_extended_info *,
1231 vect_peeling_hash_get_lowest_cost> (&res);
1233 else
1235 res.peel_info.count = 0;
1236 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1237 ->traverse <_vect_peel_extended_info *,
1238 vect_peeling_hash_get_most_frequent> (&res);
1241 *npeel = res.peel_info.npeel;
1242 *body_cost_vec = res.body_cost_vec;
1243 return res.peel_info.dr;
1247 /* Function vect_enhance_data_refs_alignment
1249 This pass will use loop versioning and loop peeling in order to enhance
1250 the alignment of data references in the loop.
1252 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1253 original loop is to be vectorized. Any other loops that are created by
1254 the transformations performed in this pass - are not supposed to be
1255 vectorized. This restriction will be relaxed.
1257 This pass will require a cost model to guide it whether to apply peeling
1258 or versioning or a combination of the two. For example, the scheme that
1259 intel uses when given a loop with several memory accesses, is as follows:
1260 choose one memory access ('p') which alignment you want to force by doing
1261 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1262 other accesses are not necessarily aligned, or (2) use loop versioning to
1263 generate one loop in which all accesses are aligned, and another loop in
1264 which only 'p' is necessarily aligned.
1266 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1267 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1268 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1270 Devising a cost model is the most critical aspect of this work. It will
1271 guide us on which access to peel for, whether to use loop versioning, how
1272 many versions to create, etc. The cost model will probably consist of
1273 generic considerations as well as target specific considerations (on
1274 powerpc for example, misaligned stores are more painful than misaligned
1275 loads).
1277 Here are the general steps involved in alignment enhancements:
1279 -- original loop, before alignment analysis:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1285 -- After vect_compute_data_refs_alignment:
1286 for (i=0; i<N; i++){
1287 x = q[i]; # DR_MISALIGNMENT(q) = 3
1288 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1291 -- Possibility 1: we do loop versioning:
1292 if (p is aligned) {
1293 for (i=0; i<N; i++){ # loop 1A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1298 else {
1299 for (i=0; i<N; i++){ # loop 1B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 3
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1305 -- Possibility 2: we do loop peeling:
1306 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1307 x = q[i];
1308 p[i] = y;
1310 for (i = 3; i < N; i++){ # loop 2A
1311 x = q[i]; # DR_MISALIGNMENT(q) = 0
1312 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1315 -- Possibility 3: combination of loop peeling and versioning:
1316 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1317 x = q[i];
1318 p[i] = y;
1320 if (p is aligned) {
1321 for (i = 3; i<N; i++){ # loop 3A
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = 0
1326 else {
1327 for (i = 3; i<N; i++){ # loop 3B
1328 x = q[i]; # DR_MISALIGNMENT(q) = 0
1329 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1333 These loops are later passed to loop_transform to be vectorized. The
1334 vectorizer will use the alignment information to guide the transformation
1335 (whether to generate regular loads/stores, or with special handling for
1336 misalignment). */
1338 bool
1339 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1341 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1342 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1343 enum dr_alignment_support supportable_dr_alignment;
1344 struct data_reference *dr0 = NULL, *first_store = NULL;
1345 struct data_reference *dr;
1346 unsigned int i, j;
1347 bool do_peeling = false;
1348 bool do_versioning = false;
1349 bool stat;
1350 gimple stmt;
1351 stmt_vec_info stmt_info;
1352 unsigned int npeel = 0;
1353 bool all_misalignments_unknown = true;
1354 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1355 unsigned possible_npeel_number = 1;
1356 tree vectype;
1357 unsigned int nelements, mis, same_align_drs_max = 0;
1358 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1360 if (dump_enabled_p ())
1361 dump_printf_loc (MSG_NOTE, vect_location,
1362 "=== vect_enhance_data_refs_alignment ===\n");
1364 /* While cost model enhancements are expected in the future, the high level
1365 view of the code at this time is as follows:
1367 A) If there is a misaligned access then see if peeling to align
1368 this access can make all data references satisfy
1369 vect_supportable_dr_alignment. If so, update data structures
1370 as needed and return true.
1372 B) If peeling wasn't possible and there is a data reference with an
1373 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1374 then see if loop versioning checks can be used to make all data
1375 references satisfy vect_supportable_dr_alignment. If so, update
1376 data structures as needed and return true.
1378 C) If neither peeling nor versioning were successful then return false if
1379 any data reference does not satisfy vect_supportable_dr_alignment.
1381 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1383 Note, Possibility 3 above (which is peeling and versioning together) is not
1384 being done at this time. */
1386 /* (1) Peeling to force alignment. */
1388 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1389 Considerations:
1390 + How many accesses will become aligned due to the peeling
1391 - How many accesses will become unaligned due to the peeling,
1392 and the cost of misaligned accesses.
1393 - The cost of peeling (the extra runtime checks, the increase
1394 in code size). */
1396 FOR_EACH_VEC_ELT (datarefs, i, dr)
1398 stmt = DR_STMT (dr);
1399 stmt_info = vinfo_for_stmt (stmt);
1401 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1402 continue;
1404 /* For interleaving, only the alignment of the first access
1405 matters. */
1406 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1407 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1408 continue;
1410 /* For invariant accesses there is nothing to enhance. */
1411 if (integer_zerop (DR_STEP (dr)))
1412 continue;
1414 /* Strided loads perform only component accesses, alignment is
1415 irrelevant for them. */
1416 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1417 continue;
1419 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1420 do_peeling = vector_alignment_reachable_p (dr);
1421 if (do_peeling)
1423 if (known_alignment_for_access_p (dr))
1425 unsigned int npeel_tmp;
1426 bool negative = tree_int_cst_compare (DR_STEP (dr),
1427 size_zero_node) < 0;
1429 /* Save info about DR in the hash table. */
1430 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1431 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1432 = new hash_table<peel_info_hasher> (1);
1434 vectype = STMT_VINFO_VECTYPE (stmt_info);
1435 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1436 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1437 TREE_TYPE (DR_REF (dr))));
1438 npeel_tmp = (negative
1439 ? (mis - nelements) : (nelements - mis))
1440 & (nelements - 1);
1442 /* For multiple types, it is possible that the bigger type access
1443 will have more than one peeling option. E.g., a loop with two
1444 types: one of size (vector size / 4), and the other one of
1445 size (vector size / 8). Vectorization factor will 8. If both
1446 access are misaligned by 3, the first one needs one scalar
1447 iteration to be aligned, and the second one needs 5. But the
1448 the first one will be aligned also by peeling 5 scalar
1449 iterations, and in that case both accesses will be aligned.
1450 Hence, except for the immediate peeling amount, we also want
1451 to try to add full vector size, while we don't exceed
1452 vectorization factor.
1453 We do this automtically for cost model, since we calculate cost
1454 for every peeling option. */
1455 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1456 possible_npeel_number = vf /nelements;
1458 /* Handle the aligned case. We may decide to align some other
1459 access, making DR unaligned. */
1460 if (DR_MISALIGNMENT (dr) == 0)
1462 npeel_tmp = 0;
1463 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1464 possible_npeel_number++;
1467 for (j = 0; j < possible_npeel_number; j++)
1469 gcc_assert (npeel_tmp <= vf);
1470 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1471 npeel_tmp += nelements;
1474 all_misalignments_unknown = false;
1475 /* Data-ref that was chosen for the case that all the
1476 misalignments are unknown is not relevant anymore, since we
1477 have a data-ref with known alignment. */
1478 dr0 = NULL;
1480 else
1482 /* If we don't know any misalignment values, we prefer
1483 peeling for data-ref that has the maximum number of data-refs
1484 with the same alignment, unless the target prefers to align
1485 stores over load. */
1486 if (all_misalignments_unknown)
1488 unsigned same_align_drs
1489 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1490 if (!dr0
1491 || same_align_drs_max < same_align_drs)
1493 same_align_drs_max = same_align_drs;
1494 dr0 = dr;
1496 /* For data-refs with the same number of related
1497 accesses prefer the one where the misalign
1498 computation will be invariant in the outermost loop. */
1499 else if (same_align_drs_max == same_align_drs)
1501 struct loop *ivloop0, *ivloop;
1502 ivloop0 = outermost_invariant_loop_for_expr
1503 (loop, DR_BASE_ADDRESS (dr0));
1504 ivloop = outermost_invariant_loop_for_expr
1505 (loop, DR_BASE_ADDRESS (dr));
1506 if ((ivloop && !ivloop0)
1507 || (ivloop && ivloop0
1508 && flow_loop_nested_p (ivloop, ivloop0)))
1509 dr0 = dr;
1512 if (!first_store && DR_IS_WRITE (dr))
1513 first_store = dr;
1516 /* If there are both known and unknown misaligned accesses in the
1517 loop, we choose peeling amount according to the known
1518 accesses. */
1519 if (!supportable_dr_alignment)
1521 dr0 = dr;
1522 if (!first_store && DR_IS_WRITE (dr))
1523 first_store = dr;
1527 else
1529 if (!aligned_access_p (dr))
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "vector alignment may not be reachable\n");
1534 break;
1539 /* Check if we can possibly peel the loop. */
1540 if (!vect_can_advance_ivs_p (loop_vinfo)
1541 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1542 do_peeling = false;
1544 /* If we don't know how many times the peeling loop will run
1545 assume it will run VF-1 times and disable peeling if the remaining
1546 iters are less than the vectorization factor. */
1547 if (do_peeling
1548 && all_misalignments_unknown
1549 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1550 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1551 < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1552 do_peeling = false;
1554 if (do_peeling
1555 && all_misalignments_unknown
1556 && vect_supportable_dr_alignment (dr0, false))
1558 /* Check if the target requires to prefer stores over loads, i.e., if
1559 misaligned stores are more expensive than misaligned loads (taking
1560 drs with same alignment into account). */
1561 if (first_store && DR_IS_READ (dr0))
1563 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1564 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1565 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1566 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1567 stmt_vector_for_cost dummy;
1568 dummy.create (2);
1570 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1571 &dummy);
1572 vect_get_data_access_cost (first_store, &store_inside_cost,
1573 &store_outside_cost, &dummy);
1575 dummy.release ();
1577 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1578 aligning the load DR0). */
1579 load_inside_penalty = store_inside_cost;
1580 load_outside_penalty = store_outside_cost;
1581 for (i = 0;
1582 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1583 DR_STMT (first_store))).iterate (i, &dr);
1584 i++)
1585 if (DR_IS_READ (dr))
1587 load_inside_penalty += load_inside_cost;
1588 load_outside_penalty += load_outside_cost;
1590 else
1592 load_inside_penalty += store_inside_cost;
1593 load_outside_penalty += store_outside_cost;
1596 /* Calculate the penalty for leaving DR0 unaligned (by
1597 aligning the FIRST_STORE). */
1598 store_inside_penalty = load_inside_cost;
1599 store_outside_penalty = load_outside_cost;
1600 for (i = 0;
1601 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1602 DR_STMT (dr0))).iterate (i, &dr);
1603 i++)
1604 if (DR_IS_READ (dr))
1606 store_inside_penalty += load_inside_cost;
1607 store_outside_penalty += load_outside_cost;
1609 else
1611 store_inside_penalty += store_inside_cost;
1612 store_outside_penalty += store_outside_cost;
1615 if (load_inside_penalty > store_inside_penalty
1616 || (load_inside_penalty == store_inside_penalty
1617 && load_outside_penalty > store_outside_penalty))
1618 dr0 = first_store;
1621 /* In case there are only loads with different unknown misalignments, use
1622 peeling only if it may help to align other accesses in the loop. */
1623 if (!first_store
1624 && !STMT_VINFO_SAME_ALIGN_REFS (
1625 vinfo_for_stmt (DR_STMT (dr0))).length ()
1626 && vect_supportable_dr_alignment (dr0, false)
1627 != dr_unaligned_supported)
1628 do_peeling = false;
1631 if (do_peeling && !dr0)
1633 /* Peeling is possible, but there is no data access that is not supported
1634 unless aligned. So we try to choose the best possible peeling. */
1636 /* We should get here only if there are drs with known misalignment. */
1637 gcc_assert (!all_misalignments_unknown);
1639 /* Choose the best peeling from the hash table. */
1640 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1641 &body_cost_vec);
1642 if (!dr0 || !npeel)
1643 do_peeling = false;
1645 /* If peeling by npeel will result in a remaining loop not iterating
1646 enough to be vectorized then do not peel. */
1647 if (do_peeling
1648 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1649 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1650 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1651 do_peeling = false;
1654 if (do_peeling)
1656 stmt = DR_STMT (dr0);
1657 stmt_info = vinfo_for_stmt (stmt);
1658 vectype = STMT_VINFO_VECTYPE (stmt_info);
1659 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1661 if (known_alignment_for_access_p (dr0))
1663 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1664 size_zero_node) < 0;
1665 if (!npeel)
1667 /* Since it's known at compile time, compute the number of
1668 iterations in the peeled loop (the peeling factor) for use in
1669 updating DR_MISALIGNMENT values. The peeling factor is the
1670 vectorization factor minus the misalignment as an element
1671 count. */
1672 mis = DR_MISALIGNMENT (dr0);
1673 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1674 npeel = ((negative ? mis - nelements : nelements - mis)
1675 & (nelements - 1));
1678 /* For interleaved data access every iteration accesses all the
1679 members of the group, therefore we divide the number of iterations
1680 by the group size. */
1681 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1682 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1683 npeel /= GROUP_SIZE (stmt_info);
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_NOTE, vect_location,
1687 "Try peeling by %d\n", npeel);
1690 /* Ensure that all data refs can be vectorized after the peel. */
1691 FOR_EACH_VEC_ELT (datarefs, i, dr)
1693 int save_misalignment;
1695 if (dr == dr0)
1696 continue;
1698 stmt = DR_STMT (dr);
1699 stmt_info = vinfo_for_stmt (stmt);
1700 /* For interleaving, only the alignment of the first access
1701 matters. */
1702 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1703 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1704 continue;
1706 /* Strided loads perform only component accesses, alignment is
1707 irrelevant for them. */
1708 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1709 continue;
1711 save_misalignment = DR_MISALIGNMENT (dr);
1712 vect_update_misalignment_for_peel (dr, dr0, npeel);
1713 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1714 SET_DR_MISALIGNMENT (dr, save_misalignment);
1716 if (!supportable_dr_alignment)
1718 do_peeling = false;
1719 break;
1723 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1725 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1726 if (!stat)
1727 do_peeling = false;
1728 else
1730 body_cost_vec.release ();
1731 return stat;
1735 if (do_peeling)
1737 unsigned max_allowed_peel
1738 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1739 if (max_allowed_peel != (unsigned)-1)
1741 unsigned max_peel = npeel;
1742 if (max_peel == 0)
1744 gimple dr_stmt = DR_STMT (dr0);
1745 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1746 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1747 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1749 if (max_peel > max_allowed_peel)
1751 do_peeling = false;
1752 if (dump_enabled_p ())
1753 dump_printf_loc (MSG_NOTE, vect_location,
1754 "Disable peeling, max peels reached: %d\n", max_peel);
1759 if (do_peeling)
1761 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1762 If the misalignment of DR_i is identical to that of dr0 then set
1763 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1764 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1765 by the peeling factor times the element size of DR_i (MOD the
1766 vectorization factor times the size). Otherwise, the
1767 misalignment of DR_i must be set to unknown. */
1768 FOR_EACH_VEC_ELT (datarefs, i, dr)
1769 if (dr != dr0)
1770 vect_update_misalignment_for_peel (dr, dr0, npeel);
1772 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1773 if (npeel)
1774 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1775 else
1776 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1777 = DR_MISALIGNMENT (dr0);
1778 SET_DR_MISALIGNMENT (dr0, 0);
1779 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Alignment of access forced using peeling.\n");
1783 dump_printf_loc (MSG_NOTE, vect_location,
1784 "Peeling for alignment will be applied.\n");
1786 /* The inside-loop cost will be accounted for in vectorizable_load
1787 and vectorizable_store correctly with adjusted alignments.
1788 Drop the body_cst_vec on the floor here. */
1789 body_cost_vec.release ();
1791 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1792 gcc_assert (stat);
1793 return stat;
1797 body_cost_vec.release ();
1799 /* (2) Versioning to force alignment. */
1801 /* Try versioning if:
1802 1) optimize loop for speed
1803 2) there is at least one unsupported misaligned data ref with an unknown
1804 misalignment, and
1805 3) all misaligned data refs with a known misalignment are supported, and
1806 4) the number of runtime alignment checks is within reason. */
1808 do_versioning =
1809 optimize_loop_nest_for_speed_p (loop)
1810 && (!loop->inner); /* FORNOW */
1812 if (do_versioning)
1814 FOR_EACH_VEC_ELT (datarefs, i, dr)
1816 stmt = DR_STMT (dr);
1817 stmt_info = vinfo_for_stmt (stmt);
1819 /* For interleaving, only the alignment of the first access
1820 matters. */
1821 if (aligned_access_p (dr)
1822 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1823 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1824 continue;
1826 /* Strided loads perform only component accesses, alignment is
1827 irrelevant for them. */
1828 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1829 continue;
1831 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1833 if (!supportable_dr_alignment)
1835 gimple stmt;
1836 int mask;
1837 tree vectype;
1839 if (known_alignment_for_access_p (dr)
1840 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1841 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1843 do_versioning = false;
1844 break;
1847 stmt = DR_STMT (dr);
1848 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1849 gcc_assert (vectype);
1851 /* The rightmost bits of an aligned address must be zeros.
1852 Construct the mask needed for this test. For example,
1853 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1854 mask must be 15 = 0xf. */
1855 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1857 /* FORNOW: use the same mask to test all potentially unaligned
1858 references in the loop. The vectorizer currently supports
1859 a single vector size, see the reference to
1860 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1861 vectorization factor is computed. */
1862 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1863 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1864 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1865 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1866 DR_STMT (dr));
1870 /* Versioning requires at least one misaligned data reference. */
1871 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1872 do_versioning = false;
1873 else if (!do_versioning)
1874 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1877 if (do_versioning)
1879 vec<gimple> may_misalign_stmts
1880 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1881 gimple stmt;
1883 /* It can now be assumed that the data references in the statements
1884 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1885 of the loop being vectorized. */
1886 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1888 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1889 dr = STMT_VINFO_DATA_REF (stmt_info);
1890 SET_DR_MISALIGNMENT (dr, 0);
1891 if (dump_enabled_p ())
1892 dump_printf_loc (MSG_NOTE, vect_location,
1893 "Alignment of access forced using versioning.\n");
1896 if (dump_enabled_p ())
1897 dump_printf_loc (MSG_NOTE, vect_location,
1898 "Versioning for alignment will be applied.\n");
1900 /* Peeling and versioning can't be done together at this time. */
1901 gcc_assert (! (do_peeling && do_versioning));
1903 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1904 gcc_assert (stat);
1905 return stat;
1908 /* This point is reached if neither peeling nor versioning is being done. */
1909 gcc_assert (! (do_peeling || do_versioning));
1911 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1912 return stat;
1916 /* Function vect_find_same_alignment_drs.
1918 Update group and alignment relations according to the chosen
1919 vectorization factor. */
1921 static void
1922 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1923 loop_vec_info loop_vinfo)
1925 unsigned int i;
1926 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1927 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1928 struct data_reference *dra = DDR_A (ddr);
1929 struct data_reference *drb = DDR_B (ddr);
1930 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1931 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1932 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1933 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1934 lambda_vector dist_v;
1935 unsigned int loop_depth;
1937 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1938 return;
1940 if (dra == drb)
1941 return;
1943 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1944 return;
1946 /* Loop-based vectorization and known data dependence. */
1947 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1948 return;
1950 /* Data-dependence analysis reports a distance vector of zero
1951 for data-references that overlap only in the first iteration
1952 but have different sign step (see PR45764).
1953 So as a sanity check require equal DR_STEP. */
1954 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1955 return;
1957 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1958 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1960 int dist = dist_v[loop_depth];
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "dependence distance = %d.\n", dist);
1966 /* Same loop iteration. */
1967 if (dist == 0
1968 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1970 /* Two references with distance zero have the same alignment. */
1971 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1972 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1973 if (dump_enabled_p ())
1975 dump_printf_loc (MSG_NOTE, vect_location,
1976 "accesses have the same alignment.\n");
1977 dump_printf (MSG_NOTE,
1978 "dependence distance modulo vf == 0 between ");
1979 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1980 dump_printf (MSG_NOTE, " and ");
1981 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1982 dump_printf (MSG_NOTE, "\n");
1989 /* Function vect_analyze_data_refs_alignment
1991 Analyze the alignment of the data-references in the loop.
1992 Return FALSE if a data reference is found that cannot be vectorized. */
1994 bool
1995 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1996 bb_vec_info bb_vinfo)
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_NOTE, vect_location,
2000 "=== vect_analyze_data_refs_alignment ===\n");
2002 /* Mark groups of data references with same alignment using
2003 data dependence information. */
2004 if (loop_vinfo)
2006 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2007 struct data_dependence_relation *ddr;
2008 unsigned int i;
2010 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2011 vect_find_same_alignment_drs (ddr, loop_vinfo);
2014 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: can't calculate alignment "
2019 "for data ref.\n");
2020 return false;
2023 return true;
2027 /* Analyze groups of accesses: check that DR belongs to a group of
2028 accesses of legal size, step, etc. Detect gaps, single element
2029 interleaving, and other special cases. Set grouped access info.
2030 Collect groups of strided stores for further use in SLP analysis. */
2032 static bool
2033 vect_analyze_group_access (struct data_reference *dr)
2035 tree step = DR_STEP (dr);
2036 tree scalar_type = TREE_TYPE (DR_REF (dr));
2037 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2038 gimple stmt = DR_STMT (dr);
2039 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2040 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2041 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2042 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2043 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2044 bool slp_impossible = false;
2045 struct loop *loop = NULL;
2047 if (loop_vinfo)
2048 loop = LOOP_VINFO_LOOP (loop_vinfo);
2050 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2051 size of the interleaving group (including gaps). */
2052 groupsize = absu_hwi (dr_step) / type_size;
2054 /* Not consecutive access is possible only if it is a part of interleaving. */
2055 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2057 /* Check if it this DR is a part of interleaving, and is a single
2058 element of the group that is accessed in the loop. */
2060 /* Gaps are supported only for loads. STEP must be a multiple of the type
2061 size. The size of the group must be a power of 2. */
2062 if (DR_IS_READ (dr)
2063 && (dr_step % type_size) == 0
2064 && groupsize > 0
2065 && exact_log2 (groupsize) != -1)
2067 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2068 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2069 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE, vect_location,
2072 "Detected single element interleaving ");
2073 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2074 dump_printf (MSG_NOTE, " step ");
2075 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2076 dump_printf (MSG_NOTE, "\n");
2079 if (loop_vinfo)
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "Data access with gaps requires scalar "
2084 "epilogue loop\n");
2085 if (loop->inner)
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089 "Peeling for outer loop is not"
2090 " supported\n");
2091 return false;
2094 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2097 return true;
2100 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103 "not consecutive access ");
2104 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2105 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2108 if (bb_vinfo)
2110 /* Mark the statement as unvectorizable. */
2111 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2112 return true;
2115 return false;
2118 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2120 /* First stmt in the interleaving chain. Check the chain. */
2121 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2122 struct data_reference *data_ref = dr;
2123 unsigned int count = 1;
2124 tree prev_init = DR_INIT (data_ref);
2125 gimple prev = stmt;
2126 HOST_WIDE_INT diff, gaps = 0;
2127 unsigned HOST_WIDE_INT count_in_bytes;
2129 while (next)
2131 /* Skip same data-refs. In case that two or more stmts share
2132 data-ref (supported only for loads), we vectorize only the first
2133 stmt, and the rest get their vectorized loads from the first
2134 one. */
2135 if (!tree_int_cst_compare (DR_INIT (data_ref),
2136 DR_INIT (STMT_VINFO_DATA_REF (
2137 vinfo_for_stmt (next)))))
2139 if (DR_IS_WRITE (data_ref))
2141 if (dump_enabled_p ())
2142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2143 "Two store stmts share the same dr.\n");
2144 return false;
2147 /* For load use the same data-ref load. */
2148 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2150 prev = next;
2151 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2152 continue;
2155 prev = next;
2156 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2158 /* All group members have the same STEP by construction. */
2159 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2161 /* Check that the distance between two accesses is equal to the type
2162 size. Otherwise, we have gaps. */
2163 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2164 - TREE_INT_CST_LOW (prev_init)) / type_size;
2165 if (diff != 1)
2167 /* FORNOW: SLP of accesses with gaps is not supported. */
2168 slp_impossible = true;
2169 if (DR_IS_WRITE (data_ref))
2171 if (dump_enabled_p ())
2172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173 "interleaved store with gaps\n");
2174 return false;
2177 gaps += diff - 1;
2180 last_accessed_element += diff;
2182 /* Store the gap from the previous member of the group. If there is no
2183 gap in the access, GROUP_GAP is always 1. */
2184 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2186 prev_init = DR_INIT (data_ref);
2187 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2188 /* Count the number of data-refs in the chain. */
2189 count++;
2192 /* COUNT is the number of accesses found, we multiply it by the size of
2193 the type to get COUNT_IN_BYTES. */
2194 count_in_bytes = type_size * count;
2196 /* Check that the size of the interleaving (including gaps) is not
2197 greater than STEP. */
2198 if (dr_step != 0
2199 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2201 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 "interleaving size is greater than step for ");
2205 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2206 DR_REF (dr));
2207 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2209 return false;
2212 /* Check that the size of the interleaving is equal to STEP for stores,
2213 i.e., that there are no gaps. */
2214 if (dr_step != 0
2215 && absu_hwi (dr_step) != count_in_bytes)
2217 if (DR_IS_READ (dr))
2219 slp_impossible = true;
2220 /* There is a gap after the last load in the group. This gap is a
2221 difference between the groupsize and the number of elements.
2222 When there is no gap, this difference should be 0. */
2223 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2225 else
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229 "interleaved store with gaps\n");
2230 return false;
2234 /* Check that STEP is a multiple of type size. */
2235 if (dr_step != 0
2236 && (dr_step % type_size) != 0)
2238 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241 "step is not a multiple of type size: step ");
2242 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2243 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2244 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2245 TYPE_SIZE_UNIT (scalar_type));
2246 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2248 return false;
2251 if (groupsize == 0)
2252 groupsize = count;
2254 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE, vect_location,
2257 "Detected interleaving of size %d\n", (int)groupsize);
2259 /* SLP: create an SLP data structure for every interleaving group of
2260 stores for further analysis in vect_analyse_slp. */
2261 if (DR_IS_WRITE (dr) && !slp_impossible)
2263 if (loop_vinfo)
2264 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2265 if (bb_vinfo)
2266 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2269 /* There is a gap in the end of the group. */
2270 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274 "Data access with gaps requires scalar "
2275 "epilogue loop\n");
2276 if (loop->inner)
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280 "Peeling for outer loop is not supported\n");
2281 return false;
2284 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2288 return true;
2292 /* Analyze the access pattern of the data-reference DR.
2293 In case of non-consecutive accesses call vect_analyze_group_access() to
2294 analyze groups of accesses. */
2296 static bool
2297 vect_analyze_data_ref_access (struct data_reference *dr)
2299 tree step = DR_STEP (dr);
2300 tree scalar_type = TREE_TYPE (DR_REF (dr));
2301 gimple stmt = DR_STMT (dr);
2302 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2303 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2304 struct loop *loop = NULL;
2306 if (loop_vinfo)
2307 loop = LOOP_VINFO_LOOP (loop_vinfo);
2309 if (loop_vinfo && !step)
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313 "bad data-ref access in loop\n");
2314 return false;
2317 /* Allow invariant loads in not nested loops. */
2318 if (loop_vinfo && integer_zerop (step))
2320 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2321 if (nested_in_vect_loop_p (loop, stmt))
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "zero step in inner loop of nest\n");
2326 return false;
2328 return DR_IS_READ (dr);
2331 if (loop && nested_in_vect_loop_p (loop, stmt))
2333 /* Interleaved accesses are not yet supported within outer-loop
2334 vectorization for references in the inner-loop. */
2335 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2337 /* For the rest of the analysis we use the outer-loop step. */
2338 step = STMT_VINFO_DR_STEP (stmt_info);
2339 if (integer_zerop (step))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_NOTE, vect_location,
2343 "zero step in outer loop.\n");
2344 if (DR_IS_READ (dr))
2345 return true;
2346 else
2347 return false;
2351 /* Consecutive? */
2352 if (TREE_CODE (step) == INTEGER_CST)
2354 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2355 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2356 || (dr_step < 0
2357 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2359 /* Mark that it is not interleaving. */
2360 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2361 return true;
2365 if (loop && nested_in_vect_loop_p (loop, stmt))
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "grouped access in outer loop.\n");
2370 return false;
2373 /* Assume this is a DR handled by non-constant strided load case. */
2374 if (TREE_CODE (step) != INTEGER_CST)
2375 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2377 /* Not consecutive access - check if it's a part of interleaving group. */
2378 return vect_analyze_group_access (dr);
2383 /* A helper function used in the comparator function to sort data
2384 references. T1 and T2 are two data references to be compared.
2385 The function returns -1, 0, or 1. */
2387 static int
2388 compare_tree (tree t1, tree t2)
2390 int i, cmp;
2391 enum tree_code code;
2392 char tclass;
2394 if (t1 == t2)
2395 return 0;
2396 if (t1 == NULL)
2397 return -1;
2398 if (t2 == NULL)
2399 return 1;
2402 if (TREE_CODE (t1) != TREE_CODE (t2))
2403 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2405 code = TREE_CODE (t1);
2406 switch (code)
2408 /* For const values, we can just use hash values for comparisons. */
2409 case INTEGER_CST:
2410 case REAL_CST:
2411 case FIXED_CST:
2412 case STRING_CST:
2413 case COMPLEX_CST:
2414 case VECTOR_CST:
2416 hashval_t h1 = iterative_hash_expr (t1, 0);
2417 hashval_t h2 = iterative_hash_expr (t2, 0);
2418 if (h1 != h2)
2419 return h1 < h2 ? -1 : 1;
2420 break;
2423 case SSA_NAME:
2424 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2425 if (cmp != 0)
2426 return cmp;
2428 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2429 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2430 break;
2432 default:
2433 tclass = TREE_CODE_CLASS (code);
2435 /* For var-decl, we could compare their UIDs. */
2436 if (tclass == tcc_declaration)
2438 if (DECL_UID (t1) != DECL_UID (t2))
2439 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2440 break;
2443 /* For expressions with operands, compare their operands recursively. */
2444 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2446 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2447 if (cmp != 0)
2448 return cmp;
2452 return 0;
2456 /* Compare two data-references DRA and DRB to group them into chunks
2457 suitable for grouping. */
2459 static int
2460 dr_group_sort_cmp (const void *dra_, const void *drb_)
2462 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2463 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2464 int cmp;
2466 /* Stabilize sort. */
2467 if (dra == drb)
2468 return 0;
2470 /* Ordering of DRs according to base. */
2471 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2473 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2474 if (cmp != 0)
2475 return cmp;
2478 /* And according to DR_OFFSET. */
2479 if (!dr_equal_offsets_p (dra, drb))
2481 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2482 if (cmp != 0)
2483 return cmp;
2486 /* Put reads before writes. */
2487 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2488 return DR_IS_READ (dra) ? -1 : 1;
2490 /* Then sort after access size. */
2491 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2492 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2494 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2495 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2496 if (cmp != 0)
2497 return cmp;
2500 /* And after step. */
2501 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2503 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2504 if (cmp != 0)
2505 return cmp;
2508 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2509 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2510 if (cmp == 0)
2511 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2512 return cmp;
2515 /* Function vect_analyze_data_ref_accesses.
2517 Analyze the access pattern of all the data references in the loop.
2519 FORNOW: the only access pattern that is considered vectorizable is a
2520 simple step 1 (consecutive) access.
2522 FORNOW: handle only arrays and pointer accesses. */
2524 bool
2525 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2527 unsigned int i;
2528 vec<data_reference_p> datarefs;
2529 struct data_reference *dr;
2531 if (dump_enabled_p ())
2532 dump_printf_loc (MSG_NOTE, vect_location,
2533 "=== vect_analyze_data_ref_accesses ===\n");
2535 if (loop_vinfo)
2536 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2537 else
2538 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2540 if (datarefs.is_empty ())
2541 return true;
2543 /* Sort the array of datarefs to make building the interleaving chains
2544 linear. Don't modify the original vector's order, it is needed for
2545 determining what dependencies are reversed. */
2546 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2547 datarefs_copy.qsort (dr_group_sort_cmp);
2549 /* Build the interleaving chains. */
2550 for (i = 0; i < datarefs_copy.length () - 1;)
2552 data_reference_p dra = datarefs_copy[i];
2553 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2554 stmt_vec_info lastinfo = NULL;
2555 for (i = i + 1; i < datarefs_copy.length (); ++i)
2557 data_reference_p drb = datarefs_copy[i];
2558 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2560 /* ??? Imperfect sorting (non-compatible types, non-modulo
2561 accesses, same accesses) can lead to a group to be artificially
2562 split here as we don't just skip over those. If it really
2563 matters we can push those to a worklist and re-iterate
2564 over them. The we can just skip ahead to the next DR here. */
2566 /* Check that the data-refs have same first location (except init)
2567 and they are both either store or load (not load and store,
2568 not masked loads or stores). */
2569 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2570 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2571 DR_BASE_ADDRESS (drb), 0)
2572 || !dr_equal_offsets_p (dra, drb)
2573 || !gimple_assign_single_p (DR_STMT (dra))
2574 || !gimple_assign_single_p (DR_STMT (drb)))
2575 break;
2577 /* Check that the data-refs have the same constant size and step. */
2578 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2579 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2580 if (!tree_fits_uhwi_p (sza)
2581 || !tree_fits_uhwi_p (szb)
2582 || !tree_int_cst_equal (sza, szb)
2583 || !tree_fits_shwi_p (DR_STEP (dra))
2584 || !tree_fits_shwi_p (DR_STEP (drb))
2585 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2586 break;
2588 /* Do not place the same access in the interleaving chain twice. */
2589 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2590 break;
2592 /* Check the types are compatible.
2593 ??? We don't distinguish this during sorting. */
2594 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2595 TREE_TYPE (DR_REF (drb))))
2596 break;
2598 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2599 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2600 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2601 gcc_assert (init_a < init_b);
2603 /* If init_b == init_a + the size of the type * k, we have an
2604 interleaving, and DRA is accessed before DRB. */
2605 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2606 if ((init_b - init_a) % type_size_a != 0)
2607 break;
2609 /* The step (if not zero) is greater than the difference between
2610 data-refs' inits. This splits groups into suitable sizes. */
2611 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2612 if (step != 0 && step <= (init_b - init_a))
2613 break;
2615 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE, vect_location,
2618 "Detected interleaving ");
2619 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2620 dump_printf (MSG_NOTE, " and ");
2621 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2622 dump_printf (MSG_NOTE, "\n");
2625 /* Link the found element into the group list. */
2626 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2628 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2629 lastinfo = stmtinfo_a;
2631 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2632 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2633 lastinfo = stmtinfo_b;
2637 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2638 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2639 && !vect_analyze_data_ref_access (dr))
2641 if (dump_enabled_p ())
2642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2643 "not vectorized: complicated access pattern.\n");
2645 if (bb_vinfo)
2647 /* Mark the statement as not vectorizable. */
2648 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2649 continue;
2651 else
2653 datarefs_copy.release ();
2654 return false;
2658 datarefs_copy.release ();
2659 return true;
2663 /* Operator == between two dr_with_seg_len objects.
2665 This equality operator is used to make sure two data refs
2666 are the same one so that we will consider to combine the
2667 aliasing checks of those two pairs of data dependent data
2668 refs. */
2670 static bool
2671 operator == (const dr_with_seg_len& d1,
2672 const dr_with_seg_len& d2)
2674 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2675 DR_BASE_ADDRESS (d2.dr), 0)
2676 && compare_tree (d1.offset, d2.offset) == 0
2677 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2680 /* Function comp_dr_with_seg_len_pair.
2682 Comparison function for sorting objects of dr_with_seg_len_pair_t
2683 so that we can combine aliasing checks in one scan. */
2685 static int
2686 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2688 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2689 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2691 const dr_with_seg_len &p11 = p1->first,
2692 &p12 = p1->second,
2693 &p21 = p2->first,
2694 &p22 = p2->second;
2696 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2697 if a and c have the same basic address snd step, and b and d have the same
2698 address and step. Therefore, if any a&c or b&d don't have the same address
2699 and step, we don't care the order of those two pairs after sorting. */
2700 int comp_res;
2702 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2703 DR_BASE_ADDRESS (p21.dr))) != 0)
2704 return comp_res;
2705 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2706 DR_BASE_ADDRESS (p22.dr))) != 0)
2707 return comp_res;
2708 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2709 return comp_res;
2710 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2711 return comp_res;
2712 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2713 return comp_res;
2714 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2715 return comp_res;
2717 return 0;
2720 /* Function vect_vfa_segment_size.
2722 Create an expression that computes the size of segment
2723 that will be accessed for a data reference. The functions takes into
2724 account that realignment loads may access one more vector.
2726 Input:
2727 DR: The data reference.
2728 LENGTH_FACTOR: segment length to consider.
2730 Return an expression whose value is the size of segment which will be
2731 accessed by DR. */
2733 static tree
2734 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2736 tree segment_length;
2738 if (integer_zerop (DR_STEP (dr)))
2739 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2740 else
2741 segment_length = size_binop (MULT_EXPR,
2742 fold_convert (sizetype, DR_STEP (dr)),
2743 fold_convert (sizetype, length_factor));
2745 if (vect_supportable_dr_alignment (dr, false)
2746 == dr_explicit_realign_optimized)
2748 tree vector_size = TYPE_SIZE_UNIT
2749 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2751 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2753 return segment_length;
2756 /* Function vect_prune_runtime_alias_test_list.
2758 Prune a list of ddrs to be tested at run-time by versioning for alias.
2759 Merge several alias checks into one if possible.
2760 Return FALSE if resulting list of ddrs is longer then allowed by
2761 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2763 bool
2764 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2766 vec<ddr_p> may_alias_ddrs =
2767 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2768 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2769 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2770 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2771 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2773 ddr_p ddr;
2774 unsigned int i;
2775 tree length_factor;
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_NOTE, vect_location,
2779 "=== vect_prune_runtime_alias_test_list ===\n");
2781 if (may_alias_ddrs.is_empty ())
2782 return true;
2784 /* Basically, for each pair of dependent data refs store_ptr_0
2785 and load_ptr_0, we create an expression:
2787 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2788 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2790 for aliasing checks. However, in some cases we can decrease
2791 the number of checks by combining two checks into one. For
2792 example, suppose we have another pair of data refs store_ptr_0
2793 and load_ptr_1, and if the following condition is satisfied:
2795 load_ptr_0 < load_ptr_1 &&
2796 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2798 (this condition means, in each iteration of vectorized loop,
2799 the accessed memory of store_ptr_0 cannot be between the memory
2800 of load_ptr_0 and load_ptr_1.)
2802 we then can use only the following expression to finish the
2803 alising checks between store_ptr_0 & load_ptr_0 and
2804 store_ptr_0 & load_ptr_1:
2806 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2807 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2809 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2810 same basic address. */
2812 comp_alias_ddrs.create (may_alias_ddrs.length ());
2814 /* First, we collect all data ref pairs for aliasing checks. */
2815 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2817 struct data_reference *dr_a, *dr_b;
2818 gimple dr_group_first_a, dr_group_first_b;
2819 tree segment_length_a, segment_length_b;
2820 gimple stmt_a, stmt_b;
2822 dr_a = DDR_A (ddr);
2823 stmt_a = DR_STMT (DDR_A (ddr));
2824 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2825 if (dr_group_first_a)
2827 stmt_a = dr_group_first_a;
2828 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2831 dr_b = DDR_B (ddr);
2832 stmt_b = DR_STMT (DDR_B (ddr));
2833 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2834 if (dr_group_first_b)
2836 stmt_b = dr_group_first_b;
2837 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2840 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2841 length_factor = scalar_loop_iters;
2842 else
2843 length_factor = size_int (vect_factor);
2844 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2845 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2847 dr_with_seg_len_pair_t dr_with_seg_len_pair
2848 (dr_with_seg_len (dr_a, segment_length_a),
2849 dr_with_seg_len (dr_b, segment_length_b));
2851 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2852 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2854 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2857 /* Second, we sort the collected data ref pairs so that we can scan
2858 them once to combine all possible aliasing checks. */
2859 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2861 /* Third, we scan the sorted dr pairs and check if we can combine
2862 alias checks of two neighbouring dr pairs. */
2863 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2865 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2866 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2867 *dr_b1 = &comp_alias_ddrs[i-1].second,
2868 *dr_a2 = &comp_alias_ddrs[i].first,
2869 *dr_b2 = &comp_alias_ddrs[i].second;
2871 /* Remove duplicate data ref pairs. */
2872 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2874 if (dump_enabled_p ())
2876 dump_printf_loc (MSG_NOTE, vect_location,
2877 "found equal ranges ");
2878 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2879 DR_REF (dr_a1->dr));
2880 dump_printf (MSG_NOTE, ", ");
2881 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2882 DR_REF (dr_b1->dr));
2883 dump_printf (MSG_NOTE, " and ");
2884 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2885 DR_REF (dr_a2->dr));
2886 dump_printf (MSG_NOTE, ", ");
2887 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2888 DR_REF (dr_b2->dr));
2889 dump_printf (MSG_NOTE, "\n");
2892 comp_alias_ddrs.ordered_remove (i--);
2893 continue;
2896 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2898 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2899 and DR_A1 and DR_A2 are two consecutive memrefs. */
2900 if (*dr_a1 == *dr_a2)
2902 std::swap (dr_a1, dr_b1);
2903 std::swap (dr_a2, dr_b2);
2906 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2907 DR_BASE_ADDRESS (dr_a2->dr),
2909 || !tree_fits_shwi_p (dr_a1->offset)
2910 || !tree_fits_shwi_p (dr_a2->offset))
2911 continue;
2913 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2914 - tree_to_shwi (dr_a1->offset));
2917 /* Now we check if the following condition is satisfied:
2919 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2921 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2922 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2923 have to make a best estimation. We can get the minimum value
2924 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2925 then either of the following two conditions can guarantee the
2926 one above:
2928 1: DIFF <= MIN_SEG_LEN_B
2929 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2933 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2934 ? tree_to_shwi (dr_b1->seg_len)
2935 : vect_factor);
2937 if (diff <= min_seg_len_b
2938 || (tree_fits_shwi_p (dr_a1->seg_len)
2939 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2941 if (dump_enabled_p ())
2943 dump_printf_loc (MSG_NOTE, vect_location,
2944 "merging ranges for ");
2945 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2946 DR_REF (dr_a1->dr));
2947 dump_printf (MSG_NOTE, ", ");
2948 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2949 DR_REF (dr_b1->dr));
2950 dump_printf (MSG_NOTE, " and ");
2951 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2952 DR_REF (dr_a2->dr));
2953 dump_printf (MSG_NOTE, ", ");
2954 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2955 DR_REF (dr_b2->dr));
2956 dump_printf (MSG_NOTE, "\n");
2959 dr_a1->seg_len = size_binop (PLUS_EXPR,
2960 dr_a2->seg_len, size_int (diff));
2961 comp_alias_ddrs.ordered_remove (i--);
2966 dump_printf_loc (MSG_NOTE, vect_location,
2967 "improved number of alias checks from %d to %d\n",
2968 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2969 if ((int) comp_alias_ddrs.length () >
2970 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2971 return false;
2973 return true;
2976 /* Check whether a non-affine read in stmt is suitable for gather load
2977 and if so, return a builtin decl for that operation. */
2979 tree
2980 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2981 tree *offp, int *scalep)
2983 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2985 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2986 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2987 tree offtype = NULL_TREE;
2988 tree decl, base, off;
2989 machine_mode pmode;
2990 int punsignedp, pvolatilep;
2992 base = DR_REF (dr);
2993 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2994 see if we can use the def stmt of the address. */
2995 if (is_gimple_call (stmt)
2996 && gimple_call_internal_p (stmt)
2997 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2998 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2999 && TREE_CODE (base) == MEM_REF
3000 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3001 && integer_zerop (TREE_OPERAND (base, 1))
3002 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3004 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3005 if (is_gimple_assign (def_stmt)
3006 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3007 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3010 /* The gather builtins need address of the form
3011 loop_invariant + vector * {1, 2, 4, 8}
3013 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3014 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3015 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3016 multiplications and additions in it. To get a vector, we need
3017 a single SSA_NAME that will be defined in the loop and will
3018 contain everything that is not loop invariant and that can be
3019 vectorized. The following code attempts to find such a preexistng
3020 SSA_NAME OFF and put the loop invariants into a tree BASE
3021 that can be gimplified before the loop. */
3022 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3023 &pmode, &punsignedp, &pvolatilep, false);
3024 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3026 if (TREE_CODE (base) == MEM_REF)
3028 if (!integer_zerop (TREE_OPERAND (base, 1)))
3030 if (off == NULL_TREE)
3032 offset_int moff = mem_ref_offset (base);
3033 off = wide_int_to_tree (sizetype, moff);
3035 else
3036 off = size_binop (PLUS_EXPR, off,
3037 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3039 base = TREE_OPERAND (base, 0);
3041 else
3042 base = build_fold_addr_expr (base);
3044 if (off == NULL_TREE)
3045 off = size_zero_node;
3047 /* If base is not loop invariant, either off is 0, then we start with just
3048 the constant offset in the loop invariant BASE and continue with base
3049 as OFF, otherwise give up.
3050 We could handle that case by gimplifying the addition of base + off
3051 into some SSA_NAME and use that as off, but for now punt. */
3052 if (!expr_invariant_in_loop_p (loop, base))
3054 if (!integer_zerop (off))
3055 return NULL_TREE;
3056 off = base;
3057 base = size_int (pbitpos / BITS_PER_UNIT);
3059 /* Otherwise put base + constant offset into the loop invariant BASE
3060 and continue with OFF. */
3061 else
3063 base = fold_convert (sizetype, base);
3064 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3067 /* OFF at this point may be either a SSA_NAME or some tree expression
3068 from get_inner_reference. Try to peel off loop invariants from it
3069 into BASE as long as possible. */
3070 STRIP_NOPS (off);
3071 while (offtype == NULL_TREE)
3073 enum tree_code code;
3074 tree op0, op1, add = NULL_TREE;
3076 if (TREE_CODE (off) == SSA_NAME)
3078 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3080 if (expr_invariant_in_loop_p (loop, off))
3081 return NULL_TREE;
3083 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3084 break;
3086 op0 = gimple_assign_rhs1 (def_stmt);
3087 code = gimple_assign_rhs_code (def_stmt);
3088 op1 = gimple_assign_rhs2 (def_stmt);
3090 else
3092 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3093 return NULL_TREE;
3094 code = TREE_CODE (off);
3095 extract_ops_from_tree (off, &code, &op0, &op1);
3097 switch (code)
3099 case POINTER_PLUS_EXPR:
3100 case PLUS_EXPR:
3101 if (expr_invariant_in_loop_p (loop, op0))
3103 add = op0;
3104 off = op1;
3105 do_add:
3106 add = fold_convert (sizetype, add);
3107 if (scale != 1)
3108 add = size_binop (MULT_EXPR, add, size_int (scale));
3109 base = size_binop (PLUS_EXPR, base, add);
3110 continue;
3112 if (expr_invariant_in_loop_p (loop, op1))
3114 add = op1;
3115 off = op0;
3116 goto do_add;
3118 break;
3119 case MINUS_EXPR:
3120 if (expr_invariant_in_loop_p (loop, op1))
3122 add = fold_convert (sizetype, op1);
3123 add = size_binop (MINUS_EXPR, size_zero_node, add);
3124 off = op0;
3125 goto do_add;
3127 break;
3128 case MULT_EXPR:
3129 if (scale == 1 && tree_fits_shwi_p (op1))
3131 scale = tree_to_shwi (op1);
3132 off = op0;
3133 continue;
3135 break;
3136 case SSA_NAME:
3137 off = op0;
3138 continue;
3139 CASE_CONVERT:
3140 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3141 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3142 break;
3143 if (TYPE_PRECISION (TREE_TYPE (op0))
3144 == TYPE_PRECISION (TREE_TYPE (off)))
3146 off = op0;
3147 continue;
3149 if (TYPE_PRECISION (TREE_TYPE (op0))
3150 < TYPE_PRECISION (TREE_TYPE (off)))
3152 off = op0;
3153 offtype = TREE_TYPE (off);
3154 STRIP_NOPS (off);
3155 continue;
3157 break;
3158 default:
3159 break;
3161 break;
3164 /* If at the end OFF still isn't a SSA_NAME or isn't
3165 defined in the loop, punt. */
3166 if (TREE_CODE (off) != SSA_NAME
3167 || expr_invariant_in_loop_p (loop, off))
3168 return NULL_TREE;
3170 if (offtype == NULL_TREE)
3171 offtype = TREE_TYPE (off);
3173 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3174 offtype, scale);
3175 if (decl == NULL_TREE)
3176 return NULL_TREE;
3178 if (basep)
3179 *basep = base;
3180 if (offp)
3181 *offp = off;
3182 if (scalep)
3183 *scalep = scale;
3184 return decl;
3187 /* Function vect_analyze_data_refs.
3189 Find all the data references in the loop or basic block.
3191 The general structure of the analysis of data refs in the vectorizer is as
3192 follows:
3193 1- vect_analyze_data_refs(loop/bb): call
3194 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3195 in the loop/bb and their dependences.
3196 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3197 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3198 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3202 bool
3203 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3204 bb_vec_info bb_vinfo,
3205 int *min_vf, unsigned *n_stmts)
3207 struct loop *loop = NULL;
3208 basic_block bb = NULL;
3209 unsigned int i;
3210 vec<data_reference_p> datarefs;
3211 struct data_reference *dr;
3212 tree scalar_type;
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_NOTE, vect_location,
3216 "=== vect_analyze_data_refs ===\n");
3218 if (loop_vinfo)
3220 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3222 loop = LOOP_VINFO_LOOP (loop_vinfo);
3223 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3224 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3226 if (dump_enabled_p ())
3227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3228 "not vectorized: loop contains function calls"
3229 " or data references that cannot be analyzed\n");
3230 return false;
3233 for (i = 0; i < loop->num_nodes; i++)
3235 gimple_stmt_iterator gsi;
3237 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3239 gimple stmt = gsi_stmt (gsi);
3240 if (is_gimple_debug (stmt))
3241 continue;
3242 ++*n_stmts;
3243 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3245 if (is_gimple_call (stmt) && loop->safelen)
3247 tree fndecl = gimple_call_fndecl (stmt), op;
3248 if (fndecl != NULL_TREE)
3250 struct cgraph_node *node = cgraph_node::get (fndecl);
3251 if (node != NULL && node->simd_clones != NULL)
3253 unsigned int j, n = gimple_call_num_args (stmt);
3254 for (j = 0; j < n; j++)
3256 op = gimple_call_arg (stmt, j);
3257 if (DECL_P (op)
3258 || (REFERENCE_CLASS_P (op)
3259 && get_base_address (op)))
3260 break;
3262 op = gimple_call_lhs (stmt);
3263 /* Ignore #pragma omp declare simd functions
3264 if they don't have data references in the
3265 call stmt itself. */
3266 if (j == n
3267 && !(op
3268 && (DECL_P (op)
3269 || (REFERENCE_CLASS_P (op)
3270 && get_base_address (op)))))
3271 continue;
3275 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 "not vectorized: loop contains function "
3279 "calls or data references that cannot "
3280 "be analyzed\n");
3281 return false;
3286 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3288 else
3290 gimple_stmt_iterator gsi;
3292 bb = BB_VINFO_BB (bb_vinfo);
3293 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3295 gimple stmt = gsi_stmt (gsi);
3296 if (is_gimple_debug (stmt))
3297 continue;
3298 ++*n_stmts;
3299 if (!find_data_references_in_stmt (NULL, stmt,
3300 &BB_VINFO_DATAREFS (bb_vinfo)))
3302 /* Mark the rest of the basic-block as unvectorizable. */
3303 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3305 stmt = gsi_stmt (gsi);
3306 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3308 break;
3312 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3315 /* Go through the data-refs, check that the analysis succeeded. Update
3316 pointer from stmt_vec_info struct to DR and vectype. */
3318 FOR_EACH_VEC_ELT (datarefs, i, dr)
3320 gimple stmt;
3321 stmt_vec_info stmt_info;
3322 tree base, offset, init;
3323 bool gather = false;
3324 bool simd_lane_access = false;
3325 int vf;
3327 again:
3328 if (!dr || !DR_REF (dr))
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3332 "not vectorized: unhandled data-ref\n");
3333 return false;
3336 stmt = DR_STMT (dr);
3337 stmt_info = vinfo_for_stmt (stmt);
3339 /* Discard clobbers from the dataref vector. We will remove
3340 clobber stmts during vectorization. */
3341 if (gimple_clobber_p (stmt))
3343 free_data_ref (dr);
3344 if (i == datarefs.length () - 1)
3346 datarefs.pop ();
3347 break;
3349 datarefs.ordered_remove (i);
3350 dr = datarefs[i];
3351 goto again;
3354 /* Check that analysis of the data-ref succeeded. */
3355 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3356 || !DR_STEP (dr))
3358 bool maybe_gather
3359 = DR_IS_READ (dr)
3360 && !TREE_THIS_VOLATILE (DR_REF (dr))
3361 && targetm.vectorize.builtin_gather != NULL;
3362 bool maybe_simd_lane_access
3363 = loop_vinfo && loop->simduid;
3365 /* If target supports vector gather loads, or if this might be
3366 a SIMD lane access, see if they can't be used. */
3367 if (loop_vinfo
3368 && (maybe_gather || maybe_simd_lane_access)
3369 && !nested_in_vect_loop_p (loop, stmt))
3371 struct data_reference *newdr
3372 = create_data_ref (NULL, loop_containing_stmt (stmt),
3373 DR_REF (dr), stmt, true);
3374 gcc_assert (newdr != NULL && DR_REF (newdr));
3375 if (DR_BASE_ADDRESS (newdr)
3376 && DR_OFFSET (newdr)
3377 && DR_INIT (newdr)
3378 && DR_STEP (newdr)
3379 && integer_zerop (DR_STEP (newdr)))
3381 if (maybe_simd_lane_access)
3383 tree off = DR_OFFSET (newdr);
3384 STRIP_NOPS (off);
3385 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3386 && TREE_CODE (off) == MULT_EXPR
3387 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3389 tree step = TREE_OPERAND (off, 1);
3390 off = TREE_OPERAND (off, 0);
3391 STRIP_NOPS (off);
3392 if (CONVERT_EXPR_P (off)
3393 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3394 0)))
3395 < TYPE_PRECISION (TREE_TYPE (off)))
3396 off = TREE_OPERAND (off, 0);
3397 if (TREE_CODE (off) == SSA_NAME)
3399 gimple def = SSA_NAME_DEF_STMT (off);
3400 tree reft = TREE_TYPE (DR_REF (newdr));
3401 if (is_gimple_call (def)
3402 && gimple_call_internal_p (def)
3403 && (gimple_call_internal_fn (def)
3404 == IFN_GOMP_SIMD_LANE))
3406 tree arg = gimple_call_arg (def, 0);
3407 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3408 arg = SSA_NAME_VAR (arg);
3409 if (arg == loop->simduid
3410 /* For now. */
3411 && tree_int_cst_equal
3412 (TYPE_SIZE_UNIT (reft),
3413 step))
3415 DR_OFFSET (newdr) = ssize_int (0);
3416 DR_STEP (newdr) = step;
3417 DR_ALIGNED_TO (newdr)
3418 = size_int (BIGGEST_ALIGNMENT);
3419 dr = newdr;
3420 simd_lane_access = true;
3426 if (!simd_lane_access && maybe_gather)
3428 dr = newdr;
3429 gather = true;
3432 if (!gather && !simd_lane_access)
3433 free_data_ref (newdr);
3436 if (!gather && !simd_lane_access)
3438 if (dump_enabled_p ())
3440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3441 "not vectorized: data ref analysis "
3442 "failed ");
3443 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3444 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3447 if (bb_vinfo)
3448 break;
3450 return false;
3454 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3456 if (dump_enabled_p ())
3457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3458 "not vectorized: base addr of dr is a "
3459 "constant\n");
3461 if (bb_vinfo)
3462 break;
3464 if (gather || simd_lane_access)
3465 free_data_ref (dr);
3466 return false;
3469 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3471 if (dump_enabled_p ())
3473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3474 "not vectorized: volatile type ");
3475 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3476 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3479 if (bb_vinfo)
3480 break;
3482 return false;
3485 if (stmt_can_throw_internal (stmt))
3487 if (dump_enabled_p ())
3489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3490 "not vectorized: statement can throw an "
3491 "exception ");
3492 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3493 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3496 if (bb_vinfo)
3497 break;
3499 if (gather || simd_lane_access)
3500 free_data_ref (dr);
3501 return false;
3504 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3505 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3507 if (dump_enabled_p ())
3509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3510 "not vectorized: statement is bitfield "
3511 "access ");
3512 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3513 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3516 if (bb_vinfo)
3517 break;
3519 if (gather || simd_lane_access)
3520 free_data_ref (dr);
3521 return false;
3524 base = unshare_expr (DR_BASE_ADDRESS (dr));
3525 offset = unshare_expr (DR_OFFSET (dr));
3526 init = unshare_expr (DR_INIT (dr));
3528 if (is_gimple_call (stmt)
3529 && (!gimple_call_internal_p (stmt)
3530 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3531 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3533 if (dump_enabled_p ())
3535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3536 "not vectorized: dr in a call ");
3537 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3538 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3541 if (bb_vinfo)
3542 break;
3544 if (gather || simd_lane_access)
3545 free_data_ref (dr);
3546 return false;
3549 /* Update DR field in stmt_vec_info struct. */
3551 /* If the dataref is in an inner-loop of the loop that is considered for
3552 for vectorization, we also want to analyze the access relative to
3553 the outer-loop (DR contains information only relative to the
3554 inner-most enclosing loop). We do that by building a reference to the
3555 first location accessed by the inner-loop, and analyze it relative to
3556 the outer-loop. */
3557 if (loop && nested_in_vect_loop_p (loop, stmt))
3559 tree outer_step, outer_base, outer_init;
3560 HOST_WIDE_INT pbitsize, pbitpos;
3561 tree poffset;
3562 machine_mode pmode;
3563 int punsignedp, pvolatilep;
3564 affine_iv base_iv, offset_iv;
3565 tree dinit;
3567 /* Build a reference to the first location accessed by the
3568 inner-loop: *(BASE+INIT). (The first location is actually
3569 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3570 tree inner_base = build_fold_indirect_ref
3571 (fold_build_pointer_plus (base, init));
3573 if (dump_enabled_p ())
3575 dump_printf_loc (MSG_NOTE, vect_location,
3576 "analyze in outer-loop: ");
3577 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3578 dump_printf (MSG_NOTE, "\n");
3581 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3582 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3583 gcc_assert (outer_base != NULL_TREE);
3585 if (pbitpos % BITS_PER_UNIT != 0)
3587 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 "failed: bit offset alignment.\n");
3590 return false;
3593 outer_base = build_fold_addr_expr (outer_base);
3594 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3595 &base_iv, false))
3597 if (dump_enabled_p ())
3598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3599 "failed: evolution of base is not affine.\n");
3600 return false;
3603 if (offset)
3605 if (poffset)
3606 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3607 poffset);
3608 else
3609 poffset = offset;
3612 if (!poffset)
3614 offset_iv.base = ssize_int (0);
3615 offset_iv.step = ssize_int (0);
3617 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3618 &offset_iv, false))
3620 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 "evolution of offset is not affine.\n");
3623 return false;
3626 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3627 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3628 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3629 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3630 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3632 outer_step = size_binop (PLUS_EXPR,
3633 fold_convert (ssizetype, base_iv.step),
3634 fold_convert (ssizetype, offset_iv.step));
3636 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3637 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3638 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3639 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3640 STMT_VINFO_DR_OFFSET (stmt_info) =
3641 fold_convert (ssizetype, offset_iv.base);
3642 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3643 size_int (highest_pow2_factor (offset_iv.base));
3645 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_NOTE, vect_location,
3648 "\touter base_address: ");
3649 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3651 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3652 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653 STMT_VINFO_DR_OFFSET (stmt_info));
3654 dump_printf (MSG_NOTE,
3655 "\n\touter constant offset from base address: ");
3656 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3657 STMT_VINFO_DR_INIT (stmt_info));
3658 dump_printf (MSG_NOTE, "\n\touter step: ");
3659 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660 STMT_VINFO_DR_STEP (stmt_info));
3661 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3662 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3663 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3664 dump_printf (MSG_NOTE, "\n");
3668 if (STMT_VINFO_DATA_REF (stmt_info))
3670 if (dump_enabled_p ())
3672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3673 "not vectorized: more than one data ref "
3674 "in stmt: ");
3675 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3676 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3679 if (bb_vinfo)
3680 break;
3682 if (gather || simd_lane_access)
3683 free_data_ref (dr);
3684 return false;
3687 STMT_VINFO_DATA_REF (stmt_info) = dr;
3688 if (simd_lane_access)
3690 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3691 free_data_ref (datarefs[i]);
3692 datarefs[i] = dr;
3695 /* Set vectype for STMT. */
3696 scalar_type = TREE_TYPE (DR_REF (dr));
3697 STMT_VINFO_VECTYPE (stmt_info)
3698 = get_vectype_for_scalar_type (scalar_type);
3699 if (!STMT_VINFO_VECTYPE (stmt_info))
3701 if (dump_enabled_p ())
3703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3704 "not vectorized: no vectype for stmt: ");
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3706 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3707 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3708 scalar_type);
3709 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3712 if (bb_vinfo)
3713 break;
3715 if (gather || simd_lane_access)
3717 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3718 if (gather)
3719 free_data_ref (dr);
3721 return false;
3723 else
3725 if (dump_enabled_p ())
3727 dump_printf_loc (MSG_NOTE, vect_location,
3728 "got vectype for stmt: ");
3729 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3730 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3731 STMT_VINFO_VECTYPE (stmt_info));
3732 dump_printf (MSG_NOTE, "\n");
3736 /* Adjust the minimal vectorization factor according to the
3737 vector type. */
3738 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3739 if (vf > *min_vf)
3740 *min_vf = vf;
3742 if (gather)
3744 tree off;
3746 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3747 if (gather
3748 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3749 gather = false;
3750 if (!gather)
3752 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3753 free_data_ref (dr);
3754 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757 "not vectorized: not suitable for gather "
3758 "load ");
3759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3760 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3762 return false;
3765 datarefs[i] = dr;
3766 STMT_VINFO_GATHER_P (stmt_info) = true;
3768 else if (loop_vinfo
3769 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3771 if (nested_in_vect_loop_p (loop, stmt)
3772 || !DR_IS_READ (dr))
3774 if (dump_enabled_p ())
3776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3777 "not vectorized: not suitable for strided "
3778 "load ");
3779 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3780 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3782 return false;
3784 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3788 /* If we stopped analysis at the first dataref we could not analyze
3789 when trying to vectorize a basic-block mark the rest of the datarefs
3790 as not vectorizable and truncate the vector of datarefs. That
3791 avoids spending useless time in analyzing their dependence. */
3792 if (i != datarefs.length ())
3794 gcc_assert (bb_vinfo != NULL);
3795 for (unsigned j = i; j < datarefs.length (); ++j)
3797 data_reference_p dr = datarefs[j];
3798 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3799 free_data_ref (dr);
3801 datarefs.truncate (i);
3804 return true;
3808 /* Function vect_get_new_vect_var.
3810 Returns a name for a new variable. The current naming scheme appends the
3811 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3812 the name of vectorizer generated variables, and appends that to NAME if
3813 provided. */
3815 tree
3816 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3818 const char *prefix;
3819 tree new_vect_var;
3821 switch (var_kind)
3823 case vect_simple_var:
3824 prefix = "vect";
3825 break;
3826 case vect_scalar_var:
3827 prefix = "stmp";
3828 break;
3829 case vect_pointer_var:
3830 prefix = "vectp";
3831 break;
3832 default:
3833 gcc_unreachable ();
3836 if (name)
3838 char* tmp = concat (prefix, "_", name, NULL);
3839 new_vect_var = create_tmp_reg (type, tmp);
3840 free (tmp);
3842 else
3843 new_vect_var = create_tmp_reg (type, prefix);
3845 return new_vect_var;
3848 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3850 static void
3851 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3852 stmt_vec_info stmt_info)
3854 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3855 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3856 int misalign = DR_MISALIGNMENT (dr);
3857 if (misalign == -1)
3858 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3859 else
3860 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3863 /* Function vect_create_addr_base_for_vector_ref.
3865 Create an expression that computes the address of the first memory location
3866 that will be accessed for a data reference.
3868 Input:
3869 STMT: The statement containing the data reference.
3870 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3871 OFFSET: Optional. If supplied, it is be added to the initial address.
3872 LOOP: Specify relative to which loop-nest should the address be computed.
3873 For example, when the dataref is in an inner-loop nested in an
3874 outer-loop that is now being vectorized, LOOP can be either the
3875 outer-loop, or the inner-loop. The first memory location accessed
3876 by the following dataref ('in' points to short):
3878 for (i=0; i<N; i++)
3879 for (j=0; j<M; j++)
3880 s += in[i+j]
3882 is as follows:
3883 if LOOP=i_loop: &in (relative to i_loop)
3884 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3885 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3886 initial address. Unlike OFFSET, which is number of elements to
3887 be added, BYTE_OFFSET is measured in bytes.
3889 Output:
3890 1. Return an SSA_NAME whose value is the address of the memory location of
3891 the first vector of the data reference.
3892 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3893 these statement(s) which define the returned SSA_NAME.
3895 FORNOW: We are only handling array accesses with step 1. */
3897 tree
3898 vect_create_addr_base_for_vector_ref (gimple stmt,
3899 gimple_seq *new_stmt_list,
3900 tree offset,
3901 struct loop *loop,
3902 tree byte_offset)
3904 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3905 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3906 tree data_ref_base;
3907 const char *base_name;
3908 tree addr_base;
3909 tree dest;
3910 gimple_seq seq = NULL;
3911 tree base_offset;
3912 tree init;
3913 tree vect_ptr_type;
3914 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3917 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3919 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3921 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3923 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3924 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3925 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3927 else
3929 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3930 base_offset = unshare_expr (DR_OFFSET (dr));
3931 init = unshare_expr (DR_INIT (dr));
3934 if (loop_vinfo)
3935 base_name = get_name (data_ref_base);
3936 else
3938 base_offset = ssize_int (0);
3939 init = ssize_int (0);
3940 base_name = get_name (DR_REF (dr));
3943 /* Create base_offset */
3944 base_offset = size_binop (PLUS_EXPR,
3945 fold_convert (sizetype, base_offset),
3946 fold_convert (sizetype, init));
3948 if (offset)
3950 offset = fold_build2 (MULT_EXPR, sizetype,
3951 fold_convert (sizetype, offset), step);
3952 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3953 base_offset, offset);
3955 if (byte_offset)
3957 byte_offset = fold_convert (sizetype, byte_offset);
3958 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3959 base_offset, byte_offset);
3962 /* base + base_offset */
3963 if (loop_vinfo)
3964 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3965 else
3967 addr_base = build1 (ADDR_EXPR,
3968 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3969 unshare_expr (DR_REF (dr)));
3972 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3973 addr_base = fold_convert (vect_ptr_type, addr_base);
3974 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3975 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3976 gimple_seq_add_seq (new_stmt_list, seq);
3978 if (DR_PTR_INFO (dr)
3979 && TREE_CODE (addr_base) == SSA_NAME)
3981 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3982 if (offset || byte_offset)
3983 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3986 if (dump_enabled_p ())
3988 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3989 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3990 dump_printf (MSG_NOTE, "\n");
3993 return addr_base;
3997 /* Function vect_create_data_ref_ptr.
3999 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4000 location accessed in the loop by STMT, along with the def-use update
4001 chain to appropriately advance the pointer through the loop iterations.
4002 Also set aliasing information for the pointer. This pointer is used by
4003 the callers to this function to create a memory reference expression for
4004 vector load/store access.
4006 Input:
4007 1. STMT: a stmt that references memory. Expected to be of the form
4008 GIMPLE_ASSIGN <name, data-ref> or
4009 GIMPLE_ASSIGN <data-ref, name>.
4010 2. AGGR_TYPE: the type of the reference, which should be either a vector
4011 or an array.
4012 3. AT_LOOP: the loop where the vector memref is to be created.
4013 4. OFFSET (optional): an offset to be added to the initial address accessed
4014 by the data-ref in STMT.
4015 5. BSI: location where the new stmts are to be placed if there is no loop
4016 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4017 pointing to the initial address.
4018 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4019 to the initial address accessed by the data-ref in STMT. This is
4020 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4021 in bytes.
4023 Output:
4024 1. Declare a new ptr to vector_type, and have it point to the base of the
4025 data reference (initial addressed accessed by the data reference).
4026 For example, for vector of type V8HI, the following code is generated:
4028 v8hi *ap;
4029 ap = (v8hi *)initial_address;
4031 if OFFSET is not supplied:
4032 initial_address = &a[init];
4033 if OFFSET is supplied:
4034 initial_address = &a[init + OFFSET];
4035 if BYTE_OFFSET is supplied:
4036 initial_address = &a[init] + BYTE_OFFSET;
4038 Return the initial_address in INITIAL_ADDRESS.
4040 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4041 update the pointer in each iteration of the loop.
4043 Return the increment stmt that updates the pointer in PTR_INCR.
4045 3. Set INV_P to true if the access pattern of the data reference in the
4046 vectorized loop is invariant. Set it to false otherwise.
4048 4. Return the pointer. */
4050 tree
4051 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4052 tree offset, tree *initial_address,
4053 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4054 bool only_init, bool *inv_p, tree byte_offset)
4056 const char *base_name;
4057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4058 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4059 struct loop *loop = NULL;
4060 bool nested_in_vect_loop = false;
4061 struct loop *containing_loop = NULL;
4062 tree aggr_ptr_type;
4063 tree aggr_ptr;
4064 tree new_temp;
4065 gimple vec_stmt;
4066 gimple_seq new_stmt_list = NULL;
4067 edge pe = NULL;
4068 basic_block new_bb;
4069 tree aggr_ptr_init;
4070 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4071 tree aptr;
4072 gimple_stmt_iterator incr_gsi;
4073 bool insert_after;
4074 tree indx_before_incr, indx_after_incr;
4075 gimple incr;
4076 tree step;
4077 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4079 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4080 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4082 if (loop_vinfo)
4084 loop = LOOP_VINFO_LOOP (loop_vinfo);
4085 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4086 containing_loop = (gimple_bb (stmt))->loop_father;
4087 pe = loop_preheader_edge (loop);
4089 else
4091 gcc_assert (bb_vinfo);
4092 only_init = true;
4093 *ptr_incr = NULL;
4096 /* Check the step (evolution) of the load in LOOP, and record
4097 whether it's invariant. */
4098 if (nested_in_vect_loop)
4099 step = STMT_VINFO_DR_STEP (stmt_info);
4100 else
4101 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4103 if (integer_zerop (step))
4104 *inv_p = true;
4105 else
4106 *inv_p = false;
4108 /* Create an expression for the first address accessed by this load
4109 in LOOP. */
4110 base_name = get_name (DR_BASE_ADDRESS (dr));
4112 if (dump_enabled_p ())
4114 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4115 dump_printf_loc (MSG_NOTE, vect_location,
4116 "create %s-pointer variable to type: ",
4117 get_tree_code_name (TREE_CODE (aggr_type)));
4118 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4119 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4120 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4121 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4122 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4123 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4124 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4125 else
4126 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4127 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4128 dump_printf (MSG_NOTE, "\n");
4131 /* (1) Create the new aggregate-pointer variable.
4132 Vector and array types inherit the alias set of their component
4133 type by default so we need to use a ref-all pointer if the data
4134 reference does not conflict with the created aggregated data
4135 reference because it is not addressable. */
4136 bool need_ref_all = false;
4137 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4138 get_alias_set (DR_REF (dr))))
4139 need_ref_all = true;
4140 /* Likewise for any of the data references in the stmt group. */
4141 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4143 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4146 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4147 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4148 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4149 get_alias_set (DR_REF (sdr))))
4151 need_ref_all = true;
4152 break;
4154 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4156 while (orig_stmt);
4158 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4159 need_ref_all);
4160 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4163 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4164 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4165 def-use update cycles for the pointer: one relative to the outer-loop
4166 (LOOP), which is what steps (3) and (4) below do. The other is relative
4167 to the inner-loop (which is the inner-most loop containing the dataref),
4168 and this is done be step (5) below.
4170 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4171 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4172 redundant. Steps (3),(4) create the following:
4174 vp0 = &base_addr;
4175 LOOP: vp1 = phi(vp0,vp2)
4178 vp2 = vp1 + step
4179 goto LOOP
4181 If there is an inner-loop nested in loop, then step (5) will also be
4182 applied, and an additional update in the inner-loop will be created:
4184 vp0 = &base_addr;
4185 LOOP: vp1 = phi(vp0,vp2)
4187 inner: vp3 = phi(vp1,vp4)
4188 vp4 = vp3 + inner_step
4189 if () goto inner
4191 vp2 = vp1 + step
4192 if () goto LOOP */
4194 /* (2) Calculate the initial address of the aggregate-pointer, and set
4195 the aggregate-pointer to point to it before the loop. */
4197 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4199 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4200 offset, loop, byte_offset);
4201 if (new_stmt_list)
4203 if (pe)
4205 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4206 gcc_assert (!new_bb);
4208 else
4209 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4212 *initial_address = new_temp;
4214 /* Create: p = (aggr_type *) initial_base */
4215 if (TREE_CODE (new_temp) != SSA_NAME
4216 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4218 vec_stmt = gimple_build_assign (aggr_ptr,
4219 fold_convert (aggr_ptr_type, new_temp));
4220 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4221 /* Copy the points-to information if it exists. */
4222 if (DR_PTR_INFO (dr))
4223 vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info);
4224 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4225 if (pe)
4227 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4228 gcc_assert (!new_bb);
4230 else
4231 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4233 else
4234 aggr_ptr_init = new_temp;
4236 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4237 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4238 inner-loop nested in LOOP (during outer-loop vectorization). */
4240 /* No update in loop is required. */
4241 if (only_init && (!loop_vinfo || at_loop == loop))
4242 aptr = aggr_ptr_init;
4243 else
4245 /* The step of the aggregate pointer is the type size. */
4246 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4247 /* One exception to the above is when the scalar step of the load in
4248 LOOP is zero. In this case the step here is also zero. */
4249 if (*inv_p)
4250 iv_step = size_zero_node;
4251 else if (tree_int_cst_sgn (step) == -1)
4252 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4254 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4256 create_iv (aggr_ptr_init,
4257 fold_convert (aggr_ptr_type, iv_step),
4258 aggr_ptr, loop, &incr_gsi, insert_after,
4259 &indx_before_incr, &indx_after_incr);
4260 incr = gsi_stmt (incr_gsi);
4261 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4263 /* Copy the points-to information if it exists. */
4264 if (DR_PTR_INFO (dr))
4266 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4267 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4269 if (ptr_incr)
4270 *ptr_incr = incr;
4272 aptr = indx_before_incr;
4275 if (!nested_in_vect_loop || only_init)
4276 return aptr;
4279 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4280 nested in LOOP, if exists. */
4282 gcc_assert (nested_in_vect_loop);
4283 if (!only_init)
4285 standard_iv_increment_position (containing_loop, &incr_gsi,
4286 &insert_after);
4287 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4288 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4289 &indx_after_incr);
4290 incr = gsi_stmt (incr_gsi);
4291 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4293 /* Copy the points-to information if it exists. */
4294 if (DR_PTR_INFO (dr))
4296 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4297 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4299 if (ptr_incr)
4300 *ptr_incr = incr;
4302 return indx_before_incr;
4304 else
4305 gcc_unreachable ();
4309 /* Function bump_vector_ptr
4311 Increment a pointer (to a vector type) by vector-size. If requested,
4312 i.e. if PTR-INCR is given, then also connect the new increment stmt
4313 to the existing def-use update-chain of the pointer, by modifying
4314 the PTR_INCR as illustrated below:
4316 The pointer def-use update-chain before this function:
4317 DATAREF_PTR = phi (p_0, p_2)
4318 ....
4319 PTR_INCR: p_2 = DATAREF_PTR + step
4321 The pointer def-use update-chain after this function:
4322 DATAREF_PTR = phi (p_0, p_2)
4323 ....
4324 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4325 ....
4326 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4328 Input:
4329 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4330 in the loop.
4331 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4332 the loop. The increment amount across iterations is expected
4333 to be vector_size.
4334 BSI - location where the new update stmt is to be placed.
4335 STMT - the original scalar memory-access stmt that is being vectorized.
4336 BUMP - optional. The offset by which to bump the pointer. If not given,
4337 the offset is assumed to be vector_size.
4339 Output: Return NEW_DATAREF_PTR as illustrated above.
4343 tree
4344 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4345 gimple stmt, tree bump)
4347 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4348 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4349 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4350 tree update = TYPE_SIZE_UNIT (vectype);
4351 gassign *incr_stmt;
4352 ssa_op_iter iter;
4353 use_operand_p use_p;
4354 tree new_dataref_ptr;
4356 if (bump)
4357 update = bump;
4359 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4360 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4361 dataref_ptr, update);
4362 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4364 /* Copy the points-to information if it exists. */
4365 if (DR_PTR_INFO (dr))
4367 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4368 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4371 if (!ptr_incr)
4372 return new_dataref_ptr;
4374 /* Update the vector-pointer's cross-iteration increment. */
4375 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4377 tree use = USE_FROM_PTR (use_p);
4379 if (use == dataref_ptr)
4380 SET_USE (use_p, new_dataref_ptr);
4381 else
4382 gcc_assert (tree_int_cst_compare (use, update) == 0);
4385 return new_dataref_ptr;
4389 /* Function vect_create_destination_var.
4391 Create a new temporary of type VECTYPE. */
4393 tree
4394 vect_create_destination_var (tree scalar_dest, tree vectype)
4396 tree vec_dest;
4397 const char *name;
4398 char *new_name;
4399 tree type;
4400 enum vect_var_kind kind;
4402 kind = vectype ? vect_simple_var : vect_scalar_var;
4403 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4405 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4407 name = get_name (scalar_dest);
4408 if (name)
4409 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4410 else
4411 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4412 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4413 free (new_name);
4415 return vec_dest;
4418 /* Function vect_grouped_store_supported.
4420 Returns TRUE if interleave high and interleave low permutations
4421 are supported, and FALSE otherwise. */
4423 bool
4424 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4426 machine_mode mode = TYPE_MODE (vectype);
4428 /* vect_permute_store_chain requires the group size to be equal to 3 or
4429 be a power of two. */
4430 if (count != 3 && exact_log2 (count) == -1)
4432 if (dump_enabled_p ())
4433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4434 "the size of the group of accesses"
4435 " is not a power of 2 or not eqaul to 3\n");
4436 return false;
4439 /* Check that the permutation is supported. */
4440 if (VECTOR_MODE_P (mode))
4442 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4443 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4445 if (count == 3)
4447 unsigned int j0 = 0, j1 = 0, j2 = 0;
4448 unsigned int i, j;
4450 for (j = 0; j < 3; j++)
4452 int nelt0 = ((3 - j) * nelt) % 3;
4453 int nelt1 = ((3 - j) * nelt + 1) % 3;
4454 int nelt2 = ((3 - j) * nelt + 2) % 3;
4455 for (i = 0; i < nelt; i++)
4457 if (3 * i + nelt0 < nelt)
4458 sel[3 * i + nelt0] = j0++;
4459 if (3 * i + nelt1 < nelt)
4460 sel[3 * i + nelt1] = nelt + j1++;
4461 if (3 * i + nelt2 < nelt)
4462 sel[3 * i + nelt2] = 0;
4464 if (!can_vec_perm_p (mode, false, sel))
4466 if (dump_enabled_p ())
4467 dump_printf (MSG_MISSED_OPTIMIZATION,
4468 "permutaion op not supported by target.\n");
4469 return false;
4472 for (i = 0; i < nelt; i++)
4474 if (3 * i + nelt0 < nelt)
4475 sel[3 * i + nelt0] = 3 * i + nelt0;
4476 if (3 * i + nelt1 < nelt)
4477 sel[3 * i + nelt1] = 3 * i + nelt1;
4478 if (3 * i + nelt2 < nelt)
4479 sel[3 * i + nelt2] = nelt + j2++;
4481 if (!can_vec_perm_p (mode, false, sel))
4483 if (dump_enabled_p ())
4484 dump_printf (MSG_MISSED_OPTIMIZATION,
4485 "permutaion op not supported by target.\n");
4486 return false;
4489 return true;
4491 else
4493 /* If length is not equal to 3 then only power of 2 is supported. */
4494 gcc_assert (exact_log2 (count) != -1);
4496 for (i = 0; i < nelt / 2; i++)
4498 sel[i * 2] = i;
4499 sel[i * 2 + 1] = i + nelt;
4501 if (can_vec_perm_p (mode, false, sel))
4503 for (i = 0; i < nelt; i++)
4504 sel[i] += nelt / 2;
4505 if (can_vec_perm_p (mode, false, sel))
4506 return true;
4511 if (dump_enabled_p ())
4512 dump_printf (MSG_MISSED_OPTIMIZATION,
4513 "permutaion op not supported by target.\n");
4514 return false;
4518 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4519 type VECTYPE. */
4521 bool
4522 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4524 return vect_lanes_optab_supported_p ("vec_store_lanes",
4525 vec_store_lanes_optab,
4526 vectype, count);
4530 /* Function vect_permute_store_chain.
4532 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4533 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4534 the data correctly for the stores. Return the final references for stores
4535 in RESULT_CHAIN.
4537 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4538 The input is 4 vectors each containing 8 elements. We assign a number to
4539 each element, the input sequence is:
4541 1st vec: 0 1 2 3 4 5 6 7
4542 2nd vec: 8 9 10 11 12 13 14 15
4543 3rd vec: 16 17 18 19 20 21 22 23
4544 4th vec: 24 25 26 27 28 29 30 31
4546 The output sequence should be:
4548 1st vec: 0 8 16 24 1 9 17 25
4549 2nd vec: 2 10 18 26 3 11 19 27
4550 3rd vec: 4 12 20 28 5 13 21 30
4551 4th vec: 6 14 22 30 7 15 23 31
4553 i.e., we interleave the contents of the four vectors in their order.
4555 We use interleave_high/low instructions to create such output. The input of
4556 each interleave_high/low operation is two vectors:
4557 1st vec 2nd vec
4558 0 1 2 3 4 5 6 7
4559 the even elements of the result vector are obtained left-to-right from the
4560 high/low elements of the first vector. The odd elements of the result are
4561 obtained left-to-right from the high/low elements of the second vector.
4562 The output of interleave_high will be: 0 4 1 5
4563 and of interleave_low: 2 6 3 7
4566 The permutation is done in log LENGTH stages. In each stage interleave_high
4567 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4568 where the first argument is taken from the first half of DR_CHAIN and the
4569 second argument from it's second half.
4570 In our example,
4572 I1: interleave_high (1st vec, 3rd vec)
4573 I2: interleave_low (1st vec, 3rd vec)
4574 I3: interleave_high (2nd vec, 4th vec)
4575 I4: interleave_low (2nd vec, 4th vec)
4577 The output for the first stage is:
4579 I1: 0 16 1 17 2 18 3 19
4580 I2: 4 20 5 21 6 22 7 23
4581 I3: 8 24 9 25 10 26 11 27
4582 I4: 12 28 13 29 14 30 15 31
4584 The output of the second stage, i.e. the final result is:
4586 I1: 0 8 16 24 1 9 17 25
4587 I2: 2 10 18 26 3 11 19 27
4588 I3: 4 12 20 28 5 13 21 30
4589 I4: 6 14 22 30 7 15 23 31. */
4591 void
4592 vect_permute_store_chain (vec<tree> dr_chain,
4593 unsigned int length,
4594 gimple stmt,
4595 gimple_stmt_iterator *gsi,
4596 vec<tree> *result_chain)
4598 tree vect1, vect2, high, low;
4599 gimple perm_stmt;
4600 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4601 tree perm_mask_low, perm_mask_high;
4602 tree data_ref;
4603 tree perm3_mask_low, perm3_mask_high;
4604 unsigned int i, n, log_length = exact_log2 (length);
4605 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4606 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4608 result_chain->quick_grow (length);
4609 memcpy (result_chain->address (), dr_chain.address (),
4610 length * sizeof (tree));
4612 if (length == 3)
4614 unsigned int j0 = 0, j1 = 0, j2 = 0;
4616 for (j = 0; j < 3; j++)
4618 int nelt0 = ((3 - j) * nelt) % 3;
4619 int nelt1 = ((3 - j) * nelt + 1) % 3;
4620 int nelt2 = ((3 - j) * nelt + 2) % 3;
4622 for (i = 0; i < nelt; i++)
4624 if (3 * i + nelt0 < nelt)
4625 sel[3 * i + nelt0] = j0++;
4626 if (3 * i + nelt1 < nelt)
4627 sel[3 * i + nelt1] = nelt + j1++;
4628 if (3 * i + nelt2 < nelt)
4629 sel[3 * i + nelt2] = 0;
4631 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4633 for (i = 0; i < nelt; i++)
4635 if (3 * i + nelt0 < nelt)
4636 sel[3 * i + nelt0] = 3 * i + nelt0;
4637 if (3 * i + nelt1 < nelt)
4638 sel[3 * i + nelt1] = 3 * i + nelt1;
4639 if (3 * i + nelt2 < nelt)
4640 sel[3 * i + nelt2] = nelt + j2++;
4642 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4644 vect1 = dr_chain[0];
4645 vect2 = dr_chain[1];
4647 /* Create interleaving stmt:
4648 low = VEC_PERM_EXPR <vect1, vect2,
4649 {j, nelt, *, j + 1, nelt + j + 1, *,
4650 j + 2, nelt + j + 2, *, ...}> */
4651 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4652 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4653 vect2, perm3_mask_low);
4654 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4656 vect1 = data_ref;
4657 vect2 = dr_chain[2];
4658 /* Create interleaving stmt:
4659 low = VEC_PERM_EXPR <vect1, vect2,
4660 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4661 6, 7, nelt + j + 2, ...}> */
4662 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4663 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4664 vect2, perm3_mask_high);
4665 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4666 (*result_chain)[j] = data_ref;
4669 else
4671 /* If length is not equal to 3 then only power of 2 is supported. */
4672 gcc_assert (exact_log2 (length) != -1);
4674 for (i = 0, n = nelt / 2; i < n; i++)
4676 sel[i * 2] = i;
4677 sel[i * 2 + 1] = i + nelt;
4679 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4681 for (i = 0; i < nelt; i++)
4682 sel[i] += nelt / 2;
4683 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4685 for (i = 0, n = log_length; i < n; i++)
4687 for (j = 0; j < length/2; j++)
4689 vect1 = dr_chain[j];
4690 vect2 = dr_chain[j+length/2];
4692 /* Create interleaving stmt:
4693 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4694 ...}> */
4695 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4696 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4697 vect2, perm_mask_high);
4698 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4699 (*result_chain)[2*j] = high;
4701 /* Create interleaving stmt:
4702 low = VEC_PERM_EXPR <vect1, vect2,
4703 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4704 ...}> */
4705 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4706 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4707 vect2, perm_mask_low);
4708 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4709 (*result_chain)[2*j+1] = low;
4711 memcpy (dr_chain.address (), result_chain->address (),
4712 length * sizeof (tree));
4717 /* Function vect_setup_realignment
4719 This function is called when vectorizing an unaligned load using
4720 the dr_explicit_realign[_optimized] scheme.
4721 This function generates the following code at the loop prolog:
4723 p = initial_addr;
4724 x msq_init = *(floor(p)); # prolog load
4725 realignment_token = call target_builtin;
4726 loop:
4727 x msq = phi (msq_init, ---)
4729 The stmts marked with x are generated only for the case of
4730 dr_explicit_realign_optimized.
4732 The code above sets up a new (vector) pointer, pointing to the first
4733 location accessed by STMT, and a "floor-aligned" load using that pointer.
4734 It also generates code to compute the "realignment-token" (if the relevant
4735 target hook was defined), and creates a phi-node at the loop-header bb
4736 whose arguments are the result of the prolog-load (created by this
4737 function) and the result of a load that takes place in the loop (to be
4738 created by the caller to this function).
4740 For the case of dr_explicit_realign_optimized:
4741 The caller to this function uses the phi-result (msq) to create the
4742 realignment code inside the loop, and sets up the missing phi argument,
4743 as follows:
4744 loop:
4745 msq = phi (msq_init, lsq)
4746 lsq = *(floor(p')); # load in loop
4747 result = realign_load (msq, lsq, realignment_token);
4749 For the case of dr_explicit_realign:
4750 loop:
4751 msq = *(floor(p)); # load in loop
4752 p' = p + (VS-1);
4753 lsq = *(floor(p')); # load in loop
4754 result = realign_load (msq, lsq, realignment_token);
4756 Input:
4757 STMT - (scalar) load stmt to be vectorized. This load accesses
4758 a memory location that may be unaligned.
4759 BSI - place where new code is to be inserted.
4760 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4761 is used.
4763 Output:
4764 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4765 target hook, if defined.
4766 Return value - the result of the loop-header phi node. */
4768 tree
4769 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4770 tree *realignment_token,
4771 enum dr_alignment_support alignment_support_scheme,
4772 tree init_addr,
4773 struct loop **at_loop)
4775 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4776 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4777 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4778 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4779 struct loop *loop = NULL;
4780 edge pe = NULL;
4781 tree scalar_dest = gimple_assign_lhs (stmt);
4782 tree vec_dest;
4783 gimple inc;
4784 tree ptr;
4785 tree data_ref;
4786 basic_block new_bb;
4787 tree msq_init = NULL_TREE;
4788 tree new_temp;
4789 gphi *phi_stmt;
4790 tree msq = NULL_TREE;
4791 gimple_seq stmts = NULL;
4792 bool inv_p;
4793 bool compute_in_loop = false;
4794 bool nested_in_vect_loop = false;
4795 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4796 struct loop *loop_for_initial_load = NULL;
4798 if (loop_vinfo)
4800 loop = LOOP_VINFO_LOOP (loop_vinfo);
4801 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4804 gcc_assert (alignment_support_scheme == dr_explicit_realign
4805 || alignment_support_scheme == dr_explicit_realign_optimized);
4807 /* We need to generate three things:
4808 1. the misalignment computation
4809 2. the extra vector load (for the optimized realignment scheme).
4810 3. the phi node for the two vectors from which the realignment is
4811 done (for the optimized realignment scheme). */
4813 /* 1. Determine where to generate the misalignment computation.
4815 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4816 calculation will be generated by this function, outside the loop (in the
4817 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4818 caller, inside the loop.
4820 Background: If the misalignment remains fixed throughout the iterations of
4821 the loop, then both realignment schemes are applicable, and also the
4822 misalignment computation can be done outside LOOP. This is because we are
4823 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4824 are a multiple of VS (the Vector Size), and therefore the misalignment in
4825 different vectorized LOOP iterations is always the same.
4826 The problem arises only if the memory access is in an inner-loop nested
4827 inside LOOP, which is now being vectorized using outer-loop vectorization.
4828 This is the only case when the misalignment of the memory access may not
4829 remain fixed throughout the iterations of the inner-loop (as explained in
4830 detail in vect_supportable_dr_alignment). In this case, not only is the
4831 optimized realignment scheme not applicable, but also the misalignment
4832 computation (and generation of the realignment token that is passed to
4833 REALIGN_LOAD) have to be done inside the loop.
4835 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4836 or not, which in turn determines if the misalignment is computed inside
4837 the inner-loop, or outside LOOP. */
4839 if (init_addr != NULL_TREE || !loop_vinfo)
4841 compute_in_loop = true;
4842 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4846 /* 2. Determine where to generate the extra vector load.
4848 For the optimized realignment scheme, instead of generating two vector
4849 loads in each iteration, we generate a single extra vector load in the
4850 preheader of the loop, and in each iteration reuse the result of the
4851 vector load from the previous iteration. In case the memory access is in
4852 an inner-loop nested inside LOOP, which is now being vectorized using
4853 outer-loop vectorization, we need to determine whether this initial vector
4854 load should be generated at the preheader of the inner-loop, or can be
4855 generated at the preheader of LOOP. If the memory access has no evolution
4856 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4857 to be generated inside LOOP (in the preheader of the inner-loop). */
4859 if (nested_in_vect_loop)
4861 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4862 bool invariant_in_outerloop =
4863 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4864 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4866 else
4867 loop_for_initial_load = loop;
4868 if (at_loop)
4869 *at_loop = loop_for_initial_load;
4871 if (loop_for_initial_load)
4872 pe = loop_preheader_edge (loop_for_initial_load);
4874 /* 3. For the case of the optimized realignment, create the first vector
4875 load at the loop preheader. */
4877 if (alignment_support_scheme == dr_explicit_realign_optimized)
4879 /* Create msq_init = *(floor(p1)) in the loop preheader */
4880 gassign *new_stmt;
4882 gcc_assert (!compute_in_loop);
4883 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4884 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4885 NULL_TREE, &init_addr, NULL, &inc,
4886 true, &inv_p);
4887 new_temp = copy_ssa_name (ptr);
4888 new_stmt = gimple_build_assign
4889 (new_temp, BIT_AND_EXPR, ptr,
4890 build_int_cst (TREE_TYPE (ptr),
4891 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4892 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4893 gcc_assert (!new_bb);
4894 data_ref
4895 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4896 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4897 new_stmt = gimple_build_assign (vec_dest, data_ref);
4898 new_temp = make_ssa_name (vec_dest, new_stmt);
4899 gimple_assign_set_lhs (new_stmt, new_temp);
4900 if (pe)
4902 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4903 gcc_assert (!new_bb);
4905 else
4906 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4908 msq_init = gimple_assign_lhs (new_stmt);
4911 /* 4. Create realignment token using a target builtin, if available.
4912 It is done either inside the containing loop, or before LOOP (as
4913 determined above). */
4915 if (targetm.vectorize.builtin_mask_for_load)
4917 gcall *new_stmt;
4918 tree builtin_decl;
4920 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4921 if (!init_addr)
4923 /* Generate the INIT_ADDR computation outside LOOP. */
4924 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4925 NULL_TREE, loop);
4926 if (loop)
4928 pe = loop_preheader_edge (loop);
4929 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4930 gcc_assert (!new_bb);
4932 else
4933 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4936 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4937 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4938 vec_dest =
4939 vect_create_destination_var (scalar_dest,
4940 gimple_call_return_type (new_stmt));
4941 new_temp = make_ssa_name (vec_dest, new_stmt);
4942 gimple_call_set_lhs (new_stmt, new_temp);
4944 if (compute_in_loop)
4945 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4946 else
4948 /* Generate the misalignment computation outside LOOP. */
4949 pe = loop_preheader_edge (loop);
4950 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4951 gcc_assert (!new_bb);
4954 *realignment_token = gimple_call_lhs (new_stmt);
4956 /* The result of the CALL_EXPR to this builtin is determined from
4957 the value of the parameter and no global variables are touched
4958 which makes the builtin a "const" function. Requiring the
4959 builtin to have the "const" attribute makes it unnecessary
4960 to call mark_call_clobbered. */
4961 gcc_assert (TREE_READONLY (builtin_decl));
4964 if (alignment_support_scheme == dr_explicit_realign)
4965 return msq;
4967 gcc_assert (!compute_in_loop);
4968 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4971 /* 5. Create msq = phi <msq_init, lsq> in loop */
4973 pe = loop_preheader_edge (containing_loop);
4974 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4975 msq = make_ssa_name (vec_dest);
4976 phi_stmt = create_phi_node (msq, containing_loop->header);
4977 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4979 return msq;
4983 /* Function vect_grouped_load_supported.
4985 Returns TRUE if even and odd permutations are supported,
4986 and FALSE otherwise. */
4988 bool
4989 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4991 machine_mode mode = TYPE_MODE (vectype);
4993 /* vect_permute_load_chain requires the group size to be equal to 3 or
4994 be a power of two. */
4995 if (count != 3 && exact_log2 (count) == -1)
4997 if (dump_enabled_p ())
4998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999 "the size of the group of accesses"
5000 " is not a power of 2 or not equal to 3\n");
5001 return false;
5004 /* Check that the permutation is supported. */
5005 if (VECTOR_MODE_P (mode))
5007 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5008 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5010 if (count == 3)
5012 unsigned int k;
5013 for (k = 0; k < 3; k++)
5015 for (i = 0; i < nelt; i++)
5016 if (3 * i + k < 2 * nelt)
5017 sel[i] = 3 * i + k;
5018 else
5019 sel[i] = 0;
5020 if (!can_vec_perm_p (mode, false, sel))
5022 if (dump_enabled_p ())
5023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5024 "shuffle of 3 loads is not supported by"
5025 " target\n");
5026 return false;
5028 for (i = 0, j = 0; i < nelt; i++)
5029 if (3 * i + k < 2 * nelt)
5030 sel[i] = i;
5031 else
5032 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5033 if (!can_vec_perm_p (mode, false, sel))
5035 if (dump_enabled_p ())
5036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5037 "shuffle of 3 loads is not supported by"
5038 " target\n");
5039 return false;
5042 return true;
5044 else
5046 /* If length is not equal to 3 then only power of 2 is supported. */
5047 gcc_assert (exact_log2 (count) != -1);
5048 for (i = 0; i < nelt; i++)
5049 sel[i] = i * 2;
5050 if (can_vec_perm_p (mode, false, sel))
5052 for (i = 0; i < nelt; i++)
5053 sel[i] = i * 2 + 1;
5054 if (can_vec_perm_p (mode, false, sel))
5055 return true;
5060 if (dump_enabled_p ())
5061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062 "extract even/odd not supported by target\n");
5063 return false;
5066 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5067 type VECTYPE. */
5069 bool
5070 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5072 return vect_lanes_optab_supported_p ("vec_load_lanes",
5073 vec_load_lanes_optab,
5074 vectype, count);
5077 /* Function vect_permute_load_chain.
5079 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5080 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5081 the input data correctly. Return the final references for loads in
5082 RESULT_CHAIN.
5084 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5085 The input is 4 vectors each containing 8 elements. We assign a number to each
5086 element, the input sequence is:
5088 1st vec: 0 1 2 3 4 5 6 7
5089 2nd vec: 8 9 10 11 12 13 14 15
5090 3rd vec: 16 17 18 19 20 21 22 23
5091 4th vec: 24 25 26 27 28 29 30 31
5093 The output sequence should be:
5095 1st vec: 0 4 8 12 16 20 24 28
5096 2nd vec: 1 5 9 13 17 21 25 29
5097 3rd vec: 2 6 10 14 18 22 26 30
5098 4th vec: 3 7 11 15 19 23 27 31
5100 i.e., the first output vector should contain the first elements of each
5101 interleaving group, etc.
5103 We use extract_even/odd instructions to create such output. The input of
5104 each extract_even/odd operation is two vectors
5105 1st vec 2nd vec
5106 0 1 2 3 4 5 6 7
5108 and the output is the vector of extracted even/odd elements. The output of
5109 extract_even will be: 0 2 4 6
5110 and of extract_odd: 1 3 5 7
5113 The permutation is done in log LENGTH stages. In each stage extract_even
5114 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5115 their order. In our example,
5117 E1: extract_even (1st vec, 2nd vec)
5118 E2: extract_odd (1st vec, 2nd vec)
5119 E3: extract_even (3rd vec, 4th vec)
5120 E4: extract_odd (3rd vec, 4th vec)
5122 The output for the first stage will be:
5124 E1: 0 2 4 6 8 10 12 14
5125 E2: 1 3 5 7 9 11 13 15
5126 E3: 16 18 20 22 24 26 28 30
5127 E4: 17 19 21 23 25 27 29 31
5129 In order to proceed and create the correct sequence for the next stage (or
5130 for the correct output, if the second stage is the last one, as in our
5131 example), we first put the output of extract_even operation and then the
5132 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5133 The input for the second stage is:
5135 1st vec (E1): 0 2 4 6 8 10 12 14
5136 2nd vec (E3): 16 18 20 22 24 26 28 30
5137 3rd vec (E2): 1 3 5 7 9 11 13 15
5138 4th vec (E4): 17 19 21 23 25 27 29 31
5140 The output of the second stage:
5142 E1: 0 4 8 12 16 20 24 28
5143 E2: 2 6 10 14 18 22 26 30
5144 E3: 1 5 9 13 17 21 25 29
5145 E4: 3 7 11 15 19 23 27 31
5147 And RESULT_CHAIN after reordering:
5149 1st vec (E1): 0 4 8 12 16 20 24 28
5150 2nd vec (E3): 1 5 9 13 17 21 25 29
5151 3rd vec (E2): 2 6 10 14 18 22 26 30
5152 4th vec (E4): 3 7 11 15 19 23 27 31. */
5154 static void
5155 vect_permute_load_chain (vec<tree> dr_chain,
5156 unsigned int length,
5157 gimple stmt,
5158 gimple_stmt_iterator *gsi,
5159 vec<tree> *result_chain)
5161 tree data_ref, first_vect, second_vect;
5162 tree perm_mask_even, perm_mask_odd;
5163 tree perm3_mask_low, perm3_mask_high;
5164 gimple perm_stmt;
5165 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5166 unsigned int i, j, log_length = exact_log2 (length);
5167 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5168 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5170 result_chain->quick_grow (length);
5171 memcpy (result_chain->address (), dr_chain.address (),
5172 length * sizeof (tree));
5174 if (length == 3)
5176 unsigned int k;
5178 for (k = 0; k < 3; k++)
5180 for (i = 0; i < nelt; i++)
5181 if (3 * i + k < 2 * nelt)
5182 sel[i] = 3 * i + k;
5183 else
5184 sel[i] = 0;
5185 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5187 for (i = 0, j = 0; i < nelt; i++)
5188 if (3 * i + k < 2 * nelt)
5189 sel[i] = i;
5190 else
5191 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5193 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5195 first_vect = dr_chain[0];
5196 second_vect = dr_chain[1];
5198 /* Create interleaving stmt (low part of):
5199 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5200 ...}> */
5201 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5202 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5203 second_vect, perm3_mask_low);
5204 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5206 /* Create interleaving stmt (high part of):
5207 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5208 ...}> */
5209 first_vect = data_ref;
5210 second_vect = dr_chain[2];
5211 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5212 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5213 second_vect, perm3_mask_high);
5214 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5215 (*result_chain)[k] = data_ref;
5218 else
5220 /* If length is not equal to 3 then only power of 2 is supported. */
5221 gcc_assert (exact_log2 (length) != -1);
5223 for (i = 0; i < nelt; ++i)
5224 sel[i] = i * 2;
5225 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5227 for (i = 0; i < nelt; ++i)
5228 sel[i] = i * 2 + 1;
5229 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5231 for (i = 0; i < log_length; i++)
5233 for (j = 0; j < length; j += 2)
5235 first_vect = dr_chain[j];
5236 second_vect = dr_chain[j+1];
5238 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5239 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5240 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5241 first_vect, second_vect,
5242 perm_mask_even);
5243 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5244 (*result_chain)[j/2] = data_ref;
5246 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5247 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5248 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5249 first_vect, second_vect,
5250 perm_mask_odd);
5251 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5252 (*result_chain)[j/2+length/2] = data_ref;
5254 memcpy (dr_chain.address (), result_chain->address (),
5255 length * sizeof (tree));
5260 /* Function vect_shift_permute_load_chain.
5262 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5263 sequence of stmts to reorder the input data accordingly.
5264 Return the final references for loads in RESULT_CHAIN.
5265 Return true if successed, false otherwise.
5267 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5268 The input is 3 vectors each containing 8 elements. We assign a
5269 number to each element, the input sequence is:
5271 1st vec: 0 1 2 3 4 5 6 7
5272 2nd vec: 8 9 10 11 12 13 14 15
5273 3rd vec: 16 17 18 19 20 21 22 23
5275 The output sequence should be:
5277 1st vec: 0 3 6 9 12 15 18 21
5278 2nd vec: 1 4 7 10 13 16 19 22
5279 3rd vec: 2 5 8 11 14 17 20 23
5281 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5283 First we shuffle all 3 vectors to get correct elements order:
5285 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5286 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5287 3rd vec: (16 19 22) (17 20 23) (18 21)
5289 Next we unite and shift vector 3 times:
5291 1st step:
5292 shift right by 6 the concatenation of:
5293 "1st vec" and "2nd vec"
5294 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5295 "2nd vec" and "3rd vec"
5296 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5297 "3rd vec" and "1st vec"
5298 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5299 | New vectors |
5301 So that now new vectors are:
5303 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5304 2nd vec: (10 13) (16 19 22) (17 20 23)
5305 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5307 2nd step:
5308 shift right by 5 the concatenation of:
5309 "1st vec" and "3rd vec"
5310 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5311 "2nd vec" and "1st vec"
5312 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5313 "3rd vec" and "2nd vec"
5314 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5315 | New vectors |
5317 So that now new vectors are:
5319 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5320 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5321 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5323 3rd step:
5324 shift right by 5 the concatenation of:
5325 "1st vec" and "1st vec"
5326 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5327 shift right by 3 the concatenation of:
5328 "2nd vec" and "2nd vec"
5329 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5330 | New vectors |
5332 So that now all vectors are READY:
5333 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5334 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5335 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5337 This algorithm is faster than one in vect_permute_load_chain if:
5338 1. "shift of a concatination" is faster than general permutation.
5339 This is usually so.
5340 2. The TARGET machine can't execute vector instructions in parallel.
5341 This is because each step of the algorithm depends on previous.
5342 The algorithm in vect_permute_load_chain is much more parallel.
5344 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5347 static bool
5348 vect_shift_permute_load_chain (vec<tree> dr_chain,
5349 unsigned int length,
5350 gimple stmt,
5351 gimple_stmt_iterator *gsi,
5352 vec<tree> *result_chain)
5354 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5355 tree perm2_mask1, perm2_mask2, perm3_mask;
5356 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5357 gimple perm_stmt;
5359 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5360 unsigned int i;
5361 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5362 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5363 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5366 result_chain->quick_grow (length);
5367 memcpy (result_chain->address (), dr_chain.address (),
5368 length * sizeof (tree));
5370 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5372 unsigned int j, log_length = exact_log2 (length);
5373 for (i = 0; i < nelt / 2; ++i)
5374 sel[i] = i * 2;
5375 for (i = 0; i < nelt / 2; ++i)
5376 sel[nelt / 2 + i] = i * 2 + 1;
5377 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5379 if (dump_enabled_p ())
5380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5381 "shuffle of 2 fields structure is not \
5382 supported by target\n");
5383 return false;
5385 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5387 for (i = 0; i < nelt / 2; ++i)
5388 sel[i] = i * 2 + 1;
5389 for (i = 0; i < nelt / 2; ++i)
5390 sel[nelt / 2 + i] = i * 2;
5391 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5393 if (dump_enabled_p ())
5394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5395 "shuffle of 2 fields structure is not \
5396 supported by target\n");
5397 return false;
5399 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5401 /* Generating permutation constant to shift all elements.
5402 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5403 for (i = 0; i < nelt; i++)
5404 sel[i] = nelt / 2 + i;
5405 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5407 if (dump_enabled_p ())
5408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5409 "shift permutation is not supported by target\n");
5410 return false;
5412 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5414 /* Generating permutation constant to select vector from 2.
5415 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5416 for (i = 0; i < nelt / 2; i++)
5417 sel[i] = i;
5418 for (i = nelt / 2; i < nelt; i++)
5419 sel[i] = nelt + i;
5420 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5422 if (dump_enabled_p ())
5423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5424 "select is not supported by target\n");
5425 return false;
5427 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5429 for (i = 0; i < log_length; i++)
5431 for (j = 0; j < length; j += 2)
5433 first_vect = dr_chain[j];
5434 second_vect = dr_chain[j + 1];
5436 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5437 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5438 first_vect, first_vect,
5439 perm2_mask1);
5440 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5441 vect[0] = data_ref;
5443 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5444 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5445 second_vect, second_vect,
5446 perm2_mask2);
5447 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5448 vect[1] = data_ref;
5450 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5451 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5452 vect[0], vect[1], shift1_mask);
5453 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5454 (*result_chain)[j/2 + length/2] = data_ref;
5456 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5457 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5458 vect[0], vect[1], select_mask);
5459 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5460 (*result_chain)[j/2] = data_ref;
5462 memcpy (dr_chain.address (), result_chain->address (),
5463 length * sizeof (tree));
5465 return true;
5467 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5469 unsigned int k = 0, l = 0;
5471 /* Generating permutation constant to get all elements in rigth order.
5472 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5473 for (i = 0; i < nelt; i++)
5475 if (3 * k + (l % 3) >= nelt)
5477 k = 0;
5478 l += (3 - (nelt % 3));
5480 sel[i] = 3 * k + (l % 3);
5481 k++;
5483 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5485 if (dump_enabled_p ())
5486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5487 "shuffle of 3 fields structure is not \
5488 supported by target\n");
5489 return false;
5491 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5493 /* Generating permutation constant to shift all elements.
5494 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5495 for (i = 0; i < nelt; i++)
5496 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5497 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5499 if (dump_enabled_p ())
5500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5501 "shift permutation is not supported by target\n");
5502 return false;
5504 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5506 /* Generating permutation constant to shift all elements.
5507 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5508 for (i = 0; i < nelt; i++)
5509 sel[i] = 2 * (nelt / 3) + 1 + i;
5510 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5512 if (dump_enabled_p ())
5513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5514 "shift permutation is not supported by target\n");
5515 return false;
5517 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5519 /* Generating permutation constant to shift all elements.
5520 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5521 for (i = 0; i < nelt; i++)
5522 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5523 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5525 if (dump_enabled_p ())
5526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5527 "shift permutation is not supported by target\n");
5528 return false;
5530 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5532 /* Generating permutation constant to shift all elements.
5533 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5534 for (i = 0; i < nelt; i++)
5535 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5536 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5538 if (dump_enabled_p ())
5539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5540 "shift permutation is not supported by target\n");
5541 return false;
5543 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5545 for (k = 0; k < 3; k++)
5547 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5548 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5549 dr_chain[k], dr_chain[k],
5550 perm3_mask);
5551 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5552 vect[k] = data_ref;
5555 for (k = 0; k < 3; k++)
5557 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5558 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5559 vect[k % 3], vect[(k + 1) % 3],
5560 shift1_mask);
5561 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5562 vect_shift[k] = data_ref;
5565 for (k = 0; k < 3; k++)
5567 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5568 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5569 vect_shift[(4 - k) % 3],
5570 vect_shift[(3 - k) % 3],
5571 shift2_mask);
5572 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5573 vect[k] = data_ref;
5576 (*result_chain)[3 - (nelt % 3)] = vect[2];
5578 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5579 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5580 vect[0], shift3_mask);
5581 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5582 (*result_chain)[nelt % 3] = data_ref;
5584 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5585 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5586 vect[1], shift4_mask);
5587 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588 (*result_chain)[0] = data_ref;
5589 return true;
5591 return false;
5594 /* Function vect_transform_grouped_load.
5596 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5597 to perform their permutation and ascribe the result vectorized statements to
5598 the scalar statements.
5601 void
5602 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5603 gimple_stmt_iterator *gsi)
5605 machine_mode mode;
5606 vec<tree> result_chain = vNULL;
5608 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5609 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5610 vectors, that are ready for vector computation. */
5611 result_chain.create (size);
5613 /* If reassociation width for vector type is 2 or greater target machine can
5614 execute 2 or more vector instructions in parallel. Otherwise try to
5615 get chain for loads group using vect_shift_permute_load_chain. */
5616 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5617 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5618 || exact_log2 (size) != -1
5619 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5620 gsi, &result_chain))
5621 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5622 vect_record_grouped_load_vectors (stmt, result_chain);
5623 result_chain.release ();
5626 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5627 generated as part of the vectorization of STMT. Assign the statement
5628 for each vector to the associated scalar statement. */
5630 void
5631 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5633 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5634 gimple next_stmt, new_stmt;
5635 unsigned int i, gap_count;
5636 tree tmp_data_ref;
5638 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5639 Since we scan the chain starting from it's first node, their order
5640 corresponds the order of data-refs in RESULT_CHAIN. */
5641 next_stmt = first_stmt;
5642 gap_count = 1;
5643 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5645 if (!next_stmt)
5646 break;
5648 /* Skip the gaps. Loads created for the gaps will be removed by dead
5649 code elimination pass later. No need to check for the first stmt in
5650 the group, since it always exists.
5651 GROUP_GAP is the number of steps in elements from the previous
5652 access (if there is no gap GROUP_GAP is 1). We skip loads that
5653 correspond to the gaps. */
5654 if (next_stmt != first_stmt
5655 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5657 gap_count++;
5658 continue;
5661 while (next_stmt)
5663 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5664 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5665 copies, and we put the new vector statement in the first available
5666 RELATED_STMT. */
5667 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5668 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5669 else
5671 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5673 gimple prev_stmt =
5674 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5675 gimple rel_stmt =
5676 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5677 while (rel_stmt)
5679 prev_stmt = rel_stmt;
5680 rel_stmt =
5681 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5684 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5685 new_stmt;
5689 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5690 gap_count = 1;
5691 /* If NEXT_STMT accesses the same DR as the previous statement,
5692 put the same TMP_DATA_REF as its vectorized statement; otherwise
5693 get the next data-ref from RESULT_CHAIN. */
5694 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5695 break;
5700 /* Function vect_force_dr_alignment_p.
5702 Returns whether the alignment of a DECL can be forced to be aligned
5703 on ALIGNMENT bit boundary. */
5705 bool
5706 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5708 if (TREE_CODE (decl) != VAR_DECL)
5709 return false;
5711 if (decl_in_symtab_p (decl)
5712 && !symtab_node::get (decl)->can_increase_alignment_p ())
5713 return false;
5715 if (TREE_STATIC (decl))
5716 return (alignment <= MAX_OFILE_ALIGNMENT);
5717 else
5718 return (alignment <= MAX_STACK_ALIGNMENT);
5722 /* Return whether the data reference DR is supported with respect to its
5723 alignment.
5724 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5725 it is aligned, i.e., check if it is possible to vectorize it with different
5726 alignment. */
5728 enum dr_alignment_support
5729 vect_supportable_dr_alignment (struct data_reference *dr,
5730 bool check_aligned_accesses)
5732 gimple stmt = DR_STMT (dr);
5733 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5734 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5735 machine_mode mode = TYPE_MODE (vectype);
5736 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5737 struct loop *vect_loop = NULL;
5738 bool nested_in_vect_loop = false;
5740 if (aligned_access_p (dr) && !check_aligned_accesses)
5741 return dr_aligned;
5743 /* For now assume all conditional loads/stores support unaligned
5744 access without any special code. */
5745 if (is_gimple_call (stmt)
5746 && gimple_call_internal_p (stmt)
5747 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5748 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5749 return dr_unaligned_supported;
5751 if (loop_vinfo)
5753 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5754 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5757 /* Possibly unaligned access. */
5759 /* We can choose between using the implicit realignment scheme (generating
5760 a misaligned_move stmt) and the explicit realignment scheme (generating
5761 aligned loads with a REALIGN_LOAD). There are two variants to the
5762 explicit realignment scheme: optimized, and unoptimized.
5763 We can optimize the realignment only if the step between consecutive
5764 vector loads is equal to the vector size. Since the vector memory
5765 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5766 is guaranteed that the misalignment amount remains the same throughout the
5767 execution of the vectorized loop. Therefore, we can create the
5768 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5769 at the loop preheader.
5771 However, in the case of outer-loop vectorization, when vectorizing a
5772 memory access in the inner-loop nested within the LOOP that is now being
5773 vectorized, while it is guaranteed that the misalignment of the
5774 vectorized memory access will remain the same in different outer-loop
5775 iterations, it is *not* guaranteed that is will remain the same throughout
5776 the execution of the inner-loop. This is because the inner-loop advances
5777 with the original scalar step (and not in steps of VS). If the inner-loop
5778 step happens to be a multiple of VS, then the misalignment remains fixed
5779 and we can use the optimized realignment scheme. For example:
5781 for (i=0; i<N; i++)
5782 for (j=0; j<M; j++)
5783 s += a[i+j];
5785 When vectorizing the i-loop in the above example, the step between
5786 consecutive vector loads is 1, and so the misalignment does not remain
5787 fixed across the execution of the inner-loop, and the realignment cannot
5788 be optimized (as illustrated in the following pseudo vectorized loop):
5790 for (i=0; i<N; i+=4)
5791 for (j=0; j<M; j++){
5792 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5793 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5794 // (assuming that we start from an aligned address).
5797 We therefore have to use the unoptimized realignment scheme:
5799 for (i=0; i<N; i+=4)
5800 for (j=k; j<M; j+=4)
5801 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5802 // that the misalignment of the initial address is
5803 // 0).
5805 The loop can then be vectorized as follows:
5807 for (k=0; k<4; k++){
5808 rt = get_realignment_token (&vp[k]);
5809 for (i=0; i<N; i+=4){
5810 v1 = vp[i+k];
5811 for (j=k; j<M; j+=4){
5812 v2 = vp[i+j+VS-1];
5813 va = REALIGN_LOAD <v1,v2,rt>;
5814 vs += va;
5815 v1 = v2;
5818 } */
5820 if (DR_IS_READ (dr))
5822 bool is_packed = false;
5823 tree type = (TREE_TYPE (DR_REF (dr)));
5825 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5826 && (!targetm.vectorize.builtin_mask_for_load
5827 || targetm.vectorize.builtin_mask_for_load ()))
5829 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5830 if ((nested_in_vect_loop
5831 && (TREE_INT_CST_LOW (DR_STEP (dr))
5832 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5833 || !loop_vinfo)
5834 return dr_explicit_realign;
5835 else
5836 return dr_explicit_realign_optimized;
5838 if (!known_alignment_for_access_p (dr))
5839 is_packed = not_size_aligned (DR_REF (dr));
5841 if ((TYPE_USER_ALIGN (type) && !is_packed)
5842 || targetm.vectorize.
5843 support_vector_misalignment (mode, type,
5844 DR_MISALIGNMENT (dr), is_packed))
5845 /* Can't software pipeline the loads, but can at least do them. */
5846 return dr_unaligned_supported;
5848 else
5850 bool is_packed = false;
5851 tree type = (TREE_TYPE (DR_REF (dr)));
5853 if (!known_alignment_for_access_p (dr))
5854 is_packed = not_size_aligned (DR_REF (dr));
5856 if ((TYPE_USER_ALIGN (type) && !is_packed)
5857 || targetm.vectorize.
5858 support_vector_misalignment (mode, type,
5859 DR_MISALIGNMENT (dr), is_packed))
5860 return dr_unaligned_supported;
5863 /* Unsupported. */
5864 return dr_unaligned_unsupported;