/cp
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob603facaaeb4f3a3a812a18a5f88f2a3764acd3b1
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "backend.h"
27 #include "predict.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "rtl.h"
31 #include "ssa.h"
32 #include "alias.h"
33 #include "fold-const.h"
34 #include "stor-layout.h"
35 #include "tm_p.h"
36 #include "target.h"
37 #include "gimple-pretty-print.h"
38 #include "internal-fn.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-chrec.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "diagnostic-core.h"
51 #include "cgraph.h"
52 /* Need to include rtl.h, expr.h, etc. for optabs. */
53 #include "flags.h"
54 #include "insn-config.h"
55 #include "expmed.h"
56 #include "dojump.h"
57 #include "explow.h"
58 #include "calls.h"
59 #include "emit-rtl.h"
60 #include "varasm.h"
61 #include "stmt.h"
62 #include "expr.h"
63 #include "insn-codes.h"
64 #include "optabs.h"
65 #include "builtins.h"
67 /* Return true if load- or store-lanes optab OPTAB is implemented for
68 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
70 static bool
71 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
72 tree vectype, unsigned HOST_WIDE_INT count)
74 machine_mode mode, array_mode;
75 bool limit_p;
77 mode = TYPE_MODE (vectype);
78 limit_p = !targetm.array_mode_supported_p (mode, count);
79 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
80 MODE_INT, limit_p);
82 if (array_mode == BLKmode)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
87 GET_MODE_NAME (mode), count);
88 return false;
91 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
95 "cannot use %s<%s><%s>\n", name,
96 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
97 return false;
100 if (dump_enabled_p ())
101 dump_printf_loc (MSG_NOTE, vect_location,
102 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
103 GET_MODE_NAME (mode));
105 return true;
109 /* Return the smallest scalar part of STMT.
110 This is used to determine the vectype of the stmt. We generally set the
111 vectype according to the type of the result (lhs). For stmts whose
112 result-type is different than the type of the arguments (e.g., demotion,
113 promotion), vectype will be reset appropriately (later). Note that we have
114 to visit the smallest datatype in this function, because that determines the
115 VF. If the smallest datatype in the loop is present only as the rhs of a
116 promotion operation - we'd miss it.
117 Such a case, where a variable of this datatype does not appear in the lhs
118 anywhere in the loop, can only occur if it's an invariant: e.g.:
119 'int_x = (int) short_inv', which we'd expect to have been optimized away by
120 invariant motion. However, we cannot rely on invariant motion to always
121 take invariants out of the loop, and so in the case of promotion we also
122 have to check the rhs.
123 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
124 types. */
126 tree
127 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
128 HOST_WIDE_INT *rhs_size_unit)
130 tree scalar_type = gimple_expr_type (stmt);
131 HOST_WIDE_INT lhs, rhs;
133 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
135 if (is_gimple_assign (stmt)
136 && (gimple_assign_cast_p (stmt)
137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
148 *lhs_size_unit = lhs;
149 *rhs_size_unit = rhs;
150 return scalar_type;
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155 tested at run-time. Return TRUE if DDR was successfully inserted.
156 Return false if versioning is not supported. */
158 static bool
159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164 return false;
166 if (dump_enabled_p ())
168 dump_printf_loc (MSG_NOTE, vect_location,
169 "mark for run-time aliasing test between ");
170 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
171 dump_printf (MSG_NOTE, " and ");
172 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
173 dump_printf (MSG_NOTE, "\n");
176 if (optimize_loop_nest_for_size_p (loop))
178 if (dump_enabled_p ())
179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
180 "versioning not supported when optimizing"
181 " for size.\n");
182 return false;
185 /* FORNOW: We don't support versioning with outer-loop vectorization. */
186 if (loop->inner)
188 if (dump_enabled_p ())
189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
190 "versioning not yet supported for outer-loops.\n");
191 return false;
194 /* FORNOW: We don't support creating runtime alias tests for non-constant
195 step. */
196 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
197 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
199 if (dump_enabled_p ())
200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
201 "versioning not yet supported for non-constant "
202 "step\n");
203 return false;
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207 return true;
211 /* Function vect_analyze_data_ref_dependence.
213 Return TRUE if there (might) exist a dependence between a memory-reference
214 DRA and a memory-reference DRB. When versioning for alias may check a
215 dependence at run-time, return FALSE. Adjust *MAX_VF according to
216 the data dependence. */
218 static bool
219 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
220 loop_vec_info loop_vinfo, int *max_vf)
222 unsigned int i;
223 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
224 struct data_reference *dra = DDR_A (ddr);
225 struct data_reference *drb = DDR_B (ddr);
226 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
227 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
228 lambda_vector dist_v;
229 unsigned int loop_depth;
231 /* In loop analysis all data references should be vectorizable. */
232 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
233 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
234 gcc_unreachable ();
236 /* Independent data accesses. */
237 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
238 return false;
240 if (dra == drb
241 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
242 return false;
244 /* Even if we have an anti-dependence then, as the vectorized loop covers at
245 least two scalar iterations, there is always also a true dependence.
246 As the vectorizer does not re-order loads and stores we can ignore
247 the anti-dependence if TBAA can disambiguate both DRs similar to the
248 case with known negative distance anti-dependences (positive
249 distance anti-dependences would violate TBAA constraints). */
250 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
251 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
252 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
253 get_alias_set (DR_REF (drb))))
254 return false;
256 /* Unknown data dependence. */
257 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
259 /* If user asserted safelen consecutive iterations can be
260 executed concurrently, assume independence. */
261 if (loop->safelen >= 2)
263 if (loop->safelen < *max_vf)
264 *max_vf = loop->safelen;
265 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
266 return false;
269 if (STMT_VINFO_GATHER_P (stmtinfo_a)
270 || STMT_VINFO_GATHER_P (stmtinfo_b))
272 if (dump_enabled_p ())
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275 "versioning for alias not supported for: "
276 "can't determine dependence between ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 DR_REF (dra));
279 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
280 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
281 DR_REF (drb));
282 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
284 return true;
287 if (dump_enabled_p ())
289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
290 "versioning for alias required: "
291 "can't determine dependence between ");
292 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
293 DR_REF (dra));
294 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
295 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
296 DR_REF (drb));
297 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
300 /* Add to list of ddrs that need to be tested at run-time. */
301 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
304 /* Known data dependence. */
305 if (DDR_NUM_DIST_VECTS (ddr) == 0)
307 /* If user asserted safelen consecutive iterations can be
308 executed concurrently, assume independence. */
309 if (loop->safelen >= 2)
311 if (loop->safelen < *max_vf)
312 *max_vf = loop->safelen;
313 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
314 return false;
317 if (STMT_VINFO_GATHER_P (stmtinfo_a)
318 || STMT_VINFO_GATHER_P (stmtinfo_b))
320 if (dump_enabled_p ())
322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
323 "versioning for alias not supported for: "
324 "bad dist vector for ");
325 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
326 DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
329 DR_REF (drb));
330 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
332 return true;
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
338 "versioning for alias required: "
339 "bad dist vector for ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
341 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
343 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
345 /* Add to list of ddrs that need to be tested at run-time. */
346 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
349 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
350 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
352 int dist = dist_v[loop_depth];
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance = %d.\n", dist);
358 if (dist == 0)
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE, vect_location,
363 "dependence distance == 0 between ");
364 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
365 dump_printf (MSG_NOTE, " and ");
366 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
367 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
370 /* When we perform grouped accesses and perform implicit CSE
371 by detecting equal accesses and doing disambiguation with
372 runtime alias tests like for
373 .. = a[i];
374 .. = a[i+1];
375 a[i] = ..;
376 a[i+1] = ..;
377 *p = ..;
378 .. = a[i];
379 .. = a[i+1];
380 where we will end up loading { a[i], a[i+1] } once, make
381 sure that inserting group loads before the first load and
382 stores after the last store will do the right thing.
383 Similar for groups like
384 a[i] = ...;
385 ... = a[i];
386 a[i+1] = ...;
387 where loads from the group interleave with the store. */
388 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
389 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
391 gimple earlier_stmt;
392 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
393 if (DR_IS_WRITE
394 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
396 if (dump_enabled_p ())
397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
398 "READ_WRITE dependence in interleaving."
399 "\n");
400 return true;
404 continue;
407 if (dist > 0 && DDR_REVERSED_P (ddr))
409 /* If DDR_REVERSED_P the order of the data-refs in DDR was
410 reversed (to make distance vector positive), and the actual
411 distance is negative. */
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "dependence distance negative.\n");
415 /* Record a negative dependence distance to later limit the
416 amount of stmt copying / unrolling we can perform.
417 Only need to handle read-after-write dependence. */
418 if (DR_IS_READ (drb)
419 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
420 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
421 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
422 continue;
425 if (abs (dist) >= 2
426 && abs (dist) < *max_vf)
428 /* The dependence distance requires reduction of the maximal
429 vectorization factor. */
430 *max_vf = abs (dist);
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_NOTE, vect_location,
433 "adjusting maximal vectorization factor to %i\n",
434 *max_vf);
437 if (abs (dist) >= *max_vf)
439 /* Dependence distance does not create dependence, as far as
440 vectorization is concerned, in this case. */
441 if (dump_enabled_p ())
442 dump_printf_loc (MSG_NOTE, vect_location,
443 "dependence distance >= VF.\n");
444 continue;
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
450 "not vectorized, possible dependence "
451 "between data-refs ");
452 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
453 dump_printf (MSG_NOTE, " and ");
454 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
455 dump_printf (MSG_NOTE, "\n");
458 return true;
461 return false;
464 /* Function vect_analyze_data_ref_dependences.
466 Examine all the data references in the loop, and make sure there do not
467 exist any data dependences between them. Set *MAX_VF according to
468 the maximum vectorization factor the data dependences allow. */
470 bool
471 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
473 unsigned int i;
474 struct data_dependence_relation *ddr;
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_NOTE, vect_location,
478 "=== vect_analyze_data_ref_dependences ===\n");
480 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
481 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
482 &LOOP_VINFO_DDRS (loop_vinfo),
483 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
484 return false;
486 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
487 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
488 return false;
490 return true;
494 /* Function vect_slp_analyze_data_ref_dependence.
496 Return TRUE if there (might) exist a dependence between a memory-reference
497 DRA and a memory-reference DRB. When versioning for alias may check a
498 dependence at run-time, return FALSE. Adjust *MAX_VF according to
499 the data dependence. */
501 static bool
502 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
504 struct data_reference *dra = DDR_A (ddr);
505 struct data_reference *drb = DDR_B (ddr);
507 /* We need to check dependences of statements marked as unvectorizable
508 as well, they still can prohibit vectorization. */
510 /* Independent data accesses. */
511 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
512 return false;
514 if (dra == drb)
515 return false;
517 /* Read-read is OK. */
518 if (DR_IS_READ (dra) && DR_IS_READ (drb))
519 return false;
521 /* If dra and drb are part of the same interleaving chain consider
522 them independent. */
523 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
524 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
525 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
526 return false;
528 /* Unknown data dependence. */
529 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
531 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534 "can't determine dependence between ");
535 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
536 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
538 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
541 else if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location,
544 "determined dependence between ");
545 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
546 dump_printf (MSG_NOTE, " and ");
547 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
548 dump_printf (MSG_NOTE, "\n");
551 /* We do not vectorize basic blocks with write-write dependencies. */
552 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
553 return true;
555 /* If we have a read-write dependence check that the load is before the store.
556 When we vectorize basic blocks, vector load can be only before
557 corresponding scalar load, and vector store can be only after its
558 corresponding scalar store. So the order of the acceses is preserved in
559 case the load is before the store. */
560 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
561 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
563 /* That only holds for load-store pairs taking part in vectorization. */
564 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
565 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
566 return false;
569 return true;
573 /* Function vect_analyze_data_ref_dependences.
575 Examine all the data references in the basic-block, and make sure there
576 do not exist any data dependences between them. Set *MAX_VF according to
577 the maximum vectorization factor the data dependences allow. */
579 bool
580 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
582 struct data_dependence_relation *ddr;
583 unsigned int i;
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_NOTE, vect_location,
587 "=== vect_slp_analyze_data_ref_dependences ===\n");
589 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
590 &BB_VINFO_DDRS (bb_vinfo),
591 vNULL, true))
592 return false;
594 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
595 if (vect_slp_analyze_data_ref_dependence (ddr))
596 return false;
598 return true;
602 /* Function vect_compute_data_ref_alignment
604 Compute the misalignment of the data reference DR.
606 Output:
607 1. If during the misalignment computation it is found that the data reference
608 cannot be vectorized then false is returned.
609 2. DR_MISALIGNMENT (DR) is defined.
611 FOR NOW: No analysis is actually performed. Misalignment is calculated
612 only for trivial cases. TODO. */
614 static bool
615 vect_compute_data_ref_alignment (struct data_reference *dr)
617 gimple stmt = DR_STMT (dr);
618 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
619 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
620 struct loop *loop = NULL;
621 tree ref = DR_REF (dr);
622 tree vectype;
623 tree base, base_addr;
624 bool base_aligned;
625 tree misalign = NULL_TREE;
626 tree aligned_to;
627 unsigned HOST_WIDE_INT alignment;
629 if (dump_enabled_p ())
630 dump_printf_loc (MSG_NOTE, vect_location,
631 "vect_compute_data_ref_alignment:\n");
633 if (loop_vinfo)
634 loop = LOOP_VINFO_LOOP (loop_vinfo);
636 /* Initialize misalignment to unknown. */
637 SET_DR_MISALIGNMENT (dr, -1);
639 /* Strided accesses perform only component accesses, misalignment information
640 is irrelevant for them. */
641 if (STMT_VINFO_STRIDED_P (stmt_info)
642 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
643 return true;
645 if (tree_fits_shwi_p (DR_STEP (dr)))
646 misalign = DR_INIT (dr);
647 aligned_to = DR_ALIGNED_TO (dr);
648 base_addr = DR_BASE_ADDRESS (dr);
649 vectype = STMT_VINFO_VECTYPE (stmt_info);
651 /* In case the dataref is in an inner-loop of the loop that is being
652 vectorized (LOOP), we use the base and misalignment information
653 relative to the outer-loop (LOOP). This is ok only if the misalignment
654 stays the same throughout the execution of the inner-loop, which is why
655 we have to check that the stride of the dataref in the inner-loop evenly
656 divides by the vector size. */
657 if (loop && nested_in_vect_loop_p (loop, stmt))
659 tree step = DR_STEP (dr);
661 if (tree_fits_shwi_p (step)
662 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location,
666 "inner step divides the vector-size.\n");
667 misalign = STMT_VINFO_DR_INIT (stmt_info);
668 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
669 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
671 else
673 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
675 "inner step doesn't divide the vector-size.\n");
676 misalign = NULL_TREE;
680 /* Similarly we can only use base and misalignment information relative to
681 an innermost loop if the misalignment stays the same throughout the
682 execution of the loop. As above, this is the case if the stride of
683 the dataref evenly divides by the vector size. */
684 else
686 tree step = DR_STEP (dr);
687 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
689 if (tree_fits_shwi_p (step)
690 && ((tree_to_shwi (step) * vf)
691 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695 "step doesn't divide the vector-size.\n");
696 misalign = NULL_TREE;
700 alignment = TYPE_ALIGN_UNIT (vectype);
702 if ((compare_tree_int (aligned_to, alignment) < 0)
703 || !misalign)
705 if (dump_enabled_p ())
707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
708 "Unknown alignment for access: ");
709 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
710 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
712 return true;
715 /* To look at alignment of the base we have to preserve an inner MEM_REF
716 as that carries alignment information of the actual access. */
717 base = ref;
718 while (handled_component_p (base))
719 base = TREE_OPERAND (base, 0);
720 if (TREE_CODE (base) == MEM_REF)
721 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
722 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
724 if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
725 base_aligned = true;
726 else
727 base_aligned = false;
729 if (!base_aligned)
731 /* Strip an inner MEM_REF to a bare decl if possible. */
732 if (TREE_CODE (base) == MEM_REF
733 && integer_zerop (TREE_OPERAND (base, 1))
734 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
735 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
737 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
739 if (dump_enabled_p ())
741 dump_printf_loc (MSG_NOTE, vect_location,
742 "can't force alignment of ref: ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
744 dump_printf (MSG_NOTE, "\n");
746 return true;
749 /* Force the alignment of the decl.
750 NOTE: This is the only change to the code we make during
751 the analysis phase, before deciding to vectorize the loop. */
752 if (dump_enabled_p ())
754 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
755 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
756 dump_printf (MSG_NOTE, "\n");
759 ((dataref_aux *)dr->aux)->base_decl = base;
760 ((dataref_aux *)dr->aux)->base_misaligned = true;
763 /* If this is a backward running DR then first access in the larger
764 vectype actually is N-1 elements before the address in the DR.
765 Adjust misalign accordingly. */
766 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
768 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
769 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
770 otherwise we wouldn't be here. */
771 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
772 /* PLUS because DR_STEP was negative. */
773 misalign = size_binop (PLUS_EXPR, misalign, offset);
776 SET_DR_MISALIGNMENT (dr,
777 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
779 if (dump_enabled_p ())
781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
782 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
783 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
784 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
787 return true;
791 /* Function vect_compute_data_refs_alignment
793 Compute the misalignment of data references in the loop.
794 Return FALSE if a data reference is found that cannot be vectorized. */
796 static bool
797 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
798 bb_vec_info bb_vinfo)
800 vec<data_reference_p> datarefs;
801 struct data_reference *dr;
802 unsigned int i;
804 if (loop_vinfo)
805 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
806 else
807 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
809 FOR_EACH_VEC_ELT (datarefs, i, dr)
810 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
811 && !vect_compute_data_ref_alignment (dr))
813 if (bb_vinfo)
815 /* Mark unsupported statement as unvectorizable. */
816 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
817 continue;
819 else
820 return false;
823 return true;
827 /* Function vect_update_misalignment_for_peel
829 DR - the data reference whose misalignment is to be adjusted.
830 DR_PEEL - the data reference whose misalignment is being made
831 zero in the vector loop by the peel.
832 NPEEL - the number of iterations in the peel loop if the misalignment
833 of DR_PEEL is known at compile time. */
835 static void
836 vect_update_misalignment_for_peel (struct data_reference *dr,
837 struct data_reference *dr_peel, int npeel)
839 unsigned int i;
840 vec<dr_p> same_align_drs;
841 struct data_reference *current_dr;
842 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
843 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
844 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
845 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
847 /* For interleaved data accesses the step in the loop must be multiplied by
848 the size of the interleaving group. */
849 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
850 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
851 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
852 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
854 /* It can be assumed that the data refs with the same alignment as dr_peel
855 are aligned in the vector loop. */
856 same_align_drs
857 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
858 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
860 if (current_dr != dr)
861 continue;
862 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
863 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
864 SET_DR_MISALIGNMENT (dr, 0);
865 return;
868 if (known_alignment_for_access_p (dr)
869 && known_alignment_for_access_p (dr_peel))
871 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
872 int misal = DR_MISALIGNMENT (dr);
873 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
874 misal += negative ? -npeel * dr_size : npeel * dr_size;
875 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
876 SET_DR_MISALIGNMENT (dr, misal);
877 return;
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
882 SET_DR_MISALIGNMENT (dr, -1);
886 /* Function vect_verify_datarefs_alignment
888 Return TRUE if all data references in the loop can be
889 handled with respect to alignment. */
891 bool
892 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
894 vec<data_reference_p> datarefs;
895 struct data_reference *dr;
896 enum dr_alignment_support supportable_dr_alignment;
897 unsigned int i;
899 if (loop_vinfo)
900 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
901 else
902 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
904 FOR_EACH_VEC_ELT (datarefs, i, dr)
906 gimple stmt = DR_STMT (dr);
907 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
909 if (!STMT_VINFO_RELEVANT_P (stmt_info))
910 continue;
912 /* For interleaving, only the alignment of the first access matters.
913 Skip statements marked as not vectorizable. */
914 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
915 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
916 || !STMT_VINFO_VECTORIZABLE (stmt_info))
917 continue;
919 /* Strided accesses perform only component accesses, alignment is
920 irrelevant for them. */
921 if (STMT_VINFO_STRIDED_P (stmt_info)
922 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
923 continue;
925 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
926 if (!supportable_dr_alignment)
928 if (dump_enabled_p ())
930 if (DR_IS_READ (dr))
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "not vectorized: unsupported unaligned load.");
933 else
934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
935 "not vectorized: unsupported unaligned "
936 "store.");
938 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
939 DR_REF (dr));
940 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
942 return false;
944 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
945 dump_printf_loc (MSG_NOTE, vect_location,
946 "Vectorizing an unaligned access.\n");
948 return true;
951 /* Given an memory reference EXP return whether its alignment is less
952 than its size. */
954 static bool
955 not_size_aligned (tree exp)
957 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
958 return true;
960 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
961 > get_object_alignment (exp));
964 /* Function vector_alignment_reachable_p
966 Return true if vector alignment for DR is reachable by peeling
967 a few loop iterations. Return false otherwise. */
969 static bool
970 vector_alignment_reachable_p (struct data_reference *dr)
972 gimple stmt = DR_STMT (dr);
973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
974 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
978 /* For interleaved access we peel only if number of iterations in
979 the prolog loop ({VF - misalignment}), is a multiple of the
980 number of the interleaved accesses. */
981 int elem_size, mis_in_elements;
982 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
984 /* FORNOW: handle only known alignment. */
985 if (!known_alignment_for_access_p (dr))
986 return false;
988 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
989 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
991 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
992 return false;
995 /* If misalignment is known at the compile time then allow peeling
996 only if natural alignment is reachable through peeling. */
997 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
999 HOST_WIDE_INT elmsize =
1000 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1001 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_NOTE, vect_location,
1004 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1005 dump_printf (MSG_NOTE,
1006 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1008 if (DR_MISALIGNMENT (dr) % elmsize)
1010 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "data size does not divide the misalignment.\n");
1013 return false;
1017 if (!known_alignment_for_access_p (dr))
1019 tree type = TREE_TYPE (DR_REF (dr));
1020 bool is_packed = not_size_aligned (DR_REF (dr));
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1023 "Unknown misalignment, is_packed = %d\n",is_packed);
1024 if ((TYPE_USER_ALIGN (type) && !is_packed)
1025 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1026 return true;
1027 else
1028 return false;
1031 return true;
1035 /* Calculate the cost of the memory access represented by DR. */
1037 static void
1038 vect_get_data_access_cost (struct data_reference *dr,
1039 unsigned int *inside_cost,
1040 unsigned int *outside_cost,
1041 stmt_vector_for_cost *body_cost_vec)
1043 gimple stmt = DR_STMT (dr);
1044 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1045 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1046 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1047 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1048 int ncopies = vf / nunits;
1050 if (DR_IS_READ (dr))
1051 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1052 NULL, body_cost_vec, false);
1053 else
1054 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_NOTE, vect_location,
1058 "vect_get_data_access_cost: inside_cost = %d, "
1059 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1063 /* Insert DR into peeling hash table with NPEEL as key. */
1065 static void
1066 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1067 int npeel)
1069 struct _vect_peel_info elem, *slot;
1070 _vect_peel_info **new_slot;
1071 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1073 elem.npeel = npeel;
1074 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1075 if (slot)
1076 slot->count++;
1077 else
1079 slot = XNEW (struct _vect_peel_info);
1080 slot->npeel = npeel;
1081 slot->dr = dr;
1082 slot->count = 1;
1083 new_slot
1084 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1085 *new_slot = slot;
1088 if (!supportable_dr_alignment
1089 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1090 slot->count += VECT_MAX_COST;
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095 number of data accesses. */
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1099 _vect_peel_extended_info *max)
1101 vect_peel_info elem = *slot;
1103 if (elem->count > max->peel_info.count
1104 || (elem->count == max->peel_info.count
1105 && max->peel_info.npeel > elem->npeel))
1107 max->peel_info.npeel = elem->npeel;
1108 max->peel_info.count = elem->count;
1109 max->peel_info.dr = elem->dr;
1112 return 1;
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117 Find the one with the lowest cost. */
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1121 _vect_peel_extended_info *min)
1123 vect_peel_info elem = *slot;
1124 int save_misalignment, dummy;
1125 unsigned int inside_cost = 0, outside_cost = 0, i;
1126 gimple stmt = DR_STMT (elem->dr);
1127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1128 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1129 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1130 struct data_reference *dr;
1131 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1133 prologue_cost_vec.create (2);
1134 body_cost_vec.create (2);
1135 epilogue_cost_vec.create (2);
1137 FOR_EACH_VEC_ELT (datarefs, i, dr)
1139 stmt = DR_STMT (dr);
1140 stmt_info = vinfo_for_stmt (stmt);
1141 /* For interleaving, only the alignment of the first access
1142 matters. */
1143 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1144 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1145 continue;
1147 save_misalignment = DR_MISALIGNMENT (dr);
1148 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1149 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1150 &body_cost_vec);
1151 SET_DR_MISALIGNMENT (dr, save_misalignment);
1154 outside_cost += vect_get_known_peeling_cost
1155 (loop_vinfo, elem->npeel, &dummy,
1156 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1157 &prologue_cost_vec, &epilogue_cost_vec);
1159 /* Prologue and epilogue costs are added to the target model later.
1160 These costs depend only on the scalar iteration cost, the
1161 number of peeling iterations finally chosen, and the number of
1162 misaligned statements. So discard the information found here. */
1163 prologue_cost_vec.release ();
1164 epilogue_cost_vec.release ();
1166 if (inside_cost < min->inside_cost
1167 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1169 min->inside_cost = inside_cost;
1170 min->outside_cost = outside_cost;
1171 min->body_cost_vec.release ();
1172 min->body_cost_vec = body_cost_vec;
1173 min->peel_info.dr = elem->dr;
1174 min->peel_info.npeel = elem->npeel;
1176 else
1177 body_cost_vec.release ();
1179 return 1;
1183 /* Choose best peeling option by traversing peeling hash table and either
1184 choosing an option with the lowest cost (if cost model is enabled) or the
1185 option that aligns as many accesses as possible. */
1187 static struct data_reference *
1188 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1189 unsigned int *npeel,
1190 stmt_vector_for_cost *body_cost_vec)
1192 struct _vect_peel_extended_info res;
1194 res.peel_info.dr = NULL;
1195 res.body_cost_vec = stmt_vector_for_cost ();
1197 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1199 res.inside_cost = INT_MAX;
1200 res.outside_cost = INT_MAX;
1201 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1202 ->traverse <_vect_peel_extended_info *,
1203 vect_peeling_hash_get_lowest_cost> (&res);
1205 else
1207 res.peel_info.count = 0;
1208 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1209 ->traverse <_vect_peel_extended_info *,
1210 vect_peeling_hash_get_most_frequent> (&res);
1213 *npeel = res.peel_info.npeel;
1214 *body_cost_vec = res.body_cost_vec;
1215 return res.peel_info.dr;
1219 /* Function vect_enhance_data_refs_alignment
1221 This pass will use loop versioning and loop peeling in order to enhance
1222 the alignment of data references in the loop.
1224 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1225 original loop is to be vectorized. Any other loops that are created by
1226 the transformations performed in this pass - are not supposed to be
1227 vectorized. This restriction will be relaxed.
1229 This pass will require a cost model to guide it whether to apply peeling
1230 or versioning or a combination of the two. For example, the scheme that
1231 intel uses when given a loop with several memory accesses, is as follows:
1232 choose one memory access ('p') which alignment you want to force by doing
1233 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1234 other accesses are not necessarily aligned, or (2) use loop versioning to
1235 generate one loop in which all accesses are aligned, and another loop in
1236 which only 'p' is necessarily aligned.
1238 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1239 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1240 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1242 Devising a cost model is the most critical aspect of this work. It will
1243 guide us on which access to peel for, whether to use loop versioning, how
1244 many versions to create, etc. The cost model will probably consist of
1245 generic considerations as well as target specific considerations (on
1246 powerpc for example, misaligned stores are more painful than misaligned
1247 loads).
1249 Here are the general steps involved in alignment enhancements:
1251 -- original loop, before alignment analysis:
1252 for (i=0; i<N; i++){
1253 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1254 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1257 -- After vect_compute_data_refs_alignment:
1258 for (i=0; i<N; i++){
1259 x = q[i]; # DR_MISALIGNMENT(q) = 3
1260 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1263 -- Possibility 1: we do loop versioning:
1264 if (p is aligned) {
1265 for (i=0; i<N; i++){ # loop 1A
1266 x = q[i]; # DR_MISALIGNMENT(q) = 3
1267 p[i] = y; # DR_MISALIGNMENT(p) = 0
1270 else {
1271 for (i=0; i<N; i++){ # loop 1B
1272 x = q[i]; # DR_MISALIGNMENT(q) = 3
1273 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1277 -- Possibility 2: we do loop peeling:
1278 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1279 x = q[i];
1280 p[i] = y;
1282 for (i = 3; i < N; i++){ # loop 2A
1283 x = q[i]; # DR_MISALIGNMENT(q) = 0
1284 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1287 -- Possibility 3: combination of loop peeling and versioning:
1288 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1289 x = q[i];
1290 p[i] = y;
1292 if (p is aligned) {
1293 for (i = 3; i<N; i++){ # loop 3A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 0
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1298 else {
1299 for (i = 3; i<N; i++){ # loop 3B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 0
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1305 These loops are later passed to loop_transform to be vectorized. The
1306 vectorizer will use the alignment information to guide the transformation
1307 (whether to generate regular loads/stores, or with special handling for
1308 misalignment). */
1310 bool
1311 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1313 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1315 enum dr_alignment_support supportable_dr_alignment;
1316 struct data_reference *dr0 = NULL, *first_store = NULL;
1317 struct data_reference *dr;
1318 unsigned int i, j;
1319 bool do_peeling = false;
1320 bool do_versioning = false;
1321 bool stat;
1322 gimple stmt;
1323 stmt_vec_info stmt_info;
1324 unsigned int npeel = 0;
1325 bool all_misalignments_unknown = true;
1326 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1327 unsigned possible_npeel_number = 1;
1328 tree vectype;
1329 unsigned int nelements, mis, same_align_drs_max = 0;
1330 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1332 if (dump_enabled_p ())
1333 dump_printf_loc (MSG_NOTE, vect_location,
1334 "=== vect_enhance_data_refs_alignment ===\n");
1336 /* While cost model enhancements are expected in the future, the high level
1337 view of the code at this time is as follows:
1339 A) If there is a misaligned access then see if peeling to align
1340 this access can make all data references satisfy
1341 vect_supportable_dr_alignment. If so, update data structures
1342 as needed and return true.
1344 B) If peeling wasn't possible and there is a data reference with an
1345 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1346 then see if loop versioning checks can be used to make all data
1347 references satisfy vect_supportable_dr_alignment. If so, update
1348 data structures as needed and return true.
1350 C) If neither peeling nor versioning were successful then return false if
1351 any data reference does not satisfy vect_supportable_dr_alignment.
1353 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1355 Note, Possibility 3 above (which is peeling and versioning together) is not
1356 being done at this time. */
1358 /* (1) Peeling to force alignment. */
1360 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1361 Considerations:
1362 + How many accesses will become aligned due to the peeling
1363 - How many accesses will become unaligned due to the peeling,
1364 and the cost of misaligned accesses.
1365 - The cost of peeling (the extra runtime checks, the increase
1366 in code size). */
1368 FOR_EACH_VEC_ELT (datarefs, i, dr)
1370 stmt = DR_STMT (dr);
1371 stmt_info = vinfo_for_stmt (stmt);
1373 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1374 continue;
1376 /* For interleaving, only the alignment of the first access
1377 matters. */
1378 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1379 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1380 continue;
1382 /* For invariant accesses there is nothing to enhance. */
1383 if (integer_zerop (DR_STEP (dr)))
1384 continue;
1386 /* Strided accesses perform only component accesses, alignment is
1387 irrelevant for them. */
1388 if (STMT_VINFO_STRIDED_P (stmt_info)
1389 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1390 continue;
1392 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1393 do_peeling = vector_alignment_reachable_p (dr);
1394 if (do_peeling)
1396 if (known_alignment_for_access_p (dr))
1398 unsigned int npeel_tmp;
1399 bool negative = tree_int_cst_compare (DR_STEP (dr),
1400 size_zero_node) < 0;
1402 /* Save info about DR in the hash table. */
1403 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1404 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1405 = new hash_table<peel_info_hasher> (1);
1407 vectype = STMT_VINFO_VECTYPE (stmt_info);
1408 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1409 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1410 TREE_TYPE (DR_REF (dr))));
1411 npeel_tmp = (negative
1412 ? (mis - nelements) : (nelements - mis))
1413 & (nelements - 1);
1415 /* For multiple types, it is possible that the bigger type access
1416 will have more than one peeling option. E.g., a loop with two
1417 types: one of size (vector size / 4), and the other one of
1418 size (vector size / 8). Vectorization factor will 8. If both
1419 access are misaligned by 3, the first one needs one scalar
1420 iteration to be aligned, and the second one needs 5. But the
1421 the first one will be aligned also by peeling 5 scalar
1422 iterations, and in that case both accesses will be aligned.
1423 Hence, except for the immediate peeling amount, we also want
1424 to try to add full vector size, while we don't exceed
1425 vectorization factor.
1426 We do this automtically for cost model, since we calculate cost
1427 for every peeling option. */
1428 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1430 if (STMT_SLP_TYPE (stmt_info))
1431 possible_npeel_number
1432 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1433 else
1434 possible_npeel_number = vf / nelements;
1437 /* Handle the aligned case. We may decide to align some other
1438 access, making DR unaligned. */
1439 if (DR_MISALIGNMENT (dr) == 0)
1441 npeel_tmp = 0;
1442 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1443 possible_npeel_number++;
1446 for (j = 0; j < possible_npeel_number; j++)
1448 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1449 npeel_tmp += nelements;
1452 all_misalignments_unknown = false;
1453 /* Data-ref that was chosen for the case that all the
1454 misalignments are unknown is not relevant anymore, since we
1455 have a data-ref with known alignment. */
1456 dr0 = NULL;
1458 else
1460 /* If we don't know any misalignment values, we prefer
1461 peeling for data-ref that has the maximum number of data-refs
1462 with the same alignment, unless the target prefers to align
1463 stores over load. */
1464 if (all_misalignments_unknown)
1466 unsigned same_align_drs
1467 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1468 if (!dr0
1469 || same_align_drs_max < same_align_drs)
1471 same_align_drs_max = same_align_drs;
1472 dr0 = dr;
1474 /* For data-refs with the same number of related
1475 accesses prefer the one where the misalign
1476 computation will be invariant in the outermost loop. */
1477 else if (same_align_drs_max == same_align_drs)
1479 struct loop *ivloop0, *ivloop;
1480 ivloop0 = outermost_invariant_loop_for_expr
1481 (loop, DR_BASE_ADDRESS (dr0));
1482 ivloop = outermost_invariant_loop_for_expr
1483 (loop, DR_BASE_ADDRESS (dr));
1484 if ((ivloop && !ivloop0)
1485 || (ivloop && ivloop0
1486 && flow_loop_nested_p (ivloop, ivloop0)))
1487 dr0 = dr;
1490 if (!first_store && DR_IS_WRITE (dr))
1491 first_store = dr;
1494 /* If there are both known and unknown misaligned accesses in the
1495 loop, we choose peeling amount according to the known
1496 accesses. */
1497 if (!supportable_dr_alignment)
1499 dr0 = dr;
1500 if (!first_store && DR_IS_WRITE (dr))
1501 first_store = dr;
1505 else
1507 if (!aligned_access_p (dr))
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1511 "vector alignment may not be reachable\n");
1512 break;
1517 /* Check if we can possibly peel the loop. */
1518 if (!vect_can_advance_ivs_p (loop_vinfo)
1519 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1520 do_peeling = false;
1522 if (do_peeling
1523 && all_misalignments_unknown
1524 && vect_supportable_dr_alignment (dr0, false))
1526 /* Check if the target requires to prefer stores over loads, i.e., if
1527 misaligned stores are more expensive than misaligned loads (taking
1528 drs with same alignment into account). */
1529 if (first_store && DR_IS_READ (dr0))
1531 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1532 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1533 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1534 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1535 stmt_vector_for_cost dummy;
1536 dummy.create (2);
1538 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1539 &dummy);
1540 vect_get_data_access_cost (first_store, &store_inside_cost,
1541 &store_outside_cost, &dummy);
1543 dummy.release ();
1545 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1546 aligning the load DR0). */
1547 load_inside_penalty = store_inside_cost;
1548 load_outside_penalty = store_outside_cost;
1549 for (i = 0;
1550 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1551 DR_STMT (first_store))).iterate (i, &dr);
1552 i++)
1553 if (DR_IS_READ (dr))
1555 load_inside_penalty += load_inside_cost;
1556 load_outside_penalty += load_outside_cost;
1558 else
1560 load_inside_penalty += store_inside_cost;
1561 load_outside_penalty += store_outside_cost;
1564 /* Calculate the penalty for leaving DR0 unaligned (by
1565 aligning the FIRST_STORE). */
1566 store_inside_penalty = load_inside_cost;
1567 store_outside_penalty = load_outside_cost;
1568 for (i = 0;
1569 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1570 DR_STMT (dr0))).iterate (i, &dr);
1571 i++)
1572 if (DR_IS_READ (dr))
1574 store_inside_penalty += load_inside_cost;
1575 store_outside_penalty += load_outside_cost;
1577 else
1579 store_inside_penalty += store_inside_cost;
1580 store_outside_penalty += store_outside_cost;
1583 if (load_inside_penalty > store_inside_penalty
1584 || (load_inside_penalty == store_inside_penalty
1585 && load_outside_penalty > store_outside_penalty))
1586 dr0 = first_store;
1589 /* In case there are only loads with different unknown misalignments, use
1590 peeling only if it may help to align other accesses in the loop or
1591 if it may help improving load bandwith when we'd end up using
1592 unaligned loads. */
1593 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1594 if (!first_store
1595 && !STMT_VINFO_SAME_ALIGN_REFS (
1596 vinfo_for_stmt (DR_STMT (dr0))).length ()
1597 && (vect_supportable_dr_alignment (dr0, false)
1598 != dr_unaligned_supported
1599 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1600 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1601 do_peeling = false;
1604 if (do_peeling && !dr0)
1606 /* Peeling is possible, but there is no data access that is not supported
1607 unless aligned. So we try to choose the best possible peeling. */
1609 /* We should get here only if there are drs with known misalignment. */
1610 gcc_assert (!all_misalignments_unknown);
1612 /* Choose the best peeling from the hash table. */
1613 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1614 &body_cost_vec);
1615 if (!dr0 || !npeel)
1616 do_peeling = false;
1619 if (do_peeling)
1621 stmt = DR_STMT (dr0);
1622 stmt_info = vinfo_for_stmt (stmt);
1623 vectype = STMT_VINFO_VECTYPE (stmt_info);
1624 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1626 if (known_alignment_for_access_p (dr0))
1628 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1629 size_zero_node) < 0;
1630 if (!npeel)
1632 /* Since it's known at compile time, compute the number of
1633 iterations in the peeled loop (the peeling factor) for use in
1634 updating DR_MISALIGNMENT values. The peeling factor is the
1635 vectorization factor minus the misalignment as an element
1636 count. */
1637 mis = DR_MISALIGNMENT (dr0);
1638 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1639 npeel = ((negative ? mis - nelements : nelements - mis)
1640 & (nelements - 1));
1643 /* For interleaved data access every iteration accesses all the
1644 members of the group, therefore we divide the number of iterations
1645 by the group size. */
1646 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1647 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1648 npeel /= GROUP_SIZE (stmt_info);
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_NOTE, vect_location,
1652 "Try peeling by %d\n", npeel);
1655 /* Ensure that all data refs can be vectorized after the peel. */
1656 FOR_EACH_VEC_ELT (datarefs, i, dr)
1658 int save_misalignment;
1660 if (dr == dr0)
1661 continue;
1663 stmt = DR_STMT (dr);
1664 stmt_info = vinfo_for_stmt (stmt);
1665 /* For interleaving, only the alignment of the first access
1666 matters. */
1667 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1668 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1669 continue;
1671 /* Strided accesses perform only component accesses, alignment is
1672 irrelevant for them. */
1673 if (STMT_VINFO_STRIDED_P (stmt_info)
1674 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1675 continue;
1677 save_misalignment = DR_MISALIGNMENT (dr);
1678 vect_update_misalignment_for_peel (dr, dr0, npeel);
1679 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1680 SET_DR_MISALIGNMENT (dr, save_misalignment);
1682 if (!supportable_dr_alignment)
1684 do_peeling = false;
1685 break;
1689 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1691 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1692 if (!stat)
1693 do_peeling = false;
1694 else
1696 body_cost_vec.release ();
1697 return stat;
1701 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1702 if (do_peeling)
1704 unsigned max_allowed_peel
1705 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1706 if (max_allowed_peel != (unsigned)-1)
1708 unsigned max_peel = npeel;
1709 if (max_peel == 0)
1711 gimple dr_stmt = DR_STMT (dr0);
1712 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1713 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1714 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1716 if (max_peel > max_allowed_peel)
1718 do_peeling = false;
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE, vect_location,
1721 "Disable peeling, max peels reached: %d\n", max_peel);
1726 /* Cost model #2 - if peeling may result in a remaining loop not
1727 iterating enough to be vectorized then do not peel. */
1728 if (do_peeling
1729 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1731 unsigned max_peel
1732 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1733 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1734 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1735 do_peeling = false;
1738 if (do_peeling)
1740 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1741 If the misalignment of DR_i is identical to that of dr0 then set
1742 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1743 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1744 by the peeling factor times the element size of DR_i (MOD the
1745 vectorization factor times the size). Otherwise, the
1746 misalignment of DR_i must be set to unknown. */
1747 FOR_EACH_VEC_ELT (datarefs, i, dr)
1748 if (dr != dr0)
1749 vect_update_misalignment_for_peel (dr, dr0, npeel);
1751 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1752 if (npeel)
1753 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1754 else
1755 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1756 = DR_MISALIGNMENT (dr0);
1757 SET_DR_MISALIGNMENT (dr0, 0);
1758 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_NOTE, vect_location,
1761 "Alignment of access forced using peeling.\n");
1762 dump_printf_loc (MSG_NOTE, vect_location,
1763 "Peeling for alignment will be applied.\n");
1765 /* The inside-loop cost will be accounted for in vectorizable_load
1766 and vectorizable_store correctly with adjusted alignments.
1767 Drop the body_cst_vec on the floor here. */
1768 body_cost_vec.release ();
1770 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1771 gcc_assert (stat);
1772 return stat;
1776 body_cost_vec.release ();
1778 /* (2) Versioning to force alignment. */
1780 /* Try versioning if:
1781 1) optimize loop for speed
1782 2) there is at least one unsupported misaligned data ref with an unknown
1783 misalignment, and
1784 3) all misaligned data refs with a known misalignment are supported, and
1785 4) the number of runtime alignment checks is within reason. */
1787 do_versioning =
1788 optimize_loop_nest_for_speed_p (loop)
1789 && (!loop->inner); /* FORNOW */
1791 if (do_versioning)
1793 FOR_EACH_VEC_ELT (datarefs, i, dr)
1795 stmt = DR_STMT (dr);
1796 stmt_info = vinfo_for_stmt (stmt);
1798 /* For interleaving, only the alignment of the first access
1799 matters. */
1800 if (aligned_access_p (dr)
1801 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1802 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1803 continue;
1805 if (STMT_VINFO_STRIDED_P (stmt_info))
1807 /* Strided loads perform only component accesses, alignment is
1808 irrelevant for them. */
1809 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1810 continue;
1811 do_versioning = false;
1812 break;
1815 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1817 if (!supportable_dr_alignment)
1819 gimple stmt;
1820 int mask;
1821 tree vectype;
1823 if (known_alignment_for_access_p (dr)
1824 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1825 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1827 do_versioning = false;
1828 break;
1831 stmt = DR_STMT (dr);
1832 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1833 gcc_assert (vectype);
1835 /* The rightmost bits of an aligned address must be zeros.
1836 Construct the mask needed for this test. For example,
1837 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1838 mask must be 15 = 0xf. */
1839 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1841 /* FORNOW: use the same mask to test all potentially unaligned
1842 references in the loop. The vectorizer currently supports
1843 a single vector size, see the reference to
1844 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1845 vectorization factor is computed. */
1846 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1847 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1848 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1849 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1850 DR_STMT (dr));
1854 /* Versioning requires at least one misaligned data reference. */
1855 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1856 do_versioning = false;
1857 else if (!do_versioning)
1858 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1861 if (do_versioning)
1863 vec<gimple> may_misalign_stmts
1864 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1865 gimple stmt;
1867 /* It can now be assumed that the data references in the statements
1868 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1869 of the loop being vectorized. */
1870 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1872 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1873 dr = STMT_VINFO_DATA_REF (stmt_info);
1874 SET_DR_MISALIGNMENT (dr, 0);
1875 if (dump_enabled_p ())
1876 dump_printf_loc (MSG_NOTE, vect_location,
1877 "Alignment of access forced using versioning.\n");
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_NOTE, vect_location,
1882 "Versioning for alignment will be applied.\n");
1884 /* Peeling and versioning can't be done together at this time. */
1885 gcc_assert (! (do_peeling && do_versioning));
1887 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1888 gcc_assert (stat);
1889 return stat;
1892 /* This point is reached if neither peeling nor versioning is being done. */
1893 gcc_assert (! (do_peeling || do_versioning));
1895 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1896 return stat;
1900 /* Function vect_find_same_alignment_drs.
1902 Update group and alignment relations according to the chosen
1903 vectorization factor. */
1905 static void
1906 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1907 loop_vec_info loop_vinfo)
1909 unsigned int i;
1910 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1911 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1912 struct data_reference *dra = DDR_A (ddr);
1913 struct data_reference *drb = DDR_B (ddr);
1914 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1915 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1916 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1917 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1918 lambda_vector dist_v;
1919 unsigned int loop_depth;
1921 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1922 return;
1924 if (dra == drb)
1925 return;
1927 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1928 return;
1930 /* Loop-based vectorization and known data dependence. */
1931 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1932 return;
1934 /* Data-dependence analysis reports a distance vector of zero
1935 for data-references that overlap only in the first iteration
1936 but have different sign step (see PR45764).
1937 So as a sanity check require equal DR_STEP. */
1938 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1939 return;
1941 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1942 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1944 int dist = dist_v[loop_depth];
1946 if (dump_enabled_p ())
1947 dump_printf_loc (MSG_NOTE, vect_location,
1948 "dependence distance = %d.\n", dist);
1950 /* Same loop iteration. */
1951 if (dist == 0
1952 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1954 /* Two references with distance zero have the same alignment. */
1955 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1956 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1957 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_NOTE, vect_location,
1960 "accesses have the same alignment.\n");
1961 dump_printf (MSG_NOTE,
1962 "dependence distance modulo vf == 0 between ");
1963 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1964 dump_printf (MSG_NOTE, " and ");
1965 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1966 dump_printf (MSG_NOTE, "\n");
1973 /* Function vect_analyze_data_refs_alignment
1975 Analyze the alignment of the data-references in the loop.
1976 Return FALSE if a data reference is found that cannot be vectorized. */
1978 bool
1979 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1980 bb_vec_info bb_vinfo)
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE, vect_location,
1984 "=== vect_analyze_data_refs_alignment ===\n");
1986 /* Mark groups of data references with same alignment using
1987 data dependence information. */
1988 if (loop_vinfo)
1990 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1991 struct data_dependence_relation *ddr;
1992 unsigned int i;
1994 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1995 vect_find_same_alignment_drs (ddr, loop_vinfo);
1998 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2000 if (dump_enabled_p ())
2001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2002 "not vectorized: can't calculate alignment "
2003 "for data ref.\n");
2004 return false;
2007 return true;
2011 /* Analyze groups of accesses: check that DR belongs to a group of
2012 accesses of legal size, step, etc. Detect gaps, single element
2013 interleaving, and other special cases. Set grouped access info.
2014 Collect groups of strided stores for further use in SLP analysis. */
2016 static bool
2017 vect_analyze_group_access (struct data_reference *dr)
2019 tree step = DR_STEP (dr);
2020 tree scalar_type = TREE_TYPE (DR_REF (dr));
2021 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2022 gimple stmt = DR_STMT (dr);
2023 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2024 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2025 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2026 HOST_WIDE_INT dr_step = -1;
2027 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2028 bool slp_impossible = false;
2029 struct loop *loop = NULL;
2031 if (loop_vinfo)
2032 loop = LOOP_VINFO_LOOP (loop_vinfo);
2034 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2035 size of the interleaving group (including gaps). */
2036 if (tree_fits_shwi_p (step))
2038 dr_step = tree_to_shwi (step);
2039 groupsize = absu_hwi (dr_step) / type_size;
2041 else
2042 groupsize = 0;
2044 /* Not consecutive access is possible only if it is a part of interleaving. */
2045 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2047 /* Check if it this DR is a part of interleaving, and is a single
2048 element of the group that is accessed in the loop. */
2050 /* Gaps are supported only for loads. STEP must be a multiple of the type
2051 size. The size of the group must be a power of 2. */
2052 if (DR_IS_READ (dr)
2053 && (dr_step % type_size) == 0
2054 && groupsize > 0
2055 && exact_log2 (groupsize) != -1)
2057 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2058 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2059 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "Detected single element interleaving ");
2063 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2064 dump_printf (MSG_NOTE, " step ");
2065 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2066 dump_printf (MSG_NOTE, "\n");
2069 if (loop_vinfo)
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE, vect_location,
2073 "Data access with gaps requires scalar "
2074 "epilogue loop\n");
2075 if (loop->inner)
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079 "Peeling for outer loop is not"
2080 " supported\n");
2081 return false;
2084 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2087 return true;
2090 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2093 "not consecutive access ");
2094 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2095 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2098 if (bb_vinfo)
2100 /* Mark the statement as unvectorizable. */
2101 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2102 return true;
2105 return false;
2108 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2110 /* First stmt in the interleaving chain. Check the chain. */
2111 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2112 struct data_reference *data_ref = dr;
2113 unsigned int count = 1;
2114 tree prev_init = DR_INIT (data_ref);
2115 gimple prev = stmt;
2116 HOST_WIDE_INT diff, gaps = 0;
2118 while (next)
2120 /* Skip same data-refs. In case that two or more stmts share
2121 data-ref (supported only for loads), we vectorize only the first
2122 stmt, and the rest get their vectorized loads from the first
2123 one. */
2124 if (!tree_int_cst_compare (DR_INIT (data_ref),
2125 DR_INIT (STMT_VINFO_DATA_REF (
2126 vinfo_for_stmt (next)))))
2128 if (DR_IS_WRITE (data_ref))
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132 "Two store stmts share the same dr.\n");
2133 return false;
2136 /* For load use the same data-ref load. */
2137 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2139 prev = next;
2140 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2141 continue;
2144 prev = next;
2145 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2147 /* All group members have the same STEP by construction. */
2148 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2150 /* Check that the distance between two accesses is equal to the type
2151 size. Otherwise, we have gaps. */
2152 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2153 - TREE_INT_CST_LOW (prev_init)) / type_size;
2154 if (diff != 1)
2156 /* FORNOW: SLP of accesses with gaps is not supported. */
2157 slp_impossible = true;
2158 if (DR_IS_WRITE (data_ref))
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 "interleaved store with gaps\n");
2163 return false;
2166 gaps += diff - 1;
2169 last_accessed_element += diff;
2171 /* Store the gap from the previous member of the group. If there is no
2172 gap in the access, GROUP_GAP is always 1. */
2173 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2175 prev_init = DR_INIT (data_ref);
2176 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2177 /* Count the number of data-refs in the chain. */
2178 count++;
2181 if (groupsize == 0)
2182 groupsize = count + gaps;
2184 /* Check that the size of the interleaving is equal to count for stores,
2185 i.e., that there are no gaps. */
2186 if (groupsize != count
2187 && !DR_IS_READ (dr))
2189 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191 "interleaved store with gaps\n");
2192 return false;
2195 /* If there is a gap after the last load in the group it is the
2196 difference between the groupsize and the last accessed
2197 element.
2198 When there is no gap, this difference should be 0. */
2199 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2201 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2202 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE, vect_location,
2205 "Detected interleaving of size %d starting with ",
2206 (int)groupsize);
2207 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2208 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2209 dump_printf_loc (MSG_NOTE, vect_location,
2210 "There is a gap of %d elements after the group\n",
2211 (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2214 /* SLP: create an SLP data structure for every interleaving group of
2215 stores for further analysis in vect_analyse_slp. */
2216 if (DR_IS_WRITE (dr) && !slp_impossible)
2218 if (loop_vinfo)
2219 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2220 if (bb_vinfo)
2221 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2224 /* If there is a gap in the end of the group or the group size cannot
2225 be made a multiple of the vector element count then we access excess
2226 elements in the last iteration and thus need to peel that off. */
2227 if (loop_vinfo
2228 && (groupsize - last_accessed_element > 0
2229 || exact_log2 (groupsize) == -1))
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2234 "Data access with gaps requires scalar "
2235 "epilogue loop\n");
2236 if (loop->inner)
2238 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240 "Peeling for outer loop is not supported\n");
2241 return false;
2244 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2248 return true;
2252 /* Analyze the access pattern of the data-reference DR.
2253 In case of non-consecutive accesses call vect_analyze_group_access() to
2254 analyze groups of accesses. */
2256 static bool
2257 vect_analyze_data_ref_access (struct data_reference *dr)
2259 tree step = DR_STEP (dr);
2260 tree scalar_type = TREE_TYPE (DR_REF (dr));
2261 gimple stmt = DR_STMT (dr);
2262 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2263 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2264 struct loop *loop = NULL;
2266 if (loop_vinfo)
2267 loop = LOOP_VINFO_LOOP (loop_vinfo);
2269 if (loop_vinfo && !step)
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273 "bad data-ref access in loop\n");
2274 return false;
2277 /* Allow loads with zero step in inner-loop vectorization. */
2278 if (loop_vinfo && integer_zerop (step))
2280 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2281 if (!nested_in_vect_loop_p (loop, stmt))
2282 return DR_IS_READ (dr);
2283 /* Allow references with zero step for outer loops marked
2284 with pragma omp simd only - it guarantees absence of
2285 loop-carried dependencies between inner loop iterations. */
2286 if (!loop->force_vectorize)
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE, vect_location,
2290 "zero step in inner loop of nest\n");
2291 return false;
2295 if (loop && nested_in_vect_loop_p (loop, stmt))
2297 /* Interleaved accesses are not yet supported within outer-loop
2298 vectorization for references in the inner-loop. */
2299 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2301 /* For the rest of the analysis we use the outer-loop step. */
2302 step = STMT_VINFO_DR_STEP (stmt_info);
2303 if (integer_zerop (step))
2305 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_NOTE, vect_location,
2307 "zero step in outer loop.\n");
2308 if (DR_IS_READ (dr))
2309 return true;
2310 else
2311 return false;
2315 /* Consecutive? */
2316 if (TREE_CODE (step) == INTEGER_CST)
2318 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2319 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2320 || (dr_step < 0
2321 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2323 /* Mark that it is not interleaving. */
2324 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2325 return true;
2329 if (loop && nested_in_vect_loop_p (loop, stmt))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location,
2333 "grouped access in outer loop.\n");
2334 return false;
2338 /* Assume this is a DR handled by non-constant strided load case. */
2339 if (TREE_CODE (step) != INTEGER_CST)
2340 return (STMT_VINFO_STRIDED_P (stmt_info)
2341 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2342 || vect_analyze_group_access (dr)));
2344 /* Not consecutive access - check if it's a part of interleaving group. */
2345 return vect_analyze_group_access (dr);
2350 /* A helper function used in the comparator function to sort data
2351 references. T1 and T2 are two data references to be compared.
2352 The function returns -1, 0, or 1. */
2354 static int
2355 compare_tree (tree t1, tree t2)
2357 int i, cmp;
2358 enum tree_code code;
2359 char tclass;
2361 if (t1 == t2)
2362 return 0;
2363 if (t1 == NULL)
2364 return -1;
2365 if (t2 == NULL)
2366 return 1;
2369 if (TREE_CODE (t1) != TREE_CODE (t2))
2370 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2372 code = TREE_CODE (t1);
2373 switch (code)
2375 /* For const values, we can just use hash values for comparisons. */
2376 case INTEGER_CST:
2377 case REAL_CST:
2378 case FIXED_CST:
2379 case STRING_CST:
2380 case COMPLEX_CST:
2381 case VECTOR_CST:
2383 hashval_t h1 = iterative_hash_expr (t1, 0);
2384 hashval_t h2 = iterative_hash_expr (t2, 0);
2385 if (h1 != h2)
2386 return h1 < h2 ? -1 : 1;
2387 break;
2390 case SSA_NAME:
2391 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2392 if (cmp != 0)
2393 return cmp;
2395 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2396 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2397 break;
2399 default:
2400 tclass = TREE_CODE_CLASS (code);
2402 /* For var-decl, we could compare their UIDs. */
2403 if (tclass == tcc_declaration)
2405 if (DECL_UID (t1) != DECL_UID (t2))
2406 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2407 break;
2410 /* For expressions with operands, compare their operands recursively. */
2411 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2413 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2414 if (cmp != 0)
2415 return cmp;
2419 return 0;
2423 /* Compare two data-references DRA and DRB to group them into chunks
2424 suitable for grouping. */
2426 static int
2427 dr_group_sort_cmp (const void *dra_, const void *drb_)
2429 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2430 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2431 int cmp;
2433 /* Stabilize sort. */
2434 if (dra == drb)
2435 return 0;
2437 /* Ordering of DRs according to base. */
2438 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2440 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2441 if (cmp != 0)
2442 return cmp;
2445 /* And according to DR_OFFSET. */
2446 if (!dr_equal_offsets_p (dra, drb))
2448 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2449 if (cmp != 0)
2450 return cmp;
2453 /* Put reads before writes. */
2454 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2455 return DR_IS_READ (dra) ? -1 : 1;
2457 /* Then sort after access size. */
2458 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2459 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2461 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2462 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2463 if (cmp != 0)
2464 return cmp;
2467 /* And after step. */
2468 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2470 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2471 if (cmp != 0)
2472 return cmp;
2475 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2476 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2477 if (cmp == 0)
2478 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2479 return cmp;
2482 /* Function vect_analyze_data_ref_accesses.
2484 Analyze the access pattern of all the data references in the loop.
2486 FORNOW: the only access pattern that is considered vectorizable is a
2487 simple step 1 (consecutive) access.
2489 FORNOW: handle only arrays and pointer accesses. */
2491 bool
2492 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2494 unsigned int i;
2495 vec<data_reference_p> datarefs;
2496 struct data_reference *dr;
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE, vect_location,
2500 "=== vect_analyze_data_ref_accesses ===\n");
2502 if (loop_vinfo)
2503 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2504 else
2505 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2507 if (datarefs.is_empty ())
2508 return true;
2510 /* Sort the array of datarefs to make building the interleaving chains
2511 linear. Don't modify the original vector's order, it is needed for
2512 determining what dependencies are reversed. */
2513 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2514 datarefs_copy.qsort (dr_group_sort_cmp);
2516 /* Build the interleaving chains. */
2517 for (i = 0; i < datarefs_copy.length () - 1;)
2519 data_reference_p dra = datarefs_copy[i];
2520 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2521 stmt_vec_info lastinfo = NULL;
2522 for (i = i + 1; i < datarefs_copy.length (); ++i)
2524 data_reference_p drb = datarefs_copy[i];
2525 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2527 /* ??? Imperfect sorting (non-compatible types, non-modulo
2528 accesses, same accesses) can lead to a group to be artificially
2529 split here as we don't just skip over those. If it really
2530 matters we can push those to a worklist and re-iterate
2531 over them. The we can just skip ahead to the next DR here. */
2533 /* Check that the data-refs have same first location (except init)
2534 and they are both either store or load (not load and store,
2535 not masked loads or stores). */
2536 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2537 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2538 DR_BASE_ADDRESS (drb), 0)
2539 || !dr_equal_offsets_p (dra, drb)
2540 || !gimple_assign_single_p (DR_STMT (dra))
2541 || !gimple_assign_single_p (DR_STMT (drb)))
2542 break;
2544 /* Check that the data-refs have the same constant size. */
2545 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2546 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2547 if (!tree_fits_uhwi_p (sza)
2548 || !tree_fits_uhwi_p (szb)
2549 || !tree_int_cst_equal (sza, szb))
2550 break;
2552 /* Check that the data-refs have the same step. */
2553 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2554 break;
2556 /* Do not place the same access in the interleaving chain twice. */
2557 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2558 break;
2560 /* Check the types are compatible.
2561 ??? We don't distinguish this during sorting. */
2562 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2563 TREE_TYPE (DR_REF (drb))))
2564 break;
2566 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2567 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2568 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2569 gcc_assert (init_a < init_b);
2571 /* If init_b == init_a + the size of the type * k, we have an
2572 interleaving, and DRA is accessed before DRB. */
2573 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2574 if ((init_b - init_a) % type_size_a != 0)
2575 break;
2577 /* If we have a store, the accesses are adjacent. This splits
2578 groups into chunks we support (we don't support vectorization
2579 of stores with gaps). */
2580 if (!DR_IS_READ (dra)
2581 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2582 (DR_INIT (datarefs_copy[i-1]))
2583 != type_size_a))
2584 break;
2586 /* If the step (if not zero or non-constant) is greater than the
2587 difference between data-refs' inits this splits groups into
2588 suitable sizes. */
2589 if (tree_fits_shwi_p (DR_STEP (dra)))
2591 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2592 if (step != 0 && step <= (init_b - init_a))
2593 break;
2596 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_NOTE, vect_location,
2599 "Detected interleaving ");
2600 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2601 dump_printf (MSG_NOTE, " and ");
2602 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2603 dump_printf (MSG_NOTE, "\n");
2606 /* Link the found element into the group list. */
2607 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2609 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2610 lastinfo = stmtinfo_a;
2612 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2613 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2614 lastinfo = stmtinfo_b;
2618 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2619 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2620 && !vect_analyze_data_ref_access (dr))
2622 if (dump_enabled_p ())
2623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2624 "not vectorized: complicated access pattern.\n");
2626 if (bb_vinfo)
2628 /* Mark the statement as not vectorizable. */
2629 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2630 continue;
2632 else
2634 datarefs_copy.release ();
2635 return false;
2639 datarefs_copy.release ();
2640 return true;
2644 /* Operator == between two dr_with_seg_len objects.
2646 This equality operator is used to make sure two data refs
2647 are the same one so that we will consider to combine the
2648 aliasing checks of those two pairs of data dependent data
2649 refs. */
2651 static bool
2652 operator == (const dr_with_seg_len& d1,
2653 const dr_with_seg_len& d2)
2655 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2656 DR_BASE_ADDRESS (d2.dr), 0)
2657 && compare_tree (d1.offset, d2.offset) == 0
2658 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2661 /* Function comp_dr_with_seg_len_pair.
2663 Comparison function for sorting objects of dr_with_seg_len_pair_t
2664 so that we can combine aliasing checks in one scan. */
2666 static int
2667 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2669 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2670 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2672 const dr_with_seg_len &p11 = p1->first,
2673 &p12 = p1->second,
2674 &p21 = p2->first,
2675 &p22 = p2->second;
2677 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2678 if a and c have the same basic address snd step, and b and d have the same
2679 address and step. Therefore, if any a&c or b&d don't have the same address
2680 and step, we don't care the order of those two pairs after sorting. */
2681 int comp_res;
2683 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2684 DR_BASE_ADDRESS (p21.dr))) != 0)
2685 return comp_res;
2686 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2687 DR_BASE_ADDRESS (p22.dr))) != 0)
2688 return comp_res;
2689 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2690 return comp_res;
2691 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2692 return comp_res;
2693 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2694 return comp_res;
2695 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2696 return comp_res;
2698 return 0;
2701 /* Function vect_vfa_segment_size.
2703 Create an expression that computes the size of segment
2704 that will be accessed for a data reference. The functions takes into
2705 account that realignment loads may access one more vector.
2707 Input:
2708 DR: The data reference.
2709 LENGTH_FACTOR: segment length to consider.
2711 Return an expression whose value is the size of segment which will be
2712 accessed by DR. */
2714 static tree
2715 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2717 tree segment_length;
2719 if (integer_zerop (DR_STEP (dr)))
2720 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2721 else
2722 segment_length = size_binop (MULT_EXPR,
2723 fold_convert (sizetype, DR_STEP (dr)),
2724 fold_convert (sizetype, length_factor));
2726 if (vect_supportable_dr_alignment (dr, false)
2727 == dr_explicit_realign_optimized)
2729 tree vector_size = TYPE_SIZE_UNIT
2730 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2732 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2734 return segment_length;
2737 /* Function vect_prune_runtime_alias_test_list.
2739 Prune a list of ddrs to be tested at run-time by versioning for alias.
2740 Merge several alias checks into one if possible.
2741 Return FALSE if resulting list of ddrs is longer then allowed by
2742 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2744 bool
2745 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2747 vec<ddr_p> may_alias_ddrs =
2748 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2749 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2750 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2751 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2752 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2754 ddr_p ddr;
2755 unsigned int i;
2756 tree length_factor;
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_NOTE, vect_location,
2760 "=== vect_prune_runtime_alias_test_list ===\n");
2762 if (may_alias_ddrs.is_empty ())
2763 return true;
2765 /* Basically, for each pair of dependent data refs store_ptr_0
2766 and load_ptr_0, we create an expression:
2768 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2769 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2771 for aliasing checks. However, in some cases we can decrease
2772 the number of checks by combining two checks into one. For
2773 example, suppose we have another pair of data refs store_ptr_0
2774 and load_ptr_1, and if the following condition is satisfied:
2776 load_ptr_0 < load_ptr_1 &&
2777 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2779 (this condition means, in each iteration of vectorized loop,
2780 the accessed memory of store_ptr_0 cannot be between the memory
2781 of load_ptr_0 and load_ptr_1.)
2783 we then can use only the following expression to finish the
2784 alising checks between store_ptr_0 & load_ptr_0 and
2785 store_ptr_0 & load_ptr_1:
2787 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2788 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2790 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2791 same basic address. */
2793 comp_alias_ddrs.create (may_alias_ddrs.length ());
2795 /* First, we collect all data ref pairs for aliasing checks. */
2796 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2798 struct data_reference *dr_a, *dr_b;
2799 gimple dr_group_first_a, dr_group_first_b;
2800 tree segment_length_a, segment_length_b;
2801 gimple stmt_a, stmt_b;
2803 dr_a = DDR_A (ddr);
2804 stmt_a = DR_STMT (DDR_A (ddr));
2805 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2806 if (dr_group_first_a)
2808 stmt_a = dr_group_first_a;
2809 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2812 dr_b = DDR_B (ddr);
2813 stmt_b = DR_STMT (DDR_B (ddr));
2814 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2815 if (dr_group_first_b)
2817 stmt_b = dr_group_first_b;
2818 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2821 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2822 length_factor = scalar_loop_iters;
2823 else
2824 length_factor = size_int (vect_factor);
2825 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2826 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2828 dr_with_seg_len_pair_t dr_with_seg_len_pair
2829 (dr_with_seg_len (dr_a, segment_length_a),
2830 dr_with_seg_len (dr_b, segment_length_b));
2832 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2833 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2835 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2838 /* Second, we sort the collected data ref pairs so that we can scan
2839 them once to combine all possible aliasing checks. */
2840 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2842 /* Third, we scan the sorted dr pairs and check if we can combine
2843 alias checks of two neighbouring dr pairs. */
2844 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2846 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2847 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2848 *dr_b1 = &comp_alias_ddrs[i-1].second,
2849 *dr_a2 = &comp_alias_ddrs[i].first,
2850 *dr_b2 = &comp_alias_ddrs[i].second;
2852 /* Remove duplicate data ref pairs. */
2853 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2855 if (dump_enabled_p ())
2857 dump_printf_loc (MSG_NOTE, vect_location,
2858 "found equal ranges ");
2859 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2860 DR_REF (dr_a1->dr));
2861 dump_printf (MSG_NOTE, ", ");
2862 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2863 DR_REF (dr_b1->dr));
2864 dump_printf (MSG_NOTE, " and ");
2865 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2866 DR_REF (dr_a2->dr));
2867 dump_printf (MSG_NOTE, ", ");
2868 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2869 DR_REF (dr_b2->dr));
2870 dump_printf (MSG_NOTE, "\n");
2873 comp_alias_ddrs.ordered_remove (i--);
2874 continue;
2877 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2879 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2880 and DR_A1 and DR_A2 are two consecutive memrefs. */
2881 if (*dr_a1 == *dr_a2)
2883 std::swap (dr_a1, dr_b1);
2884 std::swap (dr_a2, dr_b2);
2887 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2888 DR_BASE_ADDRESS (dr_a2->dr),
2890 || !tree_fits_shwi_p (dr_a1->offset)
2891 || !tree_fits_shwi_p (dr_a2->offset))
2892 continue;
2894 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2895 - tree_to_shwi (dr_a1->offset));
2898 /* Now we check if the following condition is satisfied:
2900 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2902 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2903 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2904 have to make a best estimation. We can get the minimum value
2905 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2906 then either of the following two conditions can guarantee the
2907 one above:
2909 1: DIFF <= MIN_SEG_LEN_B
2910 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2914 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2915 ? tree_to_shwi (dr_b1->seg_len)
2916 : vect_factor);
2918 if (diff <= min_seg_len_b
2919 || (tree_fits_shwi_p (dr_a1->seg_len)
2920 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2922 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_NOTE, vect_location,
2925 "merging ranges for ");
2926 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2927 DR_REF (dr_a1->dr));
2928 dump_printf (MSG_NOTE, ", ");
2929 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2930 DR_REF (dr_b1->dr));
2931 dump_printf (MSG_NOTE, " and ");
2932 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2933 DR_REF (dr_a2->dr));
2934 dump_printf (MSG_NOTE, ", ");
2935 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2936 DR_REF (dr_b2->dr));
2937 dump_printf (MSG_NOTE, "\n");
2940 dr_a1->seg_len = size_binop (PLUS_EXPR,
2941 dr_a2->seg_len, size_int (diff));
2942 comp_alias_ddrs.ordered_remove (i--);
2947 dump_printf_loc (MSG_NOTE, vect_location,
2948 "improved number of alias checks from %d to %d\n",
2949 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2950 if ((int) comp_alias_ddrs.length () >
2951 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2952 return false;
2954 return true;
2957 /* Check whether a non-affine read in stmt is suitable for gather load
2958 and if so, return a builtin decl for that operation. */
2960 tree
2961 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2962 tree *offp, int *scalep)
2964 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2965 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2966 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2967 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2968 tree offtype = NULL_TREE;
2969 tree decl, base, off;
2970 machine_mode pmode;
2971 int punsignedp, pvolatilep;
2973 base = DR_REF (dr);
2974 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2975 see if we can use the def stmt of the address. */
2976 if (is_gimple_call (stmt)
2977 && gimple_call_internal_p (stmt)
2978 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2979 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2980 && TREE_CODE (base) == MEM_REF
2981 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2982 && integer_zerop (TREE_OPERAND (base, 1))
2983 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2985 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2986 if (is_gimple_assign (def_stmt)
2987 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2988 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2991 /* The gather builtins need address of the form
2992 loop_invariant + vector * {1, 2, 4, 8}
2994 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2995 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2996 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2997 multiplications and additions in it. To get a vector, we need
2998 a single SSA_NAME that will be defined in the loop and will
2999 contain everything that is not loop invariant and that can be
3000 vectorized. The following code attempts to find such a preexistng
3001 SSA_NAME OFF and put the loop invariants into a tree BASE
3002 that can be gimplified before the loop. */
3003 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3004 &pmode, &punsignedp, &pvolatilep, false);
3005 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3007 if (TREE_CODE (base) == MEM_REF)
3009 if (!integer_zerop (TREE_OPERAND (base, 1)))
3011 if (off == NULL_TREE)
3013 offset_int moff = mem_ref_offset (base);
3014 off = wide_int_to_tree (sizetype, moff);
3016 else
3017 off = size_binop (PLUS_EXPR, off,
3018 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3020 base = TREE_OPERAND (base, 0);
3022 else
3023 base = build_fold_addr_expr (base);
3025 if (off == NULL_TREE)
3026 off = size_zero_node;
3028 /* If base is not loop invariant, either off is 0, then we start with just
3029 the constant offset in the loop invariant BASE and continue with base
3030 as OFF, otherwise give up.
3031 We could handle that case by gimplifying the addition of base + off
3032 into some SSA_NAME and use that as off, but for now punt. */
3033 if (!expr_invariant_in_loop_p (loop, base))
3035 if (!integer_zerop (off))
3036 return NULL_TREE;
3037 off = base;
3038 base = size_int (pbitpos / BITS_PER_UNIT);
3040 /* Otherwise put base + constant offset into the loop invariant BASE
3041 and continue with OFF. */
3042 else
3044 base = fold_convert (sizetype, base);
3045 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3048 /* OFF at this point may be either a SSA_NAME or some tree expression
3049 from get_inner_reference. Try to peel off loop invariants from it
3050 into BASE as long as possible. */
3051 STRIP_NOPS (off);
3052 while (offtype == NULL_TREE)
3054 enum tree_code code;
3055 tree op0, op1, add = NULL_TREE;
3057 if (TREE_CODE (off) == SSA_NAME)
3059 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3061 if (expr_invariant_in_loop_p (loop, off))
3062 return NULL_TREE;
3064 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3065 break;
3067 op0 = gimple_assign_rhs1 (def_stmt);
3068 code = gimple_assign_rhs_code (def_stmt);
3069 op1 = gimple_assign_rhs2 (def_stmt);
3071 else
3073 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3074 return NULL_TREE;
3075 code = TREE_CODE (off);
3076 extract_ops_from_tree (off, &code, &op0, &op1);
3078 switch (code)
3080 case POINTER_PLUS_EXPR:
3081 case PLUS_EXPR:
3082 if (expr_invariant_in_loop_p (loop, op0))
3084 add = op0;
3085 off = op1;
3086 do_add:
3087 add = fold_convert (sizetype, add);
3088 if (scale != 1)
3089 add = size_binop (MULT_EXPR, add, size_int (scale));
3090 base = size_binop (PLUS_EXPR, base, add);
3091 continue;
3093 if (expr_invariant_in_loop_p (loop, op1))
3095 add = op1;
3096 off = op0;
3097 goto do_add;
3099 break;
3100 case MINUS_EXPR:
3101 if (expr_invariant_in_loop_p (loop, op1))
3103 add = fold_convert (sizetype, op1);
3104 add = size_binop (MINUS_EXPR, size_zero_node, add);
3105 off = op0;
3106 goto do_add;
3108 break;
3109 case MULT_EXPR:
3110 if (scale == 1 && tree_fits_shwi_p (op1))
3112 scale = tree_to_shwi (op1);
3113 off = op0;
3114 continue;
3116 break;
3117 case SSA_NAME:
3118 off = op0;
3119 continue;
3120 CASE_CONVERT:
3121 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3122 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3123 break;
3124 if (TYPE_PRECISION (TREE_TYPE (op0))
3125 == TYPE_PRECISION (TREE_TYPE (off)))
3127 off = op0;
3128 continue;
3130 if (TYPE_PRECISION (TREE_TYPE (op0))
3131 < TYPE_PRECISION (TREE_TYPE (off)))
3133 off = op0;
3134 offtype = TREE_TYPE (off);
3135 STRIP_NOPS (off);
3136 continue;
3138 break;
3139 default:
3140 break;
3142 break;
3145 /* If at the end OFF still isn't a SSA_NAME or isn't
3146 defined in the loop, punt. */
3147 if (TREE_CODE (off) != SSA_NAME
3148 || expr_invariant_in_loop_p (loop, off))
3149 return NULL_TREE;
3151 if (offtype == NULL_TREE)
3152 offtype = TREE_TYPE (off);
3154 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3155 offtype, scale);
3156 if (decl == NULL_TREE)
3157 return NULL_TREE;
3159 if (basep)
3160 *basep = base;
3161 if (offp)
3162 *offp = off;
3163 if (scalep)
3164 *scalep = scale;
3165 return decl;
3168 /* Function vect_analyze_data_refs.
3170 Find all the data references in the loop or basic block.
3172 The general structure of the analysis of data refs in the vectorizer is as
3173 follows:
3174 1- vect_analyze_data_refs(loop/bb): call
3175 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3176 in the loop/bb and their dependences.
3177 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3178 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3179 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3183 bool
3184 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3185 bb_vec_info bb_vinfo,
3186 int *min_vf, unsigned *n_stmts)
3188 struct loop *loop = NULL;
3189 basic_block bb = NULL;
3190 unsigned int i;
3191 vec<data_reference_p> datarefs;
3192 struct data_reference *dr;
3193 tree scalar_type;
3195 if (dump_enabled_p ())
3196 dump_printf_loc (MSG_NOTE, vect_location,
3197 "=== vect_analyze_data_refs ===\n");
3199 if (loop_vinfo)
3201 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3203 loop = LOOP_VINFO_LOOP (loop_vinfo);
3204 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3205 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3207 if (dump_enabled_p ())
3208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3209 "not vectorized: loop contains function calls"
3210 " or data references that cannot be analyzed\n");
3211 return false;
3214 for (i = 0; i < loop->num_nodes; i++)
3216 gimple_stmt_iterator gsi;
3218 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3220 gimple stmt = gsi_stmt (gsi);
3221 if (is_gimple_debug (stmt))
3222 continue;
3223 ++*n_stmts;
3224 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3226 if (is_gimple_call (stmt) && loop->safelen)
3228 tree fndecl = gimple_call_fndecl (stmt), op;
3229 if (fndecl != NULL_TREE)
3231 struct cgraph_node *node = cgraph_node::get (fndecl);
3232 if (node != NULL && node->simd_clones != NULL)
3234 unsigned int j, n = gimple_call_num_args (stmt);
3235 for (j = 0; j < n; j++)
3237 op = gimple_call_arg (stmt, j);
3238 if (DECL_P (op)
3239 || (REFERENCE_CLASS_P (op)
3240 && get_base_address (op)))
3241 break;
3243 op = gimple_call_lhs (stmt);
3244 /* Ignore #pragma omp declare simd functions
3245 if they don't have data references in the
3246 call stmt itself. */
3247 if (j == n
3248 && !(op
3249 && (DECL_P (op)
3250 || (REFERENCE_CLASS_P (op)
3251 && get_base_address (op)))))
3252 continue;
3256 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3257 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3259 "not vectorized: loop contains function "
3260 "calls or data references that cannot "
3261 "be analyzed\n");
3262 return false;
3267 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3269 else
3271 gimple_stmt_iterator gsi;
3273 bb = BB_VINFO_BB (bb_vinfo);
3274 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3276 gimple stmt = gsi_stmt (gsi);
3277 if (is_gimple_debug (stmt))
3278 continue;
3279 ++*n_stmts;
3280 if (!find_data_references_in_stmt (NULL, stmt,
3281 &BB_VINFO_DATAREFS (bb_vinfo)))
3283 /* Mark the rest of the basic-block as unvectorizable. */
3284 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3286 stmt = gsi_stmt (gsi);
3287 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3289 break;
3293 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3296 /* Go through the data-refs, check that the analysis succeeded. Update
3297 pointer from stmt_vec_info struct to DR and vectype. */
3299 FOR_EACH_VEC_ELT (datarefs, i, dr)
3301 gimple stmt;
3302 stmt_vec_info stmt_info;
3303 tree base, offset, init;
3304 bool gather = false;
3305 bool simd_lane_access = false;
3306 int vf;
3308 again:
3309 if (!dr || !DR_REF (dr))
3311 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3313 "not vectorized: unhandled data-ref\n");
3314 return false;
3317 stmt = DR_STMT (dr);
3318 stmt_info = vinfo_for_stmt (stmt);
3320 /* Discard clobbers from the dataref vector. We will remove
3321 clobber stmts during vectorization. */
3322 if (gimple_clobber_p (stmt))
3324 free_data_ref (dr);
3325 if (i == datarefs.length () - 1)
3327 datarefs.pop ();
3328 break;
3330 datarefs.ordered_remove (i);
3331 dr = datarefs[i];
3332 goto again;
3335 /* Check that analysis of the data-ref succeeded. */
3336 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3337 || !DR_STEP (dr))
3339 bool maybe_gather
3340 = DR_IS_READ (dr)
3341 && !TREE_THIS_VOLATILE (DR_REF (dr))
3342 && targetm.vectorize.builtin_gather != NULL;
3343 bool maybe_simd_lane_access
3344 = loop_vinfo && loop->simduid;
3346 /* If target supports vector gather loads, or if this might be
3347 a SIMD lane access, see if they can't be used. */
3348 if (loop_vinfo
3349 && (maybe_gather || maybe_simd_lane_access)
3350 && !nested_in_vect_loop_p (loop, stmt))
3352 struct data_reference *newdr
3353 = create_data_ref (NULL, loop_containing_stmt (stmt),
3354 DR_REF (dr), stmt, true);
3355 gcc_assert (newdr != NULL && DR_REF (newdr));
3356 if (DR_BASE_ADDRESS (newdr)
3357 && DR_OFFSET (newdr)
3358 && DR_INIT (newdr)
3359 && DR_STEP (newdr)
3360 && integer_zerop (DR_STEP (newdr)))
3362 if (maybe_simd_lane_access)
3364 tree off = DR_OFFSET (newdr);
3365 STRIP_NOPS (off);
3366 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3367 && TREE_CODE (off) == MULT_EXPR
3368 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3370 tree step = TREE_OPERAND (off, 1);
3371 off = TREE_OPERAND (off, 0);
3372 STRIP_NOPS (off);
3373 if (CONVERT_EXPR_P (off)
3374 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3375 0)))
3376 < TYPE_PRECISION (TREE_TYPE (off)))
3377 off = TREE_OPERAND (off, 0);
3378 if (TREE_CODE (off) == SSA_NAME)
3380 gimple def = SSA_NAME_DEF_STMT (off);
3381 tree reft = TREE_TYPE (DR_REF (newdr));
3382 if (is_gimple_call (def)
3383 && gimple_call_internal_p (def)
3384 && (gimple_call_internal_fn (def)
3385 == IFN_GOMP_SIMD_LANE))
3387 tree arg = gimple_call_arg (def, 0);
3388 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3389 arg = SSA_NAME_VAR (arg);
3390 if (arg == loop->simduid
3391 /* For now. */
3392 && tree_int_cst_equal
3393 (TYPE_SIZE_UNIT (reft),
3394 step))
3396 DR_OFFSET (newdr) = ssize_int (0);
3397 DR_STEP (newdr) = step;
3398 DR_ALIGNED_TO (newdr)
3399 = size_int (BIGGEST_ALIGNMENT);
3400 dr = newdr;
3401 simd_lane_access = true;
3407 if (!simd_lane_access && maybe_gather)
3409 dr = newdr;
3410 gather = true;
3413 if (!gather && !simd_lane_access)
3414 free_data_ref (newdr);
3417 if (!gather && !simd_lane_access)
3419 if (dump_enabled_p ())
3421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3422 "not vectorized: data ref analysis "
3423 "failed ");
3424 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3425 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3428 if (bb_vinfo)
3429 break;
3431 return false;
3435 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3437 if (dump_enabled_p ())
3438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3439 "not vectorized: base addr of dr is a "
3440 "constant\n");
3442 if (bb_vinfo)
3443 break;
3445 if (gather || simd_lane_access)
3446 free_data_ref (dr);
3447 return false;
3450 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3452 if (dump_enabled_p ())
3454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3455 "not vectorized: volatile type ");
3456 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3457 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3460 if (bb_vinfo)
3461 break;
3463 return false;
3466 if (stmt_can_throw_internal (stmt))
3468 if (dump_enabled_p ())
3470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3471 "not vectorized: statement can throw an "
3472 "exception ");
3473 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3474 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3477 if (bb_vinfo)
3478 break;
3480 if (gather || simd_lane_access)
3481 free_data_ref (dr);
3482 return false;
3485 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3486 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3488 if (dump_enabled_p ())
3490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3491 "not vectorized: statement is bitfield "
3492 "access ");
3493 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3494 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3497 if (bb_vinfo)
3498 break;
3500 if (gather || simd_lane_access)
3501 free_data_ref (dr);
3502 return false;
3505 base = unshare_expr (DR_BASE_ADDRESS (dr));
3506 offset = unshare_expr (DR_OFFSET (dr));
3507 init = unshare_expr (DR_INIT (dr));
3509 if (is_gimple_call (stmt)
3510 && (!gimple_call_internal_p (stmt)
3511 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3512 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3514 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3517 "not vectorized: dr in a call ");
3518 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3519 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3522 if (bb_vinfo)
3523 break;
3525 if (gather || simd_lane_access)
3526 free_data_ref (dr);
3527 return false;
3530 /* Update DR field in stmt_vec_info struct. */
3532 /* If the dataref is in an inner-loop of the loop that is considered for
3533 for vectorization, we also want to analyze the access relative to
3534 the outer-loop (DR contains information only relative to the
3535 inner-most enclosing loop). We do that by building a reference to the
3536 first location accessed by the inner-loop, and analyze it relative to
3537 the outer-loop. */
3538 if (loop && nested_in_vect_loop_p (loop, stmt))
3540 tree outer_step, outer_base, outer_init;
3541 HOST_WIDE_INT pbitsize, pbitpos;
3542 tree poffset;
3543 machine_mode pmode;
3544 int punsignedp, pvolatilep;
3545 affine_iv base_iv, offset_iv;
3546 tree dinit;
3548 /* Build a reference to the first location accessed by the
3549 inner-loop: *(BASE+INIT). (The first location is actually
3550 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3551 tree inner_base = build_fold_indirect_ref
3552 (fold_build_pointer_plus (base, init));
3554 if (dump_enabled_p ())
3556 dump_printf_loc (MSG_NOTE, vect_location,
3557 "analyze in outer-loop: ");
3558 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3559 dump_printf (MSG_NOTE, "\n");
3562 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3563 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3564 gcc_assert (outer_base != NULL_TREE);
3566 if (pbitpos % BITS_PER_UNIT != 0)
3568 if (dump_enabled_p ())
3569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3570 "failed: bit offset alignment.\n");
3571 return false;
3574 outer_base = build_fold_addr_expr (outer_base);
3575 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3576 &base_iv, false))
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3580 "failed: evolution of base is not affine.\n");
3581 return false;
3584 if (offset)
3586 if (poffset)
3587 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3588 poffset);
3589 else
3590 poffset = offset;
3593 if (!poffset)
3595 offset_iv.base = ssize_int (0);
3596 offset_iv.step = ssize_int (0);
3598 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3599 &offset_iv, false))
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603 "evolution of offset is not affine.\n");
3604 return false;
3607 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3608 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3609 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3610 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3611 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3613 outer_step = size_binop (PLUS_EXPR,
3614 fold_convert (ssizetype, base_iv.step),
3615 fold_convert (ssizetype, offset_iv.step));
3617 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3618 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3619 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3620 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3621 STMT_VINFO_DR_OFFSET (stmt_info) =
3622 fold_convert (ssizetype, offset_iv.base);
3623 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3624 size_int (highest_pow2_factor (offset_iv.base));
3626 if (dump_enabled_p ())
3628 dump_printf_loc (MSG_NOTE, vect_location,
3629 "\touter base_address: ");
3630 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3631 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3632 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3633 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3634 STMT_VINFO_DR_OFFSET (stmt_info));
3635 dump_printf (MSG_NOTE,
3636 "\n\touter constant offset from base address: ");
3637 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3638 STMT_VINFO_DR_INIT (stmt_info));
3639 dump_printf (MSG_NOTE, "\n\touter step: ");
3640 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3641 STMT_VINFO_DR_STEP (stmt_info));
3642 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3643 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3644 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3645 dump_printf (MSG_NOTE, "\n");
3649 if (STMT_VINFO_DATA_REF (stmt_info))
3651 if (dump_enabled_p ())
3653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3654 "not vectorized: more than one data ref "
3655 "in stmt: ");
3656 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3657 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3660 if (bb_vinfo)
3661 break;
3663 if (gather || simd_lane_access)
3664 free_data_ref (dr);
3665 return false;
3668 STMT_VINFO_DATA_REF (stmt_info) = dr;
3669 if (simd_lane_access)
3671 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3672 free_data_ref (datarefs[i]);
3673 datarefs[i] = dr;
3676 /* Set vectype for STMT. */
3677 scalar_type = TREE_TYPE (DR_REF (dr));
3678 STMT_VINFO_VECTYPE (stmt_info)
3679 = get_vectype_for_scalar_type (scalar_type);
3680 if (!STMT_VINFO_VECTYPE (stmt_info))
3682 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3685 "not vectorized: no vectype for stmt: ");
3686 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3687 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3688 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3689 scalar_type);
3690 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3693 if (bb_vinfo)
3694 break;
3696 if (gather || simd_lane_access)
3698 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3699 if (gather)
3700 free_data_ref (dr);
3702 return false;
3704 else
3706 if (dump_enabled_p ())
3708 dump_printf_loc (MSG_NOTE, vect_location,
3709 "got vectype for stmt: ");
3710 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3711 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3712 STMT_VINFO_VECTYPE (stmt_info));
3713 dump_printf (MSG_NOTE, "\n");
3717 /* Adjust the minimal vectorization factor according to the
3718 vector type. */
3719 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3720 if (vf > *min_vf)
3721 *min_vf = vf;
3723 if (gather)
3725 tree off;
3727 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3728 if (gather
3729 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3730 gather = false;
3731 if (!gather)
3733 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3734 free_data_ref (dr);
3735 if (dump_enabled_p ())
3737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3738 "not vectorized: not suitable for gather "
3739 "load ");
3740 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3741 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3743 return false;
3746 datarefs[i] = dr;
3747 STMT_VINFO_GATHER_P (stmt_info) = true;
3749 else if (loop_vinfo
3750 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3752 if (nested_in_vect_loop_p (loop, stmt))
3754 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757 "not vectorized: not suitable for strided "
3758 "load ");
3759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3760 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3762 return false;
3764 STMT_VINFO_STRIDED_P (stmt_info) = true;
3768 /* If we stopped analysis at the first dataref we could not analyze
3769 when trying to vectorize a basic-block mark the rest of the datarefs
3770 as not vectorizable and truncate the vector of datarefs. That
3771 avoids spending useless time in analyzing their dependence. */
3772 if (i != datarefs.length ())
3774 gcc_assert (bb_vinfo != NULL);
3775 for (unsigned j = i; j < datarefs.length (); ++j)
3777 data_reference_p dr = datarefs[j];
3778 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3779 free_data_ref (dr);
3781 datarefs.truncate (i);
3784 return true;
3788 /* Function vect_get_new_vect_var.
3790 Returns a name for a new variable. The current naming scheme appends the
3791 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3792 the name of vectorizer generated variables, and appends that to NAME if
3793 provided. */
3795 tree
3796 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3798 const char *prefix;
3799 tree new_vect_var;
3801 switch (var_kind)
3803 case vect_simple_var:
3804 prefix = "vect";
3805 break;
3806 case vect_scalar_var:
3807 prefix = "stmp";
3808 break;
3809 case vect_pointer_var:
3810 prefix = "vectp";
3811 break;
3812 default:
3813 gcc_unreachable ();
3816 if (name)
3818 char* tmp = concat (prefix, "_", name, NULL);
3819 new_vect_var = create_tmp_reg (type, tmp);
3820 free (tmp);
3822 else
3823 new_vect_var = create_tmp_reg (type, prefix);
3825 return new_vect_var;
3828 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3830 static void
3831 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3832 stmt_vec_info stmt_info)
3834 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3835 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3836 int misalign = DR_MISALIGNMENT (dr);
3837 if (misalign == -1)
3838 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3839 else
3840 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3843 /* Function vect_create_addr_base_for_vector_ref.
3845 Create an expression that computes the address of the first memory location
3846 that will be accessed for a data reference.
3848 Input:
3849 STMT: The statement containing the data reference.
3850 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3851 OFFSET: Optional. If supplied, it is be added to the initial address.
3852 LOOP: Specify relative to which loop-nest should the address be computed.
3853 For example, when the dataref is in an inner-loop nested in an
3854 outer-loop that is now being vectorized, LOOP can be either the
3855 outer-loop, or the inner-loop. The first memory location accessed
3856 by the following dataref ('in' points to short):
3858 for (i=0; i<N; i++)
3859 for (j=0; j<M; j++)
3860 s += in[i+j]
3862 is as follows:
3863 if LOOP=i_loop: &in (relative to i_loop)
3864 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3865 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3866 initial address. Unlike OFFSET, which is number of elements to
3867 be added, BYTE_OFFSET is measured in bytes.
3869 Output:
3870 1. Return an SSA_NAME whose value is the address of the memory location of
3871 the first vector of the data reference.
3872 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3873 these statement(s) which define the returned SSA_NAME.
3875 FORNOW: We are only handling array accesses with step 1. */
3877 tree
3878 vect_create_addr_base_for_vector_ref (gimple stmt,
3879 gimple_seq *new_stmt_list,
3880 tree offset,
3881 struct loop *loop,
3882 tree byte_offset)
3884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3886 tree data_ref_base;
3887 const char *base_name;
3888 tree addr_base;
3889 tree dest;
3890 gimple_seq seq = NULL;
3891 tree base_offset;
3892 tree init;
3893 tree vect_ptr_type;
3894 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3895 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3897 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3899 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3901 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3903 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3904 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3905 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3907 else
3909 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3910 base_offset = unshare_expr (DR_OFFSET (dr));
3911 init = unshare_expr (DR_INIT (dr));
3914 if (loop_vinfo)
3915 base_name = get_name (data_ref_base);
3916 else
3918 base_offset = ssize_int (0);
3919 init = ssize_int (0);
3920 base_name = get_name (DR_REF (dr));
3923 /* Create base_offset */
3924 base_offset = size_binop (PLUS_EXPR,
3925 fold_convert (sizetype, base_offset),
3926 fold_convert (sizetype, init));
3928 if (offset)
3930 offset = fold_build2 (MULT_EXPR, sizetype,
3931 fold_convert (sizetype, offset), step);
3932 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3933 base_offset, offset);
3935 if (byte_offset)
3937 byte_offset = fold_convert (sizetype, byte_offset);
3938 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3939 base_offset, byte_offset);
3942 /* base + base_offset */
3943 if (loop_vinfo)
3944 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3945 else
3947 addr_base = build1 (ADDR_EXPR,
3948 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3949 unshare_expr (DR_REF (dr)));
3952 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3953 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3954 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3955 gimple_seq_add_seq (new_stmt_list, seq);
3957 if (DR_PTR_INFO (dr)
3958 && TREE_CODE (addr_base) == SSA_NAME
3959 && !SSA_NAME_PTR_INFO (addr_base))
3961 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3962 if (offset || byte_offset)
3963 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3966 if (dump_enabled_p ())
3968 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3969 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3970 dump_printf (MSG_NOTE, "\n");
3973 return addr_base;
3977 /* Function vect_create_data_ref_ptr.
3979 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3980 location accessed in the loop by STMT, along with the def-use update
3981 chain to appropriately advance the pointer through the loop iterations.
3982 Also set aliasing information for the pointer. This pointer is used by
3983 the callers to this function to create a memory reference expression for
3984 vector load/store access.
3986 Input:
3987 1. STMT: a stmt that references memory. Expected to be of the form
3988 GIMPLE_ASSIGN <name, data-ref> or
3989 GIMPLE_ASSIGN <data-ref, name>.
3990 2. AGGR_TYPE: the type of the reference, which should be either a vector
3991 or an array.
3992 3. AT_LOOP: the loop where the vector memref is to be created.
3993 4. OFFSET (optional): an offset to be added to the initial address accessed
3994 by the data-ref in STMT.
3995 5. BSI: location where the new stmts are to be placed if there is no loop
3996 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3997 pointing to the initial address.
3998 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3999 to the initial address accessed by the data-ref in STMT. This is
4000 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4001 in bytes.
4003 Output:
4004 1. Declare a new ptr to vector_type, and have it point to the base of the
4005 data reference (initial addressed accessed by the data reference).
4006 For example, for vector of type V8HI, the following code is generated:
4008 v8hi *ap;
4009 ap = (v8hi *)initial_address;
4011 if OFFSET is not supplied:
4012 initial_address = &a[init];
4013 if OFFSET is supplied:
4014 initial_address = &a[init + OFFSET];
4015 if BYTE_OFFSET is supplied:
4016 initial_address = &a[init] + BYTE_OFFSET;
4018 Return the initial_address in INITIAL_ADDRESS.
4020 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4021 update the pointer in each iteration of the loop.
4023 Return the increment stmt that updates the pointer in PTR_INCR.
4025 3. Set INV_P to true if the access pattern of the data reference in the
4026 vectorized loop is invariant. Set it to false otherwise.
4028 4. Return the pointer. */
4030 tree
4031 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4032 tree offset, tree *initial_address,
4033 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4034 bool only_init, bool *inv_p, tree byte_offset)
4036 const char *base_name;
4037 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4038 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4039 struct loop *loop = NULL;
4040 bool nested_in_vect_loop = false;
4041 struct loop *containing_loop = NULL;
4042 tree aggr_ptr_type;
4043 tree aggr_ptr;
4044 tree new_temp;
4045 gimple_seq new_stmt_list = NULL;
4046 edge pe = NULL;
4047 basic_block new_bb;
4048 tree aggr_ptr_init;
4049 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4050 tree aptr;
4051 gimple_stmt_iterator incr_gsi;
4052 bool insert_after;
4053 tree indx_before_incr, indx_after_incr;
4054 gimple incr;
4055 tree step;
4056 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4058 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4059 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4061 if (loop_vinfo)
4063 loop = LOOP_VINFO_LOOP (loop_vinfo);
4064 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4065 containing_loop = (gimple_bb (stmt))->loop_father;
4066 pe = loop_preheader_edge (loop);
4068 else
4070 gcc_assert (bb_vinfo);
4071 only_init = true;
4072 *ptr_incr = NULL;
4075 /* Check the step (evolution) of the load in LOOP, and record
4076 whether it's invariant. */
4077 if (nested_in_vect_loop)
4078 step = STMT_VINFO_DR_STEP (stmt_info);
4079 else
4080 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4082 if (integer_zerop (step))
4083 *inv_p = true;
4084 else
4085 *inv_p = false;
4087 /* Create an expression for the first address accessed by this load
4088 in LOOP. */
4089 base_name = get_name (DR_BASE_ADDRESS (dr));
4091 if (dump_enabled_p ())
4093 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4094 dump_printf_loc (MSG_NOTE, vect_location,
4095 "create %s-pointer variable to type: ",
4096 get_tree_code_name (TREE_CODE (aggr_type)));
4097 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4098 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4099 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4100 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4101 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4102 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4103 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4104 else
4105 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4106 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4107 dump_printf (MSG_NOTE, "\n");
4110 /* (1) Create the new aggregate-pointer variable.
4111 Vector and array types inherit the alias set of their component
4112 type by default so we need to use a ref-all pointer if the data
4113 reference does not conflict with the created aggregated data
4114 reference because it is not addressable. */
4115 bool need_ref_all = false;
4116 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4117 get_alias_set (DR_REF (dr))))
4118 need_ref_all = true;
4119 /* Likewise for any of the data references in the stmt group. */
4120 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4122 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4125 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4126 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4127 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4128 get_alias_set (DR_REF (sdr))))
4130 need_ref_all = true;
4131 break;
4133 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4135 while (orig_stmt);
4137 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4138 need_ref_all);
4139 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4142 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4143 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4144 def-use update cycles for the pointer: one relative to the outer-loop
4145 (LOOP), which is what steps (3) and (4) below do. The other is relative
4146 to the inner-loop (which is the inner-most loop containing the dataref),
4147 and this is done be step (5) below.
4149 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4150 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4151 redundant. Steps (3),(4) create the following:
4153 vp0 = &base_addr;
4154 LOOP: vp1 = phi(vp0,vp2)
4157 vp2 = vp1 + step
4158 goto LOOP
4160 If there is an inner-loop nested in loop, then step (5) will also be
4161 applied, and an additional update in the inner-loop will be created:
4163 vp0 = &base_addr;
4164 LOOP: vp1 = phi(vp0,vp2)
4166 inner: vp3 = phi(vp1,vp4)
4167 vp4 = vp3 + inner_step
4168 if () goto inner
4170 vp2 = vp1 + step
4171 if () goto LOOP */
4173 /* (2) Calculate the initial address of the aggregate-pointer, and set
4174 the aggregate-pointer to point to it before the loop. */
4176 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4178 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4179 offset, loop, byte_offset);
4180 if (new_stmt_list)
4182 if (pe)
4184 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4185 gcc_assert (!new_bb);
4187 else
4188 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4191 *initial_address = new_temp;
4192 aggr_ptr_init = new_temp;
4194 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4195 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4196 inner-loop nested in LOOP (during outer-loop vectorization). */
4198 /* No update in loop is required. */
4199 if (only_init && (!loop_vinfo || at_loop == loop))
4200 aptr = aggr_ptr_init;
4201 else
4203 /* The step of the aggregate pointer is the type size. */
4204 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4205 /* One exception to the above is when the scalar step of the load in
4206 LOOP is zero. In this case the step here is also zero. */
4207 if (*inv_p)
4208 iv_step = size_zero_node;
4209 else if (tree_int_cst_sgn (step) == -1)
4210 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4212 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4214 create_iv (aggr_ptr_init,
4215 fold_convert (aggr_ptr_type, iv_step),
4216 aggr_ptr, loop, &incr_gsi, insert_after,
4217 &indx_before_incr, &indx_after_incr);
4218 incr = gsi_stmt (incr_gsi);
4219 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4221 /* Copy the points-to information if it exists. */
4222 if (DR_PTR_INFO (dr))
4224 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4225 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4227 if (ptr_incr)
4228 *ptr_incr = incr;
4230 aptr = indx_before_incr;
4233 if (!nested_in_vect_loop || only_init)
4234 return aptr;
4237 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4238 nested in LOOP, if exists. */
4240 gcc_assert (nested_in_vect_loop);
4241 if (!only_init)
4243 standard_iv_increment_position (containing_loop, &incr_gsi,
4244 &insert_after);
4245 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4246 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4247 &indx_after_incr);
4248 incr = gsi_stmt (incr_gsi);
4249 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4251 /* Copy the points-to information if it exists. */
4252 if (DR_PTR_INFO (dr))
4254 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4255 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4257 if (ptr_incr)
4258 *ptr_incr = incr;
4260 return indx_before_incr;
4262 else
4263 gcc_unreachable ();
4267 /* Function bump_vector_ptr
4269 Increment a pointer (to a vector type) by vector-size. If requested,
4270 i.e. if PTR-INCR is given, then also connect the new increment stmt
4271 to the existing def-use update-chain of the pointer, by modifying
4272 the PTR_INCR as illustrated below:
4274 The pointer def-use update-chain before this function:
4275 DATAREF_PTR = phi (p_0, p_2)
4276 ....
4277 PTR_INCR: p_2 = DATAREF_PTR + step
4279 The pointer def-use update-chain after this function:
4280 DATAREF_PTR = phi (p_0, p_2)
4281 ....
4282 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4283 ....
4284 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4286 Input:
4287 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4288 in the loop.
4289 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4290 the loop. The increment amount across iterations is expected
4291 to be vector_size.
4292 BSI - location where the new update stmt is to be placed.
4293 STMT - the original scalar memory-access stmt that is being vectorized.
4294 BUMP - optional. The offset by which to bump the pointer. If not given,
4295 the offset is assumed to be vector_size.
4297 Output: Return NEW_DATAREF_PTR as illustrated above.
4301 tree
4302 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4303 gimple stmt, tree bump)
4305 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4306 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4307 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4308 tree update = TYPE_SIZE_UNIT (vectype);
4309 gassign *incr_stmt;
4310 ssa_op_iter iter;
4311 use_operand_p use_p;
4312 tree new_dataref_ptr;
4314 if (bump)
4315 update = bump;
4317 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4318 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4319 else
4320 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4321 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4322 dataref_ptr, update);
4323 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4325 /* Copy the points-to information if it exists. */
4326 if (DR_PTR_INFO (dr))
4328 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4329 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4332 if (!ptr_incr)
4333 return new_dataref_ptr;
4335 /* Update the vector-pointer's cross-iteration increment. */
4336 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4338 tree use = USE_FROM_PTR (use_p);
4340 if (use == dataref_ptr)
4341 SET_USE (use_p, new_dataref_ptr);
4342 else
4343 gcc_assert (tree_int_cst_compare (use, update) == 0);
4346 return new_dataref_ptr;
4350 /* Function vect_create_destination_var.
4352 Create a new temporary of type VECTYPE. */
4354 tree
4355 vect_create_destination_var (tree scalar_dest, tree vectype)
4357 tree vec_dest;
4358 const char *name;
4359 char *new_name;
4360 tree type;
4361 enum vect_var_kind kind;
4363 kind = vectype ? vect_simple_var : vect_scalar_var;
4364 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4366 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4368 name = get_name (scalar_dest);
4369 if (name)
4370 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4371 else
4372 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4373 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4374 free (new_name);
4376 return vec_dest;
4379 /* Function vect_grouped_store_supported.
4381 Returns TRUE if interleave high and interleave low permutations
4382 are supported, and FALSE otherwise. */
4384 bool
4385 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4387 machine_mode mode = TYPE_MODE (vectype);
4389 /* vect_permute_store_chain requires the group size to be equal to 3 or
4390 be a power of two. */
4391 if (count != 3 && exact_log2 (count) == -1)
4393 if (dump_enabled_p ())
4394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4395 "the size of the group of accesses"
4396 " is not a power of 2 or not eqaul to 3\n");
4397 return false;
4400 /* Check that the permutation is supported. */
4401 if (VECTOR_MODE_P (mode))
4403 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4404 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4406 if (count == 3)
4408 unsigned int j0 = 0, j1 = 0, j2 = 0;
4409 unsigned int i, j;
4411 for (j = 0; j < 3; j++)
4413 int nelt0 = ((3 - j) * nelt) % 3;
4414 int nelt1 = ((3 - j) * nelt + 1) % 3;
4415 int nelt2 = ((3 - j) * nelt + 2) % 3;
4416 for (i = 0; i < nelt; i++)
4418 if (3 * i + nelt0 < nelt)
4419 sel[3 * i + nelt0] = j0++;
4420 if (3 * i + nelt1 < nelt)
4421 sel[3 * i + nelt1] = nelt + j1++;
4422 if (3 * i + nelt2 < nelt)
4423 sel[3 * i + nelt2] = 0;
4425 if (!can_vec_perm_p (mode, false, sel))
4427 if (dump_enabled_p ())
4428 dump_printf (MSG_MISSED_OPTIMIZATION,
4429 "permutaion op not supported by target.\n");
4430 return false;
4433 for (i = 0; i < nelt; i++)
4435 if (3 * i + nelt0 < nelt)
4436 sel[3 * i + nelt0] = 3 * i + nelt0;
4437 if (3 * i + nelt1 < nelt)
4438 sel[3 * i + nelt1] = 3 * i + nelt1;
4439 if (3 * i + nelt2 < nelt)
4440 sel[3 * i + nelt2] = nelt + j2++;
4442 if (!can_vec_perm_p (mode, false, sel))
4444 if (dump_enabled_p ())
4445 dump_printf (MSG_MISSED_OPTIMIZATION,
4446 "permutaion op not supported by target.\n");
4447 return false;
4450 return true;
4452 else
4454 /* If length is not equal to 3 then only power of 2 is supported. */
4455 gcc_assert (exact_log2 (count) != -1);
4457 for (i = 0; i < nelt / 2; i++)
4459 sel[i * 2] = i;
4460 sel[i * 2 + 1] = i + nelt;
4462 if (can_vec_perm_p (mode, false, sel))
4464 for (i = 0; i < nelt; i++)
4465 sel[i] += nelt / 2;
4466 if (can_vec_perm_p (mode, false, sel))
4467 return true;
4472 if (dump_enabled_p ())
4473 dump_printf (MSG_MISSED_OPTIMIZATION,
4474 "permutaion op not supported by target.\n");
4475 return false;
4479 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4480 type VECTYPE. */
4482 bool
4483 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4485 return vect_lanes_optab_supported_p ("vec_store_lanes",
4486 vec_store_lanes_optab,
4487 vectype, count);
4491 /* Function vect_permute_store_chain.
4493 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4494 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4495 the data correctly for the stores. Return the final references for stores
4496 in RESULT_CHAIN.
4498 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4499 The input is 4 vectors each containing 8 elements. We assign a number to
4500 each element, the input sequence is:
4502 1st vec: 0 1 2 3 4 5 6 7
4503 2nd vec: 8 9 10 11 12 13 14 15
4504 3rd vec: 16 17 18 19 20 21 22 23
4505 4th vec: 24 25 26 27 28 29 30 31
4507 The output sequence should be:
4509 1st vec: 0 8 16 24 1 9 17 25
4510 2nd vec: 2 10 18 26 3 11 19 27
4511 3rd vec: 4 12 20 28 5 13 21 30
4512 4th vec: 6 14 22 30 7 15 23 31
4514 i.e., we interleave the contents of the four vectors in their order.
4516 We use interleave_high/low instructions to create such output. The input of
4517 each interleave_high/low operation is two vectors:
4518 1st vec 2nd vec
4519 0 1 2 3 4 5 6 7
4520 the even elements of the result vector are obtained left-to-right from the
4521 high/low elements of the first vector. The odd elements of the result are
4522 obtained left-to-right from the high/low elements of the second vector.
4523 The output of interleave_high will be: 0 4 1 5
4524 and of interleave_low: 2 6 3 7
4527 The permutation is done in log LENGTH stages. In each stage interleave_high
4528 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4529 where the first argument is taken from the first half of DR_CHAIN and the
4530 second argument from it's second half.
4531 In our example,
4533 I1: interleave_high (1st vec, 3rd vec)
4534 I2: interleave_low (1st vec, 3rd vec)
4535 I3: interleave_high (2nd vec, 4th vec)
4536 I4: interleave_low (2nd vec, 4th vec)
4538 The output for the first stage is:
4540 I1: 0 16 1 17 2 18 3 19
4541 I2: 4 20 5 21 6 22 7 23
4542 I3: 8 24 9 25 10 26 11 27
4543 I4: 12 28 13 29 14 30 15 31
4545 The output of the second stage, i.e. the final result is:
4547 I1: 0 8 16 24 1 9 17 25
4548 I2: 2 10 18 26 3 11 19 27
4549 I3: 4 12 20 28 5 13 21 30
4550 I4: 6 14 22 30 7 15 23 31. */
4552 void
4553 vect_permute_store_chain (vec<tree> dr_chain,
4554 unsigned int length,
4555 gimple stmt,
4556 gimple_stmt_iterator *gsi,
4557 vec<tree> *result_chain)
4559 tree vect1, vect2, high, low;
4560 gimple perm_stmt;
4561 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4562 tree perm_mask_low, perm_mask_high;
4563 tree data_ref;
4564 tree perm3_mask_low, perm3_mask_high;
4565 unsigned int i, n, log_length = exact_log2 (length);
4566 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4567 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4569 result_chain->quick_grow (length);
4570 memcpy (result_chain->address (), dr_chain.address (),
4571 length * sizeof (tree));
4573 if (length == 3)
4575 unsigned int j0 = 0, j1 = 0, j2 = 0;
4577 for (j = 0; j < 3; j++)
4579 int nelt0 = ((3 - j) * nelt) % 3;
4580 int nelt1 = ((3 - j) * nelt + 1) % 3;
4581 int nelt2 = ((3 - j) * nelt + 2) % 3;
4583 for (i = 0; i < nelt; i++)
4585 if (3 * i + nelt0 < nelt)
4586 sel[3 * i + nelt0] = j0++;
4587 if (3 * i + nelt1 < nelt)
4588 sel[3 * i + nelt1] = nelt + j1++;
4589 if (3 * i + nelt2 < nelt)
4590 sel[3 * i + nelt2] = 0;
4592 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4594 for (i = 0; i < nelt; i++)
4596 if (3 * i + nelt0 < nelt)
4597 sel[3 * i + nelt0] = 3 * i + nelt0;
4598 if (3 * i + nelt1 < nelt)
4599 sel[3 * i + nelt1] = 3 * i + nelt1;
4600 if (3 * i + nelt2 < nelt)
4601 sel[3 * i + nelt2] = nelt + j2++;
4603 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4605 vect1 = dr_chain[0];
4606 vect2 = dr_chain[1];
4608 /* Create interleaving stmt:
4609 low = VEC_PERM_EXPR <vect1, vect2,
4610 {j, nelt, *, j + 1, nelt + j + 1, *,
4611 j + 2, nelt + j + 2, *, ...}> */
4612 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4613 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4614 vect2, perm3_mask_low);
4615 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4617 vect1 = data_ref;
4618 vect2 = dr_chain[2];
4619 /* Create interleaving stmt:
4620 low = VEC_PERM_EXPR <vect1, vect2,
4621 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4622 6, 7, nelt + j + 2, ...}> */
4623 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4624 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4625 vect2, perm3_mask_high);
4626 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4627 (*result_chain)[j] = data_ref;
4630 else
4632 /* If length is not equal to 3 then only power of 2 is supported. */
4633 gcc_assert (exact_log2 (length) != -1);
4635 for (i = 0, n = nelt / 2; i < n; i++)
4637 sel[i * 2] = i;
4638 sel[i * 2 + 1] = i + nelt;
4640 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4642 for (i = 0; i < nelt; i++)
4643 sel[i] += nelt / 2;
4644 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4646 for (i = 0, n = log_length; i < n; i++)
4648 for (j = 0; j < length/2; j++)
4650 vect1 = dr_chain[j];
4651 vect2 = dr_chain[j+length/2];
4653 /* Create interleaving stmt:
4654 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4655 ...}> */
4656 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4657 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4658 vect2, perm_mask_high);
4659 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4660 (*result_chain)[2*j] = high;
4662 /* Create interleaving stmt:
4663 low = VEC_PERM_EXPR <vect1, vect2,
4664 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4665 ...}> */
4666 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4667 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4668 vect2, perm_mask_low);
4669 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4670 (*result_chain)[2*j+1] = low;
4672 memcpy (dr_chain.address (), result_chain->address (),
4673 length * sizeof (tree));
4678 /* Function vect_setup_realignment
4680 This function is called when vectorizing an unaligned load using
4681 the dr_explicit_realign[_optimized] scheme.
4682 This function generates the following code at the loop prolog:
4684 p = initial_addr;
4685 x msq_init = *(floor(p)); # prolog load
4686 realignment_token = call target_builtin;
4687 loop:
4688 x msq = phi (msq_init, ---)
4690 The stmts marked with x are generated only for the case of
4691 dr_explicit_realign_optimized.
4693 The code above sets up a new (vector) pointer, pointing to the first
4694 location accessed by STMT, and a "floor-aligned" load using that pointer.
4695 It also generates code to compute the "realignment-token" (if the relevant
4696 target hook was defined), and creates a phi-node at the loop-header bb
4697 whose arguments are the result of the prolog-load (created by this
4698 function) and the result of a load that takes place in the loop (to be
4699 created by the caller to this function).
4701 For the case of dr_explicit_realign_optimized:
4702 The caller to this function uses the phi-result (msq) to create the
4703 realignment code inside the loop, and sets up the missing phi argument,
4704 as follows:
4705 loop:
4706 msq = phi (msq_init, lsq)
4707 lsq = *(floor(p')); # load in loop
4708 result = realign_load (msq, lsq, realignment_token);
4710 For the case of dr_explicit_realign:
4711 loop:
4712 msq = *(floor(p)); # load in loop
4713 p' = p + (VS-1);
4714 lsq = *(floor(p')); # load in loop
4715 result = realign_load (msq, lsq, realignment_token);
4717 Input:
4718 STMT - (scalar) load stmt to be vectorized. This load accesses
4719 a memory location that may be unaligned.
4720 BSI - place where new code is to be inserted.
4721 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4722 is used.
4724 Output:
4725 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4726 target hook, if defined.
4727 Return value - the result of the loop-header phi node. */
4729 tree
4730 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4731 tree *realignment_token,
4732 enum dr_alignment_support alignment_support_scheme,
4733 tree init_addr,
4734 struct loop **at_loop)
4736 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4737 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4738 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4739 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4740 struct loop *loop = NULL;
4741 edge pe = NULL;
4742 tree scalar_dest = gimple_assign_lhs (stmt);
4743 tree vec_dest;
4744 gimple inc;
4745 tree ptr;
4746 tree data_ref;
4747 basic_block new_bb;
4748 tree msq_init = NULL_TREE;
4749 tree new_temp;
4750 gphi *phi_stmt;
4751 tree msq = NULL_TREE;
4752 gimple_seq stmts = NULL;
4753 bool inv_p;
4754 bool compute_in_loop = false;
4755 bool nested_in_vect_loop = false;
4756 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4757 struct loop *loop_for_initial_load = NULL;
4759 if (loop_vinfo)
4761 loop = LOOP_VINFO_LOOP (loop_vinfo);
4762 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4765 gcc_assert (alignment_support_scheme == dr_explicit_realign
4766 || alignment_support_scheme == dr_explicit_realign_optimized);
4768 /* We need to generate three things:
4769 1. the misalignment computation
4770 2. the extra vector load (for the optimized realignment scheme).
4771 3. the phi node for the two vectors from which the realignment is
4772 done (for the optimized realignment scheme). */
4774 /* 1. Determine where to generate the misalignment computation.
4776 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4777 calculation will be generated by this function, outside the loop (in the
4778 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4779 caller, inside the loop.
4781 Background: If the misalignment remains fixed throughout the iterations of
4782 the loop, then both realignment schemes are applicable, and also the
4783 misalignment computation can be done outside LOOP. This is because we are
4784 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4785 are a multiple of VS (the Vector Size), and therefore the misalignment in
4786 different vectorized LOOP iterations is always the same.
4787 The problem arises only if the memory access is in an inner-loop nested
4788 inside LOOP, which is now being vectorized using outer-loop vectorization.
4789 This is the only case when the misalignment of the memory access may not
4790 remain fixed throughout the iterations of the inner-loop (as explained in
4791 detail in vect_supportable_dr_alignment). In this case, not only is the
4792 optimized realignment scheme not applicable, but also the misalignment
4793 computation (and generation of the realignment token that is passed to
4794 REALIGN_LOAD) have to be done inside the loop.
4796 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4797 or not, which in turn determines if the misalignment is computed inside
4798 the inner-loop, or outside LOOP. */
4800 if (init_addr != NULL_TREE || !loop_vinfo)
4802 compute_in_loop = true;
4803 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4807 /* 2. Determine where to generate the extra vector load.
4809 For the optimized realignment scheme, instead of generating two vector
4810 loads in each iteration, we generate a single extra vector load in the
4811 preheader of the loop, and in each iteration reuse the result of the
4812 vector load from the previous iteration. In case the memory access is in
4813 an inner-loop nested inside LOOP, which is now being vectorized using
4814 outer-loop vectorization, we need to determine whether this initial vector
4815 load should be generated at the preheader of the inner-loop, or can be
4816 generated at the preheader of LOOP. If the memory access has no evolution
4817 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4818 to be generated inside LOOP (in the preheader of the inner-loop). */
4820 if (nested_in_vect_loop)
4822 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4823 bool invariant_in_outerloop =
4824 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4825 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4827 else
4828 loop_for_initial_load = loop;
4829 if (at_loop)
4830 *at_loop = loop_for_initial_load;
4832 if (loop_for_initial_load)
4833 pe = loop_preheader_edge (loop_for_initial_load);
4835 /* 3. For the case of the optimized realignment, create the first vector
4836 load at the loop preheader. */
4838 if (alignment_support_scheme == dr_explicit_realign_optimized)
4840 /* Create msq_init = *(floor(p1)) in the loop preheader */
4841 gassign *new_stmt;
4843 gcc_assert (!compute_in_loop);
4844 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4845 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4846 NULL_TREE, &init_addr, NULL, &inc,
4847 true, &inv_p);
4848 if (TREE_CODE (ptr) == SSA_NAME)
4849 new_temp = copy_ssa_name (ptr);
4850 else
4851 new_temp = make_ssa_name (TREE_TYPE (ptr));
4852 new_stmt = gimple_build_assign
4853 (new_temp, BIT_AND_EXPR, ptr,
4854 build_int_cst (TREE_TYPE (ptr),
4855 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4856 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4857 gcc_assert (!new_bb);
4858 data_ref
4859 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4860 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4861 new_stmt = gimple_build_assign (vec_dest, data_ref);
4862 new_temp = make_ssa_name (vec_dest, new_stmt);
4863 gimple_assign_set_lhs (new_stmt, new_temp);
4864 if (pe)
4866 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4867 gcc_assert (!new_bb);
4869 else
4870 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4872 msq_init = gimple_assign_lhs (new_stmt);
4875 /* 4. Create realignment token using a target builtin, if available.
4876 It is done either inside the containing loop, or before LOOP (as
4877 determined above). */
4879 if (targetm.vectorize.builtin_mask_for_load)
4881 gcall *new_stmt;
4882 tree builtin_decl;
4884 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4885 if (!init_addr)
4887 /* Generate the INIT_ADDR computation outside LOOP. */
4888 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4889 NULL_TREE, loop);
4890 if (loop)
4892 pe = loop_preheader_edge (loop);
4893 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4894 gcc_assert (!new_bb);
4896 else
4897 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4900 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4901 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4902 vec_dest =
4903 vect_create_destination_var (scalar_dest,
4904 gimple_call_return_type (new_stmt));
4905 new_temp = make_ssa_name (vec_dest, new_stmt);
4906 gimple_call_set_lhs (new_stmt, new_temp);
4908 if (compute_in_loop)
4909 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4910 else
4912 /* Generate the misalignment computation outside LOOP. */
4913 pe = loop_preheader_edge (loop);
4914 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4915 gcc_assert (!new_bb);
4918 *realignment_token = gimple_call_lhs (new_stmt);
4920 /* The result of the CALL_EXPR to this builtin is determined from
4921 the value of the parameter and no global variables are touched
4922 which makes the builtin a "const" function. Requiring the
4923 builtin to have the "const" attribute makes it unnecessary
4924 to call mark_call_clobbered. */
4925 gcc_assert (TREE_READONLY (builtin_decl));
4928 if (alignment_support_scheme == dr_explicit_realign)
4929 return msq;
4931 gcc_assert (!compute_in_loop);
4932 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4935 /* 5. Create msq = phi <msq_init, lsq> in loop */
4937 pe = loop_preheader_edge (containing_loop);
4938 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4939 msq = make_ssa_name (vec_dest);
4940 phi_stmt = create_phi_node (msq, containing_loop->header);
4941 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4943 return msq;
4947 /* Function vect_grouped_load_supported.
4949 Returns TRUE if even and odd permutations are supported,
4950 and FALSE otherwise. */
4952 bool
4953 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4955 machine_mode mode = TYPE_MODE (vectype);
4957 /* vect_permute_load_chain requires the group size to be equal to 3 or
4958 be a power of two. */
4959 if (count != 3 && exact_log2 (count) == -1)
4961 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4963 "the size of the group of accesses"
4964 " is not a power of 2 or not equal to 3\n");
4965 return false;
4968 /* Check that the permutation is supported. */
4969 if (VECTOR_MODE_P (mode))
4971 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4972 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4974 if (count == 3)
4976 unsigned int k;
4977 for (k = 0; k < 3; k++)
4979 for (i = 0; i < nelt; i++)
4980 if (3 * i + k < 2 * nelt)
4981 sel[i] = 3 * i + k;
4982 else
4983 sel[i] = 0;
4984 if (!can_vec_perm_p (mode, false, sel))
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4988 "shuffle of 3 loads is not supported by"
4989 " target\n");
4990 return false;
4992 for (i = 0, j = 0; i < nelt; i++)
4993 if (3 * i + k < 2 * nelt)
4994 sel[i] = i;
4995 else
4996 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4997 if (!can_vec_perm_p (mode, false, sel))
4999 if (dump_enabled_p ())
5000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5001 "shuffle of 3 loads is not supported by"
5002 " target\n");
5003 return false;
5006 return true;
5008 else
5010 /* If length is not equal to 3 then only power of 2 is supported. */
5011 gcc_assert (exact_log2 (count) != -1);
5012 for (i = 0; i < nelt; i++)
5013 sel[i] = i * 2;
5014 if (can_vec_perm_p (mode, false, sel))
5016 for (i = 0; i < nelt; i++)
5017 sel[i] = i * 2 + 1;
5018 if (can_vec_perm_p (mode, false, sel))
5019 return true;
5024 if (dump_enabled_p ())
5025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5026 "extract even/odd not supported by target\n");
5027 return false;
5030 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5031 type VECTYPE. */
5033 bool
5034 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5036 return vect_lanes_optab_supported_p ("vec_load_lanes",
5037 vec_load_lanes_optab,
5038 vectype, count);
5041 /* Function vect_permute_load_chain.
5043 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5044 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5045 the input data correctly. Return the final references for loads in
5046 RESULT_CHAIN.
5048 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5049 The input is 4 vectors each containing 8 elements. We assign a number to each
5050 element, the input sequence is:
5052 1st vec: 0 1 2 3 4 5 6 7
5053 2nd vec: 8 9 10 11 12 13 14 15
5054 3rd vec: 16 17 18 19 20 21 22 23
5055 4th vec: 24 25 26 27 28 29 30 31
5057 The output sequence should be:
5059 1st vec: 0 4 8 12 16 20 24 28
5060 2nd vec: 1 5 9 13 17 21 25 29
5061 3rd vec: 2 6 10 14 18 22 26 30
5062 4th vec: 3 7 11 15 19 23 27 31
5064 i.e., the first output vector should contain the first elements of each
5065 interleaving group, etc.
5067 We use extract_even/odd instructions to create such output. The input of
5068 each extract_even/odd operation is two vectors
5069 1st vec 2nd vec
5070 0 1 2 3 4 5 6 7
5072 and the output is the vector of extracted even/odd elements. The output of
5073 extract_even will be: 0 2 4 6
5074 and of extract_odd: 1 3 5 7
5077 The permutation is done in log LENGTH stages. In each stage extract_even
5078 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5079 their order. In our example,
5081 E1: extract_even (1st vec, 2nd vec)
5082 E2: extract_odd (1st vec, 2nd vec)
5083 E3: extract_even (3rd vec, 4th vec)
5084 E4: extract_odd (3rd vec, 4th vec)
5086 The output for the first stage will be:
5088 E1: 0 2 4 6 8 10 12 14
5089 E2: 1 3 5 7 9 11 13 15
5090 E3: 16 18 20 22 24 26 28 30
5091 E4: 17 19 21 23 25 27 29 31
5093 In order to proceed and create the correct sequence for the next stage (or
5094 for the correct output, if the second stage is the last one, as in our
5095 example), we first put the output of extract_even operation and then the
5096 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5097 The input for the second stage is:
5099 1st vec (E1): 0 2 4 6 8 10 12 14
5100 2nd vec (E3): 16 18 20 22 24 26 28 30
5101 3rd vec (E2): 1 3 5 7 9 11 13 15
5102 4th vec (E4): 17 19 21 23 25 27 29 31
5104 The output of the second stage:
5106 E1: 0 4 8 12 16 20 24 28
5107 E2: 2 6 10 14 18 22 26 30
5108 E3: 1 5 9 13 17 21 25 29
5109 E4: 3 7 11 15 19 23 27 31
5111 And RESULT_CHAIN after reordering:
5113 1st vec (E1): 0 4 8 12 16 20 24 28
5114 2nd vec (E3): 1 5 9 13 17 21 25 29
5115 3rd vec (E2): 2 6 10 14 18 22 26 30
5116 4th vec (E4): 3 7 11 15 19 23 27 31. */
5118 static void
5119 vect_permute_load_chain (vec<tree> dr_chain,
5120 unsigned int length,
5121 gimple stmt,
5122 gimple_stmt_iterator *gsi,
5123 vec<tree> *result_chain)
5125 tree data_ref, first_vect, second_vect;
5126 tree perm_mask_even, perm_mask_odd;
5127 tree perm3_mask_low, perm3_mask_high;
5128 gimple perm_stmt;
5129 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5130 unsigned int i, j, log_length = exact_log2 (length);
5131 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5132 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5134 result_chain->quick_grow (length);
5135 memcpy (result_chain->address (), dr_chain.address (),
5136 length * sizeof (tree));
5138 if (length == 3)
5140 unsigned int k;
5142 for (k = 0; k < 3; k++)
5144 for (i = 0; i < nelt; i++)
5145 if (3 * i + k < 2 * nelt)
5146 sel[i] = 3 * i + k;
5147 else
5148 sel[i] = 0;
5149 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5151 for (i = 0, j = 0; i < nelt; i++)
5152 if (3 * i + k < 2 * nelt)
5153 sel[i] = i;
5154 else
5155 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5157 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5159 first_vect = dr_chain[0];
5160 second_vect = dr_chain[1];
5162 /* Create interleaving stmt (low part of):
5163 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5164 ...}> */
5165 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5166 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5167 second_vect, perm3_mask_low);
5168 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5170 /* Create interleaving stmt (high part of):
5171 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5172 ...}> */
5173 first_vect = data_ref;
5174 second_vect = dr_chain[2];
5175 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5176 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5177 second_vect, perm3_mask_high);
5178 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5179 (*result_chain)[k] = data_ref;
5182 else
5184 /* If length is not equal to 3 then only power of 2 is supported. */
5185 gcc_assert (exact_log2 (length) != -1);
5187 for (i = 0; i < nelt; ++i)
5188 sel[i] = i * 2;
5189 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5191 for (i = 0; i < nelt; ++i)
5192 sel[i] = i * 2 + 1;
5193 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5195 for (i = 0; i < log_length; i++)
5197 for (j = 0; j < length; j += 2)
5199 first_vect = dr_chain[j];
5200 second_vect = dr_chain[j+1];
5202 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5203 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5204 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5205 first_vect, second_vect,
5206 perm_mask_even);
5207 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5208 (*result_chain)[j/2] = data_ref;
5210 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5211 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5212 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5213 first_vect, second_vect,
5214 perm_mask_odd);
5215 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5216 (*result_chain)[j/2+length/2] = data_ref;
5218 memcpy (dr_chain.address (), result_chain->address (),
5219 length * sizeof (tree));
5224 /* Function vect_shift_permute_load_chain.
5226 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5227 sequence of stmts to reorder the input data accordingly.
5228 Return the final references for loads in RESULT_CHAIN.
5229 Return true if successed, false otherwise.
5231 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5232 The input is 3 vectors each containing 8 elements. We assign a
5233 number to each element, the input sequence is:
5235 1st vec: 0 1 2 3 4 5 6 7
5236 2nd vec: 8 9 10 11 12 13 14 15
5237 3rd vec: 16 17 18 19 20 21 22 23
5239 The output sequence should be:
5241 1st vec: 0 3 6 9 12 15 18 21
5242 2nd vec: 1 4 7 10 13 16 19 22
5243 3rd vec: 2 5 8 11 14 17 20 23
5245 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5247 First we shuffle all 3 vectors to get correct elements order:
5249 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5250 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5251 3rd vec: (16 19 22) (17 20 23) (18 21)
5253 Next we unite and shift vector 3 times:
5255 1st step:
5256 shift right by 6 the concatenation of:
5257 "1st vec" and "2nd vec"
5258 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5259 "2nd vec" and "3rd vec"
5260 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5261 "3rd vec" and "1st vec"
5262 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5263 | New vectors |
5265 So that now new vectors are:
5267 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5268 2nd vec: (10 13) (16 19 22) (17 20 23)
5269 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5271 2nd step:
5272 shift right by 5 the concatenation of:
5273 "1st vec" and "3rd vec"
5274 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5275 "2nd vec" and "1st vec"
5276 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5277 "3rd vec" and "2nd vec"
5278 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5279 | New vectors |
5281 So that now new vectors are:
5283 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5284 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5285 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5287 3rd step:
5288 shift right by 5 the concatenation of:
5289 "1st vec" and "1st vec"
5290 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5291 shift right by 3 the concatenation of:
5292 "2nd vec" and "2nd vec"
5293 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5294 | New vectors |
5296 So that now all vectors are READY:
5297 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5298 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5299 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5301 This algorithm is faster than one in vect_permute_load_chain if:
5302 1. "shift of a concatination" is faster than general permutation.
5303 This is usually so.
5304 2. The TARGET machine can't execute vector instructions in parallel.
5305 This is because each step of the algorithm depends on previous.
5306 The algorithm in vect_permute_load_chain is much more parallel.
5308 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5311 static bool
5312 vect_shift_permute_load_chain (vec<tree> dr_chain,
5313 unsigned int length,
5314 gimple stmt,
5315 gimple_stmt_iterator *gsi,
5316 vec<tree> *result_chain)
5318 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5319 tree perm2_mask1, perm2_mask2, perm3_mask;
5320 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5321 gimple perm_stmt;
5323 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5324 unsigned int i;
5325 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5326 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5327 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5328 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5330 result_chain->quick_grow (length);
5331 memcpy (result_chain->address (), dr_chain.address (),
5332 length * sizeof (tree));
5334 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5336 unsigned int j, log_length = exact_log2 (length);
5337 for (i = 0; i < nelt / 2; ++i)
5338 sel[i] = i * 2;
5339 for (i = 0; i < nelt / 2; ++i)
5340 sel[nelt / 2 + i] = i * 2 + 1;
5341 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5343 if (dump_enabled_p ())
5344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5345 "shuffle of 2 fields structure is not \
5346 supported by target\n");
5347 return false;
5349 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5351 for (i = 0; i < nelt / 2; ++i)
5352 sel[i] = i * 2 + 1;
5353 for (i = 0; i < nelt / 2; ++i)
5354 sel[nelt / 2 + i] = i * 2;
5355 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5357 if (dump_enabled_p ())
5358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5359 "shuffle of 2 fields structure is not \
5360 supported by target\n");
5361 return false;
5363 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5365 /* Generating permutation constant to shift all elements.
5366 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5367 for (i = 0; i < nelt; i++)
5368 sel[i] = nelt / 2 + i;
5369 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5371 if (dump_enabled_p ())
5372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5373 "shift permutation is not supported by target\n");
5374 return false;
5376 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5378 /* Generating permutation constant to select vector from 2.
5379 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5380 for (i = 0; i < nelt / 2; i++)
5381 sel[i] = i;
5382 for (i = nelt / 2; i < nelt; i++)
5383 sel[i] = nelt + i;
5384 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5386 if (dump_enabled_p ())
5387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5388 "select is not supported by target\n");
5389 return false;
5391 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5393 for (i = 0; i < log_length; i++)
5395 for (j = 0; j < length; j += 2)
5397 first_vect = dr_chain[j];
5398 second_vect = dr_chain[j + 1];
5400 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5401 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5402 first_vect, first_vect,
5403 perm2_mask1);
5404 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5405 vect[0] = data_ref;
5407 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5408 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5409 second_vect, second_vect,
5410 perm2_mask2);
5411 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5412 vect[1] = data_ref;
5414 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5415 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5416 vect[0], vect[1], shift1_mask);
5417 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5418 (*result_chain)[j/2 + length/2] = data_ref;
5420 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5421 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5422 vect[0], vect[1], select_mask);
5423 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5424 (*result_chain)[j/2] = data_ref;
5426 memcpy (dr_chain.address (), result_chain->address (),
5427 length * sizeof (tree));
5429 return true;
5431 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5433 unsigned int k = 0, l = 0;
5435 /* Generating permutation constant to get all elements in rigth order.
5436 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5437 for (i = 0; i < nelt; i++)
5439 if (3 * k + (l % 3) >= nelt)
5441 k = 0;
5442 l += (3 - (nelt % 3));
5444 sel[i] = 3 * k + (l % 3);
5445 k++;
5447 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5449 if (dump_enabled_p ())
5450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5451 "shuffle of 3 fields structure is not \
5452 supported by target\n");
5453 return false;
5455 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5457 /* Generating permutation constant to shift all elements.
5458 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5459 for (i = 0; i < nelt; i++)
5460 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5461 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5463 if (dump_enabled_p ())
5464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5465 "shift permutation is not supported by target\n");
5466 return false;
5468 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5470 /* Generating permutation constant to shift all elements.
5471 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5472 for (i = 0; i < nelt; i++)
5473 sel[i] = 2 * (nelt / 3) + 1 + i;
5474 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5476 if (dump_enabled_p ())
5477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5478 "shift permutation is not supported by target\n");
5479 return false;
5481 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5483 /* Generating permutation constant to shift all elements.
5484 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5485 for (i = 0; i < nelt; i++)
5486 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5487 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5489 if (dump_enabled_p ())
5490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5491 "shift permutation is not supported by target\n");
5492 return false;
5494 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5496 /* Generating permutation constant to shift all elements.
5497 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5498 for (i = 0; i < nelt; i++)
5499 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5500 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5502 if (dump_enabled_p ())
5503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5504 "shift permutation is not supported by target\n");
5505 return false;
5507 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5509 for (k = 0; k < 3; k++)
5511 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5512 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5513 dr_chain[k], dr_chain[k],
5514 perm3_mask);
5515 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5516 vect[k] = data_ref;
5519 for (k = 0; k < 3; k++)
5521 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5522 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5523 vect[k % 3], vect[(k + 1) % 3],
5524 shift1_mask);
5525 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5526 vect_shift[k] = data_ref;
5529 for (k = 0; k < 3; k++)
5531 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5532 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5533 vect_shift[(4 - k) % 3],
5534 vect_shift[(3 - k) % 3],
5535 shift2_mask);
5536 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5537 vect[k] = data_ref;
5540 (*result_chain)[3 - (nelt % 3)] = vect[2];
5542 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5543 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5544 vect[0], shift3_mask);
5545 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5546 (*result_chain)[nelt % 3] = data_ref;
5548 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5549 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5550 vect[1], shift4_mask);
5551 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5552 (*result_chain)[0] = data_ref;
5553 return true;
5555 return false;
5558 /* Function vect_transform_grouped_load.
5560 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5561 to perform their permutation and ascribe the result vectorized statements to
5562 the scalar statements.
5565 void
5566 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5567 gimple_stmt_iterator *gsi)
5569 machine_mode mode;
5570 vec<tree> result_chain = vNULL;
5572 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5573 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5574 vectors, that are ready for vector computation. */
5575 result_chain.create (size);
5577 /* If reassociation width for vector type is 2 or greater target machine can
5578 execute 2 or more vector instructions in parallel. Otherwise try to
5579 get chain for loads group using vect_shift_permute_load_chain. */
5580 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5581 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5582 || exact_log2 (size) != -1
5583 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5584 gsi, &result_chain))
5585 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5586 vect_record_grouped_load_vectors (stmt, result_chain);
5587 result_chain.release ();
5590 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5591 generated as part of the vectorization of STMT. Assign the statement
5592 for each vector to the associated scalar statement. */
5594 void
5595 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5597 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5598 gimple next_stmt, new_stmt;
5599 unsigned int i, gap_count;
5600 tree tmp_data_ref;
5602 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5603 Since we scan the chain starting from it's first node, their order
5604 corresponds the order of data-refs in RESULT_CHAIN. */
5605 next_stmt = first_stmt;
5606 gap_count = 1;
5607 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5609 if (!next_stmt)
5610 break;
5612 /* Skip the gaps. Loads created for the gaps will be removed by dead
5613 code elimination pass later. No need to check for the first stmt in
5614 the group, since it always exists.
5615 GROUP_GAP is the number of steps in elements from the previous
5616 access (if there is no gap GROUP_GAP is 1). We skip loads that
5617 correspond to the gaps. */
5618 if (next_stmt != first_stmt
5619 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5621 gap_count++;
5622 continue;
5625 while (next_stmt)
5627 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5628 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5629 copies, and we put the new vector statement in the first available
5630 RELATED_STMT. */
5631 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5632 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5633 else
5635 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5637 gimple prev_stmt =
5638 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5639 gimple rel_stmt =
5640 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5641 while (rel_stmt)
5643 prev_stmt = rel_stmt;
5644 rel_stmt =
5645 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5648 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5649 new_stmt;
5653 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5654 gap_count = 1;
5655 /* If NEXT_STMT accesses the same DR as the previous statement,
5656 put the same TMP_DATA_REF as its vectorized statement; otherwise
5657 get the next data-ref from RESULT_CHAIN. */
5658 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5659 break;
5664 /* Function vect_force_dr_alignment_p.
5666 Returns whether the alignment of a DECL can be forced to be aligned
5667 on ALIGNMENT bit boundary. */
5669 bool
5670 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5672 if (TREE_CODE (decl) != VAR_DECL)
5673 return false;
5675 if (decl_in_symtab_p (decl)
5676 && !symtab_node::get (decl)->can_increase_alignment_p ())
5677 return false;
5679 if (TREE_STATIC (decl))
5680 return (alignment <= MAX_OFILE_ALIGNMENT);
5681 else
5682 return (alignment <= MAX_STACK_ALIGNMENT);
5686 /* Return whether the data reference DR is supported with respect to its
5687 alignment.
5688 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5689 it is aligned, i.e., check if it is possible to vectorize it with different
5690 alignment. */
5692 enum dr_alignment_support
5693 vect_supportable_dr_alignment (struct data_reference *dr,
5694 bool check_aligned_accesses)
5696 gimple stmt = DR_STMT (dr);
5697 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5698 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5699 machine_mode mode = TYPE_MODE (vectype);
5700 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5701 struct loop *vect_loop = NULL;
5702 bool nested_in_vect_loop = false;
5704 if (aligned_access_p (dr) && !check_aligned_accesses)
5705 return dr_aligned;
5707 /* For now assume all conditional loads/stores support unaligned
5708 access without any special code. */
5709 if (is_gimple_call (stmt)
5710 && gimple_call_internal_p (stmt)
5711 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5712 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5713 return dr_unaligned_supported;
5715 if (loop_vinfo)
5717 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5718 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5721 /* Possibly unaligned access. */
5723 /* We can choose between using the implicit realignment scheme (generating
5724 a misaligned_move stmt) and the explicit realignment scheme (generating
5725 aligned loads with a REALIGN_LOAD). There are two variants to the
5726 explicit realignment scheme: optimized, and unoptimized.
5727 We can optimize the realignment only if the step between consecutive
5728 vector loads is equal to the vector size. Since the vector memory
5729 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5730 is guaranteed that the misalignment amount remains the same throughout the
5731 execution of the vectorized loop. Therefore, we can create the
5732 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5733 at the loop preheader.
5735 However, in the case of outer-loop vectorization, when vectorizing a
5736 memory access in the inner-loop nested within the LOOP that is now being
5737 vectorized, while it is guaranteed that the misalignment of the
5738 vectorized memory access will remain the same in different outer-loop
5739 iterations, it is *not* guaranteed that is will remain the same throughout
5740 the execution of the inner-loop. This is because the inner-loop advances
5741 with the original scalar step (and not in steps of VS). If the inner-loop
5742 step happens to be a multiple of VS, then the misalignment remains fixed
5743 and we can use the optimized realignment scheme. For example:
5745 for (i=0; i<N; i++)
5746 for (j=0; j<M; j++)
5747 s += a[i+j];
5749 When vectorizing the i-loop in the above example, the step between
5750 consecutive vector loads is 1, and so the misalignment does not remain
5751 fixed across the execution of the inner-loop, and the realignment cannot
5752 be optimized (as illustrated in the following pseudo vectorized loop):
5754 for (i=0; i<N; i+=4)
5755 for (j=0; j<M; j++){
5756 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5757 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5758 // (assuming that we start from an aligned address).
5761 We therefore have to use the unoptimized realignment scheme:
5763 for (i=0; i<N; i+=4)
5764 for (j=k; j<M; j+=4)
5765 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5766 // that the misalignment of the initial address is
5767 // 0).
5769 The loop can then be vectorized as follows:
5771 for (k=0; k<4; k++){
5772 rt = get_realignment_token (&vp[k]);
5773 for (i=0; i<N; i+=4){
5774 v1 = vp[i+k];
5775 for (j=k; j<M; j+=4){
5776 v2 = vp[i+j+VS-1];
5777 va = REALIGN_LOAD <v1,v2,rt>;
5778 vs += va;
5779 v1 = v2;
5782 } */
5784 if (DR_IS_READ (dr))
5786 bool is_packed = false;
5787 tree type = (TREE_TYPE (DR_REF (dr)));
5789 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5790 && (!targetm.vectorize.builtin_mask_for_load
5791 || targetm.vectorize.builtin_mask_for_load ()))
5793 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5794 if ((nested_in_vect_loop
5795 && (TREE_INT_CST_LOW (DR_STEP (dr))
5796 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5797 || !loop_vinfo)
5798 return dr_explicit_realign;
5799 else
5800 return dr_explicit_realign_optimized;
5802 if (!known_alignment_for_access_p (dr))
5803 is_packed = not_size_aligned (DR_REF (dr));
5805 if ((TYPE_USER_ALIGN (type) && !is_packed)
5806 || targetm.vectorize.
5807 support_vector_misalignment (mode, type,
5808 DR_MISALIGNMENT (dr), is_packed))
5809 /* Can't software pipeline the loads, but can at least do them. */
5810 return dr_unaligned_supported;
5812 else
5814 bool is_packed = false;
5815 tree type = (TREE_TYPE (DR_REF (dr)));
5817 if (!known_alignment_for_access_p (dr))
5818 is_packed = not_size_aligned (DR_REF (dr));
5820 if ((TYPE_USER_ALIGN (type) && !is_packed)
5821 || targetm.vectorize.
5822 support_vector_misalignment (mode, type,
5823 DR_MISALIGNMENT (dr), is_packed))
5824 return dr_unaligned_supported;
5827 /* Unsupported. */
5828 return dr_unaligned_unsupported;