AVX-512. Branch to hold overall changes introduced by 20140717 EAS.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobea2316cefa057fbb3c9d2e5b5395f1e5e358661f
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
58 #include "expr.h"
59 #include "optabs.h"
60 #include "builtins.h"
61 #include "varasm.h"
63 /* Return true if load- or store-lanes optab OPTAB is implemented for
64 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
66 static bool
67 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
68 tree vectype, unsigned HOST_WIDE_INT count)
70 enum machine_mode mode, array_mode;
71 bool limit_p;
73 mode = TYPE_MODE (vectype);
74 limit_p = !targetm.array_mode_supported_p (mode, count);
75 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
76 MODE_INT, limit_p);
78 if (array_mode == BLKmode)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
83 GET_MODE_NAME (mode), count);
84 return false;
87 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
91 "cannot use %s<%s><%s>\n", name,
92 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
93 return false;
96 if (dump_enabled_p ())
97 dump_printf_loc (MSG_NOTE, vect_location,
98 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
99 GET_MODE_NAME (mode));
101 return true;
105 /* Return the smallest scalar part of STMT.
106 This is used to determine the vectype of the stmt. We generally set the
107 vectype according to the type of the result (lhs). For stmts whose
108 result-type is different than the type of the arguments (e.g., demotion,
109 promotion), vectype will be reset appropriately (later). Note that we have
110 to visit the smallest datatype in this function, because that determines the
111 VF. If the smallest datatype in the loop is present only as the rhs of a
112 promotion operation - we'd miss it.
113 Such a case, where a variable of this datatype does not appear in the lhs
114 anywhere in the loop, can only occur if it's an invariant: e.g.:
115 'int_x = (int) short_inv', which we'd expect to have been optimized away by
116 invariant motion. However, we cannot rely on invariant motion to always
117 take invariants out of the loop, and so in the case of promotion we also
118 have to check the rhs.
119 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
120 types. */
122 tree
123 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
124 HOST_WIDE_INT *rhs_size_unit)
126 tree scalar_type = gimple_expr_type (stmt);
127 HOST_WIDE_INT lhs, rhs;
129 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
131 if (is_gimple_assign (stmt)
132 && (gimple_assign_cast_p (stmt)
133 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
134 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
135 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
137 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
139 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
140 if (rhs < lhs)
141 scalar_type = rhs_type;
144 *lhs_size_unit = lhs;
145 *rhs_size_unit = rhs;
146 return scalar_type;
150 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
151 tested at run-time. Return TRUE if DDR was successfully inserted.
152 Return false if versioning is not supported. */
154 static bool
155 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
157 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
159 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
160 return false;
162 if (dump_enabled_p ())
164 dump_printf_loc (MSG_NOTE, vect_location,
165 "mark for run-time aliasing test between ");
166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
167 dump_printf (MSG_NOTE, " and ");
168 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
169 dump_printf (MSG_NOTE, "\n");
172 if (optimize_loop_nest_for_size_p (loop))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not supported when optimizing"
177 " for size.\n");
178 return false;
181 /* FORNOW: We don't support versioning with outer-loop vectorization. */
182 if (loop->inner)
184 if (dump_enabled_p ())
185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
186 "versioning not yet supported for outer-loops.\n");
187 return false;
190 /* FORNOW: We don't support creating runtime alias tests for non-constant
191 step. */
192 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
193 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
195 if (dump_enabled_p ())
196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
197 "versioning not yet supported for non-constant "
198 "step\n");
199 return false;
202 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
203 return true;
207 /* Function vect_analyze_data_ref_dependence.
209 Return TRUE if there (might) exist a dependence between a memory-reference
210 DRA and a memory-reference DRB. When versioning for alias may check a
211 dependence at run-time, return FALSE. Adjust *MAX_VF according to
212 the data dependence. */
214 static bool
215 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
216 loop_vec_info loop_vinfo, int *max_vf)
218 unsigned int i;
219 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
220 struct data_reference *dra = DDR_A (ddr);
221 struct data_reference *drb = DDR_B (ddr);
222 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
223 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
224 lambda_vector dist_v;
225 unsigned int loop_depth;
227 /* In loop analysis all data references should be vectorizable. */
228 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
229 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
230 gcc_unreachable ();
232 /* Independent data accesses. */
233 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
234 return false;
236 if (dra == drb
237 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
238 return false;
240 /* Even if we have an anti-dependence then, as the vectorized loop covers at
241 least two scalar iterations, there is always also a true dependence.
242 As the vectorizer does not re-order loads and stores we can ignore
243 the anti-dependence if TBAA can disambiguate both DRs similar to the
244 case with known negative distance anti-dependences (positive
245 distance anti-dependences would violate TBAA constraints). */
246 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
247 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
248 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
249 get_alias_set (DR_REF (drb))))
250 return false;
252 /* Unknown data dependence. */
253 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
255 /* If user asserted safelen consecutive iterations can be
256 executed concurrently, assume independence. */
257 if (loop->safelen >= 2)
259 if (loop->safelen < *max_vf)
260 *max_vf = loop->safelen;
261 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
262 return false;
265 if (STMT_VINFO_GATHER_P (stmtinfo_a)
266 || STMT_VINFO_GATHER_P (stmtinfo_b))
268 if (dump_enabled_p ())
270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
271 "versioning for alias not supported for: "
272 "can't determine dependence between ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (dra));
275 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
276 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
277 DR_REF (drb));
278 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
280 return true;
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
286 "versioning for alias required: "
287 "can't determine dependence between ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (dra));
290 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
292 DR_REF (drb));
293 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
296 /* Add to list of ddrs that need to be tested at run-time. */
297 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
300 /* Known data dependence. */
301 if (DDR_NUM_DIST_VECTS (ddr) == 0)
303 /* If user asserted safelen consecutive iterations can be
304 executed concurrently, assume independence. */
305 if (loop->safelen >= 2)
307 if (loop->safelen < *max_vf)
308 *max_vf = loop->safelen;
309 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
310 return false;
313 if (STMT_VINFO_GATHER_P (stmtinfo_a)
314 || STMT_VINFO_GATHER_P (stmtinfo_b))
316 if (dump_enabled_p ())
318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
319 "versioning for alias not supported for: "
320 "bad dist vector for ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (dra));
323 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
324 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
325 DR_REF (drb));
326 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
328 return true;
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
334 "versioning for alias required: "
335 "bad dist vector for ");
336 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
337 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
338 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
339 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
341 /* Add to list of ddrs that need to be tested at run-time. */
342 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
345 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
346 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
348 int dist = dist_v[loop_depth];
350 if (dump_enabled_p ())
351 dump_printf_loc (MSG_NOTE, vect_location,
352 "dependence distance = %d.\n", dist);
354 if (dist == 0)
356 if (dump_enabled_p ())
358 dump_printf_loc (MSG_NOTE, vect_location,
359 "dependence distance == 0 between ");
360 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
361 dump_printf (MSG_NOTE, " and ");
362 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
363 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
366 /* When we perform grouped accesses and perform implicit CSE
367 by detecting equal accesses and doing disambiguation with
368 runtime alias tests like for
369 .. = a[i];
370 .. = a[i+1];
371 a[i] = ..;
372 a[i+1] = ..;
373 *p = ..;
374 .. = a[i];
375 .. = a[i+1];
376 where we will end up loading { a[i], a[i+1] } once, make
377 sure that inserting group loads before the first load and
378 stores after the last store will do the right thing.
379 Similar for groups like
380 a[i] = ...;
381 ... = a[i];
382 a[i+1] = ...;
383 where loads from the group interleave with the store. */
384 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
385 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
387 gimple earlier_stmt;
388 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
389 if (DR_IS_WRITE
390 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
392 if (dump_enabled_p ())
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "READ_WRITE dependence in interleaving."
395 "\n");
396 return true;
400 continue;
403 if (dist > 0 && DDR_REVERSED_P (ddr))
405 /* If DDR_REVERSED_P the order of the data-refs in DDR was
406 reversed (to make distance vector positive), and the actual
407 distance is negative. */
408 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "dependence distance negative.\n");
411 /* Record a negative dependence distance to later limit the
412 amount of stmt copying / unrolling we can perform.
413 Only need to handle read-after-write dependence. */
414 if (DR_IS_READ (drb)
415 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
416 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
417 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
418 continue;
421 if (abs (dist) >= 2
422 && abs (dist) < *max_vf)
424 /* The dependence distance requires reduction of the maximal
425 vectorization factor. */
426 *max_vf = abs (dist);
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "adjusting maximal vectorization factor to %i\n",
430 *max_vf);
433 if (abs (dist) >= *max_vf)
435 /* Dependence distance does not create dependence, as far as
436 vectorization is concerned, in this case. */
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_NOTE, vect_location,
439 "dependence distance >= VF.\n");
440 continue;
443 if (dump_enabled_p ())
445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
446 "not vectorized, possible dependence "
447 "between data-refs ");
448 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
449 dump_printf (MSG_NOTE, " and ");
450 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
451 dump_printf (MSG_NOTE, "\n");
454 return true;
457 return false;
460 /* Function vect_analyze_data_ref_dependences.
462 Examine all the data references in the loop, and make sure there do not
463 exist any data dependences between them. Set *MAX_VF according to
464 the maximum vectorization factor the data dependences allow. */
466 bool
467 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
469 unsigned int i;
470 struct data_dependence_relation *ddr;
472 if (dump_enabled_p ())
473 dump_printf_loc (MSG_NOTE, vect_location,
474 "=== vect_analyze_data_ref_dependences ===\n");
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
478 &LOOP_VINFO_DDRS (loop_vinfo),
479 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
480 return false;
482 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
483 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
484 return false;
486 return true;
490 /* Function vect_slp_analyze_data_ref_dependence.
492 Return TRUE if there (might) exist a dependence between a memory-reference
493 DRA and a memory-reference DRB. When versioning for alias may check a
494 dependence at run-time, return FALSE. Adjust *MAX_VF according to
495 the data dependence. */
497 static bool
498 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
500 struct data_reference *dra = DDR_A (ddr);
501 struct data_reference *drb = DDR_B (ddr);
503 /* We need to check dependences of statements marked as unvectorizable
504 as well, they still can prohibit vectorization. */
506 /* Independent data accesses. */
507 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
508 return false;
510 if (dra == drb)
511 return false;
513 /* Read-read is OK. */
514 if (DR_IS_READ (dra) && DR_IS_READ (drb))
515 return false;
517 /* If dra and drb are part of the same interleaving chain consider
518 them independent. */
519 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
520 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
521 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
522 return false;
524 /* Unknown data dependence. */
525 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "can't determine dependence between ");
531 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
532 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
533 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
534 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
537 else if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location,
540 "determined dependence between ");
541 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
542 dump_printf (MSG_NOTE, " and ");
543 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
544 dump_printf (MSG_NOTE, "\n");
547 /* We do not vectorize basic blocks with write-write dependencies. */
548 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
549 return true;
551 /* If we have a read-write dependence check that the load is before the store.
552 When we vectorize basic blocks, vector load can be only before
553 corresponding scalar load, and vector store can be only after its
554 corresponding scalar store. So the order of the acceses is preserved in
555 case the load is before the store. */
556 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
557 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
559 /* That only holds for load-store pairs taking part in vectorization. */
560 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
561 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
562 return false;
565 return true;
569 /* Function vect_analyze_data_ref_dependences.
571 Examine all the data references in the basic-block, and make sure there
572 do not exist any data dependences between them. Set *MAX_VF according to
573 the maximum vectorization factor the data dependences allow. */
575 bool
576 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
578 struct data_dependence_relation *ddr;
579 unsigned int i;
581 if (dump_enabled_p ())
582 dump_printf_loc (MSG_NOTE, vect_location,
583 "=== vect_slp_analyze_data_ref_dependences ===\n");
585 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
586 &BB_VINFO_DDRS (bb_vinfo),
587 vNULL, true))
588 return false;
590 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
591 if (vect_slp_analyze_data_ref_dependence (ddr))
592 return false;
594 return true;
598 /* Function vect_compute_data_ref_alignment
600 Compute the misalignment of the data reference DR.
602 Output:
603 1. If during the misalignment computation it is found that the data reference
604 cannot be vectorized then false is returned.
605 2. DR_MISALIGNMENT (DR) is defined.
607 FOR NOW: No analysis is actually performed. Misalignment is calculated
608 only for trivial cases. TODO. */
610 static bool
611 vect_compute_data_ref_alignment (struct data_reference *dr)
613 gimple stmt = DR_STMT (dr);
614 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
615 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
616 struct loop *loop = NULL;
617 tree ref = DR_REF (dr);
618 tree vectype;
619 tree base, base_addr;
620 bool base_aligned;
621 tree misalign;
622 tree aligned_to, alignment;
624 if (dump_enabled_p ())
625 dump_printf_loc (MSG_NOTE, vect_location,
626 "vect_compute_data_ref_alignment:\n");
628 if (loop_vinfo)
629 loop = LOOP_VINFO_LOOP (loop_vinfo);
631 /* Initialize misalignment to unknown. */
632 SET_DR_MISALIGNMENT (dr, -1);
634 /* Strided loads perform only component accesses, misalignment information
635 is irrelevant for them. */
636 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
637 return true;
639 misalign = DR_INIT (dr);
640 aligned_to = DR_ALIGNED_TO (dr);
641 base_addr = DR_BASE_ADDRESS (dr);
642 vectype = STMT_VINFO_VECTYPE (stmt_info);
644 /* In case the dataref is in an inner-loop of the loop that is being
645 vectorized (LOOP), we use the base and misalignment information
646 relative to the outer-loop (LOOP). This is ok only if the misalignment
647 stays the same throughout the execution of the inner-loop, which is why
648 we have to check that the stride of the dataref in the inner-loop evenly
649 divides by the vector size. */
650 if (loop && nested_in_vect_loop_p (loop, stmt))
652 tree step = DR_STEP (dr);
653 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
655 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
657 if (dump_enabled_p ())
658 dump_printf_loc (MSG_NOTE, vect_location,
659 "inner step divides the vector-size.\n");
660 misalign = STMT_VINFO_DR_INIT (stmt_info);
661 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
662 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
664 else
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
668 "inner step doesn't divide the vector-size.\n");
669 misalign = NULL_TREE;
673 /* Similarly, if we're doing basic-block vectorization, we can only use
674 base and misalignment information relative to an innermost loop if the
675 misalignment stays the same throughout the execution of the loop.
676 As above, this is the case if the stride of the dataref evenly divides
677 by the vector size. */
678 if (!loop)
680 tree step = DR_STEP (dr);
681 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
683 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
685 if (dump_enabled_p ())
686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
687 "SLP: step doesn't divide the vector-size.\n");
688 misalign = NULL_TREE;
692 base = build_fold_indirect_ref (base_addr);
693 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
695 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
696 || !misalign)
698 if (dump_enabled_p ())
700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
701 "Unknown alignment for access: ");
702 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
703 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
705 return true;
708 if ((DECL_P (base)
709 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
710 alignment) >= 0)
711 || (TREE_CODE (base_addr) == SSA_NAME
712 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
713 TREE_TYPE (base_addr)))),
714 alignment) >= 0)
715 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
716 base_aligned = true;
717 else
718 base_aligned = false;
720 if (!base_aligned)
722 /* Do not change the alignment of global variables here if
723 flag_section_anchors is enabled as we already generated
724 RTL for other functions. Most global variables should
725 have been aligned during the IPA increase_alignment pass. */
726 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
727 || (TREE_STATIC (base) && flag_section_anchors))
729 if (dump_enabled_p ())
731 dump_printf_loc (MSG_NOTE, vect_location,
732 "can't force alignment of ref: ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
734 dump_printf (MSG_NOTE, "\n");
736 return true;
739 /* Force the alignment of the decl.
740 NOTE: This is the only change to the code we make during
741 the analysis phase, before deciding to vectorize the loop. */
742 if (dump_enabled_p ())
744 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
745 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
746 dump_printf (MSG_NOTE, "\n");
749 ((dataref_aux *)dr->aux)->base_decl = base;
750 ((dataref_aux *)dr->aux)->base_misaligned = true;
753 /* If this is a backward running DR then first access in the larger
754 vectype actually is N-1 elements before the address in the DR.
755 Adjust misalign accordingly. */
756 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
758 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
759 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
760 otherwise we wouldn't be here. */
761 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
762 /* PLUS because DR_STEP was negative. */
763 misalign = size_binop (PLUS_EXPR, misalign, offset);
766 /* Modulo alignment. */
767 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
769 if (!tree_fits_uhwi_p (misalign))
771 /* Negative or overflowed misalignment value. */
772 if (dump_enabled_p ())
773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
774 "unexpected misalign value\n");
775 return false;
778 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
784 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
788 return true;
792 /* Function vect_compute_data_refs_alignment
794 Compute the misalignment of data references in the loop.
795 Return FALSE if a data reference is found that cannot be vectorized. */
797 static bool
798 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
799 bb_vec_info bb_vinfo)
801 vec<data_reference_p> datarefs;
802 struct data_reference *dr;
803 unsigned int i;
805 if (loop_vinfo)
806 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
807 else
808 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
810 FOR_EACH_VEC_ELT (datarefs, i, dr)
811 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
812 && !vect_compute_data_ref_alignment (dr))
814 if (bb_vinfo)
816 /* Mark unsupported statement as unvectorizable. */
817 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
818 continue;
820 else
821 return false;
824 return true;
828 /* Function vect_update_misalignment_for_peel
830 DR - the data reference whose misalignment is to be adjusted.
831 DR_PEEL - the data reference whose misalignment is being made
832 zero in the vector loop by the peel.
833 NPEEL - the number of iterations in the peel loop if the misalignment
834 of DR_PEEL is known at compile time. */
836 static void
837 vect_update_misalignment_for_peel (struct data_reference *dr,
838 struct data_reference *dr_peel, int npeel)
840 unsigned int i;
841 vec<dr_p> same_align_drs;
842 struct data_reference *current_dr;
843 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
844 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
845 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
846 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
848 /* For interleaved data accesses the step in the loop must be multiplied by
849 the size of the interleaving group. */
850 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
851 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
852 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
853 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
855 /* It can be assumed that the data refs with the same alignment as dr_peel
856 are aligned in the vector loop. */
857 same_align_drs
858 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
859 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
861 if (current_dr != dr)
862 continue;
863 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
864 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
865 SET_DR_MISALIGNMENT (dr, 0);
866 return;
869 if (known_alignment_for_access_p (dr)
870 && known_alignment_for_access_p (dr_peel))
872 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
873 int misal = DR_MISALIGNMENT (dr);
874 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
875 misal += negative ? -npeel * dr_size : npeel * dr_size;
876 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
877 SET_DR_MISALIGNMENT (dr, misal);
878 return;
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
883 SET_DR_MISALIGNMENT (dr, -1);
887 /* Function vect_verify_datarefs_alignment
889 Return TRUE if all data references in the loop can be
890 handled with respect to alignment. */
892 bool
893 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
895 vec<data_reference_p> datarefs;
896 struct data_reference *dr;
897 enum dr_alignment_support supportable_dr_alignment;
898 unsigned int i;
900 if (loop_vinfo)
901 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
902 else
903 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
905 FOR_EACH_VEC_ELT (datarefs, i, dr)
907 gimple stmt = DR_STMT (dr);
908 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
910 if (!STMT_VINFO_RELEVANT_P (stmt_info))
911 continue;
913 /* For interleaving, only the alignment of the first access matters.
914 Skip statements marked as not vectorizable. */
915 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
916 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
917 || !STMT_VINFO_VECTORIZABLE (stmt_info))
918 continue;
920 /* Strided loads perform only component accesses, alignment is
921 irrelevant for them. */
922 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
923 continue;
925 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
926 if (!supportable_dr_alignment)
928 if (dump_enabled_p ())
930 if (DR_IS_READ (dr))
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "not vectorized: unsupported unaligned load.");
933 else
934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
935 "not vectorized: unsupported unaligned "
936 "store.");
938 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
939 DR_REF (dr));
940 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
942 return false;
944 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
945 dump_printf_loc (MSG_NOTE, vect_location,
946 "Vectorizing an unaligned access.\n");
948 return true;
951 /* Given an memory reference EXP return whether its alignment is less
952 than its size. */
954 static bool
955 not_size_aligned (tree exp)
957 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
958 return true;
960 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
961 > get_object_alignment (exp));
964 /* Function vector_alignment_reachable_p
966 Return true if vector alignment for DR is reachable by peeling
967 a few loop iterations. Return false otherwise. */
969 static bool
970 vector_alignment_reachable_p (struct data_reference *dr)
972 gimple stmt = DR_STMT (dr);
973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
974 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
978 /* For interleaved access we peel only if number of iterations in
979 the prolog loop ({VF - misalignment}), is a multiple of the
980 number of the interleaved accesses. */
981 int elem_size, mis_in_elements;
982 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
984 /* FORNOW: handle only known alignment. */
985 if (!known_alignment_for_access_p (dr))
986 return false;
988 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
989 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
991 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
992 return false;
995 /* If misalignment is known at the compile time then allow peeling
996 only if natural alignment is reachable through peeling. */
997 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
999 HOST_WIDE_INT elmsize =
1000 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1001 if (dump_enabled_p ())
1003 dump_printf_loc (MSG_NOTE, vect_location,
1004 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1005 dump_printf (MSG_NOTE,
1006 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1008 if (DR_MISALIGNMENT (dr) % elmsize)
1010 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012 "data size does not divide the misalignment.\n");
1013 return false;
1017 if (!known_alignment_for_access_p (dr))
1019 tree type = TREE_TYPE (DR_REF (dr));
1020 bool is_packed = not_size_aligned (DR_REF (dr));
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1023 "Unknown misalignment, is_packed = %d\n",is_packed);
1024 if ((TYPE_USER_ALIGN (type) && !is_packed)
1025 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1026 return true;
1027 else
1028 return false;
1031 return true;
1035 /* Calculate the cost of the memory access represented by DR. */
1037 static void
1038 vect_get_data_access_cost (struct data_reference *dr,
1039 unsigned int *inside_cost,
1040 unsigned int *outside_cost,
1041 stmt_vector_for_cost *body_cost_vec)
1043 gimple stmt = DR_STMT (dr);
1044 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1045 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1046 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1047 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1048 int ncopies = vf / nunits;
1050 if (DR_IS_READ (dr))
1051 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1052 NULL, body_cost_vec, false);
1053 else
1054 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_NOTE, vect_location,
1058 "vect_get_data_access_cost: inside_cost = %d, "
1059 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1063 /* Insert DR into peeling hash table with NPEEL as key. */
1065 static void
1066 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1067 int npeel)
1069 struct _vect_peel_info elem, *slot;
1070 _vect_peel_info **new_slot;
1071 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1073 elem.npeel = npeel;
1074 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1075 if (slot)
1076 slot->count++;
1077 else
1079 slot = XNEW (struct _vect_peel_info);
1080 slot->npeel = npeel;
1081 slot->dr = dr;
1082 slot->count = 1;
1083 new_slot
1084 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1085 *new_slot = slot;
1088 if (!supportable_dr_alignment
1089 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1090 slot->count += VECT_MAX_COST;
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095 number of data accesses. */
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1099 _vect_peel_extended_info *max)
1101 vect_peel_info elem = *slot;
1103 if (elem->count > max->peel_info.count
1104 || (elem->count == max->peel_info.count
1105 && max->peel_info.npeel > elem->npeel))
1107 max->peel_info.npeel = elem->npeel;
1108 max->peel_info.count = elem->count;
1109 max->peel_info.dr = elem->dr;
1112 return 1;
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117 Find the one with the lowest cost. */
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1121 _vect_peel_extended_info *min)
1123 vect_peel_info elem = *slot;
1124 int save_misalignment, dummy;
1125 unsigned int inside_cost = 0, outside_cost = 0, i;
1126 gimple stmt = DR_STMT (elem->dr);
1127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1128 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1129 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1130 struct data_reference *dr;
1131 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1132 int single_iter_cost;
1134 prologue_cost_vec.create (2);
1135 body_cost_vec.create (2);
1136 epilogue_cost_vec.create (2);
1138 FOR_EACH_VEC_ELT (datarefs, i, dr)
1140 stmt = DR_STMT (dr);
1141 stmt_info = vinfo_for_stmt (stmt);
1142 /* For interleaving, only the alignment of the first access
1143 matters. */
1144 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1145 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1146 continue;
1148 save_misalignment = DR_MISALIGNMENT (dr);
1149 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1150 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1151 &body_cost_vec);
1152 SET_DR_MISALIGNMENT (dr, save_misalignment);
1155 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1156 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1157 &dummy, single_iter_cost,
1158 &prologue_cost_vec,
1159 &epilogue_cost_vec);
1161 /* Prologue and epilogue costs are added to the target model later.
1162 These costs depend only on the scalar iteration cost, the
1163 number of peeling iterations finally chosen, and the number of
1164 misaligned statements. So discard the information found here. */
1165 prologue_cost_vec.release ();
1166 epilogue_cost_vec.release ();
1168 if (inside_cost < min->inside_cost
1169 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1171 min->inside_cost = inside_cost;
1172 min->outside_cost = outside_cost;
1173 min->body_cost_vec.release ();
1174 min->body_cost_vec = body_cost_vec;
1175 min->peel_info.dr = elem->dr;
1176 min->peel_info.npeel = elem->npeel;
1178 else
1179 body_cost_vec.release ();
1181 return 1;
1185 /* Choose best peeling option by traversing peeling hash table and either
1186 choosing an option with the lowest cost (if cost model is enabled) or the
1187 option that aligns as many accesses as possible. */
1189 static struct data_reference *
1190 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1191 unsigned int *npeel,
1192 stmt_vector_for_cost *body_cost_vec)
1194 struct _vect_peel_extended_info res;
1196 res.peel_info.dr = NULL;
1197 res.body_cost_vec = stmt_vector_for_cost ();
1199 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1201 res.inside_cost = INT_MAX;
1202 res.outside_cost = INT_MAX;
1203 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1204 ->traverse <_vect_peel_extended_info *,
1205 vect_peeling_hash_get_lowest_cost> (&res);
1207 else
1209 res.peel_info.count = 0;
1210 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1211 ->traverse <_vect_peel_extended_info *,
1212 vect_peeling_hash_get_most_frequent> (&res);
1215 *npeel = res.peel_info.npeel;
1216 *body_cost_vec = res.body_cost_vec;
1217 return res.peel_info.dr;
1221 /* Function vect_enhance_data_refs_alignment
1223 This pass will use loop versioning and loop peeling in order to enhance
1224 the alignment of data references in the loop.
1226 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1227 original loop is to be vectorized. Any other loops that are created by
1228 the transformations performed in this pass - are not supposed to be
1229 vectorized. This restriction will be relaxed.
1231 This pass will require a cost model to guide it whether to apply peeling
1232 or versioning or a combination of the two. For example, the scheme that
1233 intel uses when given a loop with several memory accesses, is as follows:
1234 choose one memory access ('p') which alignment you want to force by doing
1235 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1236 other accesses are not necessarily aligned, or (2) use loop versioning to
1237 generate one loop in which all accesses are aligned, and another loop in
1238 which only 'p' is necessarily aligned.
1240 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1241 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1242 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1244 Devising a cost model is the most critical aspect of this work. It will
1245 guide us on which access to peel for, whether to use loop versioning, how
1246 many versions to create, etc. The cost model will probably consist of
1247 generic considerations as well as target specific considerations (on
1248 powerpc for example, misaligned stores are more painful than misaligned
1249 loads).
1251 Here are the general steps involved in alignment enhancements:
1253 -- original loop, before alignment analysis:
1254 for (i=0; i<N; i++){
1255 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1256 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1259 -- After vect_compute_data_refs_alignment:
1260 for (i=0; i<N; i++){
1261 x = q[i]; # DR_MISALIGNMENT(q) = 3
1262 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1265 -- Possibility 1: we do loop versioning:
1266 if (p is aligned) {
1267 for (i=0; i<N; i++){ # loop 1A
1268 x = q[i]; # DR_MISALIGNMENT(q) = 3
1269 p[i] = y; # DR_MISALIGNMENT(p) = 0
1272 else {
1273 for (i=0; i<N; i++){ # loop 1B
1274 x = q[i]; # DR_MISALIGNMENT(q) = 3
1275 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1279 -- Possibility 2: we do loop peeling:
1280 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1281 x = q[i];
1282 p[i] = y;
1284 for (i = 3; i < N; i++){ # loop 2A
1285 x = q[i]; # DR_MISALIGNMENT(q) = 0
1286 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1289 -- Possibility 3: combination of loop peeling and versioning:
1290 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1291 x = q[i];
1292 p[i] = y;
1294 if (p is aligned) {
1295 for (i = 3; i<N; i++){ # loop 3A
1296 x = q[i]; # DR_MISALIGNMENT(q) = 0
1297 p[i] = y; # DR_MISALIGNMENT(p) = 0
1300 else {
1301 for (i = 3; i<N; i++){ # loop 3B
1302 x = q[i]; # DR_MISALIGNMENT(q) = 0
1303 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1307 These loops are later passed to loop_transform to be vectorized. The
1308 vectorizer will use the alignment information to guide the transformation
1309 (whether to generate regular loads/stores, or with special handling for
1310 misalignment). */
1312 bool
1313 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1315 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1317 enum dr_alignment_support supportable_dr_alignment;
1318 struct data_reference *dr0 = NULL, *first_store = NULL;
1319 struct data_reference *dr;
1320 unsigned int i, j;
1321 bool do_peeling = false;
1322 bool do_versioning = false;
1323 bool stat;
1324 gimple stmt;
1325 stmt_vec_info stmt_info;
1326 unsigned int npeel = 0;
1327 bool all_misalignments_unknown = true;
1328 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1329 unsigned possible_npeel_number = 1;
1330 tree vectype;
1331 unsigned int nelements, mis, same_align_drs_max = 0;
1332 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1334 if (dump_enabled_p ())
1335 dump_printf_loc (MSG_NOTE, vect_location,
1336 "=== vect_enhance_data_refs_alignment ===\n");
1338 /* While cost model enhancements are expected in the future, the high level
1339 view of the code at this time is as follows:
1341 A) If there is a misaligned access then see if peeling to align
1342 this access can make all data references satisfy
1343 vect_supportable_dr_alignment. If so, update data structures
1344 as needed and return true.
1346 B) If peeling wasn't possible and there is a data reference with an
1347 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1348 then see if loop versioning checks can be used to make all data
1349 references satisfy vect_supportable_dr_alignment. If so, update
1350 data structures as needed and return true.
1352 C) If neither peeling nor versioning were successful then return false if
1353 any data reference does not satisfy vect_supportable_dr_alignment.
1355 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1357 Note, Possibility 3 above (which is peeling and versioning together) is not
1358 being done at this time. */
1360 /* (1) Peeling to force alignment. */
1362 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1363 Considerations:
1364 + How many accesses will become aligned due to the peeling
1365 - How many accesses will become unaligned due to the peeling,
1366 and the cost of misaligned accesses.
1367 - The cost of peeling (the extra runtime checks, the increase
1368 in code size). */
1370 FOR_EACH_VEC_ELT (datarefs, i, dr)
1372 stmt = DR_STMT (dr);
1373 stmt_info = vinfo_for_stmt (stmt);
1375 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1376 continue;
1378 /* For interleaving, only the alignment of the first access
1379 matters. */
1380 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1381 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1382 continue;
1384 /* For invariant accesses there is nothing to enhance. */
1385 if (integer_zerop (DR_STEP (dr)))
1386 continue;
1388 /* Strided loads perform only component accesses, alignment is
1389 irrelevant for them. */
1390 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1391 continue;
1393 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1394 do_peeling = vector_alignment_reachable_p (dr);
1395 if (do_peeling)
1397 if (known_alignment_for_access_p (dr))
1399 unsigned int npeel_tmp;
1400 bool negative = tree_int_cst_compare (DR_STEP (dr),
1401 size_zero_node) < 0;
1403 /* Save info about DR in the hash table. */
1404 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1405 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1406 = new hash_table<peel_info_hasher> (1);
1408 vectype = STMT_VINFO_VECTYPE (stmt_info);
1409 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1410 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1411 TREE_TYPE (DR_REF (dr))));
1412 npeel_tmp = (negative
1413 ? (mis - nelements) : (nelements - mis))
1414 & (nelements - 1);
1416 /* For multiple types, it is possible that the bigger type access
1417 will have more than one peeling option. E.g., a loop with two
1418 types: one of size (vector size / 4), and the other one of
1419 size (vector size / 8). Vectorization factor will 8. If both
1420 access are misaligned by 3, the first one needs one scalar
1421 iteration to be aligned, and the second one needs 5. But the
1422 the first one will be aligned also by peeling 5 scalar
1423 iterations, and in that case both accesses will be aligned.
1424 Hence, except for the immediate peeling amount, we also want
1425 to try to add full vector size, while we don't exceed
1426 vectorization factor.
1427 We do this automtically for cost model, since we calculate cost
1428 for every peeling option. */
1429 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1430 possible_npeel_number = vf /nelements;
1432 /* Handle the aligned case. We may decide to align some other
1433 access, making DR unaligned. */
1434 if (DR_MISALIGNMENT (dr) == 0)
1436 npeel_tmp = 0;
1437 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1438 possible_npeel_number++;
1441 for (j = 0; j < possible_npeel_number; j++)
1443 gcc_assert (npeel_tmp <= vf);
1444 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1445 npeel_tmp += nelements;
1448 all_misalignments_unknown = false;
1449 /* Data-ref that was chosen for the case that all the
1450 misalignments are unknown is not relevant anymore, since we
1451 have a data-ref with known alignment. */
1452 dr0 = NULL;
1454 else
1456 /* If we don't know any misalignment values, we prefer
1457 peeling for data-ref that has the maximum number of data-refs
1458 with the same alignment, unless the target prefers to align
1459 stores over load. */
1460 if (all_misalignments_unknown)
1462 unsigned same_align_drs
1463 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1464 if (!dr0
1465 || same_align_drs_max < same_align_drs)
1467 same_align_drs_max = same_align_drs;
1468 dr0 = dr;
1470 /* For data-refs with the same number of related
1471 accesses prefer the one where the misalign
1472 computation will be invariant in the outermost loop. */
1473 else if (same_align_drs_max == same_align_drs)
1475 struct loop *ivloop0, *ivloop;
1476 ivloop0 = outermost_invariant_loop_for_expr
1477 (loop, DR_BASE_ADDRESS (dr0));
1478 ivloop = outermost_invariant_loop_for_expr
1479 (loop, DR_BASE_ADDRESS (dr));
1480 if ((ivloop && !ivloop0)
1481 || (ivloop && ivloop0
1482 && flow_loop_nested_p (ivloop, ivloop0)))
1483 dr0 = dr;
1486 if (!first_store && DR_IS_WRITE (dr))
1487 first_store = dr;
1490 /* If there are both known and unknown misaligned accesses in the
1491 loop, we choose peeling amount according to the known
1492 accesses. */
1493 if (!supportable_dr_alignment)
1495 dr0 = dr;
1496 if (!first_store && DR_IS_WRITE (dr))
1497 first_store = dr;
1501 else
1503 if (!aligned_access_p (dr))
1505 if (dump_enabled_p ())
1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507 "vector alignment may not be reachable\n");
1508 break;
1513 /* Check if we can possibly peel the loop. */
1514 if (!vect_can_advance_ivs_p (loop_vinfo)
1515 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1516 do_peeling = false;
1518 if (do_peeling && all_misalignments_unknown
1519 && vect_supportable_dr_alignment (dr0, false))
1522 /* Check if the target requires to prefer stores over loads, i.e., if
1523 misaligned stores are more expensive than misaligned loads (taking
1524 drs with same alignment into account). */
1525 if (first_store && DR_IS_READ (dr0))
1527 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1528 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1529 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1530 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1531 stmt_vector_for_cost dummy;
1532 dummy.create (2);
1534 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1535 &dummy);
1536 vect_get_data_access_cost (first_store, &store_inside_cost,
1537 &store_outside_cost, &dummy);
1539 dummy.release ();
1541 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1542 aligning the load DR0). */
1543 load_inside_penalty = store_inside_cost;
1544 load_outside_penalty = store_outside_cost;
1545 for (i = 0;
1546 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1547 DR_STMT (first_store))).iterate (i, &dr);
1548 i++)
1549 if (DR_IS_READ (dr))
1551 load_inside_penalty += load_inside_cost;
1552 load_outside_penalty += load_outside_cost;
1554 else
1556 load_inside_penalty += store_inside_cost;
1557 load_outside_penalty += store_outside_cost;
1560 /* Calculate the penalty for leaving DR0 unaligned (by
1561 aligning the FIRST_STORE). */
1562 store_inside_penalty = load_inside_cost;
1563 store_outside_penalty = load_outside_cost;
1564 for (i = 0;
1565 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1566 DR_STMT (dr0))).iterate (i, &dr);
1567 i++)
1568 if (DR_IS_READ (dr))
1570 store_inside_penalty += load_inside_cost;
1571 store_outside_penalty += load_outside_cost;
1573 else
1575 store_inside_penalty += store_inside_cost;
1576 store_outside_penalty += store_outside_cost;
1579 if (load_inside_penalty > store_inside_penalty
1580 || (load_inside_penalty == store_inside_penalty
1581 && load_outside_penalty > store_outside_penalty))
1582 dr0 = first_store;
1585 /* In case there are only loads with different unknown misalignments, use
1586 peeling only if it may help to align other accesses in the loop. */
1587 if (!first_store
1588 && !STMT_VINFO_SAME_ALIGN_REFS (
1589 vinfo_for_stmt (DR_STMT (dr0))).length ()
1590 && vect_supportable_dr_alignment (dr0, false)
1591 != dr_unaligned_supported)
1592 do_peeling = false;
1595 if (do_peeling && !dr0)
1597 /* Peeling is possible, but there is no data access that is not supported
1598 unless aligned. So we try to choose the best possible peeling. */
1600 /* We should get here only if there are drs with known misalignment. */
1601 gcc_assert (!all_misalignments_unknown);
1603 /* Choose the best peeling from the hash table. */
1604 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1605 &body_cost_vec);
1606 if (!dr0 || !npeel)
1607 do_peeling = false;
1610 if (do_peeling)
1612 stmt = DR_STMT (dr0);
1613 stmt_info = vinfo_for_stmt (stmt);
1614 vectype = STMT_VINFO_VECTYPE (stmt_info);
1615 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1617 if (known_alignment_for_access_p (dr0))
1619 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1620 size_zero_node) < 0;
1621 if (!npeel)
1623 /* Since it's known at compile time, compute the number of
1624 iterations in the peeled loop (the peeling factor) for use in
1625 updating DR_MISALIGNMENT values. The peeling factor is the
1626 vectorization factor minus the misalignment as an element
1627 count. */
1628 mis = DR_MISALIGNMENT (dr0);
1629 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1630 npeel = ((negative ? mis - nelements : nelements - mis)
1631 & (nelements - 1));
1634 /* For interleaved data access every iteration accesses all the
1635 members of the group, therefore we divide the number of iterations
1636 by the group size. */
1637 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1638 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1639 npeel /= GROUP_SIZE (stmt_info);
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "Try peeling by %d\n", npeel);
1646 /* Ensure that all data refs can be vectorized after the peel. */
1647 FOR_EACH_VEC_ELT (datarefs, i, dr)
1649 int save_misalignment;
1651 if (dr == dr0)
1652 continue;
1654 stmt = DR_STMT (dr);
1655 stmt_info = vinfo_for_stmt (stmt);
1656 /* For interleaving, only the alignment of the first access
1657 matters. */
1658 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1659 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1660 continue;
1662 /* Strided loads perform only component accesses, alignment is
1663 irrelevant for them. */
1664 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1665 continue;
1667 save_misalignment = DR_MISALIGNMENT (dr);
1668 vect_update_misalignment_for_peel (dr, dr0, npeel);
1669 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1670 SET_DR_MISALIGNMENT (dr, save_misalignment);
1672 if (!supportable_dr_alignment)
1674 do_peeling = false;
1675 break;
1679 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1681 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1682 if (!stat)
1683 do_peeling = false;
1684 else
1686 body_cost_vec.release ();
1687 return stat;
1691 if (do_peeling)
1693 unsigned max_allowed_peel
1694 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1695 if (max_allowed_peel != (unsigned)-1)
1697 unsigned max_peel = npeel;
1698 if (max_peel == 0)
1700 gimple dr_stmt = DR_STMT (dr0);
1701 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1702 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1703 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1705 if (max_peel > max_allowed_peel)
1707 do_peeling = false;
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_NOTE, vect_location,
1710 "Disable peeling, max peels reached: %d\n", max_peel);
1715 if (do_peeling)
1717 stmt_info_for_cost *si;
1718 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1720 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1721 If the misalignment of DR_i is identical to that of dr0 then set
1722 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1723 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1724 by the peeling factor times the element size of DR_i (MOD the
1725 vectorization factor times the size). Otherwise, the
1726 misalignment of DR_i must be set to unknown. */
1727 FOR_EACH_VEC_ELT (datarefs, i, dr)
1728 if (dr != dr0)
1729 vect_update_misalignment_for_peel (dr, dr0, npeel);
1731 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1732 if (npeel)
1733 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1734 else
1735 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1736 = DR_MISALIGNMENT (dr0);
1737 SET_DR_MISALIGNMENT (dr0, 0);
1738 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "Alignment of access forced using peeling.\n");
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "Peeling for alignment will be applied.\n");
1745 /* We've delayed passing the inside-loop peeling costs to the
1746 target cost model until we were sure peeling would happen.
1747 Do so now. */
1748 if (body_cost_vec.exists ())
1750 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1752 struct _stmt_vec_info *stmt_info
1753 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1754 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1755 si->misalign, vect_body);
1757 body_cost_vec.release ();
1760 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1761 gcc_assert (stat);
1762 return stat;
1766 body_cost_vec.release ();
1768 /* (2) Versioning to force alignment. */
1770 /* Try versioning if:
1771 1) optimize loop for speed
1772 2) there is at least one unsupported misaligned data ref with an unknown
1773 misalignment, and
1774 3) all misaligned data refs with a known misalignment are supported, and
1775 4) the number of runtime alignment checks is within reason. */
1777 do_versioning =
1778 optimize_loop_nest_for_speed_p (loop)
1779 && (!loop->inner); /* FORNOW */
1781 if (do_versioning)
1783 FOR_EACH_VEC_ELT (datarefs, i, dr)
1785 stmt = DR_STMT (dr);
1786 stmt_info = vinfo_for_stmt (stmt);
1788 /* For interleaving, only the alignment of the first access
1789 matters. */
1790 if (aligned_access_p (dr)
1791 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1792 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1793 continue;
1795 /* Strided loads perform only component accesses, alignment is
1796 irrelevant for them. */
1797 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1798 continue;
1800 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1802 if (!supportable_dr_alignment)
1804 gimple stmt;
1805 int mask;
1806 tree vectype;
1808 if (known_alignment_for_access_p (dr)
1809 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1810 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1812 do_versioning = false;
1813 break;
1816 stmt = DR_STMT (dr);
1817 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1818 gcc_assert (vectype);
1820 /* The rightmost bits of an aligned address must be zeros.
1821 Construct the mask needed for this test. For example,
1822 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1823 mask must be 15 = 0xf. */
1824 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1826 /* FORNOW: use the same mask to test all potentially unaligned
1827 references in the loop. The vectorizer currently supports
1828 a single vector size, see the reference to
1829 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1830 vectorization factor is computed. */
1831 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1832 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1833 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1834 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1835 DR_STMT (dr));
1839 /* Versioning requires at least one misaligned data reference. */
1840 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1841 do_versioning = false;
1842 else if (!do_versioning)
1843 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1846 if (do_versioning)
1848 vec<gimple> may_misalign_stmts
1849 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1850 gimple stmt;
1852 /* It can now be assumed that the data references in the statements
1853 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1854 of the loop being vectorized. */
1855 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1857 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1858 dr = STMT_VINFO_DATA_REF (stmt_info);
1859 SET_DR_MISALIGNMENT (dr, 0);
1860 if (dump_enabled_p ())
1861 dump_printf_loc (MSG_NOTE, vect_location,
1862 "Alignment of access forced using versioning.\n");
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_NOTE, vect_location,
1867 "Versioning for alignment will be applied.\n");
1869 /* Peeling and versioning can't be done together at this time. */
1870 gcc_assert (! (do_peeling && do_versioning));
1872 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1873 gcc_assert (stat);
1874 return stat;
1877 /* This point is reached if neither peeling nor versioning is being done. */
1878 gcc_assert (! (do_peeling || do_versioning));
1880 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1881 return stat;
1885 /* Function vect_find_same_alignment_drs.
1887 Update group and alignment relations according to the chosen
1888 vectorization factor. */
1890 static void
1891 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1892 loop_vec_info loop_vinfo)
1894 unsigned int i;
1895 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1896 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1897 struct data_reference *dra = DDR_A (ddr);
1898 struct data_reference *drb = DDR_B (ddr);
1899 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1900 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1901 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1902 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1903 lambda_vector dist_v;
1904 unsigned int loop_depth;
1906 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1907 return;
1909 if (dra == drb)
1910 return;
1912 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1913 return;
1915 /* Loop-based vectorization and known data dependence. */
1916 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1917 return;
1919 /* Data-dependence analysis reports a distance vector of zero
1920 for data-references that overlap only in the first iteration
1921 but have different sign step (see PR45764).
1922 So as a sanity check require equal DR_STEP. */
1923 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1924 return;
1926 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1927 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1929 int dist = dist_v[loop_depth];
1931 if (dump_enabled_p ())
1932 dump_printf_loc (MSG_NOTE, vect_location,
1933 "dependence distance = %d.\n", dist);
1935 /* Same loop iteration. */
1936 if (dist == 0
1937 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1939 /* Two references with distance zero have the same alignment. */
1940 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1941 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1942 if (dump_enabled_p ())
1944 dump_printf_loc (MSG_NOTE, vect_location,
1945 "accesses have the same alignment.\n");
1946 dump_printf (MSG_NOTE,
1947 "dependence distance modulo vf == 0 between ");
1948 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1949 dump_printf (MSG_NOTE, " and ");
1950 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1951 dump_printf (MSG_NOTE, "\n");
1958 /* Function vect_analyze_data_refs_alignment
1960 Analyze the alignment of the data-references in the loop.
1961 Return FALSE if a data reference is found that cannot be vectorized. */
1963 bool
1964 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1965 bb_vec_info bb_vinfo)
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_NOTE, vect_location,
1969 "=== vect_analyze_data_refs_alignment ===\n");
1971 /* Mark groups of data references with same alignment using
1972 data dependence information. */
1973 if (loop_vinfo)
1975 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1976 struct data_dependence_relation *ddr;
1977 unsigned int i;
1979 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1980 vect_find_same_alignment_drs (ddr, loop_vinfo);
1983 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1985 if (dump_enabled_p ())
1986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1987 "not vectorized: can't calculate alignment "
1988 "for data ref.\n");
1989 return false;
1992 return true;
1996 /* Analyze groups of accesses: check that DR belongs to a group of
1997 accesses of legal size, step, etc. Detect gaps, single element
1998 interleaving, and other special cases. Set grouped access info.
1999 Collect groups of strided stores for further use in SLP analysis. */
2001 static bool
2002 vect_analyze_group_access (struct data_reference *dr)
2004 tree step = DR_STEP (dr);
2005 tree scalar_type = TREE_TYPE (DR_REF (dr));
2006 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2007 gimple stmt = DR_STMT (dr);
2008 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2009 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2010 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2011 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2012 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2013 bool slp_impossible = false;
2014 struct loop *loop = NULL;
2016 if (loop_vinfo)
2017 loop = LOOP_VINFO_LOOP (loop_vinfo);
2019 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2020 size of the interleaving group (including gaps). */
2021 groupsize = absu_hwi (dr_step) / type_size;
2023 /* Not consecutive access is possible only if it is a part of interleaving. */
2024 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2026 /* Check if it this DR is a part of interleaving, and is a single
2027 element of the group that is accessed in the loop. */
2029 /* Gaps are supported only for loads. STEP must be a multiple of the type
2030 size. The size of the group must be a power of 2. */
2031 if (DR_IS_READ (dr)
2032 && (dr_step % type_size) == 0
2033 && groupsize > 0
2034 && exact_log2 (groupsize) != -1)
2036 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2037 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2038 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "Detected single element interleaving ");
2042 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2043 dump_printf (MSG_NOTE, " step ");
2044 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2045 dump_printf (MSG_NOTE, "\n");
2048 if (loop_vinfo)
2050 if (dump_enabled_p ())
2051 dump_printf_loc (MSG_NOTE, vect_location,
2052 "Data access with gaps requires scalar "
2053 "epilogue loop\n");
2054 if (loop->inner)
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 "Peeling for outer loop is not"
2059 " supported\n");
2060 return false;
2063 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2066 return true;
2069 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2072 "not consecutive access ");
2073 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2074 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2077 if (bb_vinfo)
2079 /* Mark the statement as unvectorizable. */
2080 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2081 return true;
2084 return false;
2087 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2089 /* First stmt in the interleaving chain. Check the chain. */
2090 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2091 struct data_reference *data_ref = dr;
2092 unsigned int count = 1;
2093 tree prev_init = DR_INIT (data_ref);
2094 gimple prev = stmt;
2095 HOST_WIDE_INT diff, gaps = 0;
2096 unsigned HOST_WIDE_INT count_in_bytes;
2098 while (next)
2100 /* Skip same data-refs. In case that two or more stmts share
2101 data-ref (supported only for loads), we vectorize only the first
2102 stmt, and the rest get their vectorized loads from the first
2103 one. */
2104 if (!tree_int_cst_compare (DR_INIT (data_ref),
2105 DR_INIT (STMT_VINFO_DATA_REF (
2106 vinfo_for_stmt (next)))))
2108 if (DR_IS_WRITE (data_ref))
2110 if (dump_enabled_p ())
2111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2112 "Two store stmts share the same dr.\n");
2113 return false;
2116 /* For load use the same data-ref load. */
2117 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2119 prev = next;
2120 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2121 continue;
2124 prev = next;
2125 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2127 /* All group members have the same STEP by construction. */
2128 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2130 /* Check that the distance between two accesses is equal to the type
2131 size. Otherwise, we have gaps. */
2132 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2133 - TREE_INT_CST_LOW (prev_init)) / type_size;
2134 if (diff != 1)
2136 /* FORNOW: SLP of accesses with gaps is not supported. */
2137 slp_impossible = true;
2138 if (DR_IS_WRITE (data_ref))
2140 if (dump_enabled_p ())
2141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2142 "interleaved store with gaps\n");
2143 return false;
2146 gaps += diff - 1;
2149 last_accessed_element += diff;
2151 /* Store the gap from the previous member of the group. If there is no
2152 gap in the access, GROUP_GAP is always 1. */
2153 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2155 prev_init = DR_INIT (data_ref);
2156 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2157 /* Count the number of data-refs in the chain. */
2158 count++;
2161 /* COUNT is the number of accesses found, we multiply it by the size of
2162 the type to get COUNT_IN_BYTES. */
2163 count_in_bytes = type_size * count;
2165 /* Check that the size of the interleaving (including gaps) is not
2166 greater than STEP. */
2167 if (dr_step != 0
2168 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2170 if (dump_enabled_p ())
2172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173 "interleaving size is greater than step for ");
2174 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2175 DR_REF (dr));
2176 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2178 return false;
2181 /* Check that the size of the interleaving is equal to STEP for stores,
2182 i.e., that there are no gaps. */
2183 if (dr_step != 0
2184 && absu_hwi (dr_step) != count_in_bytes)
2186 if (DR_IS_READ (dr))
2188 slp_impossible = true;
2189 /* There is a gap after the last load in the group. This gap is a
2190 difference between the groupsize and the number of elements.
2191 When there is no gap, this difference should be 0. */
2192 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2194 else
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 "interleaved store with gaps\n");
2199 return false;
2203 /* Check that STEP is a multiple of type size. */
2204 if (dr_step != 0
2205 && (dr_step % type_size) != 0)
2207 if (dump_enabled_p ())
2209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2210 "step is not a multiple of type size: step ");
2211 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2212 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2213 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2214 TYPE_SIZE_UNIT (scalar_type));
2215 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2217 return false;
2220 if (groupsize == 0)
2221 groupsize = count;
2223 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_NOTE, vect_location,
2226 "Detected interleaving of size %d\n", (int)groupsize);
2228 /* SLP: create an SLP data structure for every interleaving group of
2229 stores for further analysis in vect_analyse_slp. */
2230 if (DR_IS_WRITE (dr) && !slp_impossible)
2232 if (loop_vinfo)
2233 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2234 if (bb_vinfo)
2235 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2238 /* There is a gap in the end of the group. */
2239 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2241 if (dump_enabled_p ())
2242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 "Data access with gaps requires scalar "
2244 "epilogue loop\n");
2245 if (loop->inner)
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "Peeling for outer loop is not supported\n");
2250 return false;
2253 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2257 return true;
2261 /* Analyze the access pattern of the data-reference DR.
2262 In case of non-consecutive accesses call vect_analyze_group_access() to
2263 analyze groups of accesses. */
2265 static bool
2266 vect_analyze_data_ref_access (struct data_reference *dr)
2268 tree step = DR_STEP (dr);
2269 tree scalar_type = TREE_TYPE (DR_REF (dr));
2270 gimple stmt = DR_STMT (dr);
2271 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2272 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2273 struct loop *loop = NULL;
2275 if (loop_vinfo)
2276 loop = LOOP_VINFO_LOOP (loop_vinfo);
2278 if (loop_vinfo && !step)
2280 if (dump_enabled_p ())
2281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2282 "bad data-ref access in loop\n");
2283 return false;
2286 /* Allow invariant loads in not nested loops. */
2287 if (loop_vinfo && integer_zerop (step))
2289 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2290 if (nested_in_vect_loop_p (loop, stmt))
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location,
2294 "zero step in inner loop of nest\n");
2295 return false;
2297 return DR_IS_READ (dr);
2300 if (loop && nested_in_vect_loop_p (loop, stmt))
2302 /* Interleaved accesses are not yet supported within outer-loop
2303 vectorization for references in the inner-loop. */
2304 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2306 /* For the rest of the analysis we use the outer-loop step. */
2307 step = STMT_VINFO_DR_STEP (stmt_info);
2308 if (integer_zerop (step))
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE, vect_location,
2312 "zero step in outer loop.\n");
2313 if (DR_IS_READ (dr))
2314 return true;
2315 else
2316 return false;
2320 /* Consecutive? */
2321 if (TREE_CODE (step) == INTEGER_CST)
2323 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2324 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2325 || (dr_step < 0
2326 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2328 /* Mark that it is not interleaving. */
2329 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2330 return true;
2334 if (loop && nested_in_vect_loop_p (loop, stmt))
2336 if (dump_enabled_p ())
2337 dump_printf_loc (MSG_NOTE, vect_location,
2338 "grouped access in outer loop.\n");
2339 return false;
2342 /* Assume this is a DR handled by non-constant strided load case. */
2343 if (TREE_CODE (step) != INTEGER_CST)
2344 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2346 /* Not consecutive access - check if it's a part of interleaving group. */
2347 return vect_analyze_group_access (dr);
2352 /* A helper function used in the comparator function to sort data
2353 references. T1 and T2 are two data references to be compared.
2354 The function returns -1, 0, or 1. */
2356 static int
2357 compare_tree (tree t1, tree t2)
2359 int i, cmp;
2360 enum tree_code code;
2361 char tclass;
2363 if (t1 == t2)
2364 return 0;
2365 if (t1 == NULL)
2366 return -1;
2367 if (t2 == NULL)
2368 return 1;
2371 if (TREE_CODE (t1) != TREE_CODE (t2))
2372 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2374 code = TREE_CODE (t1);
2375 switch (code)
2377 /* For const values, we can just use hash values for comparisons. */
2378 case INTEGER_CST:
2379 case REAL_CST:
2380 case FIXED_CST:
2381 case STRING_CST:
2382 case COMPLEX_CST:
2383 case VECTOR_CST:
2385 hashval_t h1 = iterative_hash_expr (t1, 0);
2386 hashval_t h2 = iterative_hash_expr (t2, 0);
2387 if (h1 != h2)
2388 return h1 < h2 ? -1 : 1;
2389 break;
2392 case SSA_NAME:
2393 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2394 if (cmp != 0)
2395 return cmp;
2397 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2398 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2399 break;
2401 default:
2402 tclass = TREE_CODE_CLASS (code);
2404 /* For var-decl, we could compare their UIDs. */
2405 if (tclass == tcc_declaration)
2407 if (DECL_UID (t1) != DECL_UID (t2))
2408 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2409 break;
2412 /* For expressions with operands, compare their operands recursively. */
2413 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2415 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2416 if (cmp != 0)
2417 return cmp;
2421 return 0;
2425 /* Compare two data-references DRA and DRB to group them into chunks
2426 suitable for grouping. */
2428 static int
2429 dr_group_sort_cmp (const void *dra_, const void *drb_)
2431 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2432 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2433 int cmp;
2435 /* Stabilize sort. */
2436 if (dra == drb)
2437 return 0;
2439 /* Ordering of DRs according to base. */
2440 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2442 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2443 if (cmp != 0)
2444 return cmp;
2447 /* And according to DR_OFFSET. */
2448 if (!dr_equal_offsets_p (dra, drb))
2450 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2451 if (cmp != 0)
2452 return cmp;
2455 /* Put reads before writes. */
2456 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2457 return DR_IS_READ (dra) ? -1 : 1;
2459 /* Then sort after access size. */
2460 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2461 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2463 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2464 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2465 if (cmp != 0)
2466 return cmp;
2469 /* And after step. */
2470 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2472 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2473 if (cmp != 0)
2474 return cmp;
2477 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2478 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2479 if (cmp == 0)
2480 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2481 return cmp;
2484 /* Function vect_analyze_data_ref_accesses.
2486 Analyze the access pattern of all the data references in the loop.
2488 FORNOW: the only access pattern that is considered vectorizable is a
2489 simple step 1 (consecutive) access.
2491 FORNOW: handle only arrays and pointer accesses. */
2493 bool
2494 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2496 unsigned int i;
2497 vec<data_reference_p> datarefs;
2498 struct data_reference *dr;
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_NOTE, vect_location,
2502 "=== vect_analyze_data_ref_accesses ===\n");
2504 if (loop_vinfo)
2505 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2506 else
2507 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2509 if (datarefs.is_empty ())
2510 return true;
2512 /* Sort the array of datarefs to make building the interleaving chains
2513 linear. Don't modify the original vector's order, it is needed for
2514 determining what dependencies are reversed. */
2515 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2516 datarefs_copy.qsort (dr_group_sort_cmp);
2518 /* Build the interleaving chains. */
2519 for (i = 0; i < datarefs_copy.length () - 1;)
2521 data_reference_p dra = datarefs_copy[i];
2522 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2523 stmt_vec_info lastinfo = NULL;
2524 for (i = i + 1; i < datarefs_copy.length (); ++i)
2526 data_reference_p drb = datarefs_copy[i];
2527 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2529 /* ??? Imperfect sorting (non-compatible types, non-modulo
2530 accesses, same accesses) can lead to a group to be artificially
2531 split here as we don't just skip over those. If it really
2532 matters we can push those to a worklist and re-iterate
2533 over them. The we can just skip ahead to the next DR here. */
2535 /* Check that the data-refs have same first location (except init)
2536 and they are both either store or load (not load and store). */
2537 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2538 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2539 DR_BASE_ADDRESS (drb), 0)
2540 || !dr_equal_offsets_p (dra, drb))
2541 break;
2543 /* Check that the data-refs have the same constant size and step. */
2544 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2545 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2546 if (!tree_fits_uhwi_p (sza)
2547 || !tree_fits_uhwi_p (szb)
2548 || !tree_int_cst_equal (sza, szb)
2549 || !tree_fits_shwi_p (DR_STEP (dra))
2550 || !tree_fits_shwi_p (DR_STEP (drb))
2551 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2552 break;
2554 /* Do not place the same access in the interleaving chain twice. */
2555 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2556 break;
2558 /* Check the types are compatible.
2559 ??? We don't distinguish this during sorting. */
2560 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2561 TREE_TYPE (DR_REF (drb))))
2562 break;
2564 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2565 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2566 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2567 gcc_assert (init_a < init_b);
2569 /* If init_b == init_a + the size of the type * k, we have an
2570 interleaving, and DRA is accessed before DRB. */
2571 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2572 if ((init_b - init_a) % type_size_a != 0)
2573 break;
2575 /* The step (if not zero) is greater than the difference between
2576 data-refs' inits. This splits groups into suitable sizes. */
2577 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2578 if (step != 0 && step <= (init_b - init_a))
2579 break;
2581 if (dump_enabled_p ())
2583 dump_printf_loc (MSG_NOTE, vect_location,
2584 "Detected interleaving ");
2585 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2586 dump_printf (MSG_NOTE, " and ");
2587 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2588 dump_printf (MSG_NOTE, "\n");
2591 /* Link the found element into the group list. */
2592 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2594 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2595 lastinfo = stmtinfo_a;
2597 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2598 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2599 lastinfo = stmtinfo_b;
2603 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2604 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2605 && !vect_analyze_data_ref_access (dr))
2607 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2609 "not vectorized: complicated access pattern.\n");
2611 if (bb_vinfo)
2613 /* Mark the statement as not vectorizable. */
2614 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2615 continue;
2617 else
2619 datarefs_copy.release ();
2620 return false;
2624 datarefs_copy.release ();
2625 return true;
2629 /* Operator == between two dr_with_seg_len objects.
2631 This equality operator is used to make sure two data refs
2632 are the same one so that we will consider to combine the
2633 aliasing checks of those two pairs of data dependent data
2634 refs. */
2636 static bool
2637 operator == (const dr_with_seg_len& d1,
2638 const dr_with_seg_len& d2)
2640 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2641 DR_BASE_ADDRESS (d2.dr), 0)
2642 && compare_tree (d1.offset, d2.offset) == 0
2643 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2646 /* Function comp_dr_with_seg_len_pair.
2648 Comparison function for sorting objects of dr_with_seg_len_pair_t
2649 so that we can combine aliasing checks in one scan. */
2651 static int
2652 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2654 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2655 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2657 const dr_with_seg_len &p11 = p1->first,
2658 &p12 = p1->second,
2659 &p21 = p2->first,
2660 &p22 = p2->second;
2662 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2663 if a and c have the same basic address snd step, and b and d have the same
2664 address and step. Therefore, if any a&c or b&d don't have the same address
2665 and step, we don't care the order of those two pairs after sorting. */
2666 int comp_res;
2668 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2669 DR_BASE_ADDRESS (p21.dr))) != 0)
2670 return comp_res;
2671 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2672 DR_BASE_ADDRESS (p22.dr))) != 0)
2673 return comp_res;
2674 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2675 return comp_res;
2676 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2677 return comp_res;
2678 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2679 return comp_res;
2680 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2681 return comp_res;
2683 return 0;
2686 template <class T> static void
2687 swap (T& a, T& b)
2689 T c (a);
2690 a = b;
2691 b = c;
2694 /* Function vect_vfa_segment_size.
2696 Create an expression that computes the size of segment
2697 that will be accessed for a data reference. The functions takes into
2698 account that realignment loads may access one more vector.
2700 Input:
2701 DR: The data reference.
2702 LENGTH_FACTOR: segment length to consider.
2704 Return an expression whose value is the size of segment which will be
2705 accessed by DR. */
2707 static tree
2708 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2710 tree segment_length;
2712 if (integer_zerop (DR_STEP (dr)))
2713 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2714 else
2715 segment_length = size_binop (MULT_EXPR,
2716 fold_convert (sizetype, DR_STEP (dr)),
2717 fold_convert (sizetype, length_factor));
2719 if (vect_supportable_dr_alignment (dr, false)
2720 == dr_explicit_realign_optimized)
2722 tree vector_size = TYPE_SIZE_UNIT
2723 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2725 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2727 return segment_length;
2730 /* Function vect_prune_runtime_alias_test_list.
2732 Prune a list of ddrs to be tested at run-time by versioning for alias.
2733 Merge several alias checks into one if possible.
2734 Return FALSE if resulting list of ddrs is longer then allowed by
2735 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2737 bool
2738 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2740 vec<ddr_p> may_alias_ddrs =
2741 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2742 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2743 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2744 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2745 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2747 ddr_p ddr;
2748 unsigned int i;
2749 tree length_factor;
2751 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE, vect_location,
2753 "=== vect_prune_runtime_alias_test_list ===\n");
2755 if (may_alias_ddrs.is_empty ())
2756 return true;
2758 /* Basically, for each pair of dependent data refs store_ptr_0
2759 and load_ptr_0, we create an expression:
2761 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2762 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2764 for aliasing checks. However, in some cases we can decrease
2765 the number of checks by combining two checks into one. For
2766 example, suppose we have another pair of data refs store_ptr_0
2767 and load_ptr_1, and if the following condition is satisfied:
2769 load_ptr_0 < load_ptr_1 &&
2770 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2772 (this condition means, in each iteration of vectorized loop,
2773 the accessed memory of store_ptr_0 cannot be between the memory
2774 of load_ptr_0 and load_ptr_1.)
2776 we then can use only the following expression to finish the
2777 alising checks between store_ptr_0 & load_ptr_0 and
2778 store_ptr_0 & load_ptr_1:
2780 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2781 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2783 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2784 same basic address. */
2786 comp_alias_ddrs.create (may_alias_ddrs.length ());
2788 /* First, we collect all data ref pairs for aliasing checks. */
2789 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2791 struct data_reference *dr_a, *dr_b;
2792 gimple dr_group_first_a, dr_group_first_b;
2793 tree segment_length_a, segment_length_b;
2794 gimple stmt_a, stmt_b;
2796 dr_a = DDR_A (ddr);
2797 stmt_a = DR_STMT (DDR_A (ddr));
2798 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2799 if (dr_group_first_a)
2801 stmt_a = dr_group_first_a;
2802 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2805 dr_b = DDR_B (ddr);
2806 stmt_b = DR_STMT (DDR_B (ddr));
2807 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2808 if (dr_group_first_b)
2810 stmt_b = dr_group_first_b;
2811 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2814 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2815 length_factor = scalar_loop_iters;
2816 else
2817 length_factor = size_int (vect_factor);
2818 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2819 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2821 dr_with_seg_len_pair_t dr_with_seg_len_pair
2822 (dr_with_seg_len (dr_a, segment_length_a),
2823 dr_with_seg_len (dr_b, segment_length_b));
2825 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2826 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2828 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2831 /* Second, we sort the collected data ref pairs so that we can scan
2832 them once to combine all possible aliasing checks. */
2833 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2835 /* Third, we scan the sorted dr pairs and check if we can combine
2836 alias checks of two neighbouring dr pairs. */
2837 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2839 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2840 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2841 *dr_b1 = &comp_alias_ddrs[i-1].second,
2842 *dr_a2 = &comp_alias_ddrs[i].first,
2843 *dr_b2 = &comp_alias_ddrs[i].second;
2845 /* Remove duplicate data ref pairs. */
2846 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2848 if (dump_enabled_p ())
2850 dump_printf_loc (MSG_NOTE, vect_location,
2851 "found equal ranges ");
2852 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2853 DR_REF (dr_a1->dr));
2854 dump_printf (MSG_NOTE, ", ");
2855 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2856 DR_REF (dr_b1->dr));
2857 dump_printf (MSG_NOTE, " and ");
2858 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2859 DR_REF (dr_a2->dr));
2860 dump_printf (MSG_NOTE, ", ");
2861 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2862 DR_REF (dr_b2->dr));
2863 dump_printf (MSG_NOTE, "\n");
2866 comp_alias_ddrs.ordered_remove (i--);
2867 continue;
2870 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2872 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2873 and DR_A1 and DR_A2 are two consecutive memrefs. */
2874 if (*dr_a1 == *dr_a2)
2876 swap (dr_a1, dr_b1);
2877 swap (dr_a2, dr_b2);
2880 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2881 DR_BASE_ADDRESS (dr_a2->dr),
2883 || !tree_fits_shwi_p (dr_a1->offset)
2884 || !tree_fits_shwi_p (dr_a2->offset))
2885 continue;
2887 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2888 - tree_to_shwi (dr_a1->offset));
2891 /* Now we check if the following condition is satisfied:
2893 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2895 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2896 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2897 have to make a best estimation. We can get the minimum value
2898 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2899 then either of the following two conditions can guarantee the
2900 one above:
2902 1: DIFF <= MIN_SEG_LEN_B
2903 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2907 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2908 ? tree_to_shwi (dr_b1->seg_len)
2909 : vect_factor);
2911 if (diff <= min_seg_len_b
2912 || (tree_fits_shwi_p (dr_a1->seg_len)
2913 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2915 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "merging ranges for ");
2919 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2920 DR_REF (dr_a1->dr));
2921 dump_printf (MSG_NOTE, ", ");
2922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923 DR_REF (dr_b1->dr));
2924 dump_printf (MSG_NOTE, " and ");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926 DR_REF (dr_a2->dr));
2927 dump_printf (MSG_NOTE, ", ");
2928 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2929 DR_REF (dr_b2->dr));
2930 dump_printf (MSG_NOTE, "\n");
2933 dr_a1->seg_len = size_binop (PLUS_EXPR,
2934 dr_a2->seg_len, size_int (diff));
2935 comp_alias_ddrs.ordered_remove (i--);
2940 dump_printf_loc (MSG_NOTE, vect_location,
2941 "improved number of alias checks from %d to %d\n",
2942 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2943 if ((int) comp_alias_ddrs.length () >
2944 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2945 return false;
2947 return true;
2950 /* Check whether a non-affine read in stmt is suitable for gather load
2951 and if so, return a builtin decl for that operation. */
2953 tree
2954 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2955 tree *offp, int *scalep)
2957 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2958 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2959 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2960 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2961 tree offtype = NULL_TREE;
2962 tree decl, base, off;
2963 enum machine_mode pmode;
2964 int punsignedp, pvolatilep;
2966 base = DR_REF (dr);
2967 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2968 see if we can use the def stmt of the address. */
2969 if (is_gimple_call (stmt)
2970 && gimple_call_internal_p (stmt)
2971 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2972 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2973 && TREE_CODE (base) == MEM_REF
2974 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2975 && integer_zerop (TREE_OPERAND (base, 1))
2976 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2978 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2979 if (is_gimple_assign (def_stmt)
2980 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2981 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2984 /* The gather builtins need address of the form
2985 loop_invariant + vector * {1, 2, 4, 8}
2987 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2988 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2989 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2990 multiplications and additions in it. To get a vector, we need
2991 a single SSA_NAME that will be defined in the loop and will
2992 contain everything that is not loop invariant and that can be
2993 vectorized. The following code attempts to find such a preexistng
2994 SSA_NAME OFF and put the loop invariants into a tree BASE
2995 that can be gimplified before the loop. */
2996 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2997 &pmode, &punsignedp, &pvolatilep, false);
2998 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3000 if (TREE_CODE (base) == MEM_REF)
3002 if (!integer_zerop (TREE_OPERAND (base, 1)))
3004 if (off == NULL_TREE)
3006 offset_int moff = mem_ref_offset (base);
3007 off = wide_int_to_tree (sizetype, moff);
3009 else
3010 off = size_binop (PLUS_EXPR, off,
3011 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3013 base = TREE_OPERAND (base, 0);
3015 else
3016 base = build_fold_addr_expr (base);
3018 if (off == NULL_TREE)
3019 off = size_zero_node;
3021 /* If base is not loop invariant, either off is 0, then we start with just
3022 the constant offset in the loop invariant BASE and continue with base
3023 as OFF, otherwise give up.
3024 We could handle that case by gimplifying the addition of base + off
3025 into some SSA_NAME and use that as off, but for now punt. */
3026 if (!expr_invariant_in_loop_p (loop, base))
3028 if (!integer_zerop (off))
3029 return NULL_TREE;
3030 off = base;
3031 base = size_int (pbitpos / BITS_PER_UNIT);
3033 /* Otherwise put base + constant offset into the loop invariant BASE
3034 and continue with OFF. */
3035 else
3037 base = fold_convert (sizetype, base);
3038 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3041 /* OFF at this point may be either a SSA_NAME or some tree expression
3042 from get_inner_reference. Try to peel off loop invariants from it
3043 into BASE as long as possible. */
3044 STRIP_NOPS (off);
3045 while (offtype == NULL_TREE)
3047 enum tree_code code;
3048 tree op0, op1, add = NULL_TREE;
3050 if (TREE_CODE (off) == SSA_NAME)
3052 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3054 if (expr_invariant_in_loop_p (loop, off))
3055 return NULL_TREE;
3057 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3058 break;
3060 op0 = gimple_assign_rhs1 (def_stmt);
3061 code = gimple_assign_rhs_code (def_stmt);
3062 op1 = gimple_assign_rhs2 (def_stmt);
3064 else
3066 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3067 return NULL_TREE;
3068 code = TREE_CODE (off);
3069 extract_ops_from_tree (off, &code, &op0, &op1);
3071 switch (code)
3073 case POINTER_PLUS_EXPR:
3074 case PLUS_EXPR:
3075 if (expr_invariant_in_loop_p (loop, op0))
3077 add = op0;
3078 off = op1;
3079 do_add:
3080 add = fold_convert (sizetype, add);
3081 if (scale != 1)
3082 add = size_binop (MULT_EXPR, add, size_int (scale));
3083 base = size_binop (PLUS_EXPR, base, add);
3084 continue;
3086 if (expr_invariant_in_loop_p (loop, op1))
3088 add = op1;
3089 off = op0;
3090 goto do_add;
3092 break;
3093 case MINUS_EXPR:
3094 if (expr_invariant_in_loop_p (loop, op1))
3096 add = fold_convert (sizetype, op1);
3097 add = size_binop (MINUS_EXPR, size_zero_node, add);
3098 off = op0;
3099 goto do_add;
3101 break;
3102 case MULT_EXPR:
3103 if (scale == 1 && tree_fits_shwi_p (op1))
3105 scale = tree_to_shwi (op1);
3106 off = op0;
3107 continue;
3109 break;
3110 case SSA_NAME:
3111 off = op0;
3112 continue;
3113 CASE_CONVERT:
3114 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3115 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3116 break;
3117 if (TYPE_PRECISION (TREE_TYPE (op0))
3118 == TYPE_PRECISION (TREE_TYPE (off)))
3120 off = op0;
3121 continue;
3123 if (TYPE_PRECISION (TREE_TYPE (op0))
3124 < TYPE_PRECISION (TREE_TYPE (off)))
3126 off = op0;
3127 offtype = TREE_TYPE (off);
3128 STRIP_NOPS (off);
3129 continue;
3131 break;
3132 default:
3133 break;
3135 break;
3138 /* If at the end OFF still isn't a SSA_NAME or isn't
3139 defined in the loop, punt. */
3140 if (TREE_CODE (off) != SSA_NAME
3141 || expr_invariant_in_loop_p (loop, off))
3142 return NULL_TREE;
3144 if (offtype == NULL_TREE)
3145 offtype = TREE_TYPE (off);
3147 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3148 offtype, scale);
3149 if (decl == NULL_TREE)
3150 return NULL_TREE;
3152 if (basep)
3153 *basep = base;
3154 if (offp)
3155 *offp = off;
3156 if (scalep)
3157 *scalep = scale;
3158 return decl;
3161 /* Function vect_analyze_data_refs.
3163 Find all the data references in the loop or basic block.
3165 The general structure of the analysis of data refs in the vectorizer is as
3166 follows:
3167 1- vect_analyze_data_refs(loop/bb): call
3168 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3169 in the loop/bb and their dependences.
3170 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3171 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3172 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3176 bool
3177 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3178 bb_vec_info bb_vinfo,
3179 int *min_vf, unsigned *n_stmts)
3181 struct loop *loop = NULL;
3182 basic_block bb = NULL;
3183 unsigned int i;
3184 vec<data_reference_p> datarefs;
3185 struct data_reference *dr;
3186 tree scalar_type;
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_NOTE, vect_location,
3190 "=== vect_analyze_data_refs ===\n");
3192 if (loop_vinfo)
3194 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3196 loop = LOOP_VINFO_LOOP (loop_vinfo);
3197 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3198 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3202 "not vectorized: loop contains function calls"
3203 " or data references that cannot be analyzed\n");
3204 return false;
3207 for (i = 0; i < loop->num_nodes; i++)
3209 gimple_stmt_iterator gsi;
3211 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3213 gimple stmt = gsi_stmt (gsi);
3214 if (is_gimple_debug (stmt))
3215 continue;
3216 ++*n_stmts;
3217 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3219 if (is_gimple_call (stmt) && loop->safelen)
3221 tree fndecl = gimple_call_fndecl (stmt), op;
3222 if (fndecl != NULL_TREE)
3224 struct cgraph_node *node = cgraph_get_node (fndecl);
3225 if (node != NULL && node->simd_clones != NULL)
3227 unsigned int j, n = gimple_call_num_args (stmt);
3228 for (j = 0; j < n; j++)
3230 op = gimple_call_arg (stmt, j);
3231 if (DECL_P (op)
3232 || (REFERENCE_CLASS_P (op)
3233 && get_base_address (op)))
3234 break;
3236 op = gimple_call_lhs (stmt);
3237 /* Ignore #pragma omp declare simd functions
3238 if they don't have data references in the
3239 call stmt itself. */
3240 if (j == n
3241 && !(op
3242 && (DECL_P (op)
3243 || (REFERENCE_CLASS_P (op)
3244 && get_base_address (op)))))
3245 continue;
3249 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3250 if (dump_enabled_p ())
3251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3252 "not vectorized: loop contains function "
3253 "calls or data references that cannot "
3254 "be analyzed\n");
3255 return false;
3260 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3262 else
3264 gimple_stmt_iterator gsi;
3266 bb = BB_VINFO_BB (bb_vinfo);
3267 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3269 gimple stmt = gsi_stmt (gsi);
3270 if (is_gimple_debug (stmt))
3271 continue;
3272 ++*n_stmts;
3273 if (!find_data_references_in_stmt (NULL, stmt,
3274 &BB_VINFO_DATAREFS (bb_vinfo)))
3276 /* Mark the rest of the basic-block as unvectorizable. */
3277 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3279 stmt = gsi_stmt (gsi);
3280 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3282 break;
3286 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3289 /* Go through the data-refs, check that the analysis succeeded. Update
3290 pointer from stmt_vec_info struct to DR and vectype. */
3292 FOR_EACH_VEC_ELT (datarefs, i, dr)
3294 gimple stmt;
3295 stmt_vec_info stmt_info;
3296 tree base, offset, init;
3297 bool gather = false;
3298 bool simd_lane_access = false;
3299 int vf;
3301 again:
3302 if (!dr || !DR_REF (dr))
3304 if (dump_enabled_p ())
3305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3306 "not vectorized: unhandled data-ref\n");
3307 return false;
3310 stmt = DR_STMT (dr);
3311 stmt_info = vinfo_for_stmt (stmt);
3313 /* Discard clobbers from the dataref vector. We will remove
3314 clobber stmts during vectorization. */
3315 if (gimple_clobber_p (stmt))
3317 free_data_ref (dr);
3318 if (i == datarefs.length () - 1)
3320 datarefs.pop ();
3321 break;
3323 datarefs.ordered_remove (i);
3324 dr = datarefs[i];
3325 goto again;
3328 /* Check that analysis of the data-ref succeeded. */
3329 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3330 || !DR_STEP (dr))
3332 bool maybe_gather
3333 = DR_IS_READ (dr)
3334 && !TREE_THIS_VOLATILE (DR_REF (dr))
3335 && targetm.vectorize.builtin_gather != NULL;
3336 bool maybe_simd_lane_access
3337 = loop_vinfo && loop->simduid;
3339 /* If target supports vector gather loads, or if this might be
3340 a SIMD lane access, see if they can't be used. */
3341 if (loop_vinfo
3342 && (maybe_gather || maybe_simd_lane_access)
3343 && !nested_in_vect_loop_p (loop, stmt))
3345 struct data_reference *newdr
3346 = create_data_ref (NULL, loop_containing_stmt (stmt),
3347 DR_REF (dr), stmt, true);
3348 gcc_assert (newdr != NULL && DR_REF (newdr));
3349 if (DR_BASE_ADDRESS (newdr)
3350 && DR_OFFSET (newdr)
3351 && DR_INIT (newdr)
3352 && DR_STEP (newdr)
3353 && integer_zerop (DR_STEP (newdr)))
3355 if (maybe_simd_lane_access)
3357 tree off = DR_OFFSET (newdr);
3358 STRIP_NOPS (off);
3359 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3360 && TREE_CODE (off) == MULT_EXPR
3361 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3363 tree step = TREE_OPERAND (off, 1);
3364 off = TREE_OPERAND (off, 0);
3365 STRIP_NOPS (off);
3366 if (CONVERT_EXPR_P (off)
3367 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3368 0)))
3369 < TYPE_PRECISION (TREE_TYPE (off)))
3370 off = TREE_OPERAND (off, 0);
3371 if (TREE_CODE (off) == SSA_NAME)
3373 gimple def = SSA_NAME_DEF_STMT (off);
3374 tree reft = TREE_TYPE (DR_REF (newdr));
3375 if (is_gimple_call (def)
3376 && gimple_call_internal_p (def)
3377 && (gimple_call_internal_fn (def)
3378 == IFN_GOMP_SIMD_LANE))
3380 tree arg = gimple_call_arg (def, 0);
3381 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3382 arg = SSA_NAME_VAR (arg);
3383 if (arg == loop->simduid
3384 /* For now. */
3385 && tree_int_cst_equal
3386 (TYPE_SIZE_UNIT (reft),
3387 step))
3389 DR_OFFSET (newdr) = ssize_int (0);
3390 DR_STEP (newdr) = step;
3391 DR_ALIGNED_TO (newdr)
3392 = size_int (BIGGEST_ALIGNMENT);
3393 dr = newdr;
3394 simd_lane_access = true;
3400 if (!simd_lane_access && maybe_gather)
3402 dr = newdr;
3403 gather = true;
3406 if (!gather && !simd_lane_access)
3407 free_data_ref (newdr);
3410 if (!gather && !simd_lane_access)
3412 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3415 "not vectorized: data ref analysis "
3416 "failed ");
3417 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3418 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3421 if (bb_vinfo)
3422 break;
3424 return false;
3428 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3430 if (dump_enabled_p ())
3431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3432 "not vectorized: base addr of dr is a "
3433 "constant\n");
3435 if (bb_vinfo)
3436 break;
3438 if (gather || simd_lane_access)
3439 free_data_ref (dr);
3440 return false;
3443 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3445 if (dump_enabled_p ())
3447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3448 "not vectorized: volatile type ");
3449 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3450 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3453 if (bb_vinfo)
3454 break;
3456 return false;
3459 if (stmt_can_throw_internal (stmt))
3461 if (dump_enabled_p ())
3463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3464 "not vectorized: statement can throw an "
3465 "exception ");
3466 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3467 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3470 if (bb_vinfo)
3471 break;
3473 if (gather || simd_lane_access)
3474 free_data_ref (dr);
3475 return false;
3478 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3479 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3481 if (dump_enabled_p ())
3483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3484 "not vectorized: statement is bitfield "
3485 "access ");
3486 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3487 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3490 if (bb_vinfo)
3491 break;
3493 if (gather || simd_lane_access)
3494 free_data_ref (dr);
3495 return false;
3498 base = unshare_expr (DR_BASE_ADDRESS (dr));
3499 offset = unshare_expr (DR_OFFSET (dr));
3500 init = unshare_expr (DR_INIT (dr));
3502 if (is_gimple_call (stmt)
3503 && (!gimple_call_internal_p (stmt)
3504 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3505 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3507 if (dump_enabled_p ())
3509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3510 "not vectorized: dr in a call ");
3511 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3512 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3515 if (bb_vinfo)
3516 break;
3518 if (gather || simd_lane_access)
3519 free_data_ref (dr);
3520 return false;
3523 /* Update DR field in stmt_vec_info struct. */
3525 /* If the dataref is in an inner-loop of the loop that is considered for
3526 for vectorization, we also want to analyze the access relative to
3527 the outer-loop (DR contains information only relative to the
3528 inner-most enclosing loop). We do that by building a reference to the
3529 first location accessed by the inner-loop, and analyze it relative to
3530 the outer-loop. */
3531 if (loop && nested_in_vect_loop_p (loop, stmt))
3533 tree outer_step, outer_base, outer_init;
3534 HOST_WIDE_INT pbitsize, pbitpos;
3535 tree poffset;
3536 enum machine_mode pmode;
3537 int punsignedp, pvolatilep;
3538 affine_iv base_iv, offset_iv;
3539 tree dinit;
3541 /* Build a reference to the first location accessed by the
3542 inner-loop: *(BASE+INIT). (The first location is actually
3543 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3544 tree inner_base = build_fold_indirect_ref
3545 (fold_build_pointer_plus (base, init));
3547 if (dump_enabled_p ())
3549 dump_printf_loc (MSG_NOTE, vect_location,
3550 "analyze in outer-loop: ");
3551 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3552 dump_printf (MSG_NOTE, "\n");
3555 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3556 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3557 gcc_assert (outer_base != NULL_TREE);
3559 if (pbitpos % BITS_PER_UNIT != 0)
3561 if (dump_enabled_p ())
3562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3563 "failed: bit offset alignment.\n");
3564 return false;
3567 outer_base = build_fold_addr_expr (outer_base);
3568 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3569 &base_iv, false))
3571 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3573 "failed: evolution of base is not affine.\n");
3574 return false;
3577 if (offset)
3579 if (poffset)
3580 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3581 poffset);
3582 else
3583 poffset = offset;
3586 if (!poffset)
3588 offset_iv.base = ssize_int (0);
3589 offset_iv.step = ssize_int (0);
3591 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3592 &offset_iv, false))
3594 if (dump_enabled_p ())
3595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3596 "evolution of offset is not affine.\n");
3597 return false;
3600 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3601 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3602 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3603 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3604 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3606 outer_step = size_binop (PLUS_EXPR,
3607 fold_convert (ssizetype, base_iv.step),
3608 fold_convert (ssizetype, offset_iv.step));
3610 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3611 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3612 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3613 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3614 STMT_VINFO_DR_OFFSET (stmt_info) =
3615 fold_convert (ssizetype, offset_iv.base);
3616 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3617 size_int (highest_pow2_factor (offset_iv.base));
3619 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_NOTE, vect_location,
3622 "\touter base_address: ");
3623 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3624 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3625 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3626 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3627 STMT_VINFO_DR_OFFSET (stmt_info));
3628 dump_printf (MSG_NOTE,
3629 "\n\touter constant offset from base address: ");
3630 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3631 STMT_VINFO_DR_INIT (stmt_info));
3632 dump_printf (MSG_NOTE, "\n\touter step: ");
3633 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3634 STMT_VINFO_DR_STEP (stmt_info));
3635 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3636 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3637 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3638 dump_printf (MSG_NOTE, "\n");
3642 if (STMT_VINFO_DATA_REF (stmt_info))
3644 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3647 "not vectorized: more than one data ref "
3648 "in stmt: ");
3649 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3650 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3653 if (bb_vinfo)
3654 break;
3656 if (gather || simd_lane_access)
3657 free_data_ref (dr);
3658 return false;
3661 STMT_VINFO_DATA_REF (stmt_info) = dr;
3662 if (simd_lane_access)
3664 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3665 free_data_ref (datarefs[i]);
3666 datarefs[i] = dr;
3669 /* Set vectype for STMT. */
3670 scalar_type = TREE_TYPE (DR_REF (dr));
3671 STMT_VINFO_VECTYPE (stmt_info)
3672 = get_vectype_for_scalar_type (scalar_type);
3673 if (!STMT_VINFO_VECTYPE (stmt_info))
3675 if (dump_enabled_p ())
3677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3678 "not vectorized: no vectype for stmt: ");
3679 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3680 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3681 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3682 scalar_type);
3683 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3686 if (bb_vinfo)
3687 break;
3689 if (gather || simd_lane_access)
3691 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3692 if (gather)
3693 free_data_ref (dr);
3695 return false;
3697 else
3699 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_NOTE, vect_location,
3702 "got vectype for stmt: ");
3703 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3704 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3705 STMT_VINFO_VECTYPE (stmt_info));
3706 dump_printf (MSG_NOTE, "\n");
3710 /* Adjust the minimal vectorization factor according to the
3711 vector type. */
3712 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3713 if (vf > *min_vf)
3714 *min_vf = vf;
3716 if (gather)
3718 tree off;
3720 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3721 if (gather
3722 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3723 gather = false;
3724 if (!gather)
3726 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3727 free_data_ref (dr);
3728 if (dump_enabled_p ())
3730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3731 "not vectorized: not suitable for gather "
3732 "load ");
3733 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3734 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3736 return false;
3739 datarefs[i] = dr;
3740 STMT_VINFO_GATHER_P (stmt_info) = true;
3742 else if (loop_vinfo
3743 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3745 if (nested_in_vect_loop_p (loop, stmt)
3746 || !DR_IS_READ (dr))
3748 if (dump_enabled_p ())
3750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3751 "not vectorized: not suitable for strided "
3752 "load ");
3753 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3754 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3756 return false;
3758 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3762 /* If we stopped analysis at the first dataref we could not analyze
3763 when trying to vectorize a basic-block mark the rest of the datarefs
3764 as not vectorizable and truncate the vector of datarefs. That
3765 avoids spending useless time in analyzing their dependence. */
3766 if (i != datarefs.length ())
3768 gcc_assert (bb_vinfo != NULL);
3769 for (unsigned j = i; j < datarefs.length (); ++j)
3771 data_reference_p dr = datarefs[j];
3772 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3773 free_data_ref (dr);
3775 datarefs.truncate (i);
3778 return true;
3782 /* Function vect_get_new_vect_var.
3784 Returns a name for a new variable. The current naming scheme appends the
3785 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3786 the name of vectorizer generated variables, and appends that to NAME if
3787 provided. */
3789 tree
3790 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3792 const char *prefix;
3793 tree new_vect_var;
3795 switch (var_kind)
3797 case vect_simple_var:
3798 prefix = "vect";
3799 break;
3800 case vect_scalar_var:
3801 prefix = "stmp";
3802 break;
3803 case vect_pointer_var:
3804 prefix = "vectp";
3805 break;
3806 default:
3807 gcc_unreachable ();
3810 if (name)
3812 char* tmp = concat (prefix, "_", name, NULL);
3813 new_vect_var = create_tmp_reg (type, tmp);
3814 free (tmp);
3816 else
3817 new_vect_var = create_tmp_reg (type, prefix);
3819 return new_vect_var;
3823 /* Function vect_create_addr_base_for_vector_ref.
3825 Create an expression that computes the address of the first memory location
3826 that will be accessed for a data reference.
3828 Input:
3829 STMT: The statement containing the data reference.
3830 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3831 OFFSET: Optional. If supplied, it is be added to the initial address.
3832 LOOP: Specify relative to which loop-nest should the address be computed.
3833 For example, when the dataref is in an inner-loop nested in an
3834 outer-loop that is now being vectorized, LOOP can be either the
3835 outer-loop, or the inner-loop. The first memory location accessed
3836 by the following dataref ('in' points to short):
3838 for (i=0; i<N; i++)
3839 for (j=0; j<M; j++)
3840 s += in[i+j]
3842 is as follows:
3843 if LOOP=i_loop: &in (relative to i_loop)
3844 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3846 Output:
3847 1. Return an SSA_NAME whose value is the address of the memory location of
3848 the first vector of the data reference.
3849 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3850 these statement(s) which define the returned SSA_NAME.
3852 FORNOW: We are only handling array accesses with step 1. */
3854 tree
3855 vect_create_addr_base_for_vector_ref (gimple stmt,
3856 gimple_seq *new_stmt_list,
3857 tree offset,
3858 struct loop *loop)
3860 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3861 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3862 tree data_ref_base;
3863 const char *base_name;
3864 tree addr_base;
3865 tree dest;
3866 gimple_seq seq = NULL;
3867 tree base_offset;
3868 tree init;
3869 tree vect_ptr_type;
3870 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3871 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3873 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3875 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3877 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3879 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3880 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3881 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3883 else
3885 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3886 base_offset = unshare_expr (DR_OFFSET (dr));
3887 init = unshare_expr (DR_INIT (dr));
3890 if (loop_vinfo)
3891 base_name = get_name (data_ref_base);
3892 else
3894 base_offset = ssize_int (0);
3895 init = ssize_int (0);
3896 base_name = get_name (DR_REF (dr));
3899 /* Create base_offset */
3900 base_offset = size_binop (PLUS_EXPR,
3901 fold_convert (sizetype, base_offset),
3902 fold_convert (sizetype, init));
3904 if (offset)
3906 offset = fold_build2 (MULT_EXPR, sizetype,
3907 fold_convert (sizetype, offset), step);
3908 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3909 base_offset, offset);
3912 /* base + base_offset */
3913 if (loop_vinfo)
3914 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3915 else
3917 addr_base = build1 (ADDR_EXPR,
3918 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3919 unshare_expr (DR_REF (dr)));
3922 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3923 addr_base = fold_convert (vect_ptr_type, addr_base);
3924 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3925 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3926 gimple_seq_add_seq (new_stmt_list, seq);
3928 if (DR_PTR_INFO (dr)
3929 && TREE_CODE (addr_base) == SSA_NAME)
3931 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3932 if (offset)
3933 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3939 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3940 dump_printf (MSG_NOTE, "\n");
3943 return addr_base;
3947 /* Function vect_create_data_ref_ptr.
3949 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3950 location accessed in the loop by STMT, along with the def-use update
3951 chain to appropriately advance the pointer through the loop iterations.
3952 Also set aliasing information for the pointer. This pointer is used by
3953 the callers to this function to create a memory reference expression for
3954 vector load/store access.
3956 Input:
3957 1. STMT: a stmt that references memory. Expected to be of the form
3958 GIMPLE_ASSIGN <name, data-ref> or
3959 GIMPLE_ASSIGN <data-ref, name>.
3960 2. AGGR_TYPE: the type of the reference, which should be either a vector
3961 or an array.
3962 3. AT_LOOP: the loop where the vector memref is to be created.
3963 4. OFFSET (optional): an offset to be added to the initial address accessed
3964 by the data-ref in STMT.
3965 5. BSI: location where the new stmts are to be placed if there is no loop
3966 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3967 pointing to the initial address.
3969 Output:
3970 1. Declare a new ptr to vector_type, and have it point to the base of the
3971 data reference (initial addressed accessed by the data reference).
3972 For example, for vector of type V8HI, the following code is generated:
3974 v8hi *ap;
3975 ap = (v8hi *)initial_address;
3977 if OFFSET is not supplied:
3978 initial_address = &a[init];
3979 if OFFSET is supplied:
3980 initial_address = &a[init + OFFSET];
3982 Return the initial_address in INITIAL_ADDRESS.
3984 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3985 update the pointer in each iteration of the loop.
3987 Return the increment stmt that updates the pointer in PTR_INCR.
3989 3. Set INV_P to true if the access pattern of the data reference in the
3990 vectorized loop is invariant. Set it to false otherwise.
3992 4. Return the pointer. */
3994 tree
3995 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3996 tree offset, tree *initial_address,
3997 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3998 bool only_init, bool *inv_p)
4000 const char *base_name;
4001 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4003 struct loop *loop = NULL;
4004 bool nested_in_vect_loop = false;
4005 struct loop *containing_loop = NULL;
4006 tree aggr_ptr_type;
4007 tree aggr_ptr;
4008 tree new_temp;
4009 gimple vec_stmt;
4010 gimple_seq new_stmt_list = NULL;
4011 edge pe = NULL;
4012 basic_block new_bb;
4013 tree aggr_ptr_init;
4014 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4015 tree aptr;
4016 gimple_stmt_iterator incr_gsi;
4017 bool insert_after;
4018 tree indx_before_incr, indx_after_incr;
4019 gimple incr;
4020 tree step;
4021 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4023 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4024 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4026 if (loop_vinfo)
4028 loop = LOOP_VINFO_LOOP (loop_vinfo);
4029 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4030 containing_loop = (gimple_bb (stmt))->loop_father;
4031 pe = loop_preheader_edge (loop);
4033 else
4035 gcc_assert (bb_vinfo);
4036 only_init = true;
4037 *ptr_incr = NULL;
4040 /* Check the step (evolution) of the load in LOOP, and record
4041 whether it's invariant. */
4042 if (nested_in_vect_loop)
4043 step = STMT_VINFO_DR_STEP (stmt_info);
4044 else
4045 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4047 if (integer_zerop (step))
4048 *inv_p = true;
4049 else
4050 *inv_p = false;
4052 /* Create an expression for the first address accessed by this load
4053 in LOOP. */
4054 base_name = get_name (DR_BASE_ADDRESS (dr));
4056 if (dump_enabled_p ())
4058 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4059 dump_printf_loc (MSG_NOTE, vect_location,
4060 "create %s-pointer variable to type: ",
4061 get_tree_code_name (TREE_CODE (aggr_type)));
4062 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4063 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4064 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4065 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4066 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4067 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4068 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4069 else
4070 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4071 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4072 dump_printf (MSG_NOTE, "\n");
4075 /* (1) Create the new aggregate-pointer variable.
4076 Vector and array types inherit the alias set of their component
4077 type by default so we need to use a ref-all pointer if the data
4078 reference does not conflict with the created aggregated data
4079 reference because it is not addressable. */
4080 bool need_ref_all = false;
4081 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4082 get_alias_set (DR_REF (dr))))
4083 need_ref_all = true;
4084 /* Likewise for any of the data references in the stmt group. */
4085 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4087 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4090 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4091 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4092 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4093 get_alias_set (DR_REF (sdr))))
4095 need_ref_all = true;
4096 break;
4098 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4100 while (orig_stmt);
4102 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4103 need_ref_all);
4104 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4107 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4108 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4109 def-use update cycles for the pointer: one relative to the outer-loop
4110 (LOOP), which is what steps (3) and (4) below do. The other is relative
4111 to the inner-loop (which is the inner-most loop containing the dataref),
4112 and this is done be step (5) below.
4114 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4115 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4116 redundant. Steps (3),(4) create the following:
4118 vp0 = &base_addr;
4119 LOOP: vp1 = phi(vp0,vp2)
4122 vp2 = vp1 + step
4123 goto LOOP
4125 If there is an inner-loop nested in loop, then step (5) will also be
4126 applied, and an additional update in the inner-loop will be created:
4128 vp0 = &base_addr;
4129 LOOP: vp1 = phi(vp0,vp2)
4131 inner: vp3 = phi(vp1,vp4)
4132 vp4 = vp3 + inner_step
4133 if () goto inner
4135 vp2 = vp1 + step
4136 if () goto LOOP */
4138 /* (2) Calculate the initial address of the aggregate-pointer, and set
4139 the aggregate-pointer to point to it before the loop. */
4141 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4143 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4144 offset, loop);
4145 if (new_stmt_list)
4147 if (pe)
4149 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4150 gcc_assert (!new_bb);
4152 else
4153 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4156 *initial_address = new_temp;
4158 /* Create: p = (aggr_type *) initial_base */
4159 if (TREE_CODE (new_temp) != SSA_NAME
4160 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4162 vec_stmt = gimple_build_assign (aggr_ptr,
4163 fold_convert (aggr_ptr_type, new_temp));
4164 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4165 /* Copy the points-to information if it exists. */
4166 if (DR_PTR_INFO (dr))
4167 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4168 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4169 if (pe)
4171 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4172 gcc_assert (!new_bb);
4174 else
4175 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4177 else
4178 aggr_ptr_init = new_temp;
4180 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4181 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4182 inner-loop nested in LOOP (during outer-loop vectorization). */
4184 /* No update in loop is required. */
4185 if (only_init && (!loop_vinfo || at_loop == loop))
4186 aptr = aggr_ptr_init;
4187 else
4189 /* The step of the aggregate pointer is the type size. */
4190 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4191 /* One exception to the above is when the scalar step of the load in
4192 LOOP is zero. In this case the step here is also zero. */
4193 if (*inv_p)
4194 iv_step = size_zero_node;
4195 else if (tree_int_cst_sgn (step) == -1)
4196 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4198 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4200 create_iv (aggr_ptr_init,
4201 fold_convert (aggr_ptr_type, iv_step),
4202 aggr_ptr, loop, &incr_gsi, insert_after,
4203 &indx_before_incr, &indx_after_incr);
4204 incr = gsi_stmt (incr_gsi);
4205 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4207 /* Copy the points-to information if it exists. */
4208 if (DR_PTR_INFO (dr))
4210 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4211 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4213 if (ptr_incr)
4214 *ptr_incr = incr;
4216 aptr = indx_before_incr;
4219 if (!nested_in_vect_loop || only_init)
4220 return aptr;
4223 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4224 nested in LOOP, if exists. */
4226 gcc_assert (nested_in_vect_loop);
4227 if (!only_init)
4229 standard_iv_increment_position (containing_loop, &incr_gsi,
4230 &insert_after);
4231 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4232 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4233 &indx_after_incr);
4234 incr = gsi_stmt (incr_gsi);
4235 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4237 /* Copy the points-to information if it exists. */
4238 if (DR_PTR_INFO (dr))
4240 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4241 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4243 if (ptr_incr)
4244 *ptr_incr = incr;
4246 return indx_before_incr;
4248 else
4249 gcc_unreachable ();
4253 /* Function bump_vector_ptr
4255 Increment a pointer (to a vector type) by vector-size. If requested,
4256 i.e. if PTR-INCR is given, then also connect the new increment stmt
4257 to the existing def-use update-chain of the pointer, by modifying
4258 the PTR_INCR as illustrated below:
4260 The pointer def-use update-chain before this function:
4261 DATAREF_PTR = phi (p_0, p_2)
4262 ....
4263 PTR_INCR: p_2 = DATAREF_PTR + step
4265 The pointer def-use update-chain after this function:
4266 DATAREF_PTR = phi (p_0, p_2)
4267 ....
4268 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4269 ....
4270 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4272 Input:
4273 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4274 in the loop.
4275 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4276 the loop. The increment amount across iterations is expected
4277 to be vector_size.
4278 BSI - location where the new update stmt is to be placed.
4279 STMT - the original scalar memory-access stmt that is being vectorized.
4280 BUMP - optional. The offset by which to bump the pointer. If not given,
4281 the offset is assumed to be vector_size.
4283 Output: Return NEW_DATAREF_PTR as illustrated above.
4287 tree
4288 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4289 gimple stmt, tree bump)
4291 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4292 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4293 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4294 tree update = TYPE_SIZE_UNIT (vectype);
4295 gimple incr_stmt;
4296 ssa_op_iter iter;
4297 use_operand_p use_p;
4298 tree new_dataref_ptr;
4300 if (bump)
4301 update = bump;
4303 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4304 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4305 dataref_ptr, update);
4306 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4308 /* Copy the points-to information if it exists. */
4309 if (DR_PTR_INFO (dr))
4311 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4312 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4315 if (!ptr_incr)
4316 return new_dataref_ptr;
4318 /* Update the vector-pointer's cross-iteration increment. */
4319 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4321 tree use = USE_FROM_PTR (use_p);
4323 if (use == dataref_ptr)
4324 SET_USE (use_p, new_dataref_ptr);
4325 else
4326 gcc_assert (tree_int_cst_compare (use, update) == 0);
4329 return new_dataref_ptr;
4333 /* Function vect_create_destination_var.
4335 Create a new temporary of type VECTYPE. */
4337 tree
4338 vect_create_destination_var (tree scalar_dest, tree vectype)
4340 tree vec_dest;
4341 const char *name;
4342 char *new_name;
4343 tree type;
4344 enum vect_var_kind kind;
4346 kind = vectype ? vect_simple_var : vect_scalar_var;
4347 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4349 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4351 name = get_name (scalar_dest);
4352 if (name)
4353 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4354 else
4355 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4356 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4357 free (new_name);
4359 return vec_dest;
4362 /* Function vect_grouped_store_supported.
4364 Returns TRUE if interleave high and interleave low permutations
4365 are supported, and FALSE otherwise. */
4367 bool
4368 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4370 enum machine_mode mode = TYPE_MODE (vectype);
4372 /* vect_permute_store_chain requires the group size to be equal to 3 or
4373 be a power of two. */
4374 if (count != 3 && exact_log2 (count) == -1)
4376 if (dump_enabled_p ())
4377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4378 "the size of the group of accesses"
4379 " is not a power of 2 or not eqaul to 3\n");
4380 return false;
4383 /* Check that the permutation is supported. */
4384 if (VECTOR_MODE_P (mode))
4386 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4387 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4389 if (count == 3)
4391 unsigned int j0 = 0, j1 = 0, j2 = 0;
4392 unsigned int i, j;
4394 for (j = 0; j < 3; j++)
4396 int nelt0 = ((3 - j) * nelt) % 3;
4397 int nelt1 = ((3 - j) * nelt + 1) % 3;
4398 int nelt2 = ((3 - j) * nelt + 2) % 3;
4399 for (i = 0; i < nelt; i++)
4401 if (3 * i + nelt0 < nelt)
4402 sel[3 * i + nelt0] = j0++;
4403 if (3 * i + nelt1 < nelt)
4404 sel[3 * i + nelt1] = nelt + j1++;
4405 if (3 * i + nelt2 < nelt)
4406 sel[3 * i + nelt2] = 0;
4408 if (!can_vec_perm_p (mode, false, sel))
4410 if (dump_enabled_p ())
4411 dump_printf (MSG_MISSED_OPTIMIZATION,
4412 "permutaion op not supported by target.\n");
4413 return false;
4416 for (i = 0; i < nelt; i++)
4418 if (3 * i + nelt0 < nelt)
4419 sel[3 * i + nelt0] = 3 * i + nelt0;
4420 if (3 * i + nelt1 < nelt)
4421 sel[3 * i + nelt1] = 3 * i + nelt1;
4422 if (3 * i + nelt2 < nelt)
4423 sel[3 * i + nelt2] = nelt + j2++;
4425 if (!can_vec_perm_p (mode, false, sel))
4427 if (dump_enabled_p ())
4428 dump_printf (MSG_MISSED_OPTIMIZATION,
4429 "permutaion op not supported by target.\n");
4430 return false;
4433 return true;
4435 else
4437 /* If length is not equal to 3 then only power of 2 is supported. */
4438 gcc_assert (exact_log2 (count) != -1);
4440 for (i = 0; i < nelt / 2; i++)
4442 sel[i * 2] = i;
4443 sel[i * 2 + 1] = i + nelt;
4445 if (can_vec_perm_p (mode, false, sel))
4447 for (i = 0; i < nelt; i++)
4448 sel[i] += nelt / 2;
4449 if (can_vec_perm_p (mode, false, sel))
4450 return true;
4455 if (dump_enabled_p ())
4456 dump_printf (MSG_MISSED_OPTIMIZATION,
4457 "permutaion op not supported by target.\n");
4458 return false;
4462 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4463 type VECTYPE. */
4465 bool
4466 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4468 return vect_lanes_optab_supported_p ("vec_store_lanes",
4469 vec_store_lanes_optab,
4470 vectype, count);
4474 /* Function vect_permute_store_chain.
4476 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4477 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4478 the data correctly for the stores. Return the final references for stores
4479 in RESULT_CHAIN.
4481 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4482 The input is 4 vectors each containing 8 elements. We assign a number to
4483 each element, the input sequence is:
4485 1st vec: 0 1 2 3 4 5 6 7
4486 2nd vec: 8 9 10 11 12 13 14 15
4487 3rd vec: 16 17 18 19 20 21 22 23
4488 4th vec: 24 25 26 27 28 29 30 31
4490 The output sequence should be:
4492 1st vec: 0 8 16 24 1 9 17 25
4493 2nd vec: 2 10 18 26 3 11 19 27
4494 3rd vec: 4 12 20 28 5 13 21 30
4495 4th vec: 6 14 22 30 7 15 23 31
4497 i.e., we interleave the contents of the four vectors in their order.
4499 We use interleave_high/low instructions to create such output. The input of
4500 each interleave_high/low operation is two vectors:
4501 1st vec 2nd vec
4502 0 1 2 3 4 5 6 7
4503 the even elements of the result vector are obtained left-to-right from the
4504 high/low elements of the first vector. The odd elements of the result are
4505 obtained left-to-right from the high/low elements of the second vector.
4506 The output of interleave_high will be: 0 4 1 5
4507 and of interleave_low: 2 6 3 7
4510 The permutation is done in log LENGTH stages. In each stage interleave_high
4511 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4512 where the first argument is taken from the first half of DR_CHAIN and the
4513 second argument from it's second half.
4514 In our example,
4516 I1: interleave_high (1st vec, 3rd vec)
4517 I2: interleave_low (1st vec, 3rd vec)
4518 I3: interleave_high (2nd vec, 4th vec)
4519 I4: interleave_low (2nd vec, 4th vec)
4521 The output for the first stage is:
4523 I1: 0 16 1 17 2 18 3 19
4524 I2: 4 20 5 21 6 22 7 23
4525 I3: 8 24 9 25 10 26 11 27
4526 I4: 12 28 13 29 14 30 15 31
4528 The output of the second stage, i.e. the final result is:
4530 I1: 0 8 16 24 1 9 17 25
4531 I2: 2 10 18 26 3 11 19 27
4532 I3: 4 12 20 28 5 13 21 30
4533 I4: 6 14 22 30 7 15 23 31. */
4535 void
4536 vect_permute_store_chain (vec<tree> dr_chain,
4537 unsigned int length,
4538 gimple stmt,
4539 gimple_stmt_iterator *gsi,
4540 vec<tree> *result_chain)
4542 tree vect1, vect2, high, low;
4543 gimple perm_stmt;
4544 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4545 tree perm_mask_low, perm_mask_high;
4546 tree data_ref;
4547 tree perm3_mask_low, perm3_mask_high;
4548 unsigned int i, n, log_length = exact_log2 (length);
4549 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4550 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4552 result_chain->quick_grow (length);
4553 memcpy (result_chain->address (), dr_chain.address (),
4554 length * sizeof (tree));
4556 if (length == 3)
4558 unsigned int j0 = 0, j1 = 0, j2 = 0;
4560 for (j = 0; j < 3; j++)
4562 int nelt0 = ((3 - j) * nelt) % 3;
4563 int nelt1 = ((3 - j) * nelt + 1) % 3;
4564 int nelt2 = ((3 - j) * nelt + 2) % 3;
4566 for (i = 0; i < nelt; i++)
4568 if (3 * i + nelt0 < nelt)
4569 sel[3 * i + nelt0] = j0++;
4570 if (3 * i + nelt1 < nelt)
4571 sel[3 * i + nelt1] = nelt + j1++;
4572 if (3 * i + nelt2 < nelt)
4573 sel[3 * i + nelt2] = 0;
4575 perm3_mask_low = vect_gen_perm_mask (vectype, sel);
4576 gcc_assert (perm3_mask_low != NULL);
4578 for (i = 0; i < nelt; i++)
4580 if (3 * i + nelt0 < nelt)
4581 sel[3 * i + nelt0] = 3 * i + nelt0;
4582 if (3 * i + nelt1 < nelt)
4583 sel[3 * i + nelt1] = 3 * i + nelt1;
4584 if (3 * i + nelt2 < nelt)
4585 sel[3 * i + nelt2] = nelt + j2++;
4587 perm3_mask_high = vect_gen_perm_mask (vectype, sel);
4588 gcc_assert (perm3_mask_high != NULL);
4590 vect1 = dr_chain[0];
4591 vect2 = dr_chain[1];
4593 /* Create interleaving stmt:
4594 low = VEC_PERM_EXPR <vect1, vect2,
4595 {j, nelt, *, j + 1, nelt + j + 1, *,
4596 j + 2, nelt + j + 2, *, ...}> */
4597 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4598 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4599 vect1, vect2,
4600 perm3_mask_low);
4601 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4603 vect1 = data_ref;
4604 vect2 = dr_chain[2];
4605 /* Create interleaving stmt:
4606 low = VEC_PERM_EXPR <vect1, vect2,
4607 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4608 6, 7, nelt + j + 2, ...}> */
4609 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4610 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4611 vect1, vect2,
4612 perm3_mask_high);
4613 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4614 (*result_chain)[j] = data_ref;
4617 else
4619 /* If length is not equal to 3 then only power of 2 is supported. */
4620 gcc_assert (exact_log2 (length) != -1);
4622 for (i = 0, n = nelt / 2; i < n; i++)
4624 sel[i * 2] = i;
4625 sel[i * 2 + 1] = i + nelt;
4627 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4628 gcc_assert (perm_mask_high != NULL);
4630 for (i = 0; i < nelt; i++)
4631 sel[i] += nelt / 2;
4632 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4633 gcc_assert (perm_mask_low != NULL);
4635 for (i = 0, n = log_length; i < n; i++)
4637 for (j = 0; j < length/2; j++)
4639 vect1 = dr_chain[j];
4640 vect2 = dr_chain[j+length/2];
4642 /* Create interleaving stmt:
4643 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4644 ...}> */
4645 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4646 perm_stmt
4647 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4648 vect1, vect2, perm_mask_high);
4649 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4650 (*result_chain)[2*j] = high;
4652 /* Create interleaving stmt:
4653 low = VEC_PERM_EXPR <vect1, vect2,
4654 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4655 ...}> */
4656 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4657 perm_stmt
4658 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4659 vect1, vect2, perm_mask_low);
4660 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4661 (*result_chain)[2*j+1] = low;
4663 memcpy (dr_chain.address (), result_chain->address (),
4664 length * sizeof (tree));
4669 /* Function vect_setup_realignment
4671 This function is called when vectorizing an unaligned load using
4672 the dr_explicit_realign[_optimized] scheme.
4673 This function generates the following code at the loop prolog:
4675 p = initial_addr;
4676 x msq_init = *(floor(p)); # prolog load
4677 realignment_token = call target_builtin;
4678 loop:
4679 x msq = phi (msq_init, ---)
4681 The stmts marked with x are generated only for the case of
4682 dr_explicit_realign_optimized.
4684 The code above sets up a new (vector) pointer, pointing to the first
4685 location accessed by STMT, and a "floor-aligned" load using that pointer.
4686 It also generates code to compute the "realignment-token" (if the relevant
4687 target hook was defined), and creates a phi-node at the loop-header bb
4688 whose arguments are the result of the prolog-load (created by this
4689 function) and the result of a load that takes place in the loop (to be
4690 created by the caller to this function).
4692 For the case of dr_explicit_realign_optimized:
4693 The caller to this function uses the phi-result (msq) to create the
4694 realignment code inside the loop, and sets up the missing phi argument,
4695 as follows:
4696 loop:
4697 msq = phi (msq_init, lsq)
4698 lsq = *(floor(p')); # load in loop
4699 result = realign_load (msq, lsq, realignment_token);
4701 For the case of dr_explicit_realign:
4702 loop:
4703 msq = *(floor(p)); # load in loop
4704 p' = p + (VS-1);
4705 lsq = *(floor(p')); # load in loop
4706 result = realign_load (msq, lsq, realignment_token);
4708 Input:
4709 STMT - (scalar) load stmt to be vectorized. This load accesses
4710 a memory location that may be unaligned.
4711 BSI - place where new code is to be inserted.
4712 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4713 is used.
4715 Output:
4716 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4717 target hook, if defined.
4718 Return value - the result of the loop-header phi node. */
4720 tree
4721 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4722 tree *realignment_token,
4723 enum dr_alignment_support alignment_support_scheme,
4724 tree init_addr,
4725 struct loop **at_loop)
4727 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4728 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4729 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4730 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4731 struct loop *loop = NULL;
4732 edge pe = NULL;
4733 tree scalar_dest = gimple_assign_lhs (stmt);
4734 tree vec_dest;
4735 gimple inc;
4736 tree ptr;
4737 tree data_ref;
4738 gimple new_stmt;
4739 basic_block new_bb;
4740 tree msq_init = NULL_TREE;
4741 tree new_temp;
4742 gimple phi_stmt;
4743 tree msq = NULL_TREE;
4744 gimple_seq stmts = NULL;
4745 bool inv_p;
4746 bool compute_in_loop = false;
4747 bool nested_in_vect_loop = false;
4748 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4749 struct loop *loop_for_initial_load = NULL;
4751 if (loop_vinfo)
4753 loop = LOOP_VINFO_LOOP (loop_vinfo);
4754 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4757 gcc_assert (alignment_support_scheme == dr_explicit_realign
4758 || alignment_support_scheme == dr_explicit_realign_optimized);
4760 /* We need to generate three things:
4761 1. the misalignment computation
4762 2. the extra vector load (for the optimized realignment scheme).
4763 3. the phi node for the two vectors from which the realignment is
4764 done (for the optimized realignment scheme). */
4766 /* 1. Determine where to generate the misalignment computation.
4768 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4769 calculation will be generated by this function, outside the loop (in the
4770 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4771 caller, inside the loop.
4773 Background: If the misalignment remains fixed throughout the iterations of
4774 the loop, then both realignment schemes are applicable, and also the
4775 misalignment computation can be done outside LOOP. This is because we are
4776 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4777 are a multiple of VS (the Vector Size), and therefore the misalignment in
4778 different vectorized LOOP iterations is always the same.
4779 The problem arises only if the memory access is in an inner-loop nested
4780 inside LOOP, which is now being vectorized using outer-loop vectorization.
4781 This is the only case when the misalignment of the memory access may not
4782 remain fixed throughout the iterations of the inner-loop (as explained in
4783 detail in vect_supportable_dr_alignment). In this case, not only is the
4784 optimized realignment scheme not applicable, but also the misalignment
4785 computation (and generation of the realignment token that is passed to
4786 REALIGN_LOAD) have to be done inside the loop.
4788 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4789 or not, which in turn determines if the misalignment is computed inside
4790 the inner-loop, or outside LOOP. */
4792 if (init_addr != NULL_TREE || !loop_vinfo)
4794 compute_in_loop = true;
4795 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4799 /* 2. Determine where to generate the extra vector load.
4801 For the optimized realignment scheme, instead of generating two vector
4802 loads in each iteration, we generate a single extra vector load in the
4803 preheader of the loop, and in each iteration reuse the result of the
4804 vector load from the previous iteration. In case the memory access is in
4805 an inner-loop nested inside LOOP, which is now being vectorized using
4806 outer-loop vectorization, we need to determine whether this initial vector
4807 load should be generated at the preheader of the inner-loop, or can be
4808 generated at the preheader of LOOP. If the memory access has no evolution
4809 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4810 to be generated inside LOOP (in the preheader of the inner-loop). */
4812 if (nested_in_vect_loop)
4814 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4815 bool invariant_in_outerloop =
4816 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4817 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4819 else
4820 loop_for_initial_load = loop;
4821 if (at_loop)
4822 *at_loop = loop_for_initial_load;
4824 if (loop_for_initial_load)
4825 pe = loop_preheader_edge (loop_for_initial_load);
4827 /* 3. For the case of the optimized realignment, create the first vector
4828 load at the loop preheader. */
4830 if (alignment_support_scheme == dr_explicit_realign_optimized)
4832 /* Create msq_init = *(floor(p1)) in the loop preheader */
4834 gcc_assert (!compute_in_loop);
4835 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4836 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4837 NULL_TREE, &init_addr, NULL, &inc,
4838 true, &inv_p);
4839 new_temp = copy_ssa_name (ptr, NULL);
4840 new_stmt = gimple_build_assign_with_ops
4841 (BIT_AND_EXPR, new_temp, ptr,
4842 build_int_cst (TREE_TYPE (ptr),
4843 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4844 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4845 gcc_assert (!new_bb);
4846 data_ref
4847 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4848 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4849 new_stmt = gimple_build_assign (vec_dest, data_ref);
4850 new_temp = make_ssa_name (vec_dest, new_stmt);
4851 gimple_assign_set_lhs (new_stmt, new_temp);
4852 if (pe)
4854 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4855 gcc_assert (!new_bb);
4857 else
4858 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4860 msq_init = gimple_assign_lhs (new_stmt);
4863 /* 4. Create realignment token using a target builtin, if available.
4864 It is done either inside the containing loop, or before LOOP (as
4865 determined above). */
4867 if (targetm.vectorize.builtin_mask_for_load)
4869 tree builtin_decl;
4871 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4872 if (!init_addr)
4874 /* Generate the INIT_ADDR computation outside LOOP. */
4875 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4876 NULL_TREE, loop);
4877 if (loop)
4879 pe = loop_preheader_edge (loop);
4880 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4881 gcc_assert (!new_bb);
4883 else
4884 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4887 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4888 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4889 vec_dest =
4890 vect_create_destination_var (scalar_dest,
4891 gimple_call_return_type (new_stmt));
4892 new_temp = make_ssa_name (vec_dest, new_stmt);
4893 gimple_call_set_lhs (new_stmt, new_temp);
4895 if (compute_in_loop)
4896 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4897 else
4899 /* Generate the misalignment computation outside LOOP. */
4900 pe = loop_preheader_edge (loop);
4901 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4902 gcc_assert (!new_bb);
4905 *realignment_token = gimple_call_lhs (new_stmt);
4907 /* The result of the CALL_EXPR to this builtin is determined from
4908 the value of the parameter and no global variables are touched
4909 which makes the builtin a "const" function. Requiring the
4910 builtin to have the "const" attribute makes it unnecessary
4911 to call mark_call_clobbered. */
4912 gcc_assert (TREE_READONLY (builtin_decl));
4915 if (alignment_support_scheme == dr_explicit_realign)
4916 return msq;
4918 gcc_assert (!compute_in_loop);
4919 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4922 /* 5. Create msq = phi <msq_init, lsq> in loop */
4924 pe = loop_preheader_edge (containing_loop);
4925 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4926 msq = make_ssa_name (vec_dest, NULL);
4927 phi_stmt = create_phi_node (msq, containing_loop->header);
4928 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4930 return msq;
4934 /* Function vect_grouped_load_supported.
4936 Returns TRUE if even and odd permutations are supported,
4937 and FALSE otherwise. */
4939 bool
4940 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4942 enum machine_mode mode = TYPE_MODE (vectype);
4944 /* vect_permute_load_chain requires the group size to be equal to 3 or
4945 be a power of two. */
4946 if (count != 3 && exact_log2 (count) == -1)
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4950 "the size of the group of accesses"
4951 " is not a power of 2 or not equal to 3\n");
4952 return false;
4955 /* Check that the permutation is supported. */
4956 if (VECTOR_MODE_P (mode))
4958 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4959 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4961 if (count == 3)
4963 unsigned int k;
4964 for (k = 0; k < 3; k++)
4966 for (i = 0; i < nelt; i++)
4967 if (3 * i + k < 2 * nelt)
4968 sel[i] = 3 * i + k;
4969 else
4970 sel[i] = 0;
4971 if (!can_vec_perm_p (mode, false, sel))
4973 if (dump_enabled_p ())
4974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4975 "shuffle of 3 loads is not supported by"
4976 " target\n");
4977 return false;
4979 for (i = 0, j = 0; i < nelt; i++)
4980 if (3 * i + k < 2 * nelt)
4981 sel[i] = i;
4982 else
4983 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4984 if (!can_vec_perm_p (mode, false, sel))
4986 if (dump_enabled_p ())
4987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4988 "shuffle of 3 loads is not supported by"
4989 " target\n");
4990 return false;
4993 return true;
4995 else
4997 /* If length is not equal to 3 then only power of 2 is supported. */
4998 gcc_assert (exact_log2 (count) != -1);
4999 for (i = 0; i < nelt; i++)
5000 sel[i] = i * 2;
5001 if (can_vec_perm_p (mode, false, sel))
5003 for (i = 0; i < nelt; i++)
5004 sel[i] = i * 2 + 1;
5005 if (can_vec_perm_p (mode, false, sel))
5006 return true;
5011 if (dump_enabled_p ())
5012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5013 "extract even/odd not supported by target\n");
5014 return false;
5017 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5018 type VECTYPE. */
5020 bool
5021 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5023 return vect_lanes_optab_supported_p ("vec_load_lanes",
5024 vec_load_lanes_optab,
5025 vectype, count);
5028 /* Function vect_permute_load_chain.
5030 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5031 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5032 the input data correctly. Return the final references for loads in
5033 RESULT_CHAIN.
5035 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5036 The input is 4 vectors each containing 8 elements. We assign a number to each
5037 element, the input sequence is:
5039 1st vec: 0 1 2 3 4 5 6 7
5040 2nd vec: 8 9 10 11 12 13 14 15
5041 3rd vec: 16 17 18 19 20 21 22 23
5042 4th vec: 24 25 26 27 28 29 30 31
5044 The output sequence should be:
5046 1st vec: 0 4 8 12 16 20 24 28
5047 2nd vec: 1 5 9 13 17 21 25 29
5048 3rd vec: 2 6 10 14 18 22 26 30
5049 4th vec: 3 7 11 15 19 23 27 31
5051 i.e., the first output vector should contain the first elements of each
5052 interleaving group, etc.
5054 We use extract_even/odd instructions to create such output. The input of
5055 each extract_even/odd operation is two vectors
5056 1st vec 2nd vec
5057 0 1 2 3 4 5 6 7
5059 and the output is the vector of extracted even/odd elements. The output of
5060 extract_even will be: 0 2 4 6
5061 and of extract_odd: 1 3 5 7
5064 The permutation is done in log LENGTH stages. In each stage extract_even
5065 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5066 their order. In our example,
5068 E1: extract_even (1st vec, 2nd vec)
5069 E2: extract_odd (1st vec, 2nd vec)
5070 E3: extract_even (3rd vec, 4th vec)
5071 E4: extract_odd (3rd vec, 4th vec)
5073 The output for the first stage will be:
5075 E1: 0 2 4 6 8 10 12 14
5076 E2: 1 3 5 7 9 11 13 15
5077 E3: 16 18 20 22 24 26 28 30
5078 E4: 17 19 21 23 25 27 29 31
5080 In order to proceed and create the correct sequence for the next stage (or
5081 for the correct output, if the second stage is the last one, as in our
5082 example), we first put the output of extract_even operation and then the
5083 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5084 The input for the second stage is:
5086 1st vec (E1): 0 2 4 6 8 10 12 14
5087 2nd vec (E3): 16 18 20 22 24 26 28 30
5088 3rd vec (E2): 1 3 5 7 9 11 13 15
5089 4th vec (E4): 17 19 21 23 25 27 29 31
5091 The output of the second stage:
5093 E1: 0 4 8 12 16 20 24 28
5094 E2: 2 6 10 14 18 22 26 30
5095 E3: 1 5 9 13 17 21 25 29
5096 E4: 3 7 11 15 19 23 27 31
5098 And RESULT_CHAIN after reordering:
5100 1st vec (E1): 0 4 8 12 16 20 24 28
5101 2nd vec (E3): 1 5 9 13 17 21 25 29
5102 3rd vec (E2): 2 6 10 14 18 22 26 30
5103 4th vec (E4): 3 7 11 15 19 23 27 31. */
5105 static void
5106 vect_permute_load_chain (vec<tree> dr_chain,
5107 unsigned int length,
5108 gimple stmt,
5109 gimple_stmt_iterator *gsi,
5110 vec<tree> *result_chain)
5112 tree data_ref, first_vect, second_vect;
5113 tree perm_mask_even, perm_mask_odd;
5114 tree perm3_mask_low, perm3_mask_high;
5115 gimple perm_stmt;
5116 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5117 unsigned int i, j, log_length = exact_log2 (length);
5118 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5119 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5121 result_chain->quick_grow (length);
5122 memcpy (result_chain->address (), dr_chain.address (),
5123 length * sizeof (tree));
5125 if (length == 3)
5127 unsigned int k;
5129 for (k = 0; k < 3; k++)
5131 for (i = 0; i < nelt; i++)
5132 if (3 * i + k < 2 * nelt)
5133 sel[i] = 3 * i + k;
5134 else
5135 sel[i] = 0;
5136 perm3_mask_low = vect_gen_perm_mask (vectype, sel);
5137 gcc_assert (perm3_mask_low != NULL);
5139 for (i = 0, j = 0; i < nelt; i++)
5140 if (3 * i + k < 2 * nelt)
5141 sel[i] = i;
5142 else
5143 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5145 perm3_mask_high = vect_gen_perm_mask (vectype, sel);
5146 gcc_assert (perm3_mask_high != NULL);
5148 first_vect = dr_chain[0];
5149 second_vect = dr_chain[1];
5151 /* Create interleaving stmt (low part of):
5152 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5153 ...}> */
5154 data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
5155 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5156 first_vect, second_vect,
5157 perm3_mask_low);
5158 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5160 /* Create interleaving stmt (high part of):
5161 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5162 ...}> */
5163 first_vect = data_ref;
5164 second_vect = dr_chain[2];
5165 data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
5166 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5167 first_vect, second_vect,
5168 perm3_mask_high);
5169 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5170 (*result_chain)[k] = data_ref;
5173 else
5175 /* If length is not equal to 3 then only power of 2 is supported. */
5176 gcc_assert (exact_log2 (length) != -1);
5178 for (i = 0; i < nelt; ++i)
5179 sel[i] = i * 2;
5180 perm_mask_even = vect_gen_perm_mask (vectype, sel);
5181 gcc_assert (perm_mask_even != NULL);
5183 for (i = 0; i < nelt; ++i)
5184 sel[i] = i * 2 + 1;
5185 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
5186 gcc_assert (perm_mask_odd != NULL);
5188 for (i = 0; i < log_length; i++)
5190 for (j = 0; j < length; j += 2)
5192 first_vect = dr_chain[j];
5193 second_vect = dr_chain[j+1];
5195 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5196 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5197 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5198 first_vect, second_vect,
5199 perm_mask_even);
5200 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5201 (*result_chain)[j/2] = data_ref;
5203 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5204 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5205 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5206 first_vect, second_vect,
5207 perm_mask_odd);
5208 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5209 (*result_chain)[j/2+length/2] = data_ref;
5211 memcpy (dr_chain.address (), result_chain->address (),
5212 length * sizeof (tree));
5217 /* Function vect_shift_permute_load_chain.
5219 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5220 sequence of stmts to reorder the input data accordingly.
5221 Return the final references for loads in RESULT_CHAIN.
5222 Return true if successed, false otherwise.
5224 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5225 The input is 3 vectors each containing 8 elements. We assign a
5226 number to each element, the input sequence is:
5228 1st vec: 0 1 2 3 4 5 6 7
5229 2nd vec: 8 9 10 11 12 13 14 15
5230 3rd vec: 16 17 18 19 20 21 22 23
5232 The output sequence should be:
5234 1st vec: 0 3 6 9 12 15 18 21
5235 2nd vec: 1 4 7 10 13 16 19 22
5236 3rd vec: 2 5 8 11 14 17 20 23
5238 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5240 First we shuffle all 3 vectors to get correct elements order:
5242 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5243 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5244 3rd vec: (16 19 22) (17 20 23) (18 21)
5246 Next we unite and shift vector 3 times:
5248 1st step:
5249 shift right by 6 the concatenation of:
5250 "1st vec" and "2nd vec"
5251 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5252 "2nd vec" and "3rd vec"
5253 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5254 "3rd vec" and "1st vec"
5255 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5256 | New vectors |
5258 So that now new vectors are:
5260 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5261 2nd vec: (10 13) (16 19 22) (17 20 23)
5262 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5264 2nd step:
5265 shift right by 5 the concatenation of:
5266 "1st vec" and "3rd vec"
5267 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5268 "2nd vec" and "1st vec"
5269 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5270 "3rd vec" and "2nd vec"
5271 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5272 | New vectors |
5274 So that now new vectors are:
5276 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5277 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5278 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5280 3rd step:
5281 shift right by 5 the concatenation of:
5282 "1st vec" and "1st vec"
5283 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5284 shift right by 3 the concatenation of:
5285 "2nd vec" and "2nd vec"
5286 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5287 | New vectors |
5289 So that now all vectors are READY:
5290 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5291 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5292 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5294 This algorithm is faster than one in vect_permute_load_chain if:
5295 1. "shift of a concatination" is faster than general permutation.
5296 This is usually so.
5297 2. The TARGET machine can't execute vector instructions in parallel.
5298 This is because each step of the algorithm depends on previous.
5299 The algorithm in vect_permute_load_chain is much more parallel.
5301 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5304 static bool
5305 vect_shift_permute_load_chain (vec<tree> dr_chain,
5306 unsigned int length,
5307 gimple stmt,
5308 gimple_stmt_iterator *gsi,
5309 vec<tree> *result_chain)
5311 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5312 tree perm2_mask1, perm2_mask2, perm3_mask;
5313 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5314 gimple perm_stmt;
5316 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5317 unsigned int i;
5318 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5319 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5320 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5321 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5323 result_chain->quick_grow (length);
5324 memcpy (result_chain->address (), dr_chain.address (),
5325 length * sizeof (tree));
5327 if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5329 for (i = 0; i < nelt / 2; ++i)
5330 sel[i] = i * 2;
5331 for (i = 0; i < nelt / 2; ++i)
5332 sel[nelt / 2 + i] = i * 2 + 1;
5333 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5335 if (dump_enabled_p ())
5336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5337 "shuffle of 2 fields structure is not \
5338 supported by target\n");
5339 return false;
5341 perm2_mask1 = vect_gen_perm_mask (vectype, sel);
5342 gcc_assert (perm2_mask1 != NULL);
5344 for (i = 0; i < nelt / 2; ++i)
5345 sel[i] = i * 2 + 1;
5346 for (i = 0; i < nelt / 2; ++i)
5347 sel[nelt / 2 + i] = i * 2;
5348 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5350 if (dump_enabled_p ())
5351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5352 "shuffle of 2 fields structure is not \
5353 supported by target\n");
5354 return false;
5356 perm2_mask2 = vect_gen_perm_mask (vectype, sel);
5357 gcc_assert (perm2_mask2 != NULL);
5359 /* Generating permutation constant to shift all elements.
5360 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5361 for (i = 0; i < nelt; i++)
5362 sel[i] = nelt / 2 + i;
5363 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5365 if (dump_enabled_p ())
5366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5367 "shift permutation is not supported by target\n");
5368 return false;
5370 shift1_mask = vect_gen_perm_mask (vectype, sel);
5371 gcc_assert (shift1_mask != NULL);
5373 /* Generating permutation constant to select vector from 2.
5374 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5375 for (i = 0; i < nelt / 2; i++)
5376 sel[i] = i;
5377 for (i = nelt / 2; i < nelt; i++)
5378 sel[i] = nelt + i;
5379 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5381 if (dump_enabled_p ())
5382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5383 "select is not supported by target\n");
5384 return false;
5386 select_mask = vect_gen_perm_mask (vectype, sel);
5387 gcc_assert (select_mask != NULL);
5389 first_vect = dr_chain[0];
5390 second_vect = dr_chain[1];
5392 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5393 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5394 first_vect, first_vect,
5395 perm2_mask1);
5396 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5397 vect[0] = data_ref;
5399 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5400 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5401 second_vect, second_vect,
5402 perm2_mask2);
5403 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5404 vect[1] = data_ref;
5406 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5407 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5408 vect[0], vect[1],
5409 shift1_mask);
5410 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5411 (*result_chain)[1] = data_ref;
5413 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5414 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5415 vect[0], vect[1],
5416 select_mask);
5417 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5418 (*result_chain)[0] = data_ref;
5420 return true;
5422 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5424 unsigned int k = 0, l = 0;
5426 /* Generating permutation constant to get all elements in rigth order.
5427 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5428 for (i = 0; i < nelt; i++)
5430 if (3 * k + (l % 3) >= nelt)
5432 k = 0;
5433 l += (3 - (nelt % 3));
5435 sel[i] = 3 * k + (l % 3);
5436 k++;
5438 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5440 if (dump_enabled_p ())
5441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5442 "shuffle of 3 fields structure is not \
5443 supported by target\n");
5444 return false;
5446 perm3_mask = vect_gen_perm_mask (vectype, sel);
5447 gcc_assert (perm3_mask != NULL);
5449 /* Generating permutation constant to shift all elements.
5450 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5451 for (i = 0; i < nelt; i++)
5452 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5453 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5455 if (dump_enabled_p ())
5456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5457 "shift permutation is not supported by target\n");
5458 return false;
5460 shift1_mask = vect_gen_perm_mask (vectype, sel);
5461 gcc_assert (shift1_mask != NULL);
5463 /* Generating permutation constant to shift all elements.
5464 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5465 for (i = 0; i < nelt; i++)
5466 sel[i] = 2 * (nelt / 3) + 1 + i;
5467 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5469 if (dump_enabled_p ())
5470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5471 "shift permutation is not supported by target\n");
5472 return false;
5474 shift2_mask = vect_gen_perm_mask (vectype, sel);
5475 gcc_assert (shift2_mask != NULL);
5477 /* Generating permutation constant to shift all elements.
5478 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5479 for (i = 0; i < nelt; i++)
5480 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5481 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5483 if (dump_enabled_p ())
5484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5485 "shift permutation is not supported by target\n");
5486 return false;
5488 shift3_mask = vect_gen_perm_mask (vectype, sel);
5489 gcc_assert (shift3_mask != NULL);
5491 /* Generating permutation constant to shift all elements.
5492 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5493 for (i = 0; i < nelt; i++)
5494 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5495 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5497 if (dump_enabled_p ())
5498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5499 "shift permutation is not supported by target\n");
5500 return false;
5502 shift4_mask = vect_gen_perm_mask (vectype, sel);
5503 gcc_assert (shift4_mask != NULL);
5505 for (k = 0; k < 3; k++)
5507 data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3");
5508 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5509 dr_chain[k], dr_chain[k],
5510 perm3_mask);
5511 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5512 vect[k] = data_ref;
5515 for (k = 0; k < 3; k++)
5517 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5518 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5519 vect[k % 3],
5520 vect[(k + 1) % 3],
5521 shift1_mask);
5522 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5523 vect_shift[k] = data_ref;
5526 for (k = 0; k < 3; k++)
5528 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5529 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5530 vect_shift[(4 - k) % 3],
5531 vect_shift[(3 - k) % 3],
5532 shift2_mask);
5533 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5534 vect[k] = data_ref;
5537 (*result_chain)[3 - (nelt % 3)] = vect[2];
5539 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5540 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5541 vect[0], vect[0],
5542 shift3_mask);
5543 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5544 (*result_chain)[nelt % 3] = data_ref;
5546 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5547 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5548 vect[1], vect[1],
5549 shift4_mask);
5550 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5551 (*result_chain)[0] = data_ref;
5552 return true;
5554 return false;
5557 /* Function vect_transform_grouped_load.
5559 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5560 to perform their permutation and ascribe the result vectorized statements to
5561 the scalar statements.
5564 void
5565 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5566 gimple_stmt_iterator *gsi)
5568 enum machine_mode mode;
5569 vec<tree> result_chain = vNULL;
5571 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5572 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5573 vectors, that are ready for vector computation. */
5574 result_chain.create (size);
5576 /* If reassociation width for vector type is 2 or greater target machine can
5577 execute 2 or more vector instructions in parallel. Otherwise try to
5578 get chain for loads group using vect_shift_permute_load_chain. */
5579 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5580 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5581 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5582 gsi, &result_chain))
5583 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5584 vect_record_grouped_load_vectors (stmt, result_chain);
5585 result_chain.release ();
5588 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5589 generated as part of the vectorization of STMT. Assign the statement
5590 for each vector to the associated scalar statement. */
5592 void
5593 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5595 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5596 gimple next_stmt, new_stmt;
5597 unsigned int i, gap_count;
5598 tree tmp_data_ref;
5600 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5601 Since we scan the chain starting from it's first node, their order
5602 corresponds the order of data-refs in RESULT_CHAIN. */
5603 next_stmt = first_stmt;
5604 gap_count = 1;
5605 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5607 if (!next_stmt)
5608 break;
5610 /* Skip the gaps. Loads created for the gaps will be removed by dead
5611 code elimination pass later. No need to check for the first stmt in
5612 the group, since it always exists.
5613 GROUP_GAP is the number of steps in elements from the previous
5614 access (if there is no gap GROUP_GAP is 1). We skip loads that
5615 correspond to the gaps. */
5616 if (next_stmt != first_stmt
5617 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5619 gap_count++;
5620 continue;
5623 while (next_stmt)
5625 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5626 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5627 copies, and we put the new vector statement in the first available
5628 RELATED_STMT. */
5629 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5630 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5631 else
5633 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5635 gimple prev_stmt =
5636 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5637 gimple rel_stmt =
5638 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5639 while (rel_stmt)
5641 prev_stmt = rel_stmt;
5642 rel_stmt =
5643 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5646 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5647 new_stmt;
5651 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5652 gap_count = 1;
5653 /* If NEXT_STMT accesses the same DR as the previous statement,
5654 put the same TMP_DATA_REF as its vectorized statement; otherwise
5655 get the next data-ref from RESULT_CHAIN. */
5656 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5657 break;
5662 /* Function vect_force_dr_alignment_p.
5664 Returns whether the alignment of a DECL can be forced to be aligned
5665 on ALIGNMENT bit boundary. */
5667 bool
5668 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5670 if (TREE_CODE (decl) != VAR_DECL)
5671 return false;
5673 /* With -fno-toplevel-reorder we may have already output the constant. */
5674 if (TREE_ASM_WRITTEN (decl))
5675 return false;
5677 /* Constant pool entries may be shared and not properly merged by LTO. */
5678 if (DECL_IN_CONSTANT_POOL (decl))
5679 return false;
5681 if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5683 symtab_node *snode;
5685 /* We cannot change alignment of symbols that may bind to symbols
5686 in other translation unit that may contain a definition with lower
5687 alignment. */
5688 if (!decl_binds_to_current_def_p (decl))
5689 return false;
5691 /* When compiling partition, be sure the symbol is not output by other
5692 partition. */
5693 snode = symtab_get_node (decl);
5694 if (flag_ltrans
5695 && (snode->in_other_partition
5696 || symtab_get_symbol_partitioning_class (snode) == SYMBOL_DUPLICATE))
5697 return false;
5700 /* Do not override the alignment as specified by the ABI when the used
5701 attribute is set. */
5702 if (DECL_PRESERVE_P (decl))
5703 return false;
5705 /* Do not override explicit alignment set by the user when an explicit
5706 section name is also used. This is a common idiom used by many
5707 software projects. */
5708 if (TREE_STATIC (decl)
5709 && DECL_SECTION_NAME (decl) != NULL
5710 && !symtab_get_node (decl)->implicit_section)
5711 return false;
5713 /* If symbol is an alias, we need to check that target is OK. */
5714 if (TREE_STATIC (decl))
5716 tree target = symtab_alias_ultimate_target (symtab_get_node (decl))->decl;
5717 if (target != decl)
5719 if (DECL_PRESERVE_P (target))
5720 return false;
5721 decl = target;
5725 if (TREE_STATIC (decl))
5726 return (alignment <= MAX_OFILE_ALIGNMENT);
5727 else
5728 return (alignment <= MAX_STACK_ALIGNMENT);
5732 /* Return whether the data reference DR is supported with respect to its
5733 alignment.
5734 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5735 it is aligned, i.e., check if it is possible to vectorize it with different
5736 alignment. */
5738 enum dr_alignment_support
5739 vect_supportable_dr_alignment (struct data_reference *dr,
5740 bool check_aligned_accesses)
5742 gimple stmt = DR_STMT (dr);
5743 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5744 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5745 enum machine_mode mode = TYPE_MODE (vectype);
5746 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5747 struct loop *vect_loop = NULL;
5748 bool nested_in_vect_loop = false;
5750 if (aligned_access_p (dr) && !check_aligned_accesses)
5751 return dr_aligned;
5753 /* For now assume all conditional loads/stores support unaligned
5754 access without any special code. */
5755 if (is_gimple_call (stmt)
5756 && gimple_call_internal_p (stmt)
5757 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5758 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5759 return dr_unaligned_supported;
5761 if (loop_vinfo)
5763 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5764 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5767 /* Possibly unaligned access. */
5769 /* We can choose between using the implicit realignment scheme (generating
5770 a misaligned_move stmt) and the explicit realignment scheme (generating
5771 aligned loads with a REALIGN_LOAD). There are two variants to the
5772 explicit realignment scheme: optimized, and unoptimized.
5773 We can optimize the realignment only if the step between consecutive
5774 vector loads is equal to the vector size. Since the vector memory
5775 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5776 is guaranteed that the misalignment amount remains the same throughout the
5777 execution of the vectorized loop. Therefore, we can create the
5778 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5779 at the loop preheader.
5781 However, in the case of outer-loop vectorization, when vectorizing a
5782 memory access in the inner-loop nested within the LOOP that is now being
5783 vectorized, while it is guaranteed that the misalignment of the
5784 vectorized memory access will remain the same in different outer-loop
5785 iterations, it is *not* guaranteed that is will remain the same throughout
5786 the execution of the inner-loop. This is because the inner-loop advances
5787 with the original scalar step (and not in steps of VS). If the inner-loop
5788 step happens to be a multiple of VS, then the misalignment remains fixed
5789 and we can use the optimized realignment scheme. For example:
5791 for (i=0; i<N; i++)
5792 for (j=0; j<M; j++)
5793 s += a[i+j];
5795 When vectorizing the i-loop in the above example, the step between
5796 consecutive vector loads is 1, and so the misalignment does not remain
5797 fixed across the execution of the inner-loop, and the realignment cannot
5798 be optimized (as illustrated in the following pseudo vectorized loop):
5800 for (i=0; i<N; i+=4)
5801 for (j=0; j<M; j++){
5802 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5803 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5804 // (assuming that we start from an aligned address).
5807 We therefore have to use the unoptimized realignment scheme:
5809 for (i=0; i<N; i+=4)
5810 for (j=k; j<M; j+=4)
5811 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5812 // that the misalignment of the initial address is
5813 // 0).
5815 The loop can then be vectorized as follows:
5817 for (k=0; k<4; k++){
5818 rt = get_realignment_token (&vp[k]);
5819 for (i=0; i<N; i+=4){
5820 v1 = vp[i+k];
5821 for (j=k; j<M; j+=4){
5822 v2 = vp[i+j+VS-1];
5823 va = REALIGN_LOAD <v1,v2,rt>;
5824 vs += va;
5825 v1 = v2;
5828 } */
5830 if (DR_IS_READ (dr))
5832 bool is_packed = false;
5833 tree type = (TREE_TYPE (DR_REF (dr)));
5835 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5836 && (!targetm.vectorize.builtin_mask_for_load
5837 || targetm.vectorize.builtin_mask_for_load ()))
5839 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5840 if ((nested_in_vect_loop
5841 && (TREE_INT_CST_LOW (DR_STEP (dr))
5842 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5843 || !loop_vinfo)
5844 return dr_explicit_realign;
5845 else
5846 return dr_explicit_realign_optimized;
5848 if (!known_alignment_for_access_p (dr))
5849 is_packed = not_size_aligned (DR_REF (dr));
5851 if ((TYPE_USER_ALIGN (type) && !is_packed)
5852 || targetm.vectorize.
5853 support_vector_misalignment (mode, type,
5854 DR_MISALIGNMENT (dr), is_packed))
5855 /* Can't software pipeline the loads, but can at least do them. */
5856 return dr_unaligned_supported;
5858 else
5860 bool is_packed = false;
5861 tree type = (TREE_TYPE (DR_REF (dr)));
5863 if (!known_alignment_for_access_p (dr))
5864 is_packed = not_size_aligned (DR_REF (dr));
5866 if ((TYPE_USER_ALIGN (type) && !is_packed)
5867 || targetm.vectorize.
5868 support_vector_misalignment (mode, type,
5869 DR_MISALIGNMENT (dr), is_packed))
5870 return dr_unaligned_supported;
5873 /* Unsupported. */
5874 return dr_unaligned_unsupported;