2014-02-04 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobd018add0a969d7efab02814d8f0ae01b64322470
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
58 #include "expr.h"
59 #include "optabs.h"
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
64 static bool
65 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
66 tree vectype, unsigned HOST_WIDE_INT count)
68 enum machine_mode mode, array_mode;
69 bool limit_p;
71 mode = TYPE_MODE (vectype);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
74 MODE_INT, limit_p);
76 if (array_mode == BLKmode)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
81 GET_MODE_NAME (mode), count);
82 return false;
85 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
89 "cannot use %s<%s><%s>\n", name,
90 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
91 return false;
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE, vect_location,
96 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
97 GET_MODE_NAME (mode));
99 return true;
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 types. */
120 tree
121 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt);
125 HOST_WIDE_INT lhs, rhs;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE, vect_location,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
165 dump_printf (MSG_NOTE, " and ");
166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
167 dump_printf (MSG_NOTE, "\n");
170 if (optimize_loop_nest_for_size_p (loop))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
174 "versioning not supported when optimizing"
175 " for size.\n");
176 return false;
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
180 if (loop->inner)
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
184 "versioning not yet supported for outer-loops.\n");
185 return false;
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
189 step. */
190 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195 "versioning not yet supported for non-constant "
196 "step\n");
197 return false;
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return true;
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
212 static bool
213 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
214 loop_vec_info loop_vinfo, int *max_vf)
216 unsigned int i;
217 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
218 struct data_reference *dra = DDR_A (ddr);
219 struct data_reference *drb = DDR_B (ddr);
220 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222 lambda_vector dist_v;
223 unsigned int loop_depth;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
228 gcc_unreachable ();
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232 return false;
234 if (dra == drb
235 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
236 return false;
238 /* Even if we have an anti-dependence then, as the vectorized loop covers at
239 least two scalar iterations, there is always also a true dependence.
240 As the vectorizer does not re-order loads and stores we can ignore
241 the anti-dependence if TBAA can disambiguate both DRs similar to the
242 case with known negative distance anti-dependences (positive
243 distance anti-dependences would violate TBAA constraints). */
244 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
245 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
246 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
247 get_alias_set (DR_REF (drb))))
248 return false;
250 /* Unknown data dependence. */
251 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
253 /* If user asserted safelen consecutive iterations can be
254 executed concurrently, assume independence. */
255 if (loop->safelen >= 2)
257 if (loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 return false;
263 if (STMT_VINFO_GATHER_P (stmtinfo_a)
264 || STMT_VINFO_GATHER_P (stmtinfo_b))
266 if (dump_enabled_p ())
268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
269 "versioning for alias not supported for: "
270 "can't determine dependence between ");
271 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
272 DR_REF (dra));
273 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 DR_REF (drb));
276 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
278 return true;
281 if (dump_enabled_p ())
283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
284 "versioning for alias required: "
285 "can't determine dependence between ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (dra));
288 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 DR_REF (drb));
291 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
294 /* Add to list of ddrs that need to be tested at run-time. */
295 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
298 /* Known data dependence. */
299 if (DDR_NUM_DIST_VECTS (ddr) == 0)
301 /* If user asserted safelen consecutive iterations can be
302 executed concurrently, assume independence. */
303 if (loop->safelen >= 2)
305 if (loop->safelen < *max_vf)
306 *max_vf = loop->safelen;
307 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
308 return false;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a)
312 || STMT_VINFO_GATHER_P (stmtinfo_b))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 DR_REF (drb));
324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
326 return true;
329 if (dump_enabled_p ())
331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
332 "versioning for alias required: "
333 "bad dist vector for ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
335 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
336 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
337 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
339 /* Add to list of ddrs that need to be tested at run-time. */
340 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
343 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
344 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
346 int dist = dist_v[loop_depth];
348 if (dump_enabled_p ())
349 dump_printf_loc (MSG_NOTE, vect_location,
350 "dependence distance = %d.\n", dist);
352 if (dist == 0)
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "dependence distance == 0 between ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
359 dump_printf (MSG_NOTE, " and ");
360 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
361 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 /* When we perform grouped accesses and perform implicit CSE
365 by detecting equal accesses and doing disambiguation with
366 runtime alias tests like for
367 .. = a[i];
368 .. = a[i+1];
369 a[i] = ..;
370 a[i+1] = ..;
371 *p = ..;
372 .. = a[i];
373 .. = a[i+1];
374 where we will end up loading { a[i], a[i+1] } once, make
375 sure that inserting group loads before the first load and
376 stores after the last store will do the right thing. */
377 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
378 && GROUP_SAME_DR_STMT (stmtinfo_a))
379 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
380 && GROUP_SAME_DR_STMT (stmtinfo_b)))
382 gimple earlier_stmt;
383 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
384 if (DR_IS_WRITE
385 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "READ_WRITE dependence in interleaving."
390 "\n");
391 return true;
395 continue;
398 if (dist > 0 && DDR_REVERSED_P (ddr))
400 /* If DDR_REVERSED_P the order of the data-refs in DDR was
401 reversed (to make distance vector positive), and the actual
402 distance is negative. */
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "dependence distance negative.\n");
406 continue;
409 if (abs (dist) >= 2
410 && abs (dist) < *max_vf)
412 /* The dependence distance requires reduction of the maximal
413 vectorization factor. */
414 *max_vf = abs (dist);
415 if (dump_enabled_p ())
416 dump_printf_loc (MSG_NOTE, vect_location,
417 "adjusting maximal vectorization factor to %i\n",
418 *max_vf);
421 if (abs (dist) >= *max_vf)
423 /* Dependence distance does not create dependence, as far as
424 vectorization is concerned, in this case. */
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_NOTE, vect_location,
427 "dependence distance >= VF.\n");
428 continue;
431 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "not vectorized, possible dependence "
435 "between data-refs ");
436 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
437 dump_printf (MSG_NOTE, " and ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
439 dump_printf (MSG_NOTE, "\n");
442 return true;
445 return false;
448 /* Function vect_analyze_data_ref_dependences.
450 Examine all the data references in the loop, and make sure there do not
451 exist any data dependences between them. Set *MAX_VF according to
452 the maximum vectorization factor the data dependences allow. */
454 bool
455 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
457 unsigned int i;
458 struct data_dependence_relation *ddr;
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_NOTE, vect_location,
462 "=== vect_analyze_data_ref_dependences ===\n");
464 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
465 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
466 &LOOP_VINFO_DDRS (loop_vinfo),
467 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
468 return false;
470 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
471 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
472 return false;
474 return true;
478 /* Function vect_slp_analyze_data_ref_dependence.
480 Return TRUE if there (might) exist a dependence between a memory-reference
481 DRA and a memory-reference DRB. When versioning for alias may check a
482 dependence at run-time, return FALSE. Adjust *MAX_VF according to
483 the data dependence. */
485 static bool
486 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
488 struct data_reference *dra = DDR_A (ddr);
489 struct data_reference *drb = DDR_B (ddr);
491 /* We need to check dependences of statements marked as unvectorizable
492 as well, they still can prohibit vectorization. */
494 /* Independent data accesses. */
495 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
496 return false;
498 if (dra == drb)
499 return false;
501 /* Read-read is OK. */
502 if (DR_IS_READ (dra) && DR_IS_READ (drb))
503 return false;
505 /* If dra and drb are part of the same interleaving chain consider
506 them independent. */
507 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
508 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
509 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
510 return false;
512 /* Unknown data dependence. */
513 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
518 "can't determine dependence between ");
519 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
520 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
522 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 else if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE, vect_location,
528 "determined dependence between ");
529 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
530 dump_printf (MSG_NOTE, " and ");
531 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
532 dump_printf (MSG_NOTE, "\n");
535 /* We do not vectorize basic blocks with write-write dependencies. */
536 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
537 return true;
539 /* If we have a read-write dependence check that the load is before the store.
540 When we vectorize basic blocks, vector load can be only before
541 corresponding scalar load, and vector store can be only after its
542 corresponding scalar store. So the order of the acceses is preserved in
543 case the load is before the store. */
544 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
545 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
547 /* That only holds for load-store pairs taking part in vectorization. */
548 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
549 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
550 return false;
553 return true;
557 /* Function vect_analyze_data_ref_dependences.
559 Examine all the data references in the basic-block, and make sure there
560 do not exist any data dependences between them. Set *MAX_VF according to
561 the maximum vectorization factor the data dependences allow. */
563 bool
564 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
566 struct data_dependence_relation *ddr;
567 unsigned int i;
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "=== vect_slp_analyze_data_ref_dependences ===\n");
573 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
574 &BB_VINFO_DDRS (bb_vinfo),
575 vNULL, true))
576 return false;
578 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
579 if (vect_slp_analyze_data_ref_dependence (ddr))
580 return false;
582 return true;
586 /* Function vect_compute_data_ref_alignment
588 Compute the misalignment of the data reference DR.
590 Output:
591 1. If during the misalignment computation it is found that the data reference
592 cannot be vectorized then false is returned.
593 2. DR_MISALIGNMENT (DR) is defined.
595 FOR NOW: No analysis is actually performed. Misalignment is calculated
596 only for trivial cases. TODO. */
598 static bool
599 vect_compute_data_ref_alignment (struct data_reference *dr)
601 gimple stmt = DR_STMT (dr);
602 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
603 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
604 struct loop *loop = NULL;
605 tree ref = DR_REF (dr);
606 tree vectype;
607 tree base, base_addr;
608 bool base_aligned;
609 tree misalign;
610 tree aligned_to, alignment;
612 if (dump_enabled_p ())
613 dump_printf_loc (MSG_NOTE, vect_location,
614 "vect_compute_data_ref_alignment:\n");
616 if (loop_vinfo)
617 loop = LOOP_VINFO_LOOP (loop_vinfo);
619 /* Initialize misalignment to unknown. */
620 SET_DR_MISALIGNMENT (dr, -1);
622 /* Strided loads perform only component accesses, misalignment information
623 is irrelevant for them. */
624 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
625 return true;
627 misalign = DR_INIT (dr);
628 aligned_to = DR_ALIGNED_TO (dr);
629 base_addr = DR_BASE_ADDRESS (dr);
630 vectype = STMT_VINFO_VECTYPE (stmt_info);
632 /* In case the dataref is in an inner-loop of the loop that is being
633 vectorized (LOOP), we use the base and misalignment information
634 relative to the outer-loop (LOOP). This is ok only if the misalignment
635 stays the same throughout the execution of the inner-loop, which is why
636 we have to check that the stride of the dataref in the inner-loop evenly
637 divides by the vector size. */
638 if (loop && nested_in_vect_loop_p (loop, stmt))
640 tree step = DR_STEP (dr);
641 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
643 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_NOTE, vect_location,
647 "inner step divides the vector-size.\n");
648 misalign = STMT_VINFO_DR_INIT (stmt_info);
649 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
650 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
652 else
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
656 "inner step doesn't divide the vector-size.\n");
657 misalign = NULL_TREE;
661 /* Similarly, if we're doing basic-block vectorization, we can only use
662 base and misalignment information relative to an innermost loop if the
663 misalignment stays the same throughout the execution of the loop.
664 As above, this is the case if the stride of the dataref evenly divides
665 by the vector size. */
666 if (!loop)
668 tree step = DR_STEP (dr);
669 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
671 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
673 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
675 "SLP: step doesn't divide the vector-size.\n");
676 misalign = NULL_TREE;
680 base = build_fold_indirect_ref (base_addr);
681 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
683 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
684 || !misalign)
686 if (dump_enabled_p ())
688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
689 "Unknown alignment for access: ");
690 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
691 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
693 return true;
696 if ((DECL_P (base)
697 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
698 alignment) >= 0)
699 || (TREE_CODE (base_addr) == SSA_NAME
700 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
701 TREE_TYPE (base_addr)))),
702 alignment) >= 0)
703 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
704 base_aligned = true;
705 else
706 base_aligned = false;
708 if (!base_aligned)
710 /* Do not change the alignment of global variables here if
711 flag_section_anchors is enabled as we already generated
712 RTL for other functions. Most global variables should
713 have been aligned during the IPA increase_alignment pass. */
714 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
715 || (TREE_STATIC (base) && flag_section_anchors))
717 if (dump_enabled_p ())
719 dump_printf_loc (MSG_NOTE, vect_location,
720 "can't force alignment of ref: ");
721 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
722 dump_printf (MSG_NOTE, "\n");
724 return true;
727 /* Force the alignment of the decl.
728 NOTE: This is the only change to the code we make during
729 the analysis phase, before deciding to vectorize the loop. */
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
734 dump_printf (MSG_NOTE, "\n");
737 ((dataref_aux *)dr->aux)->base_decl = base;
738 ((dataref_aux *)dr->aux)->base_misaligned = true;
741 /* If this is a backward running DR then first access in the larger
742 vectype actually is N-1 elements before the address in the DR.
743 Adjust misalign accordingly. */
744 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
746 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
747 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
748 otherwise we wouldn't be here. */
749 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
750 /* PLUS because DR_STEP was negative. */
751 misalign = size_binop (PLUS_EXPR, misalign, offset);
754 /* Modulo alignment. */
755 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
757 if (!tree_fits_uhwi_p (misalign))
759 /* Negative or overflowed misalignment value. */
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "unexpected misalign value\n");
763 return false;
766 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
768 if (dump_enabled_p ())
770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
771 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
772 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
773 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
776 return true;
780 /* Function vect_compute_data_refs_alignment
782 Compute the misalignment of data references in the loop.
783 Return FALSE if a data reference is found that cannot be vectorized. */
785 static bool
786 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
787 bb_vec_info bb_vinfo)
789 vec<data_reference_p> datarefs;
790 struct data_reference *dr;
791 unsigned int i;
793 if (loop_vinfo)
794 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
795 else
796 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
798 FOR_EACH_VEC_ELT (datarefs, i, dr)
799 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
800 && !vect_compute_data_ref_alignment (dr))
802 if (bb_vinfo)
804 /* Mark unsupported statement as unvectorizable. */
805 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
806 continue;
808 else
809 return false;
812 return true;
816 /* Function vect_update_misalignment_for_peel
818 DR - the data reference whose misalignment is to be adjusted.
819 DR_PEEL - the data reference whose misalignment is being made
820 zero in the vector loop by the peel.
821 NPEEL - the number of iterations in the peel loop if the misalignment
822 of DR_PEEL is known at compile time. */
824 static void
825 vect_update_misalignment_for_peel (struct data_reference *dr,
826 struct data_reference *dr_peel, int npeel)
828 unsigned int i;
829 vec<dr_p> same_align_drs;
830 struct data_reference *current_dr;
831 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
832 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
833 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
834 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
836 /* For interleaved data accesses the step in the loop must be multiplied by
837 the size of the interleaving group. */
838 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
839 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
840 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
841 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
843 /* It can be assumed that the data refs with the same alignment as dr_peel
844 are aligned in the vector loop. */
845 same_align_drs
846 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
847 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
849 if (current_dr != dr)
850 continue;
851 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
852 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
853 SET_DR_MISALIGNMENT (dr, 0);
854 return;
857 if (known_alignment_for_access_p (dr)
858 && known_alignment_for_access_p (dr_peel))
860 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
861 int misal = DR_MISALIGNMENT (dr);
862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
863 misal += negative ? -npeel * dr_size : npeel * dr_size;
864 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
865 SET_DR_MISALIGNMENT (dr, misal);
866 return;
869 if (dump_enabled_p ())
870 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
871 SET_DR_MISALIGNMENT (dr, -1);
875 /* Function vect_verify_datarefs_alignment
877 Return TRUE if all data references in the loop can be
878 handled with respect to alignment. */
880 bool
881 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
883 vec<data_reference_p> datarefs;
884 struct data_reference *dr;
885 enum dr_alignment_support supportable_dr_alignment;
886 unsigned int i;
888 if (loop_vinfo)
889 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
890 else
891 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
893 FOR_EACH_VEC_ELT (datarefs, i, dr)
895 gimple stmt = DR_STMT (dr);
896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
898 if (!STMT_VINFO_RELEVANT_P (stmt_info))
899 continue;
901 /* For interleaving, only the alignment of the first access matters.
902 Skip statements marked as not vectorizable. */
903 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
904 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
905 || !STMT_VINFO_VECTORIZABLE (stmt_info))
906 continue;
908 /* Strided loads perform only component accesses, alignment is
909 irrelevant for them. */
910 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
911 continue;
913 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
914 if (!supportable_dr_alignment)
916 if (dump_enabled_p ())
918 if (DR_IS_READ (dr))
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
920 "not vectorized: unsupported unaligned load.");
921 else
922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
923 "not vectorized: unsupported unaligned "
924 "store.");
926 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
927 DR_REF (dr));
928 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
930 return false;
932 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE, vect_location,
934 "Vectorizing an unaligned access.\n");
936 return true;
939 /* Given an memory reference EXP return whether its alignment is less
940 than its size. */
942 static bool
943 not_size_aligned (tree exp)
945 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
946 return true;
948 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
949 > get_object_alignment (exp));
952 /* Function vector_alignment_reachable_p
954 Return true if vector alignment for DR is reachable by peeling
955 a few loop iterations. Return false otherwise. */
957 static bool
958 vector_alignment_reachable_p (struct data_reference *dr)
960 gimple stmt = DR_STMT (dr);
961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
962 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
964 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
966 /* For interleaved access we peel only if number of iterations in
967 the prolog loop ({VF - misalignment}), is a multiple of the
968 number of the interleaved accesses. */
969 int elem_size, mis_in_elements;
970 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
972 /* FORNOW: handle only known alignment. */
973 if (!known_alignment_for_access_p (dr))
974 return false;
976 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
977 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
979 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
980 return false;
983 /* If misalignment is known at the compile time then allow peeling
984 only if natural alignment is reachable through peeling. */
985 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
987 HOST_WIDE_INT elmsize =
988 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
989 if (dump_enabled_p ())
991 dump_printf_loc (MSG_NOTE, vect_location,
992 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
993 dump_printf (MSG_NOTE,
994 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
996 if (DR_MISALIGNMENT (dr) % elmsize)
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1000 "data size does not divide the misalignment.\n");
1001 return false;
1005 if (!known_alignment_for_access_p (dr))
1007 tree type = TREE_TYPE (DR_REF (dr));
1008 bool is_packed = not_size_aligned (DR_REF (dr));
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011 "Unknown misalignment, is_packed = %d\n",is_packed);
1012 if ((TYPE_USER_ALIGN (type) && !is_packed)
1013 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1014 return true;
1015 else
1016 return false;
1019 return true;
1023 /* Calculate the cost of the memory access represented by DR. */
1025 static void
1026 vect_get_data_access_cost (struct data_reference *dr,
1027 unsigned int *inside_cost,
1028 unsigned int *outside_cost,
1029 stmt_vector_for_cost *body_cost_vec)
1031 gimple stmt = DR_STMT (dr);
1032 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1033 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1034 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1035 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1036 int ncopies = vf / nunits;
1038 if (DR_IS_READ (dr))
1039 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1040 NULL, body_cost_vec, false);
1041 else
1042 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_NOTE, vect_location,
1046 "vect_get_data_access_cost: inside_cost = %d, "
1047 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1051 /* Insert DR into peeling hash table with NPEEL as key. */
1053 static void
1054 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1055 int npeel)
1057 struct _vect_peel_info elem, *slot;
1058 _vect_peel_info **new_slot;
1059 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1061 elem.npeel = npeel;
1062 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1063 if (slot)
1064 slot->count++;
1065 else
1067 slot = XNEW (struct _vect_peel_info);
1068 slot->npeel = npeel;
1069 slot->dr = dr;
1070 slot->count = 1;
1071 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1072 *new_slot = slot;
1075 if (!supportable_dr_alignment
1076 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1077 slot->count += VECT_MAX_COST;
1081 /* Traverse peeling hash table to find peeling option that aligns maximum
1082 number of data accesses. */
1085 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1086 _vect_peel_extended_info *max)
1088 vect_peel_info elem = *slot;
1090 if (elem->count > max->peel_info.count
1091 || (elem->count == max->peel_info.count
1092 && max->peel_info.npeel > elem->npeel))
1094 max->peel_info.npeel = elem->npeel;
1095 max->peel_info.count = elem->count;
1096 max->peel_info.dr = elem->dr;
1099 return 1;
1103 /* Traverse peeling hash table and calculate cost for each peeling option.
1104 Find the one with the lowest cost. */
1107 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1108 _vect_peel_extended_info *min)
1110 vect_peel_info elem = *slot;
1111 int save_misalignment, dummy;
1112 unsigned int inside_cost = 0, outside_cost = 0, i;
1113 gimple stmt = DR_STMT (elem->dr);
1114 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1115 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1116 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1117 struct data_reference *dr;
1118 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1119 int single_iter_cost;
1121 prologue_cost_vec.create (2);
1122 body_cost_vec.create (2);
1123 epilogue_cost_vec.create (2);
1125 FOR_EACH_VEC_ELT (datarefs, i, dr)
1127 stmt = DR_STMT (dr);
1128 stmt_info = vinfo_for_stmt (stmt);
1129 /* For interleaving, only the alignment of the first access
1130 matters. */
1131 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1132 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1133 continue;
1135 save_misalignment = DR_MISALIGNMENT (dr);
1136 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1137 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1138 &body_cost_vec);
1139 SET_DR_MISALIGNMENT (dr, save_misalignment);
1142 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1143 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1144 &dummy, single_iter_cost,
1145 &prologue_cost_vec,
1146 &epilogue_cost_vec);
1148 /* Prologue and epilogue costs are added to the target model later.
1149 These costs depend only on the scalar iteration cost, the
1150 number of peeling iterations finally chosen, and the number of
1151 misaligned statements. So discard the information found here. */
1152 prologue_cost_vec.release ();
1153 epilogue_cost_vec.release ();
1155 if (inside_cost < min->inside_cost
1156 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1158 min->inside_cost = inside_cost;
1159 min->outside_cost = outside_cost;
1160 min->body_cost_vec.release ();
1161 min->body_cost_vec = body_cost_vec;
1162 min->peel_info.dr = elem->dr;
1163 min->peel_info.npeel = elem->npeel;
1165 else
1166 body_cost_vec.release ();
1168 return 1;
1172 /* Choose best peeling option by traversing peeling hash table and either
1173 choosing an option with the lowest cost (if cost model is enabled) or the
1174 option that aligns as many accesses as possible. */
1176 static struct data_reference *
1177 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1178 unsigned int *npeel,
1179 stmt_vector_for_cost *body_cost_vec)
1181 struct _vect_peel_extended_info res;
1183 res.peel_info.dr = NULL;
1184 res.body_cost_vec = stmt_vector_for_cost ();
1186 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1188 res.inside_cost = INT_MAX;
1189 res.outside_cost = INT_MAX;
1190 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1191 .traverse <_vect_peel_extended_info *,
1192 vect_peeling_hash_get_lowest_cost> (&res);
1194 else
1196 res.peel_info.count = 0;
1197 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1198 .traverse <_vect_peel_extended_info *,
1199 vect_peeling_hash_get_most_frequent> (&res);
1202 *npeel = res.peel_info.npeel;
1203 *body_cost_vec = res.body_cost_vec;
1204 return res.peel_info.dr;
1208 /* Function vect_enhance_data_refs_alignment
1210 This pass will use loop versioning and loop peeling in order to enhance
1211 the alignment of data references in the loop.
1213 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1214 original loop is to be vectorized. Any other loops that are created by
1215 the transformations performed in this pass - are not supposed to be
1216 vectorized. This restriction will be relaxed.
1218 This pass will require a cost model to guide it whether to apply peeling
1219 or versioning or a combination of the two. For example, the scheme that
1220 intel uses when given a loop with several memory accesses, is as follows:
1221 choose one memory access ('p') which alignment you want to force by doing
1222 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1223 other accesses are not necessarily aligned, or (2) use loop versioning to
1224 generate one loop in which all accesses are aligned, and another loop in
1225 which only 'p' is necessarily aligned.
1227 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1228 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1229 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1231 Devising a cost model is the most critical aspect of this work. It will
1232 guide us on which access to peel for, whether to use loop versioning, how
1233 many versions to create, etc. The cost model will probably consist of
1234 generic considerations as well as target specific considerations (on
1235 powerpc for example, misaligned stores are more painful than misaligned
1236 loads).
1238 Here are the general steps involved in alignment enhancements:
1240 -- original loop, before alignment analysis:
1241 for (i=0; i<N; i++){
1242 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1243 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1246 -- After vect_compute_data_refs_alignment:
1247 for (i=0; i<N; i++){
1248 x = q[i]; # DR_MISALIGNMENT(q) = 3
1249 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1252 -- Possibility 1: we do loop versioning:
1253 if (p is aligned) {
1254 for (i=0; i<N; i++){ # loop 1A
1255 x = q[i]; # DR_MISALIGNMENT(q) = 3
1256 p[i] = y; # DR_MISALIGNMENT(p) = 0
1259 else {
1260 for (i=0; i<N; i++){ # loop 1B
1261 x = q[i]; # DR_MISALIGNMENT(q) = 3
1262 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1266 -- Possibility 2: we do loop peeling:
1267 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1268 x = q[i];
1269 p[i] = y;
1271 for (i = 3; i < N; i++){ # loop 2A
1272 x = q[i]; # DR_MISALIGNMENT(q) = 0
1273 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1276 -- Possibility 3: combination of loop peeling and versioning:
1277 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1278 x = q[i];
1279 p[i] = y;
1281 if (p is aligned) {
1282 for (i = 3; i<N; i++){ # loop 3A
1283 x = q[i]; # DR_MISALIGNMENT(q) = 0
1284 p[i] = y; # DR_MISALIGNMENT(p) = 0
1287 else {
1288 for (i = 3; i<N; i++){ # loop 3B
1289 x = q[i]; # DR_MISALIGNMENT(q) = 0
1290 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1294 These loops are later passed to loop_transform to be vectorized. The
1295 vectorizer will use the alignment information to guide the transformation
1296 (whether to generate regular loads/stores, or with special handling for
1297 misalignment). */
1299 bool
1300 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1302 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1303 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1304 enum dr_alignment_support supportable_dr_alignment;
1305 struct data_reference *dr0 = NULL, *first_store = NULL;
1306 struct data_reference *dr;
1307 unsigned int i, j;
1308 bool do_peeling = false;
1309 bool do_versioning = false;
1310 bool stat;
1311 gimple stmt;
1312 stmt_vec_info stmt_info;
1313 unsigned int npeel = 0;
1314 bool all_misalignments_unknown = true;
1315 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1316 unsigned possible_npeel_number = 1;
1317 tree vectype;
1318 unsigned int nelements, mis, same_align_drs_max = 0;
1319 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_NOTE, vect_location,
1323 "=== vect_enhance_data_refs_alignment ===\n");
1325 /* While cost model enhancements are expected in the future, the high level
1326 view of the code at this time is as follows:
1328 A) If there is a misaligned access then see if peeling to align
1329 this access can make all data references satisfy
1330 vect_supportable_dr_alignment. If so, update data structures
1331 as needed and return true.
1333 B) If peeling wasn't possible and there is a data reference with an
1334 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1335 then see if loop versioning checks can be used to make all data
1336 references satisfy vect_supportable_dr_alignment. If so, update
1337 data structures as needed and return true.
1339 C) If neither peeling nor versioning were successful then return false if
1340 any data reference does not satisfy vect_supportable_dr_alignment.
1342 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1344 Note, Possibility 3 above (which is peeling and versioning together) is not
1345 being done at this time. */
1347 /* (1) Peeling to force alignment. */
1349 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1350 Considerations:
1351 + How many accesses will become aligned due to the peeling
1352 - How many accesses will become unaligned due to the peeling,
1353 and the cost of misaligned accesses.
1354 - The cost of peeling (the extra runtime checks, the increase
1355 in code size). */
1357 FOR_EACH_VEC_ELT (datarefs, i, dr)
1359 stmt = DR_STMT (dr);
1360 stmt_info = vinfo_for_stmt (stmt);
1362 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1363 continue;
1365 /* For interleaving, only the alignment of the first access
1366 matters. */
1367 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1368 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1369 continue;
1371 /* For invariant accesses there is nothing to enhance. */
1372 if (integer_zerop (DR_STEP (dr)))
1373 continue;
1375 /* Strided loads perform only component accesses, alignment is
1376 irrelevant for them. */
1377 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1378 continue;
1380 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1381 do_peeling = vector_alignment_reachable_p (dr);
1382 if (do_peeling)
1384 if (known_alignment_for_access_p (dr))
1386 unsigned int npeel_tmp;
1387 bool negative = tree_int_cst_compare (DR_STEP (dr),
1388 size_zero_node) < 0;
1390 /* Save info about DR in the hash table. */
1391 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1392 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1394 vectype = STMT_VINFO_VECTYPE (stmt_info);
1395 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1396 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1397 TREE_TYPE (DR_REF (dr))));
1398 npeel_tmp = (negative
1399 ? (mis - nelements) : (nelements - mis))
1400 & (nelements - 1);
1402 /* For multiple types, it is possible that the bigger type access
1403 will have more than one peeling option. E.g., a loop with two
1404 types: one of size (vector size / 4), and the other one of
1405 size (vector size / 8). Vectorization factor will 8. If both
1406 access are misaligned by 3, the first one needs one scalar
1407 iteration to be aligned, and the second one needs 5. But the
1408 the first one will be aligned also by peeling 5 scalar
1409 iterations, and in that case both accesses will be aligned.
1410 Hence, except for the immediate peeling amount, we also want
1411 to try to add full vector size, while we don't exceed
1412 vectorization factor.
1413 We do this automtically for cost model, since we calculate cost
1414 for every peeling option. */
1415 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1416 possible_npeel_number = vf /nelements;
1418 /* Handle the aligned case. We may decide to align some other
1419 access, making DR unaligned. */
1420 if (DR_MISALIGNMENT (dr) == 0)
1422 npeel_tmp = 0;
1423 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1424 possible_npeel_number++;
1427 for (j = 0; j < possible_npeel_number; j++)
1429 gcc_assert (npeel_tmp <= vf);
1430 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1431 npeel_tmp += nelements;
1434 all_misalignments_unknown = false;
1435 /* Data-ref that was chosen for the case that all the
1436 misalignments are unknown is not relevant anymore, since we
1437 have a data-ref with known alignment. */
1438 dr0 = NULL;
1440 else
1442 /* If we don't know any misalignment values, we prefer
1443 peeling for data-ref that has the maximum number of data-refs
1444 with the same alignment, unless the target prefers to align
1445 stores over load. */
1446 if (all_misalignments_unknown)
1448 unsigned same_align_drs
1449 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1450 if (!dr0
1451 || same_align_drs_max < same_align_drs)
1453 same_align_drs_max = same_align_drs;
1454 dr0 = dr;
1456 /* For data-refs with the same number of related
1457 accesses prefer the one where the misalign
1458 computation will be invariant in the outermost loop. */
1459 else if (same_align_drs_max == same_align_drs)
1461 struct loop *ivloop0, *ivloop;
1462 ivloop0 = outermost_invariant_loop_for_expr
1463 (loop, DR_BASE_ADDRESS (dr0));
1464 ivloop = outermost_invariant_loop_for_expr
1465 (loop, DR_BASE_ADDRESS (dr));
1466 if ((ivloop && !ivloop0)
1467 || (ivloop && ivloop0
1468 && flow_loop_nested_p (ivloop, ivloop0)))
1469 dr0 = dr;
1472 if (!first_store && DR_IS_WRITE (dr))
1473 first_store = dr;
1476 /* If there are both known and unknown misaligned accesses in the
1477 loop, we choose peeling amount according to the known
1478 accesses. */
1479 if (!supportable_dr_alignment)
1481 dr0 = dr;
1482 if (!first_store && DR_IS_WRITE (dr))
1483 first_store = dr;
1487 else
1489 if (!aligned_access_p (dr))
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 "vector alignment may not be reachable\n");
1494 break;
1499 /* Check if we can possibly peel the loop. */
1500 if (!vect_can_advance_ivs_p (loop_vinfo)
1501 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1502 do_peeling = false;
1504 if (do_peeling && all_misalignments_unknown
1505 && vect_supportable_dr_alignment (dr0, false))
1508 /* Check if the target requires to prefer stores over loads, i.e., if
1509 misaligned stores are more expensive than misaligned loads (taking
1510 drs with same alignment into account). */
1511 if (first_store && DR_IS_READ (dr0))
1513 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1514 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1515 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1516 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1517 stmt_vector_for_cost dummy;
1518 dummy.create (2);
1520 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1521 &dummy);
1522 vect_get_data_access_cost (first_store, &store_inside_cost,
1523 &store_outside_cost, &dummy);
1525 dummy.release ();
1527 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1528 aligning the load DR0). */
1529 load_inside_penalty = store_inside_cost;
1530 load_outside_penalty = store_outside_cost;
1531 for (i = 0;
1532 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1533 DR_STMT (first_store))).iterate (i, &dr);
1534 i++)
1535 if (DR_IS_READ (dr))
1537 load_inside_penalty += load_inside_cost;
1538 load_outside_penalty += load_outside_cost;
1540 else
1542 load_inside_penalty += store_inside_cost;
1543 load_outside_penalty += store_outside_cost;
1546 /* Calculate the penalty for leaving DR0 unaligned (by
1547 aligning the FIRST_STORE). */
1548 store_inside_penalty = load_inside_cost;
1549 store_outside_penalty = load_outside_cost;
1550 for (i = 0;
1551 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1552 DR_STMT (dr0))).iterate (i, &dr);
1553 i++)
1554 if (DR_IS_READ (dr))
1556 store_inside_penalty += load_inside_cost;
1557 store_outside_penalty += load_outside_cost;
1559 else
1561 store_inside_penalty += store_inside_cost;
1562 store_outside_penalty += store_outside_cost;
1565 if (load_inside_penalty > store_inside_penalty
1566 || (load_inside_penalty == store_inside_penalty
1567 && load_outside_penalty > store_outside_penalty))
1568 dr0 = first_store;
1571 /* In case there are only loads with different unknown misalignments, use
1572 peeling only if it may help to align other accesses in the loop. */
1573 if (!first_store
1574 && !STMT_VINFO_SAME_ALIGN_REFS (
1575 vinfo_for_stmt (DR_STMT (dr0))).length ()
1576 && vect_supportable_dr_alignment (dr0, false)
1577 != dr_unaligned_supported)
1578 do_peeling = false;
1581 if (do_peeling && !dr0)
1583 /* Peeling is possible, but there is no data access that is not supported
1584 unless aligned. So we try to choose the best possible peeling. */
1586 /* We should get here only if there are drs with known misalignment. */
1587 gcc_assert (!all_misalignments_unknown);
1589 /* Choose the best peeling from the hash table. */
1590 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1591 &body_cost_vec);
1592 if (!dr0 || !npeel)
1593 do_peeling = false;
1596 if (do_peeling)
1598 stmt = DR_STMT (dr0);
1599 stmt_info = vinfo_for_stmt (stmt);
1600 vectype = STMT_VINFO_VECTYPE (stmt_info);
1601 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1603 if (known_alignment_for_access_p (dr0))
1605 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1606 size_zero_node) < 0;
1607 if (!npeel)
1609 /* Since it's known at compile time, compute the number of
1610 iterations in the peeled loop (the peeling factor) for use in
1611 updating DR_MISALIGNMENT values. The peeling factor is the
1612 vectorization factor minus the misalignment as an element
1613 count. */
1614 mis = DR_MISALIGNMENT (dr0);
1615 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1616 npeel = ((negative ? mis - nelements : nelements - mis)
1617 & (nelements - 1));
1620 /* For interleaved data access every iteration accesses all the
1621 members of the group, therefore we divide the number of iterations
1622 by the group size. */
1623 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1624 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1625 npeel /= GROUP_SIZE (stmt_info);
1627 if (dump_enabled_p ())
1628 dump_printf_loc (MSG_NOTE, vect_location,
1629 "Try peeling by %d\n", npeel);
1632 /* Ensure that all data refs can be vectorized after the peel. */
1633 FOR_EACH_VEC_ELT (datarefs, i, dr)
1635 int save_misalignment;
1637 if (dr == dr0)
1638 continue;
1640 stmt = DR_STMT (dr);
1641 stmt_info = vinfo_for_stmt (stmt);
1642 /* For interleaving, only the alignment of the first access
1643 matters. */
1644 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1645 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1646 continue;
1648 /* Strided loads perform only component accesses, alignment is
1649 irrelevant for them. */
1650 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1651 continue;
1653 save_misalignment = DR_MISALIGNMENT (dr);
1654 vect_update_misalignment_for_peel (dr, dr0, npeel);
1655 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1656 SET_DR_MISALIGNMENT (dr, save_misalignment);
1658 if (!supportable_dr_alignment)
1660 do_peeling = false;
1661 break;
1665 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1667 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1668 if (!stat)
1669 do_peeling = false;
1670 else
1672 body_cost_vec.release ();
1673 return stat;
1677 if (do_peeling)
1679 unsigned max_allowed_peel
1680 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1681 if (max_allowed_peel != (unsigned)-1)
1683 unsigned max_peel = npeel;
1684 if (max_peel == 0)
1686 gimple dr_stmt = DR_STMT (dr0);
1687 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1688 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1689 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1691 if (max_peel > max_allowed_peel)
1693 do_peeling = false;
1694 if (dump_enabled_p ())
1695 dump_printf_loc (MSG_NOTE, vect_location,
1696 "Disable peeling, max peels reached: %d\n", max_peel);
1701 if (do_peeling)
1703 stmt_info_for_cost *si;
1704 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1706 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1707 If the misalignment of DR_i is identical to that of dr0 then set
1708 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1709 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1710 by the peeling factor times the element size of DR_i (MOD the
1711 vectorization factor times the size). Otherwise, the
1712 misalignment of DR_i must be set to unknown. */
1713 FOR_EACH_VEC_ELT (datarefs, i, dr)
1714 if (dr != dr0)
1715 vect_update_misalignment_for_peel (dr, dr0, npeel);
1717 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1718 if (npeel)
1719 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1720 else
1721 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1722 = DR_MISALIGNMENT (dr0);
1723 SET_DR_MISALIGNMENT (dr0, 0);
1724 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE, vect_location,
1727 "Alignment of access forced using peeling.\n");
1728 dump_printf_loc (MSG_NOTE, vect_location,
1729 "Peeling for alignment will be applied.\n");
1731 /* We've delayed passing the inside-loop peeling costs to the
1732 target cost model until we were sure peeling would happen.
1733 Do so now. */
1734 if (body_cost_vec.exists ())
1736 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1738 struct _stmt_vec_info *stmt_info
1739 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1740 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1741 si->misalign, vect_body);
1743 body_cost_vec.release ();
1746 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1747 gcc_assert (stat);
1748 return stat;
1752 body_cost_vec.release ();
1754 /* (2) Versioning to force alignment. */
1756 /* Try versioning if:
1757 1) optimize loop for speed
1758 2) there is at least one unsupported misaligned data ref with an unknown
1759 misalignment, and
1760 3) all misaligned data refs with a known misalignment are supported, and
1761 4) the number of runtime alignment checks is within reason. */
1763 do_versioning =
1764 optimize_loop_nest_for_speed_p (loop)
1765 && (!loop->inner); /* FORNOW */
1767 if (do_versioning)
1769 FOR_EACH_VEC_ELT (datarefs, i, dr)
1771 stmt = DR_STMT (dr);
1772 stmt_info = vinfo_for_stmt (stmt);
1774 /* For interleaving, only the alignment of the first access
1775 matters. */
1776 if (aligned_access_p (dr)
1777 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1778 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1779 continue;
1781 /* Strided loads perform only component accesses, alignment is
1782 irrelevant for them. */
1783 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1784 continue;
1786 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1788 if (!supportable_dr_alignment)
1790 gimple stmt;
1791 int mask;
1792 tree vectype;
1794 if (known_alignment_for_access_p (dr)
1795 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1796 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1798 do_versioning = false;
1799 break;
1802 stmt = DR_STMT (dr);
1803 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1804 gcc_assert (vectype);
1806 /* The rightmost bits of an aligned address must be zeros.
1807 Construct the mask needed for this test. For example,
1808 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1809 mask must be 15 = 0xf. */
1810 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1812 /* FORNOW: use the same mask to test all potentially unaligned
1813 references in the loop. The vectorizer currently supports
1814 a single vector size, see the reference to
1815 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1816 vectorization factor is computed. */
1817 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1818 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1819 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1820 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1821 DR_STMT (dr));
1825 /* Versioning requires at least one misaligned data reference. */
1826 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1827 do_versioning = false;
1828 else if (!do_versioning)
1829 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1832 if (do_versioning)
1834 vec<gimple> may_misalign_stmts
1835 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1836 gimple stmt;
1838 /* It can now be assumed that the data references in the statements
1839 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1840 of the loop being vectorized. */
1841 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1843 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1844 dr = STMT_VINFO_DATA_REF (stmt_info);
1845 SET_DR_MISALIGNMENT (dr, 0);
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE, vect_location,
1848 "Alignment of access forced using versioning.\n");
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_NOTE, vect_location,
1853 "Versioning for alignment will be applied.\n");
1855 /* Peeling and versioning can't be done together at this time. */
1856 gcc_assert (! (do_peeling && do_versioning));
1858 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1859 gcc_assert (stat);
1860 return stat;
1863 /* This point is reached if neither peeling nor versioning is being done. */
1864 gcc_assert (! (do_peeling || do_versioning));
1866 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1867 return stat;
1871 /* Function vect_find_same_alignment_drs.
1873 Update group and alignment relations according to the chosen
1874 vectorization factor. */
1876 static void
1877 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1878 loop_vec_info loop_vinfo)
1880 unsigned int i;
1881 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1882 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1883 struct data_reference *dra = DDR_A (ddr);
1884 struct data_reference *drb = DDR_B (ddr);
1885 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1886 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1887 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1888 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1889 lambda_vector dist_v;
1890 unsigned int loop_depth;
1892 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1893 return;
1895 if (dra == drb)
1896 return;
1898 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1899 return;
1901 /* Loop-based vectorization and known data dependence. */
1902 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1903 return;
1905 /* Data-dependence analysis reports a distance vector of zero
1906 for data-references that overlap only in the first iteration
1907 but have different sign step (see PR45764).
1908 So as a sanity check require equal DR_STEP. */
1909 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1910 return;
1912 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1913 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1915 int dist = dist_v[loop_depth];
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE, vect_location,
1919 "dependence distance = %d.\n", dist);
1921 /* Same loop iteration. */
1922 if (dist == 0
1923 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1925 /* Two references with distance zero have the same alignment. */
1926 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1927 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1928 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_NOTE, vect_location,
1931 "accesses have the same alignment.\n");
1932 dump_printf (MSG_NOTE,
1933 "dependence distance modulo vf == 0 between ");
1934 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1935 dump_printf (MSG_NOTE, " and ");
1936 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1937 dump_printf (MSG_NOTE, "\n");
1944 /* Function vect_analyze_data_refs_alignment
1946 Analyze the alignment of the data-references in the loop.
1947 Return FALSE if a data reference is found that cannot be vectorized. */
1949 bool
1950 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1951 bb_vec_info bb_vinfo)
1953 if (dump_enabled_p ())
1954 dump_printf_loc (MSG_NOTE, vect_location,
1955 "=== vect_analyze_data_refs_alignment ===\n");
1957 /* Mark groups of data references with same alignment using
1958 data dependence information. */
1959 if (loop_vinfo)
1961 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1962 struct data_dependence_relation *ddr;
1963 unsigned int i;
1965 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1966 vect_find_same_alignment_drs (ddr, loop_vinfo);
1969 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1971 if (dump_enabled_p ())
1972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1973 "not vectorized: can't calculate alignment "
1974 "for data ref.\n");
1975 return false;
1978 return true;
1982 /* Analyze groups of accesses: check that DR belongs to a group of
1983 accesses of legal size, step, etc. Detect gaps, single element
1984 interleaving, and other special cases. Set grouped access info.
1985 Collect groups of strided stores for further use in SLP analysis. */
1987 static bool
1988 vect_analyze_group_access (struct data_reference *dr)
1990 tree step = DR_STEP (dr);
1991 tree scalar_type = TREE_TYPE (DR_REF (dr));
1992 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1993 gimple stmt = DR_STMT (dr);
1994 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1995 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1996 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1997 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1998 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1999 bool slp_impossible = false;
2000 struct loop *loop = NULL;
2002 if (loop_vinfo)
2003 loop = LOOP_VINFO_LOOP (loop_vinfo);
2005 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2006 size of the interleaving group (including gaps). */
2007 groupsize = absu_hwi (dr_step) / type_size;
2009 /* Not consecutive access is possible only if it is a part of interleaving. */
2010 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2012 /* Check if it this DR is a part of interleaving, and is a single
2013 element of the group that is accessed in the loop. */
2015 /* Gaps are supported only for loads. STEP must be a multiple of the type
2016 size. The size of the group must be a power of 2. */
2017 if (DR_IS_READ (dr)
2018 && (dr_step % type_size) == 0
2019 && groupsize > 0
2020 && exact_log2 (groupsize) != -1)
2022 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2023 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2024 if (dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE, vect_location,
2027 "Detected single element interleaving ");
2028 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2029 dump_printf (MSG_NOTE, " step ");
2030 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2031 dump_printf (MSG_NOTE, "\n");
2034 if (loop_vinfo)
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE, vect_location,
2038 "Data access with gaps requires scalar "
2039 "epilogue loop\n");
2040 if (loop->inner)
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2044 "Peeling for outer loop is not"
2045 " supported\n");
2046 return false;
2049 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2052 return true;
2055 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 "not consecutive access ");
2059 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2060 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2063 if (bb_vinfo)
2065 /* Mark the statement as unvectorizable. */
2066 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2067 return true;
2070 return false;
2073 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2075 /* First stmt in the interleaving chain. Check the chain. */
2076 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2077 struct data_reference *data_ref = dr;
2078 unsigned int count = 1;
2079 tree prev_init = DR_INIT (data_ref);
2080 gimple prev = stmt;
2081 HOST_WIDE_INT diff, gaps = 0;
2082 unsigned HOST_WIDE_INT count_in_bytes;
2084 while (next)
2086 /* Skip same data-refs. In case that two or more stmts share
2087 data-ref (supported only for loads), we vectorize only the first
2088 stmt, and the rest get their vectorized loads from the first
2089 one. */
2090 if (!tree_int_cst_compare (DR_INIT (data_ref),
2091 DR_INIT (STMT_VINFO_DATA_REF (
2092 vinfo_for_stmt (next)))))
2094 if (DR_IS_WRITE (data_ref))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "Two store stmts share the same dr.\n");
2099 return false;
2102 /* For load use the same data-ref load. */
2103 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2105 prev = next;
2106 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2107 continue;
2110 prev = next;
2111 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2113 /* All group members have the same STEP by construction. */
2114 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2116 /* Check that the distance between two accesses is equal to the type
2117 size. Otherwise, we have gaps. */
2118 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2119 - TREE_INT_CST_LOW (prev_init)) / type_size;
2120 if (diff != 1)
2122 /* FORNOW: SLP of accesses with gaps is not supported. */
2123 slp_impossible = true;
2124 if (DR_IS_WRITE (data_ref))
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 "interleaved store with gaps\n");
2129 return false;
2132 gaps += diff - 1;
2135 last_accessed_element += diff;
2137 /* Store the gap from the previous member of the group. If there is no
2138 gap in the access, GROUP_GAP is always 1. */
2139 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2141 prev_init = DR_INIT (data_ref);
2142 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2143 /* Count the number of data-refs in the chain. */
2144 count++;
2147 /* COUNT is the number of accesses found, we multiply it by the size of
2148 the type to get COUNT_IN_BYTES. */
2149 count_in_bytes = type_size * count;
2151 /* Check that the size of the interleaving (including gaps) is not
2152 greater than STEP. */
2153 if (dr_step != 0
2154 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2156 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2159 "interleaving size is greater than step for ");
2160 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2161 DR_REF (dr));
2162 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2164 return false;
2167 /* Check that the size of the interleaving is equal to STEP for stores,
2168 i.e., that there are no gaps. */
2169 if (dr_step != 0
2170 && absu_hwi (dr_step) != count_in_bytes)
2172 if (DR_IS_READ (dr))
2174 slp_impossible = true;
2175 /* There is a gap after the last load in the group. This gap is a
2176 difference between the groupsize and the number of elements.
2177 When there is no gap, this difference should be 0. */
2178 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2180 else
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "interleaved store with gaps\n");
2185 return false;
2189 /* Check that STEP is a multiple of type size. */
2190 if (dr_step != 0
2191 && (dr_step % type_size) != 0)
2193 if (dump_enabled_p ())
2195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2196 "step is not a multiple of type size: step ");
2197 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2198 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2199 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2200 TYPE_SIZE_UNIT (scalar_type));
2201 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2203 return false;
2206 if (groupsize == 0)
2207 groupsize = count;
2209 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_NOTE, vect_location,
2212 "Detected interleaving of size %d\n", (int)groupsize);
2214 /* SLP: create an SLP data structure for every interleaving group of
2215 stores for further analysis in vect_analyse_slp. */
2216 if (DR_IS_WRITE (dr) && !slp_impossible)
2218 if (loop_vinfo)
2219 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2220 if (bb_vinfo)
2221 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2224 /* There is a gap in the end of the group. */
2225 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229 "Data access with gaps requires scalar "
2230 "epilogue loop\n");
2231 if (loop->inner)
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2235 "Peeling for outer loop is not supported\n");
2236 return false;
2239 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2243 return true;
2247 /* Analyze the access pattern of the data-reference DR.
2248 In case of non-consecutive accesses call vect_analyze_group_access() to
2249 analyze groups of accesses. */
2251 static bool
2252 vect_analyze_data_ref_access (struct data_reference *dr)
2254 tree step = DR_STEP (dr);
2255 tree scalar_type = TREE_TYPE (DR_REF (dr));
2256 gimple stmt = DR_STMT (dr);
2257 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2259 struct loop *loop = NULL;
2261 if (loop_vinfo)
2262 loop = LOOP_VINFO_LOOP (loop_vinfo);
2264 if (loop_vinfo && !step)
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2268 "bad data-ref access in loop\n");
2269 return false;
2272 /* Allow invariant loads in not nested loops. */
2273 if (loop_vinfo && integer_zerop (step))
2275 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2276 if (nested_in_vect_loop_p (loop, stmt))
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_NOTE, vect_location,
2280 "zero step in inner loop of nest\n");
2281 return false;
2283 return DR_IS_READ (dr);
2286 if (loop && nested_in_vect_loop_p (loop, stmt))
2288 /* Interleaved accesses are not yet supported within outer-loop
2289 vectorization for references in the inner-loop. */
2290 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2292 /* For the rest of the analysis we use the outer-loop step. */
2293 step = STMT_VINFO_DR_STEP (stmt_info);
2294 if (integer_zerop (step))
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "zero step in outer loop.\n");
2299 if (DR_IS_READ (dr))
2300 return true;
2301 else
2302 return false;
2306 /* Consecutive? */
2307 if (TREE_CODE (step) == INTEGER_CST)
2309 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2310 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2311 || (dr_step < 0
2312 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2314 /* Mark that it is not interleaving. */
2315 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2316 return true;
2320 if (loop && nested_in_vect_loop_p (loop, stmt))
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_NOTE, vect_location,
2324 "grouped access in outer loop.\n");
2325 return false;
2328 /* Assume this is a DR handled by non-constant strided load case. */
2329 if (TREE_CODE (step) != INTEGER_CST)
2330 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2332 /* Not consecutive access - check if it's a part of interleaving group. */
2333 return vect_analyze_group_access (dr);
2338 /* A helper function used in the comparator function to sort data
2339 references. T1 and T2 are two data references to be compared.
2340 The function returns -1, 0, or 1. */
2342 static int
2343 compare_tree (tree t1, tree t2)
2345 int i, cmp;
2346 enum tree_code code;
2347 char tclass;
2349 if (t1 == t2)
2350 return 0;
2351 if (t1 == NULL)
2352 return -1;
2353 if (t2 == NULL)
2354 return 1;
2357 if (TREE_CODE (t1) != TREE_CODE (t2))
2358 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2360 code = TREE_CODE (t1);
2361 switch (code)
2363 /* For const values, we can just use hash values for comparisons. */
2364 case INTEGER_CST:
2365 case REAL_CST:
2366 case FIXED_CST:
2367 case STRING_CST:
2368 case COMPLEX_CST:
2369 case VECTOR_CST:
2371 hashval_t h1 = iterative_hash_expr (t1, 0);
2372 hashval_t h2 = iterative_hash_expr (t2, 0);
2373 if (h1 != h2)
2374 return h1 < h2 ? -1 : 1;
2375 break;
2378 case SSA_NAME:
2379 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2380 if (cmp != 0)
2381 return cmp;
2383 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2384 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2385 break;
2387 default:
2388 tclass = TREE_CODE_CLASS (code);
2390 /* For var-decl, we could compare their UIDs. */
2391 if (tclass == tcc_declaration)
2393 if (DECL_UID (t1) != DECL_UID (t2))
2394 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2395 break;
2398 /* For expressions with operands, compare their operands recursively. */
2399 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2401 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2402 if (cmp != 0)
2403 return cmp;
2407 return 0;
2411 /* Compare two data-references DRA and DRB to group them into chunks
2412 suitable for grouping. */
2414 static int
2415 dr_group_sort_cmp (const void *dra_, const void *drb_)
2417 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2418 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2419 int cmp;
2421 /* Stabilize sort. */
2422 if (dra == drb)
2423 return 0;
2425 /* Ordering of DRs according to base. */
2426 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2428 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2429 if (cmp != 0)
2430 return cmp;
2433 /* And according to DR_OFFSET. */
2434 if (!dr_equal_offsets_p (dra, drb))
2436 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2437 if (cmp != 0)
2438 return cmp;
2441 /* Put reads before writes. */
2442 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2443 return DR_IS_READ (dra) ? -1 : 1;
2445 /* Then sort after access size. */
2446 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2447 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2449 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2450 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2451 if (cmp != 0)
2452 return cmp;
2455 /* And after step. */
2456 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2458 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2459 if (cmp != 0)
2460 return cmp;
2463 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2464 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2465 if (cmp == 0)
2466 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2467 return cmp;
2470 /* Function vect_analyze_data_ref_accesses.
2472 Analyze the access pattern of all the data references in the loop.
2474 FORNOW: the only access pattern that is considered vectorizable is a
2475 simple step 1 (consecutive) access.
2477 FORNOW: handle only arrays and pointer accesses. */
2479 bool
2480 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2482 unsigned int i;
2483 vec<data_reference_p> datarefs;
2484 struct data_reference *dr;
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_NOTE, vect_location,
2488 "=== vect_analyze_data_ref_accesses ===\n");
2490 if (loop_vinfo)
2491 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2492 else
2493 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2495 if (datarefs.is_empty ())
2496 return true;
2498 /* Sort the array of datarefs to make building the interleaving chains
2499 linear. Don't modify the original vector's order, it is needed for
2500 determining what dependencies are reversed. */
2501 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2502 qsort (datarefs_copy.address (), datarefs_copy.length (),
2503 sizeof (data_reference_p), dr_group_sort_cmp);
2505 /* Build the interleaving chains. */
2506 for (i = 0; i < datarefs_copy.length () - 1;)
2508 data_reference_p dra = datarefs_copy[i];
2509 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2510 stmt_vec_info lastinfo = NULL;
2511 for (i = i + 1; i < datarefs_copy.length (); ++i)
2513 data_reference_p drb = datarefs_copy[i];
2514 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2516 /* ??? Imperfect sorting (non-compatible types, non-modulo
2517 accesses, same accesses) can lead to a group to be artificially
2518 split here as we don't just skip over those. If it really
2519 matters we can push those to a worklist and re-iterate
2520 over them. The we can just skip ahead to the next DR here. */
2522 /* Check that the data-refs have same first location (except init)
2523 and they are both either store or load (not load and store). */
2524 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2525 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2526 DR_BASE_ADDRESS (drb), 0)
2527 || !dr_equal_offsets_p (dra, drb))
2528 break;
2530 /* Check that the data-refs have the same constant size and step. */
2531 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2532 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2533 if (!tree_fits_uhwi_p (sza)
2534 || !tree_fits_uhwi_p (szb)
2535 || !tree_int_cst_equal (sza, szb)
2536 || !tree_fits_shwi_p (DR_STEP (dra))
2537 || !tree_fits_shwi_p (DR_STEP (drb))
2538 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2539 break;
2541 /* Do not place the same access in the interleaving chain twice. */
2542 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2543 break;
2545 /* Check the types are compatible.
2546 ??? We don't distinguish this during sorting. */
2547 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2548 TREE_TYPE (DR_REF (drb))))
2549 break;
2551 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2552 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2553 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2554 gcc_assert (init_a < init_b);
2556 /* If init_b == init_a + the size of the type * k, we have an
2557 interleaving, and DRA is accessed before DRB. */
2558 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2559 if ((init_b - init_a) % type_size_a != 0)
2560 break;
2562 /* The step (if not zero) is greater than the difference between
2563 data-refs' inits. This splits groups into suitable sizes. */
2564 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2565 if (step != 0 && step <= (init_b - init_a))
2566 break;
2568 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_NOTE, vect_location,
2571 "Detected interleaving ");
2572 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2573 dump_printf (MSG_NOTE, " and ");
2574 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2575 dump_printf (MSG_NOTE, "\n");
2578 /* Link the found element into the group list. */
2579 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2581 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2582 lastinfo = stmtinfo_a;
2584 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2585 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2586 lastinfo = stmtinfo_b;
2590 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2591 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2592 && !vect_analyze_data_ref_access (dr))
2594 if (dump_enabled_p ())
2595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2596 "not vectorized: complicated access pattern.\n");
2598 if (bb_vinfo)
2600 /* Mark the statement as not vectorizable. */
2601 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2602 continue;
2604 else
2606 datarefs_copy.release ();
2607 return false;
2611 datarefs_copy.release ();
2612 return true;
2616 /* Operator == between two dr_with_seg_len objects.
2618 This equality operator is used to make sure two data refs
2619 are the same one so that we will consider to combine the
2620 aliasing checks of those two pairs of data dependent data
2621 refs. */
2623 static bool
2624 operator == (const dr_with_seg_len& d1,
2625 const dr_with_seg_len& d2)
2627 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2628 DR_BASE_ADDRESS (d2.dr), 0)
2629 && compare_tree (d1.offset, d2.offset) == 0
2630 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2633 /* Function comp_dr_with_seg_len_pair.
2635 Comparison function for sorting objects of dr_with_seg_len_pair_t
2636 so that we can combine aliasing checks in one scan. */
2638 static int
2639 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2641 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2642 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2644 const dr_with_seg_len &p11 = p1->first,
2645 &p12 = p1->second,
2646 &p21 = p2->first,
2647 &p22 = p2->second;
2649 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2650 if a and c have the same basic address snd step, and b and d have the same
2651 address and step. Therefore, if any a&c or b&d don't have the same address
2652 and step, we don't care the order of those two pairs after sorting. */
2653 int comp_res;
2655 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2656 DR_BASE_ADDRESS (p21.dr))) != 0)
2657 return comp_res;
2658 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2659 DR_BASE_ADDRESS (p22.dr))) != 0)
2660 return comp_res;
2661 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2662 return comp_res;
2663 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2664 return comp_res;
2665 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2666 return comp_res;
2667 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2668 return comp_res;
2670 return 0;
2673 template <class T> static void
2674 swap (T& a, T& b)
2676 T c (a);
2677 a = b;
2678 b = c;
2681 /* Function vect_vfa_segment_size.
2683 Create an expression that computes the size of segment
2684 that will be accessed for a data reference. The functions takes into
2685 account that realignment loads may access one more vector.
2687 Input:
2688 DR: The data reference.
2689 LENGTH_FACTOR: segment length to consider.
2691 Return an expression whose value is the size of segment which will be
2692 accessed by DR. */
2694 static tree
2695 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2697 tree segment_length;
2699 if (integer_zerop (DR_STEP (dr)))
2700 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2701 else
2702 segment_length = size_binop (MULT_EXPR,
2703 fold_convert (sizetype, DR_STEP (dr)),
2704 fold_convert (sizetype, length_factor));
2706 if (vect_supportable_dr_alignment (dr, false)
2707 == dr_explicit_realign_optimized)
2709 tree vector_size = TYPE_SIZE_UNIT
2710 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2712 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2714 return segment_length;
2717 /* Function vect_prune_runtime_alias_test_list.
2719 Prune a list of ddrs to be tested at run-time by versioning for alias.
2720 Merge several alias checks into one if possible.
2721 Return FALSE if resulting list of ddrs is longer then allowed by
2722 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2724 bool
2725 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2727 vec<ddr_p> may_alias_ddrs =
2728 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2729 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2730 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2731 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2732 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2734 ddr_p ddr;
2735 unsigned int i;
2736 tree length_factor;
2738 if (dump_enabled_p ())
2739 dump_printf_loc (MSG_NOTE, vect_location,
2740 "=== vect_prune_runtime_alias_test_list ===\n");
2742 if (may_alias_ddrs.is_empty ())
2743 return true;
2745 /* Basically, for each pair of dependent data refs store_ptr_0
2746 and load_ptr_0, we create an expression:
2748 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2749 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2751 for aliasing checks. However, in some cases we can decrease
2752 the number of checks by combining two checks into one. For
2753 example, suppose we have another pair of data refs store_ptr_0
2754 and load_ptr_1, and if the following condition is satisfied:
2756 load_ptr_0 < load_ptr_1 &&
2757 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2759 (this condition means, in each iteration of vectorized loop,
2760 the accessed memory of store_ptr_0 cannot be between the memory
2761 of load_ptr_0 and load_ptr_1.)
2763 we then can use only the following expression to finish the
2764 alising checks between store_ptr_0 & load_ptr_0 and
2765 store_ptr_0 & load_ptr_1:
2767 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2768 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2770 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2771 same basic address. */
2773 comp_alias_ddrs.create (may_alias_ddrs.length ());
2775 /* First, we collect all data ref pairs for aliasing checks. */
2776 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2778 struct data_reference *dr_a, *dr_b;
2779 gimple dr_group_first_a, dr_group_first_b;
2780 tree segment_length_a, segment_length_b;
2781 gimple stmt_a, stmt_b;
2783 dr_a = DDR_A (ddr);
2784 stmt_a = DR_STMT (DDR_A (ddr));
2785 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2786 if (dr_group_first_a)
2788 stmt_a = dr_group_first_a;
2789 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2792 dr_b = DDR_B (ddr);
2793 stmt_b = DR_STMT (DDR_B (ddr));
2794 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2795 if (dr_group_first_b)
2797 stmt_b = dr_group_first_b;
2798 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2801 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2802 length_factor = scalar_loop_iters;
2803 else
2804 length_factor = size_int (vect_factor);
2805 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2806 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2808 dr_with_seg_len_pair_t dr_with_seg_len_pair
2809 (dr_with_seg_len (dr_a, segment_length_a),
2810 dr_with_seg_len (dr_b, segment_length_b));
2812 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2813 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2815 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2818 /* Second, we sort the collected data ref pairs so that we can scan
2819 them once to combine all possible aliasing checks. */
2820 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2822 /* Third, we scan the sorted dr pairs and check if we can combine
2823 alias checks of two neighbouring dr pairs. */
2824 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2826 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2827 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2828 *dr_b1 = &comp_alias_ddrs[i-1].second,
2829 *dr_a2 = &comp_alias_ddrs[i].first,
2830 *dr_b2 = &comp_alias_ddrs[i].second;
2832 /* Remove duplicate data ref pairs. */
2833 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2835 if (dump_enabled_p ())
2837 dump_printf_loc (MSG_NOTE, vect_location,
2838 "found equal ranges ");
2839 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2840 DR_REF (dr_a1->dr));
2841 dump_printf (MSG_NOTE, ", ");
2842 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2843 DR_REF (dr_b1->dr));
2844 dump_printf (MSG_NOTE, " and ");
2845 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2846 DR_REF (dr_a2->dr));
2847 dump_printf (MSG_NOTE, ", ");
2848 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2849 DR_REF (dr_b2->dr));
2850 dump_printf (MSG_NOTE, "\n");
2853 comp_alias_ddrs.ordered_remove (i--);
2854 continue;
2857 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2859 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2860 and DR_A1 and DR_A2 are two consecutive memrefs. */
2861 if (*dr_a1 == *dr_a2)
2863 swap (dr_a1, dr_b1);
2864 swap (dr_a2, dr_b2);
2867 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2868 DR_BASE_ADDRESS (dr_a2->dr),
2870 || !tree_fits_shwi_p (dr_a1->offset)
2871 || !tree_fits_shwi_p (dr_a2->offset))
2872 continue;
2874 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2875 - tree_to_shwi (dr_a1->offset));
2878 /* Now we check if the following condition is satisfied:
2880 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2882 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2883 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2884 have to make a best estimation. We can get the minimum value
2885 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2886 then either of the following two conditions can guarantee the
2887 one above:
2889 1: DIFF <= MIN_SEG_LEN_B
2890 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2894 HOST_WIDE_INT
2895 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2896 TREE_INT_CST_LOW (dr_b1->seg_len) :
2897 vect_factor;
2899 if (diff <= min_seg_len_b
2900 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2901 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2902 min_seg_len_b))
2904 dr_a1->seg_len = size_binop (PLUS_EXPR,
2905 dr_a2->seg_len, size_int (diff));
2906 comp_alias_ddrs.ordered_remove (i--);
2911 if ((int) comp_alias_ddrs.length () >
2912 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2914 if (dump_enabled_p ())
2916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2917 "disable versioning for alias - max number of "
2918 "generated checks exceeded.\n");
2921 return false;
2924 return true;
2927 /* Check whether a non-affine read in stmt is suitable for gather load
2928 and if so, return a builtin decl for that operation. */
2930 tree
2931 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2932 tree *offp, int *scalep)
2934 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2935 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2936 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2937 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2938 tree offtype = NULL_TREE;
2939 tree decl, base, off;
2940 enum machine_mode pmode;
2941 int punsignedp, pvolatilep;
2943 base = DR_REF (dr);
2944 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2945 see if we can use the def stmt of the address. */
2946 if (is_gimple_call (stmt)
2947 && gimple_call_internal_p (stmt)
2948 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2949 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2950 && TREE_CODE (base) == MEM_REF
2951 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2952 && integer_zerop (TREE_OPERAND (base, 1))
2953 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2955 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2956 if (is_gimple_assign (def_stmt)
2957 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2958 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2961 /* The gather builtins need address of the form
2962 loop_invariant + vector * {1, 2, 4, 8}
2964 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2965 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2966 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2967 multiplications and additions in it. To get a vector, we need
2968 a single SSA_NAME that will be defined in the loop and will
2969 contain everything that is not loop invariant and that can be
2970 vectorized. The following code attempts to find such a preexistng
2971 SSA_NAME OFF and put the loop invariants into a tree BASE
2972 that can be gimplified before the loop. */
2973 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2974 &pmode, &punsignedp, &pvolatilep, false);
2975 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2977 if (TREE_CODE (base) == MEM_REF)
2979 if (!integer_zerop (TREE_OPERAND (base, 1)))
2981 if (off == NULL_TREE)
2983 double_int moff = mem_ref_offset (base);
2984 off = double_int_to_tree (sizetype, moff);
2986 else
2987 off = size_binop (PLUS_EXPR, off,
2988 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2990 base = TREE_OPERAND (base, 0);
2992 else
2993 base = build_fold_addr_expr (base);
2995 if (off == NULL_TREE)
2996 off = size_zero_node;
2998 /* If base is not loop invariant, either off is 0, then we start with just
2999 the constant offset in the loop invariant BASE and continue with base
3000 as OFF, otherwise give up.
3001 We could handle that case by gimplifying the addition of base + off
3002 into some SSA_NAME and use that as off, but for now punt. */
3003 if (!expr_invariant_in_loop_p (loop, base))
3005 if (!integer_zerop (off))
3006 return NULL_TREE;
3007 off = base;
3008 base = size_int (pbitpos / BITS_PER_UNIT);
3010 /* Otherwise put base + constant offset into the loop invariant BASE
3011 and continue with OFF. */
3012 else
3014 base = fold_convert (sizetype, base);
3015 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3018 /* OFF at this point may be either a SSA_NAME or some tree expression
3019 from get_inner_reference. Try to peel off loop invariants from it
3020 into BASE as long as possible. */
3021 STRIP_NOPS (off);
3022 while (offtype == NULL_TREE)
3024 enum tree_code code;
3025 tree op0, op1, add = NULL_TREE;
3027 if (TREE_CODE (off) == SSA_NAME)
3029 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3031 if (expr_invariant_in_loop_p (loop, off))
3032 return NULL_TREE;
3034 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3035 break;
3037 op0 = gimple_assign_rhs1 (def_stmt);
3038 code = gimple_assign_rhs_code (def_stmt);
3039 op1 = gimple_assign_rhs2 (def_stmt);
3041 else
3043 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3044 return NULL_TREE;
3045 code = TREE_CODE (off);
3046 extract_ops_from_tree (off, &code, &op0, &op1);
3048 switch (code)
3050 case POINTER_PLUS_EXPR:
3051 case PLUS_EXPR:
3052 if (expr_invariant_in_loop_p (loop, op0))
3054 add = op0;
3055 off = op1;
3056 do_add:
3057 add = fold_convert (sizetype, add);
3058 if (scale != 1)
3059 add = size_binop (MULT_EXPR, add, size_int (scale));
3060 base = size_binop (PLUS_EXPR, base, add);
3061 continue;
3063 if (expr_invariant_in_loop_p (loop, op1))
3065 add = op1;
3066 off = op0;
3067 goto do_add;
3069 break;
3070 case MINUS_EXPR:
3071 if (expr_invariant_in_loop_p (loop, op1))
3073 add = fold_convert (sizetype, op1);
3074 add = size_binop (MINUS_EXPR, size_zero_node, add);
3075 off = op0;
3076 goto do_add;
3078 break;
3079 case MULT_EXPR:
3080 if (scale == 1 && tree_fits_shwi_p (op1))
3082 scale = tree_to_shwi (op1);
3083 off = op0;
3084 continue;
3086 break;
3087 case SSA_NAME:
3088 off = op0;
3089 continue;
3090 CASE_CONVERT:
3091 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3092 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3093 break;
3094 if (TYPE_PRECISION (TREE_TYPE (op0))
3095 == TYPE_PRECISION (TREE_TYPE (off)))
3097 off = op0;
3098 continue;
3100 if (TYPE_PRECISION (TREE_TYPE (op0))
3101 < TYPE_PRECISION (TREE_TYPE (off)))
3103 off = op0;
3104 offtype = TREE_TYPE (off);
3105 STRIP_NOPS (off);
3106 continue;
3108 break;
3109 default:
3110 break;
3112 break;
3115 /* If at the end OFF still isn't a SSA_NAME or isn't
3116 defined in the loop, punt. */
3117 if (TREE_CODE (off) != SSA_NAME
3118 || expr_invariant_in_loop_p (loop, off))
3119 return NULL_TREE;
3121 if (offtype == NULL_TREE)
3122 offtype = TREE_TYPE (off);
3124 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3125 offtype, scale);
3126 if (decl == NULL_TREE)
3127 return NULL_TREE;
3129 if (basep)
3130 *basep = base;
3131 if (offp)
3132 *offp = off;
3133 if (scalep)
3134 *scalep = scale;
3135 return decl;
3138 /* Function vect_analyze_data_refs.
3140 Find all the data references in the loop or basic block.
3142 The general structure of the analysis of data refs in the vectorizer is as
3143 follows:
3144 1- vect_analyze_data_refs(loop/bb): call
3145 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3146 in the loop/bb and their dependences.
3147 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3148 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3149 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3153 bool
3154 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3155 bb_vec_info bb_vinfo,
3156 int *min_vf)
3158 struct loop *loop = NULL;
3159 basic_block bb = NULL;
3160 unsigned int i;
3161 vec<data_reference_p> datarefs;
3162 struct data_reference *dr;
3163 tree scalar_type;
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_NOTE, vect_location,
3167 "=== vect_analyze_data_refs ===\n");
3169 if (loop_vinfo)
3171 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3173 loop = LOOP_VINFO_LOOP (loop_vinfo);
3174 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3175 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3177 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3179 "not vectorized: loop contains function calls"
3180 " or data references that cannot be analyzed\n");
3181 return false;
3184 for (i = 0; i < loop->num_nodes; i++)
3186 gimple_stmt_iterator gsi;
3188 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3190 gimple stmt = gsi_stmt (gsi);
3191 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3193 if (is_gimple_call (stmt) && loop->safelen)
3195 tree fndecl = gimple_call_fndecl (stmt), op;
3196 if (fndecl != NULL_TREE)
3198 struct cgraph_node *node = cgraph_get_node (fndecl);
3199 if (node != NULL && node->simd_clones != NULL)
3201 unsigned int j, n = gimple_call_num_args (stmt);
3202 for (j = 0; j < n; j++)
3204 op = gimple_call_arg (stmt, j);
3205 if (DECL_P (op)
3206 || (REFERENCE_CLASS_P (op)
3207 && get_base_address (op)))
3208 break;
3210 op = gimple_call_lhs (stmt);
3211 /* Ignore #pragma omp declare simd functions
3212 if they don't have data references in the
3213 call stmt itself. */
3214 if (j == n
3215 && !(op
3216 && (DECL_P (op)
3217 || (REFERENCE_CLASS_P (op)
3218 && get_base_address (op)))))
3219 continue;
3223 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3224 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3226 "not vectorized: loop contains function "
3227 "calls or data references that cannot "
3228 "be analyzed\n");
3229 return false;
3234 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3236 else
3238 gimple_stmt_iterator gsi;
3240 bb = BB_VINFO_BB (bb_vinfo);
3241 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3243 gimple stmt = gsi_stmt (gsi);
3244 if (!find_data_references_in_stmt (NULL, stmt,
3245 &BB_VINFO_DATAREFS (bb_vinfo)))
3247 /* Mark the rest of the basic-block as unvectorizable. */
3248 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3250 stmt = gsi_stmt (gsi);
3251 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3253 break;
3257 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3260 /* Go through the data-refs, check that the analysis succeeded. Update
3261 pointer from stmt_vec_info struct to DR and vectype. */
3263 FOR_EACH_VEC_ELT (datarefs, i, dr)
3265 gimple stmt;
3266 stmt_vec_info stmt_info;
3267 tree base, offset, init;
3268 bool gather = false;
3269 bool simd_lane_access = false;
3270 int vf;
3272 again:
3273 if (!dr || !DR_REF (dr))
3275 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3277 "not vectorized: unhandled data-ref\n");
3278 return false;
3281 stmt = DR_STMT (dr);
3282 stmt_info = vinfo_for_stmt (stmt);
3284 /* Discard clobbers from the dataref vector. We will remove
3285 clobber stmts during vectorization. */
3286 if (gimple_clobber_p (stmt))
3288 if (i == datarefs.length () - 1)
3290 datarefs.pop ();
3291 break;
3293 datarefs[i] = datarefs.pop ();
3294 goto again;
3297 /* Check that analysis of the data-ref succeeded. */
3298 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3299 || !DR_STEP (dr))
3301 bool maybe_gather
3302 = DR_IS_READ (dr)
3303 && !TREE_THIS_VOLATILE (DR_REF (dr))
3304 && targetm.vectorize.builtin_gather != NULL;
3305 bool maybe_simd_lane_access
3306 = loop_vinfo && loop->simduid;
3308 /* If target supports vector gather loads, or if this might be
3309 a SIMD lane access, see if they can't be used. */
3310 if (loop_vinfo
3311 && (maybe_gather || maybe_simd_lane_access)
3312 && !nested_in_vect_loop_p (loop, stmt))
3314 struct data_reference *newdr
3315 = create_data_ref (NULL, loop_containing_stmt (stmt),
3316 DR_REF (dr), stmt, true);
3317 gcc_assert (newdr != NULL && DR_REF (newdr));
3318 if (DR_BASE_ADDRESS (newdr)
3319 && DR_OFFSET (newdr)
3320 && DR_INIT (newdr)
3321 && DR_STEP (newdr)
3322 && integer_zerop (DR_STEP (newdr)))
3324 if (maybe_simd_lane_access)
3326 tree off = DR_OFFSET (newdr);
3327 STRIP_NOPS (off);
3328 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3329 && TREE_CODE (off) == MULT_EXPR
3330 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3332 tree step = TREE_OPERAND (off, 1);
3333 off = TREE_OPERAND (off, 0);
3334 STRIP_NOPS (off);
3335 if (CONVERT_EXPR_P (off)
3336 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3337 0)))
3338 < TYPE_PRECISION (TREE_TYPE (off)))
3339 off = TREE_OPERAND (off, 0);
3340 if (TREE_CODE (off) == SSA_NAME)
3342 gimple def = SSA_NAME_DEF_STMT (off);
3343 tree reft = TREE_TYPE (DR_REF (newdr));
3344 if (is_gimple_call (def)
3345 && gimple_call_internal_p (def)
3346 && (gimple_call_internal_fn (def)
3347 == IFN_GOMP_SIMD_LANE))
3349 tree arg = gimple_call_arg (def, 0);
3350 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3351 arg = SSA_NAME_VAR (arg);
3352 if (arg == loop->simduid
3353 /* For now. */
3354 && tree_int_cst_equal
3355 (TYPE_SIZE_UNIT (reft),
3356 step))
3358 DR_OFFSET (newdr) = ssize_int (0);
3359 DR_STEP (newdr) = step;
3360 DR_ALIGNED_TO (newdr)
3361 = size_int (BIGGEST_ALIGNMENT);
3362 dr = newdr;
3363 simd_lane_access = true;
3369 if (!simd_lane_access && maybe_gather)
3371 dr = newdr;
3372 gather = true;
3375 if (!gather && !simd_lane_access)
3376 free_data_ref (newdr);
3379 if (!gather && !simd_lane_access)
3381 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3384 "not vectorized: data ref analysis "
3385 "failed ");
3386 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3387 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3390 if (bb_vinfo)
3391 break;
3393 return false;
3397 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3401 "not vectorized: base addr of dr is a "
3402 "constant\n");
3404 if (bb_vinfo)
3405 break;
3407 if (gather || simd_lane_access)
3408 free_data_ref (dr);
3409 return false;
3412 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3414 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3417 "not vectorized: volatile type ");
3418 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3419 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3422 if (bb_vinfo)
3423 break;
3425 return false;
3428 if (stmt_can_throw_internal (stmt))
3430 if (dump_enabled_p ())
3432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3433 "not vectorized: statement can throw an "
3434 "exception ");
3435 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3436 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3439 if (bb_vinfo)
3440 break;
3442 if (gather || simd_lane_access)
3443 free_data_ref (dr);
3444 return false;
3447 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3448 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3453 "not vectorized: statement is bitfield "
3454 "access ");
3455 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3456 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3459 if (bb_vinfo)
3460 break;
3462 if (gather || simd_lane_access)
3463 free_data_ref (dr);
3464 return false;
3467 base = unshare_expr (DR_BASE_ADDRESS (dr));
3468 offset = unshare_expr (DR_OFFSET (dr));
3469 init = unshare_expr (DR_INIT (dr));
3471 if (is_gimple_call (stmt)
3472 && (!gimple_call_internal_p (stmt)
3473 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3474 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3476 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3479 "not vectorized: dr in a call ");
3480 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3484 if (bb_vinfo)
3485 break;
3487 if (gather || simd_lane_access)
3488 free_data_ref (dr);
3489 return false;
3492 /* Update DR field in stmt_vec_info struct. */
3494 /* If the dataref is in an inner-loop of the loop that is considered for
3495 for vectorization, we also want to analyze the access relative to
3496 the outer-loop (DR contains information only relative to the
3497 inner-most enclosing loop). We do that by building a reference to the
3498 first location accessed by the inner-loop, and analyze it relative to
3499 the outer-loop. */
3500 if (loop && nested_in_vect_loop_p (loop, stmt))
3502 tree outer_step, outer_base, outer_init;
3503 HOST_WIDE_INT pbitsize, pbitpos;
3504 tree poffset;
3505 enum machine_mode pmode;
3506 int punsignedp, pvolatilep;
3507 affine_iv base_iv, offset_iv;
3508 tree dinit;
3510 /* Build a reference to the first location accessed by the
3511 inner-loop: *(BASE+INIT). (The first location is actually
3512 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3513 tree inner_base = build_fold_indirect_ref
3514 (fold_build_pointer_plus (base, init));
3516 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_NOTE, vect_location,
3519 "analyze in outer-loop: ");
3520 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3521 dump_printf (MSG_NOTE, "\n");
3524 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3525 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3526 gcc_assert (outer_base != NULL_TREE);
3528 if (pbitpos % BITS_PER_UNIT != 0)
3530 if (dump_enabled_p ())
3531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3532 "failed: bit offset alignment.\n");
3533 return false;
3536 outer_base = build_fold_addr_expr (outer_base);
3537 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3538 &base_iv, false))
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542 "failed: evolution of base is not affine.\n");
3543 return false;
3546 if (offset)
3548 if (poffset)
3549 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3550 poffset);
3551 else
3552 poffset = offset;
3555 if (!poffset)
3557 offset_iv.base = ssize_int (0);
3558 offset_iv.step = ssize_int (0);
3560 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3561 &offset_iv, false))
3563 if (dump_enabled_p ())
3564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3565 "evolution of offset is not affine.\n");
3566 return false;
3569 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3570 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3571 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3572 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3573 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3575 outer_step = size_binop (PLUS_EXPR,
3576 fold_convert (ssizetype, base_iv.step),
3577 fold_convert (ssizetype, offset_iv.step));
3579 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3580 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3581 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3582 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3583 STMT_VINFO_DR_OFFSET (stmt_info) =
3584 fold_convert (ssizetype, offset_iv.base);
3585 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3586 size_int (highest_pow2_factor (offset_iv.base));
3588 if (dump_enabled_p ())
3590 dump_printf_loc (MSG_NOTE, vect_location,
3591 "\touter base_address: ");
3592 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3593 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3594 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3595 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3596 STMT_VINFO_DR_OFFSET (stmt_info));
3597 dump_printf (MSG_NOTE,
3598 "\n\touter constant offset from base address: ");
3599 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3600 STMT_VINFO_DR_INIT (stmt_info));
3601 dump_printf (MSG_NOTE, "\n\touter step: ");
3602 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3603 STMT_VINFO_DR_STEP (stmt_info));
3604 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3605 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3606 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3607 dump_printf (MSG_NOTE, "\n");
3611 if (STMT_VINFO_DATA_REF (stmt_info))
3613 if (dump_enabled_p ())
3615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3616 "not vectorized: more than one data ref "
3617 "in stmt: ");
3618 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3619 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3622 if (bb_vinfo)
3623 break;
3625 if (gather || simd_lane_access)
3626 free_data_ref (dr);
3627 return false;
3630 STMT_VINFO_DATA_REF (stmt_info) = dr;
3631 if (simd_lane_access)
3633 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3634 datarefs[i] = dr;
3637 /* Set vectype for STMT. */
3638 scalar_type = TREE_TYPE (DR_REF (dr));
3639 STMT_VINFO_VECTYPE (stmt_info) =
3640 get_vectype_for_scalar_type (scalar_type);
3641 if (!STMT_VINFO_VECTYPE (stmt_info))
3643 if (dump_enabled_p ())
3645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3646 "not vectorized: no vectype for stmt: ");
3647 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3648 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3649 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3650 scalar_type);
3651 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3654 if (bb_vinfo)
3655 break;
3657 if (gather || simd_lane_access)
3659 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3660 free_data_ref (dr);
3662 return false;
3664 else
3666 if (dump_enabled_p ())
3668 dump_printf_loc (MSG_NOTE, vect_location,
3669 "got vectype for stmt: ");
3670 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3671 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3672 STMT_VINFO_VECTYPE (stmt_info));
3673 dump_printf (MSG_NOTE, "\n");
3677 /* Adjust the minimal vectorization factor according to the
3678 vector type. */
3679 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3680 if (vf > *min_vf)
3681 *min_vf = vf;
3683 if (gather)
3685 tree off;
3687 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3688 if (gather
3689 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3690 gather = false;
3691 if (!gather)
3693 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3694 free_data_ref (dr);
3695 if (dump_enabled_p ())
3697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3698 "not vectorized: not suitable for gather "
3699 "load ");
3700 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3701 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3703 return false;
3706 datarefs[i] = dr;
3707 STMT_VINFO_GATHER_P (stmt_info) = true;
3709 else if (loop_vinfo
3710 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3712 if (nested_in_vect_loop_p (loop, stmt)
3713 || !DR_IS_READ (dr))
3715 if (dump_enabled_p ())
3717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3718 "not vectorized: not suitable for strided "
3719 "load ");
3720 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3721 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3723 return false;
3725 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3729 /* If we stopped analysis at the first dataref we could not analyze
3730 when trying to vectorize a basic-block mark the rest of the datarefs
3731 as not vectorizable and truncate the vector of datarefs. That
3732 avoids spending useless time in analyzing their dependence. */
3733 if (i != datarefs.length ())
3735 gcc_assert (bb_vinfo != NULL);
3736 for (unsigned j = i; j < datarefs.length (); ++j)
3738 data_reference_p dr = datarefs[j];
3739 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3740 free_data_ref (dr);
3742 datarefs.truncate (i);
3745 return true;
3749 /* Function vect_get_new_vect_var.
3751 Returns a name for a new variable. The current naming scheme appends the
3752 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3753 the name of vectorizer generated variables, and appends that to NAME if
3754 provided. */
3756 tree
3757 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3759 const char *prefix;
3760 tree new_vect_var;
3762 switch (var_kind)
3764 case vect_simple_var:
3765 prefix = "vect";
3766 break;
3767 case vect_scalar_var:
3768 prefix = "stmp";
3769 break;
3770 case vect_pointer_var:
3771 prefix = "vectp";
3772 break;
3773 default:
3774 gcc_unreachable ();
3777 if (name)
3779 char* tmp = concat (prefix, "_", name, NULL);
3780 new_vect_var = create_tmp_reg (type, tmp);
3781 free (tmp);
3783 else
3784 new_vect_var = create_tmp_reg (type, prefix);
3786 return new_vect_var;
3790 /* Function vect_create_addr_base_for_vector_ref.
3792 Create an expression that computes the address of the first memory location
3793 that will be accessed for a data reference.
3795 Input:
3796 STMT: The statement containing the data reference.
3797 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3798 OFFSET: Optional. If supplied, it is be added to the initial address.
3799 LOOP: Specify relative to which loop-nest should the address be computed.
3800 For example, when the dataref is in an inner-loop nested in an
3801 outer-loop that is now being vectorized, LOOP can be either the
3802 outer-loop, or the inner-loop. The first memory location accessed
3803 by the following dataref ('in' points to short):
3805 for (i=0; i<N; i++)
3806 for (j=0; j<M; j++)
3807 s += in[i+j]
3809 is as follows:
3810 if LOOP=i_loop: &in (relative to i_loop)
3811 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3813 Output:
3814 1. Return an SSA_NAME whose value is the address of the memory location of
3815 the first vector of the data reference.
3816 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3817 these statement(s) which define the returned SSA_NAME.
3819 FORNOW: We are only handling array accesses with step 1. */
3821 tree
3822 vect_create_addr_base_for_vector_ref (gimple stmt,
3823 gimple_seq *new_stmt_list,
3824 tree offset,
3825 struct loop *loop)
3827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3828 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3829 tree data_ref_base;
3830 const char *base_name;
3831 tree addr_base;
3832 tree dest;
3833 gimple_seq seq = NULL;
3834 tree base_offset;
3835 tree init;
3836 tree vect_ptr_type;
3837 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3838 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3840 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3842 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3844 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3846 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3847 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3848 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3850 else
3852 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3853 base_offset = unshare_expr (DR_OFFSET (dr));
3854 init = unshare_expr (DR_INIT (dr));
3857 if (loop_vinfo)
3858 base_name = get_name (data_ref_base);
3859 else
3861 base_offset = ssize_int (0);
3862 init = ssize_int (0);
3863 base_name = get_name (DR_REF (dr));
3866 /* Create base_offset */
3867 base_offset = size_binop (PLUS_EXPR,
3868 fold_convert (sizetype, base_offset),
3869 fold_convert (sizetype, init));
3871 if (offset)
3873 offset = fold_build2 (MULT_EXPR, sizetype,
3874 fold_convert (sizetype, offset), step);
3875 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3876 base_offset, offset);
3879 /* base + base_offset */
3880 if (loop_vinfo)
3881 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3882 else
3884 addr_base = build1 (ADDR_EXPR,
3885 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3886 unshare_expr (DR_REF (dr)));
3889 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3890 addr_base = fold_convert (vect_ptr_type, addr_base);
3891 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3892 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3893 gimple_seq_add_seq (new_stmt_list, seq);
3895 if (DR_PTR_INFO (dr)
3896 && TREE_CODE (addr_base) == SSA_NAME)
3898 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3899 if (offset)
3900 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3903 if (dump_enabled_p ())
3905 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3906 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3907 dump_printf (MSG_NOTE, "\n");
3910 return addr_base;
3914 /* Function vect_create_data_ref_ptr.
3916 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3917 location accessed in the loop by STMT, along with the def-use update
3918 chain to appropriately advance the pointer through the loop iterations.
3919 Also set aliasing information for the pointer. This pointer is used by
3920 the callers to this function to create a memory reference expression for
3921 vector load/store access.
3923 Input:
3924 1. STMT: a stmt that references memory. Expected to be of the form
3925 GIMPLE_ASSIGN <name, data-ref> or
3926 GIMPLE_ASSIGN <data-ref, name>.
3927 2. AGGR_TYPE: the type of the reference, which should be either a vector
3928 or an array.
3929 3. AT_LOOP: the loop where the vector memref is to be created.
3930 4. OFFSET (optional): an offset to be added to the initial address accessed
3931 by the data-ref in STMT.
3932 5. BSI: location where the new stmts are to be placed if there is no loop
3933 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3934 pointing to the initial address.
3936 Output:
3937 1. Declare a new ptr to vector_type, and have it point to the base of the
3938 data reference (initial addressed accessed by the data reference).
3939 For example, for vector of type V8HI, the following code is generated:
3941 v8hi *ap;
3942 ap = (v8hi *)initial_address;
3944 if OFFSET is not supplied:
3945 initial_address = &a[init];
3946 if OFFSET is supplied:
3947 initial_address = &a[init + OFFSET];
3949 Return the initial_address in INITIAL_ADDRESS.
3951 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3952 update the pointer in each iteration of the loop.
3954 Return the increment stmt that updates the pointer in PTR_INCR.
3956 3. Set INV_P to true if the access pattern of the data reference in the
3957 vectorized loop is invariant. Set it to false otherwise.
3959 4. Return the pointer. */
3961 tree
3962 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3963 tree offset, tree *initial_address,
3964 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3965 bool only_init, bool *inv_p)
3967 const char *base_name;
3968 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3969 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3970 struct loop *loop = NULL;
3971 bool nested_in_vect_loop = false;
3972 struct loop *containing_loop = NULL;
3973 tree aggr_ptr_type;
3974 tree aggr_ptr;
3975 tree new_temp;
3976 gimple vec_stmt;
3977 gimple_seq new_stmt_list = NULL;
3978 edge pe = NULL;
3979 basic_block new_bb;
3980 tree aggr_ptr_init;
3981 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3982 tree aptr;
3983 gimple_stmt_iterator incr_gsi;
3984 bool insert_after;
3985 tree indx_before_incr, indx_after_incr;
3986 gimple incr;
3987 tree step;
3988 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3990 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3991 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3993 if (loop_vinfo)
3995 loop = LOOP_VINFO_LOOP (loop_vinfo);
3996 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3997 containing_loop = (gimple_bb (stmt))->loop_father;
3998 pe = loop_preheader_edge (loop);
4000 else
4002 gcc_assert (bb_vinfo);
4003 only_init = true;
4004 *ptr_incr = NULL;
4007 /* Check the step (evolution) of the load in LOOP, and record
4008 whether it's invariant. */
4009 if (nested_in_vect_loop)
4010 step = STMT_VINFO_DR_STEP (stmt_info);
4011 else
4012 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4014 if (integer_zerop (step))
4015 *inv_p = true;
4016 else
4017 *inv_p = false;
4019 /* Create an expression for the first address accessed by this load
4020 in LOOP. */
4021 base_name = get_name (DR_BASE_ADDRESS (dr));
4023 if (dump_enabled_p ())
4025 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4026 dump_printf_loc (MSG_NOTE, vect_location,
4027 "create %s-pointer variable to type: ",
4028 get_tree_code_name (TREE_CODE (aggr_type)));
4029 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4030 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4031 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4032 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4033 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4034 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4035 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4036 else
4037 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4038 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4039 dump_printf (MSG_NOTE, "\n");
4042 /* (1) Create the new aggregate-pointer variable.
4043 Vector and array types inherit the alias set of their component
4044 type by default so we need to use a ref-all pointer if the data
4045 reference does not conflict with the created aggregated data
4046 reference because it is not addressable. */
4047 bool need_ref_all = false;
4048 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4049 get_alias_set (DR_REF (dr))))
4050 need_ref_all = true;
4051 /* Likewise for any of the data references in the stmt group. */
4052 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4054 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4057 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4058 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4059 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4060 get_alias_set (DR_REF (sdr))))
4062 need_ref_all = true;
4063 break;
4065 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4067 while (orig_stmt);
4069 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4070 need_ref_all);
4071 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4074 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4075 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4076 def-use update cycles for the pointer: one relative to the outer-loop
4077 (LOOP), which is what steps (3) and (4) below do. The other is relative
4078 to the inner-loop (which is the inner-most loop containing the dataref),
4079 and this is done be step (5) below.
4081 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4082 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4083 redundant. Steps (3),(4) create the following:
4085 vp0 = &base_addr;
4086 LOOP: vp1 = phi(vp0,vp2)
4089 vp2 = vp1 + step
4090 goto LOOP
4092 If there is an inner-loop nested in loop, then step (5) will also be
4093 applied, and an additional update in the inner-loop will be created:
4095 vp0 = &base_addr;
4096 LOOP: vp1 = phi(vp0,vp2)
4098 inner: vp3 = phi(vp1,vp4)
4099 vp4 = vp3 + inner_step
4100 if () goto inner
4102 vp2 = vp1 + step
4103 if () goto LOOP */
4105 /* (2) Calculate the initial address of the aggregate-pointer, and set
4106 the aggregate-pointer to point to it before the loop. */
4108 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4110 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4111 offset, loop);
4112 if (new_stmt_list)
4114 if (pe)
4116 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4117 gcc_assert (!new_bb);
4119 else
4120 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4123 *initial_address = new_temp;
4125 /* Create: p = (aggr_type *) initial_base */
4126 if (TREE_CODE (new_temp) != SSA_NAME
4127 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4129 vec_stmt = gimple_build_assign (aggr_ptr,
4130 fold_convert (aggr_ptr_type, new_temp));
4131 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4132 /* Copy the points-to information if it exists. */
4133 if (DR_PTR_INFO (dr))
4134 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4135 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4136 if (pe)
4138 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4139 gcc_assert (!new_bb);
4141 else
4142 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4144 else
4145 aggr_ptr_init = new_temp;
4147 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4148 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4149 inner-loop nested in LOOP (during outer-loop vectorization). */
4151 /* No update in loop is required. */
4152 if (only_init && (!loop_vinfo || at_loop == loop))
4153 aptr = aggr_ptr_init;
4154 else
4156 /* The step of the aggregate pointer is the type size. */
4157 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4158 /* One exception to the above is when the scalar step of the load in
4159 LOOP is zero. In this case the step here is also zero. */
4160 if (*inv_p)
4161 iv_step = size_zero_node;
4162 else if (tree_int_cst_sgn (step) == -1)
4163 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4165 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4167 create_iv (aggr_ptr_init,
4168 fold_convert (aggr_ptr_type, iv_step),
4169 aggr_ptr, loop, &incr_gsi, insert_after,
4170 &indx_before_incr, &indx_after_incr);
4171 incr = gsi_stmt (incr_gsi);
4172 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4174 /* Copy the points-to information if it exists. */
4175 if (DR_PTR_INFO (dr))
4177 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4178 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4180 if (ptr_incr)
4181 *ptr_incr = incr;
4183 aptr = indx_before_incr;
4186 if (!nested_in_vect_loop || only_init)
4187 return aptr;
4190 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4191 nested in LOOP, if exists. */
4193 gcc_assert (nested_in_vect_loop);
4194 if (!only_init)
4196 standard_iv_increment_position (containing_loop, &incr_gsi,
4197 &insert_after);
4198 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4199 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4200 &indx_after_incr);
4201 incr = gsi_stmt (incr_gsi);
4202 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4204 /* Copy the points-to information if it exists. */
4205 if (DR_PTR_INFO (dr))
4207 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4208 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4210 if (ptr_incr)
4211 *ptr_incr = incr;
4213 return indx_before_incr;
4215 else
4216 gcc_unreachable ();
4220 /* Function bump_vector_ptr
4222 Increment a pointer (to a vector type) by vector-size. If requested,
4223 i.e. if PTR-INCR is given, then also connect the new increment stmt
4224 to the existing def-use update-chain of the pointer, by modifying
4225 the PTR_INCR as illustrated below:
4227 The pointer def-use update-chain before this function:
4228 DATAREF_PTR = phi (p_0, p_2)
4229 ....
4230 PTR_INCR: p_2 = DATAREF_PTR + step
4232 The pointer def-use update-chain after this function:
4233 DATAREF_PTR = phi (p_0, p_2)
4234 ....
4235 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4236 ....
4237 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4239 Input:
4240 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4241 in the loop.
4242 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4243 the loop. The increment amount across iterations is expected
4244 to be vector_size.
4245 BSI - location where the new update stmt is to be placed.
4246 STMT - the original scalar memory-access stmt that is being vectorized.
4247 BUMP - optional. The offset by which to bump the pointer. If not given,
4248 the offset is assumed to be vector_size.
4250 Output: Return NEW_DATAREF_PTR as illustrated above.
4254 tree
4255 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4256 gimple stmt, tree bump)
4258 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4259 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4260 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4261 tree update = TYPE_SIZE_UNIT (vectype);
4262 gimple incr_stmt;
4263 ssa_op_iter iter;
4264 use_operand_p use_p;
4265 tree new_dataref_ptr;
4267 if (bump)
4268 update = bump;
4270 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4271 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4272 dataref_ptr, update);
4273 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4275 /* Copy the points-to information if it exists. */
4276 if (DR_PTR_INFO (dr))
4278 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4279 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4282 if (!ptr_incr)
4283 return new_dataref_ptr;
4285 /* Update the vector-pointer's cross-iteration increment. */
4286 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4288 tree use = USE_FROM_PTR (use_p);
4290 if (use == dataref_ptr)
4291 SET_USE (use_p, new_dataref_ptr);
4292 else
4293 gcc_assert (tree_int_cst_compare (use, update) == 0);
4296 return new_dataref_ptr;
4300 /* Function vect_create_destination_var.
4302 Create a new temporary of type VECTYPE. */
4304 tree
4305 vect_create_destination_var (tree scalar_dest, tree vectype)
4307 tree vec_dest;
4308 const char *name;
4309 char *new_name;
4310 tree type;
4311 enum vect_var_kind kind;
4313 kind = vectype ? vect_simple_var : vect_scalar_var;
4314 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4316 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4318 name = get_name (scalar_dest);
4319 if (name)
4320 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4321 else
4322 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4323 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4324 free (new_name);
4326 return vec_dest;
4329 /* Function vect_grouped_store_supported.
4331 Returns TRUE if interleave high and interleave low permutations
4332 are supported, and FALSE otherwise. */
4334 bool
4335 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4337 enum machine_mode mode = TYPE_MODE (vectype);
4339 /* vect_permute_store_chain requires the group size to be a power of two. */
4340 if (exact_log2 (count) == -1)
4342 if (dump_enabled_p ())
4343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4344 "the size of the group of accesses"
4345 " is not a power of 2\n");
4346 return false;
4349 /* Check that the permutation is supported. */
4350 if (VECTOR_MODE_P (mode))
4352 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4353 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4354 for (i = 0; i < nelt / 2; i++)
4356 sel[i * 2] = i;
4357 sel[i * 2 + 1] = i + nelt;
4359 if (can_vec_perm_p (mode, false, sel))
4361 for (i = 0; i < nelt; i++)
4362 sel[i] += nelt / 2;
4363 if (can_vec_perm_p (mode, false, sel))
4364 return true;
4368 if (dump_enabled_p ())
4369 dump_printf (MSG_MISSED_OPTIMIZATION,
4370 "interleave op not supported by target.\n");
4371 return false;
4375 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4376 type VECTYPE. */
4378 bool
4379 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4381 return vect_lanes_optab_supported_p ("vec_store_lanes",
4382 vec_store_lanes_optab,
4383 vectype, count);
4387 /* Function vect_permute_store_chain.
4389 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4390 a power of 2, generate interleave_high/low stmts to reorder the data
4391 correctly for the stores. Return the final references for stores in
4392 RESULT_CHAIN.
4394 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4395 The input is 4 vectors each containing 8 elements. We assign a number to
4396 each element, the input sequence is:
4398 1st vec: 0 1 2 3 4 5 6 7
4399 2nd vec: 8 9 10 11 12 13 14 15
4400 3rd vec: 16 17 18 19 20 21 22 23
4401 4th vec: 24 25 26 27 28 29 30 31
4403 The output sequence should be:
4405 1st vec: 0 8 16 24 1 9 17 25
4406 2nd vec: 2 10 18 26 3 11 19 27
4407 3rd vec: 4 12 20 28 5 13 21 30
4408 4th vec: 6 14 22 30 7 15 23 31
4410 i.e., we interleave the contents of the four vectors in their order.
4412 We use interleave_high/low instructions to create such output. The input of
4413 each interleave_high/low operation is two vectors:
4414 1st vec 2nd vec
4415 0 1 2 3 4 5 6 7
4416 the even elements of the result vector are obtained left-to-right from the
4417 high/low elements of the first vector. The odd elements of the result are
4418 obtained left-to-right from the high/low elements of the second vector.
4419 The output of interleave_high will be: 0 4 1 5
4420 and of interleave_low: 2 6 3 7
4423 The permutation is done in log LENGTH stages. In each stage interleave_high
4424 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4425 where the first argument is taken from the first half of DR_CHAIN and the
4426 second argument from it's second half.
4427 In our example,
4429 I1: interleave_high (1st vec, 3rd vec)
4430 I2: interleave_low (1st vec, 3rd vec)
4431 I3: interleave_high (2nd vec, 4th vec)
4432 I4: interleave_low (2nd vec, 4th vec)
4434 The output for the first stage is:
4436 I1: 0 16 1 17 2 18 3 19
4437 I2: 4 20 5 21 6 22 7 23
4438 I3: 8 24 9 25 10 26 11 27
4439 I4: 12 28 13 29 14 30 15 31
4441 The output of the second stage, i.e. the final result is:
4443 I1: 0 8 16 24 1 9 17 25
4444 I2: 2 10 18 26 3 11 19 27
4445 I3: 4 12 20 28 5 13 21 30
4446 I4: 6 14 22 30 7 15 23 31. */
4448 void
4449 vect_permute_store_chain (vec<tree> dr_chain,
4450 unsigned int length,
4451 gimple stmt,
4452 gimple_stmt_iterator *gsi,
4453 vec<tree> *result_chain)
4455 tree vect1, vect2, high, low;
4456 gimple perm_stmt;
4457 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4458 tree perm_mask_low, perm_mask_high;
4459 unsigned int i, n;
4460 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4461 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4463 result_chain->quick_grow (length);
4464 memcpy (result_chain->address (), dr_chain.address (),
4465 length * sizeof (tree));
4467 for (i = 0, n = nelt / 2; i < n; i++)
4469 sel[i * 2] = i;
4470 sel[i * 2 + 1] = i + nelt;
4472 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4473 gcc_assert (perm_mask_high != NULL);
4475 for (i = 0; i < nelt; i++)
4476 sel[i] += nelt / 2;
4477 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4478 gcc_assert (perm_mask_low != NULL);
4480 for (i = 0, n = exact_log2 (length); i < n; i++)
4482 for (j = 0; j < length/2; j++)
4484 vect1 = dr_chain[j];
4485 vect2 = dr_chain[j+length/2];
4487 /* Create interleaving stmt:
4488 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4489 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4490 perm_stmt
4491 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4492 vect1, vect2, perm_mask_high);
4493 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4494 (*result_chain)[2*j] = high;
4496 /* Create interleaving stmt:
4497 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4498 nelt*3/2+1, ...}> */
4499 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4500 perm_stmt
4501 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4502 vect1, vect2, perm_mask_low);
4503 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4504 (*result_chain)[2*j+1] = low;
4506 memcpy (dr_chain.address (), result_chain->address (),
4507 length * sizeof (tree));
4511 /* Function vect_setup_realignment
4513 This function is called when vectorizing an unaligned load using
4514 the dr_explicit_realign[_optimized] scheme.
4515 This function generates the following code at the loop prolog:
4517 p = initial_addr;
4518 x msq_init = *(floor(p)); # prolog load
4519 realignment_token = call target_builtin;
4520 loop:
4521 x msq = phi (msq_init, ---)
4523 The stmts marked with x are generated only for the case of
4524 dr_explicit_realign_optimized.
4526 The code above sets up a new (vector) pointer, pointing to the first
4527 location accessed by STMT, and a "floor-aligned" load using that pointer.
4528 It also generates code to compute the "realignment-token" (if the relevant
4529 target hook was defined), and creates a phi-node at the loop-header bb
4530 whose arguments are the result of the prolog-load (created by this
4531 function) and the result of a load that takes place in the loop (to be
4532 created by the caller to this function).
4534 For the case of dr_explicit_realign_optimized:
4535 The caller to this function uses the phi-result (msq) to create the
4536 realignment code inside the loop, and sets up the missing phi argument,
4537 as follows:
4538 loop:
4539 msq = phi (msq_init, lsq)
4540 lsq = *(floor(p')); # load in loop
4541 result = realign_load (msq, lsq, realignment_token);
4543 For the case of dr_explicit_realign:
4544 loop:
4545 msq = *(floor(p)); # load in loop
4546 p' = p + (VS-1);
4547 lsq = *(floor(p')); # load in loop
4548 result = realign_load (msq, lsq, realignment_token);
4550 Input:
4551 STMT - (scalar) load stmt to be vectorized. This load accesses
4552 a memory location that may be unaligned.
4553 BSI - place where new code is to be inserted.
4554 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4555 is used.
4557 Output:
4558 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4559 target hook, if defined.
4560 Return value - the result of the loop-header phi node. */
4562 tree
4563 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4564 tree *realignment_token,
4565 enum dr_alignment_support alignment_support_scheme,
4566 tree init_addr,
4567 struct loop **at_loop)
4569 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4570 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4571 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4572 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4573 struct loop *loop = NULL;
4574 edge pe = NULL;
4575 tree scalar_dest = gimple_assign_lhs (stmt);
4576 tree vec_dest;
4577 gimple inc;
4578 tree ptr;
4579 tree data_ref;
4580 gimple new_stmt;
4581 basic_block new_bb;
4582 tree msq_init = NULL_TREE;
4583 tree new_temp;
4584 gimple phi_stmt;
4585 tree msq = NULL_TREE;
4586 gimple_seq stmts = NULL;
4587 bool inv_p;
4588 bool compute_in_loop = false;
4589 bool nested_in_vect_loop = false;
4590 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4591 struct loop *loop_for_initial_load = NULL;
4593 if (loop_vinfo)
4595 loop = LOOP_VINFO_LOOP (loop_vinfo);
4596 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4599 gcc_assert (alignment_support_scheme == dr_explicit_realign
4600 || alignment_support_scheme == dr_explicit_realign_optimized);
4602 /* We need to generate three things:
4603 1. the misalignment computation
4604 2. the extra vector load (for the optimized realignment scheme).
4605 3. the phi node for the two vectors from which the realignment is
4606 done (for the optimized realignment scheme). */
4608 /* 1. Determine where to generate the misalignment computation.
4610 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4611 calculation will be generated by this function, outside the loop (in the
4612 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4613 caller, inside the loop.
4615 Background: If the misalignment remains fixed throughout the iterations of
4616 the loop, then both realignment schemes are applicable, and also the
4617 misalignment computation can be done outside LOOP. This is because we are
4618 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4619 are a multiple of VS (the Vector Size), and therefore the misalignment in
4620 different vectorized LOOP iterations is always the same.
4621 The problem arises only if the memory access is in an inner-loop nested
4622 inside LOOP, which is now being vectorized using outer-loop vectorization.
4623 This is the only case when the misalignment of the memory access may not
4624 remain fixed throughout the iterations of the inner-loop (as explained in
4625 detail in vect_supportable_dr_alignment). In this case, not only is the
4626 optimized realignment scheme not applicable, but also the misalignment
4627 computation (and generation of the realignment token that is passed to
4628 REALIGN_LOAD) have to be done inside the loop.
4630 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4631 or not, which in turn determines if the misalignment is computed inside
4632 the inner-loop, or outside LOOP. */
4634 if (init_addr != NULL_TREE || !loop_vinfo)
4636 compute_in_loop = true;
4637 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4641 /* 2. Determine where to generate the extra vector load.
4643 For the optimized realignment scheme, instead of generating two vector
4644 loads in each iteration, we generate a single extra vector load in the
4645 preheader of the loop, and in each iteration reuse the result of the
4646 vector load from the previous iteration. In case the memory access is in
4647 an inner-loop nested inside LOOP, which is now being vectorized using
4648 outer-loop vectorization, we need to determine whether this initial vector
4649 load should be generated at the preheader of the inner-loop, or can be
4650 generated at the preheader of LOOP. If the memory access has no evolution
4651 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4652 to be generated inside LOOP (in the preheader of the inner-loop). */
4654 if (nested_in_vect_loop)
4656 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4657 bool invariant_in_outerloop =
4658 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4659 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4661 else
4662 loop_for_initial_load = loop;
4663 if (at_loop)
4664 *at_loop = loop_for_initial_load;
4666 if (loop_for_initial_load)
4667 pe = loop_preheader_edge (loop_for_initial_load);
4669 /* 3. For the case of the optimized realignment, create the first vector
4670 load at the loop preheader. */
4672 if (alignment_support_scheme == dr_explicit_realign_optimized)
4674 /* Create msq_init = *(floor(p1)) in the loop preheader */
4676 gcc_assert (!compute_in_loop);
4677 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4678 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4679 NULL_TREE, &init_addr, NULL, &inc,
4680 true, &inv_p);
4681 new_temp = copy_ssa_name (ptr, NULL);
4682 new_stmt = gimple_build_assign_with_ops
4683 (BIT_AND_EXPR, new_temp, ptr,
4684 build_int_cst (TREE_TYPE (ptr),
4685 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4686 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4687 gcc_assert (!new_bb);
4688 data_ref
4689 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4690 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4691 new_stmt = gimple_build_assign (vec_dest, data_ref);
4692 new_temp = make_ssa_name (vec_dest, new_stmt);
4693 gimple_assign_set_lhs (new_stmt, new_temp);
4694 if (pe)
4696 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4697 gcc_assert (!new_bb);
4699 else
4700 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4702 msq_init = gimple_assign_lhs (new_stmt);
4705 /* 4. Create realignment token using a target builtin, if available.
4706 It is done either inside the containing loop, or before LOOP (as
4707 determined above). */
4709 if (targetm.vectorize.builtin_mask_for_load)
4711 tree builtin_decl;
4713 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4714 if (!init_addr)
4716 /* Generate the INIT_ADDR computation outside LOOP. */
4717 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4718 NULL_TREE, loop);
4719 if (loop)
4721 pe = loop_preheader_edge (loop);
4722 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4723 gcc_assert (!new_bb);
4725 else
4726 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4729 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4730 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4731 vec_dest =
4732 vect_create_destination_var (scalar_dest,
4733 gimple_call_return_type (new_stmt));
4734 new_temp = make_ssa_name (vec_dest, new_stmt);
4735 gimple_call_set_lhs (new_stmt, new_temp);
4737 if (compute_in_loop)
4738 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4739 else
4741 /* Generate the misalignment computation outside LOOP. */
4742 pe = loop_preheader_edge (loop);
4743 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4744 gcc_assert (!new_bb);
4747 *realignment_token = gimple_call_lhs (new_stmt);
4749 /* The result of the CALL_EXPR to this builtin is determined from
4750 the value of the parameter and no global variables are touched
4751 which makes the builtin a "const" function. Requiring the
4752 builtin to have the "const" attribute makes it unnecessary
4753 to call mark_call_clobbered. */
4754 gcc_assert (TREE_READONLY (builtin_decl));
4757 if (alignment_support_scheme == dr_explicit_realign)
4758 return msq;
4760 gcc_assert (!compute_in_loop);
4761 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4764 /* 5. Create msq = phi <msq_init, lsq> in loop */
4766 pe = loop_preheader_edge (containing_loop);
4767 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4768 msq = make_ssa_name (vec_dest, NULL);
4769 phi_stmt = create_phi_node (msq, containing_loop->header);
4770 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4772 return msq;
4776 /* Function vect_grouped_load_supported.
4778 Returns TRUE if even and odd permutations are supported,
4779 and FALSE otherwise. */
4781 bool
4782 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4784 enum machine_mode mode = TYPE_MODE (vectype);
4786 /* vect_permute_load_chain requires the group size to be a power of two. */
4787 if (exact_log2 (count) == -1)
4789 if (dump_enabled_p ())
4790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4791 "the size of the group of accesses"
4792 " is not a power of 2\n");
4793 return false;
4796 /* Check that the permutation is supported. */
4797 if (VECTOR_MODE_P (mode))
4799 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4800 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4802 for (i = 0; i < nelt; i++)
4803 sel[i] = i * 2;
4804 if (can_vec_perm_p (mode, false, sel))
4806 for (i = 0; i < nelt; i++)
4807 sel[i] = i * 2 + 1;
4808 if (can_vec_perm_p (mode, false, sel))
4809 return true;
4813 if (dump_enabled_p ())
4814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4815 "extract even/odd not supported by target\n");
4816 return false;
4819 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4820 type VECTYPE. */
4822 bool
4823 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4825 return vect_lanes_optab_supported_p ("vec_load_lanes",
4826 vec_load_lanes_optab,
4827 vectype, count);
4830 /* Function vect_permute_load_chain.
4832 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4833 a power of 2, generate extract_even/odd stmts to reorder the input data
4834 correctly. Return the final references for loads in RESULT_CHAIN.
4836 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4837 The input is 4 vectors each containing 8 elements. We assign a number to each
4838 element, the input sequence is:
4840 1st vec: 0 1 2 3 4 5 6 7
4841 2nd vec: 8 9 10 11 12 13 14 15
4842 3rd vec: 16 17 18 19 20 21 22 23
4843 4th vec: 24 25 26 27 28 29 30 31
4845 The output sequence should be:
4847 1st vec: 0 4 8 12 16 20 24 28
4848 2nd vec: 1 5 9 13 17 21 25 29
4849 3rd vec: 2 6 10 14 18 22 26 30
4850 4th vec: 3 7 11 15 19 23 27 31
4852 i.e., the first output vector should contain the first elements of each
4853 interleaving group, etc.
4855 We use extract_even/odd instructions to create such output. The input of
4856 each extract_even/odd operation is two vectors
4857 1st vec 2nd vec
4858 0 1 2 3 4 5 6 7
4860 and the output is the vector of extracted even/odd elements. The output of
4861 extract_even will be: 0 2 4 6
4862 and of extract_odd: 1 3 5 7
4865 The permutation is done in log LENGTH stages. In each stage extract_even
4866 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4867 their order. In our example,
4869 E1: extract_even (1st vec, 2nd vec)
4870 E2: extract_odd (1st vec, 2nd vec)
4871 E3: extract_even (3rd vec, 4th vec)
4872 E4: extract_odd (3rd vec, 4th vec)
4874 The output for the first stage will be:
4876 E1: 0 2 4 6 8 10 12 14
4877 E2: 1 3 5 7 9 11 13 15
4878 E3: 16 18 20 22 24 26 28 30
4879 E4: 17 19 21 23 25 27 29 31
4881 In order to proceed and create the correct sequence for the next stage (or
4882 for the correct output, if the second stage is the last one, as in our
4883 example), we first put the output of extract_even operation and then the
4884 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4885 The input for the second stage is:
4887 1st vec (E1): 0 2 4 6 8 10 12 14
4888 2nd vec (E3): 16 18 20 22 24 26 28 30
4889 3rd vec (E2): 1 3 5 7 9 11 13 15
4890 4th vec (E4): 17 19 21 23 25 27 29 31
4892 The output of the second stage:
4894 E1: 0 4 8 12 16 20 24 28
4895 E2: 2 6 10 14 18 22 26 30
4896 E3: 1 5 9 13 17 21 25 29
4897 E4: 3 7 11 15 19 23 27 31
4899 And RESULT_CHAIN after reordering:
4901 1st vec (E1): 0 4 8 12 16 20 24 28
4902 2nd vec (E3): 1 5 9 13 17 21 25 29
4903 3rd vec (E2): 2 6 10 14 18 22 26 30
4904 4th vec (E4): 3 7 11 15 19 23 27 31. */
4906 static void
4907 vect_permute_load_chain (vec<tree> dr_chain,
4908 unsigned int length,
4909 gimple stmt,
4910 gimple_stmt_iterator *gsi,
4911 vec<tree> *result_chain)
4913 tree data_ref, first_vect, second_vect;
4914 tree perm_mask_even, perm_mask_odd;
4915 gimple perm_stmt;
4916 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4917 unsigned int i, j, log_length = exact_log2 (length);
4918 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4919 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4921 result_chain->quick_grow (length);
4922 memcpy (result_chain->address (), dr_chain.address (),
4923 length * sizeof (tree));
4925 for (i = 0; i < nelt; ++i)
4926 sel[i] = i * 2;
4927 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4928 gcc_assert (perm_mask_even != NULL);
4930 for (i = 0; i < nelt; ++i)
4931 sel[i] = i * 2 + 1;
4932 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4933 gcc_assert (perm_mask_odd != NULL);
4935 for (i = 0; i < log_length; i++)
4937 for (j = 0; j < length; j += 2)
4939 first_vect = dr_chain[j];
4940 second_vect = dr_chain[j+1];
4942 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4943 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4944 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4945 first_vect, second_vect,
4946 perm_mask_even);
4947 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4948 (*result_chain)[j/2] = data_ref;
4950 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4951 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4952 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4953 first_vect, second_vect,
4954 perm_mask_odd);
4955 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4956 (*result_chain)[j/2+length/2] = data_ref;
4958 memcpy (dr_chain.address (), result_chain->address (),
4959 length * sizeof (tree));
4964 /* Function vect_transform_grouped_load.
4966 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4967 to perform their permutation and ascribe the result vectorized statements to
4968 the scalar statements.
4971 void
4972 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4973 gimple_stmt_iterator *gsi)
4975 vec<tree> result_chain = vNULL;
4977 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4978 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4979 vectors, that are ready for vector computation. */
4980 result_chain.create (size);
4981 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4982 vect_record_grouped_load_vectors (stmt, result_chain);
4983 result_chain.release ();
4986 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4987 generated as part of the vectorization of STMT. Assign the statement
4988 for each vector to the associated scalar statement. */
4990 void
4991 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4993 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4994 gimple next_stmt, new_stmt;
4995 unsigned int i, gap_count;
4996 tree tmp_data_ref;
4998 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4999 Since we scan the chain starting from it's first node, their order
5000 corresponds the order of data-refs in RESULT_CHAIN. */
5001 next_stmt = first_stmt;
5002 gap_count = 1;
5003 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5005 if (!next_stmt)
5006 break;
5008 /* Skip the gaps. Loads created for the gaps will be removed by dead
5009 code elimination pass later. No need to check for the first stmt in
5010 the group, since it always exists.
5011 GROUP_GAP is the number of steps in elements from the previous
5012 access (if there is no gap GROUP_GAP is 1). We skip loads that
5013 correspond to the gaps. */
5014 if (next_stmt != first_stmt
5015 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5017 gap_count++;
5018 continue;
5021 while (next_stmt)
5023 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5024 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5025 copies, and we put the new vector statement in the first available
5026 RELATED_STMT. */
5027 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5028 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5029 else
5031 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5033 gimple prev_stmt =
5034 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5035 gimple rel_stmt =
5036 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5037 while (rel_stmt)
5039 prev_stmt = rel_stmt;
5040 rel_stmt =
5041 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5044 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5045 new_stmt;
5049 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5050 gap_count = 1;
5051 /* If NEXT_STMT accesses the same DR as the previous statement,
5052 put the same TMP_DATA_REF as its vectorized statement; otherwise
5053 get the next data-ref from RESULT_CHAIN. */
5054 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5055 break;
5060 /* Function vect_force_dr_alignment_p.
5062 Returns whether the alignment of a DECL can be forced to be aligned
5063 on ALIGNMENT bit boundary. */
5065 bool
5066 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5068 if (TREE_CODE (decl) != VAR_DECL)
5069 return false;
5071 /* We cannot change alignment of common or external symbols as another
5072 translation unit may contain a definition with lower alignment.
5073 The rules of common symbol linking mean that the definition
5074 will override the common symbol. The same is true for constant
5075 pool entries which may be shared and are not properly merged
5076 by LTO. */
5077 if (DECL_EXTERNAL (decl)
5078 || DECL_COMMON (decl)
5079 || DECL_IN_CONSTANT_POOL (decl))
5080 return false;
5082 if (TREE_ASM_WRITTEN (decl))
5083 return false;
5085 /* Do not override the alignment as specified by the ABI when the used
5086 attribute is set. */
5087 if (DECL_PRESERVE_P (decl))
5088 return false;
5090 /* Do not override explicit alignment set by the user when an explicit
5091 section name is also used. This is a common idiom used by many
5092 software projects. */
5093 if (DECL_SECTION_NAME (decl) != NULL_TREE
5094 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5095 return false;
5097 if (TREE_STATIC (decl))
5098 return (alignment <= MAX_OFILE_ALIGNMENT);
5099 else
5100 return (alignment <= MAX_STACK_ALIGNMENT);
5104 /* Return whether the data reference DR is supported with respect to its
5105 alignment.
5106 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5107 it is aligned, i.e., check if it is possible to vectorize it with different
5108 alignment. */
5110 enum dr_alignment_support
5111 vect_supportable_dr_alignment (struct data_reference *dr,
5112 bool check_aligned_accesses)
5114 gimple stmt = DR_STMT (dr);
5115 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5116 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5117 enum machine_mode mode = TYPE_MODE (vectype);
5118 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5119 struct loop *vect_loop = NULL;
5120 bool nested_in_vect_loop = false;
5122 if (aligned_access_p (dr) && !check_aligned_accesses)
5123 return dr_aligned;
5125 /* For now assume all conditional loads/stores support unaligned
5126 access without any special code. */
5127 if (is_gimple_call (stmt)
5128 && gimple_call_internal_p (stmt)
5129 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5130 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5131 return dr_unaligned_supported;
5133 if (loop_vinfo)
5135 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5136 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5139 /* Possibly unaligned access. */
5141 /* We can choose between using the implicit realignment scheme (generating
5142 a misaligned_move stmt) and the explicit realignment scheme (generating
5143 aligned loads with a REALIGN_LOAD). There are two variants to the
5144 explicit realignment scheme: optimized, and unoptimized.
5145 We can optimize the realignment only if the step between consecutive
5146 vector loads is equal to the vector size. Since the vector memory
5147 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5148 is guaranteed that the misalignment amount remains the same throughout the
5149 execution of the vectorized loop. Therefore, we can create the
5150 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5151 at the loop preheader.
5153 However, in the case of outer-loop vectorization, when vectorizing a
5154 memory access in the inner-loop nested within the LOOP that is now being
5155 vectorized, while it is guaranteed that the misalignment of the
5156 vectorized memory access will remain the same in different outer-loop
5157 iterations, it is *not* guaranteed that is will remain the same throughout
5158 the execution of the inner-loop. This is because the inner-loop advances
5159 with the original scalar step (and not in steps of VS). If the inner-loop
5160 step happens to be a multiple of VS, then the misalignment remains fixed
5161 and we can use the optimized realignment scheme. For example:
5163 for (i=0; i<N; i++)
5164 for (j=0; j<M; j++)
5165 s += a[i+j];
5167 When vectorizing the i-loop in the above example, the step between
5168 consecutive vector loads is 1, and so the misalignment does not remain
5169 fixed across the execution of the inner-loop, and the realignment cannot
5170 be optimized (as illustrated in the following pseudo vectorized loop):
5172 for (i=0; i<N; i+=4)
5173 for (j=0; j<M; j++){
5174 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5175 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5176 // (assuming that we start from an aligned address).
5179 We therefore have to use the unoptimized realignment scheme:
5181 for (i=0; i<N; i+=4)
5182 for (j=k; j<M; j+=4)
5183 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5184 // that the misalignment of the initial address is
5185 // 0).
5187 The loop can then be vectorized as follows:
5189 for (k=0; k<4; k++){
5190 rt = get_realignment_token (&vp[k]);
5191 for (i=0; i<N; i+=4){
5192 v1 = vp[i+k];
5193 for (j=k; j<M; j+=4){
5194 v2 = vp[i+j+VS-1];
5195 va = REALIGN_LOAD <v1,v2,rt>;
5196 vs += va;
5197 v1 = v2;
5200 } */
5202 if (DR_IS_READ (dr))
5204 bool is_packed = false;
5205 tree type = (TREE_TYPE (DR_REF (dr)));
5207 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5208 && (!targetm.vectorize.builtin_mask_for_load
5209 || targetm.vectorize.builtin_mask_for_load ()))
5211 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5212 if ((nested_in_vect_loop
5213 && (TREE_INT_CST_LOW (DR_STEP (dr))
5214 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5215 || !loop_vinfo)
5216 return dr_explicit_realign;
5217 else
5218 return dr_explicit_realign_optimized;
5220 if (!known_alignment_for_access_p (dr))
5221 is_packed = not_size_aligned (DR_REF (dr));
5223 if ((TYPE_USER_ALIGN (type) && !is_packed)
5224 || targetm.vectorize.
5225 support_vector_misalignment (mode, type,
5226 DR_MISALIGNMENT (dr), is_packed))
5227 /* Can't software pipeline the loads, but can at least do them. */
5228 return dr_unaligned_supported;
5230 else
5232 bool is_packed = false;
5233 tree type = (TREE_TYPE (DR_REF (dr)));
5235 if (!known_alignment_for_access_p (dr))
5236 is_packed = not_size_aligned (DR_REF (dr));
5238 if ((TYPE_USER_ALIGN (type) && !is_packed)
5239 || targetm.vectorize.
5240 support_vector_misalignment (mode, type,
5241 DR_MISALIGNMENT (dr), is_packed))
5242 return dr_unaligned_supported;
5245 /* Unsupported. */
5246 return dr_unaligned_unsupported;