2013-11-28 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob87d151f836dfc2c11971c8955fb11fee7dd43777
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
58 #include "expr.h"
59 #include "optabs.h"
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
64 static bool
65 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
66 tree vectype, unsigned HOST_WIDE_INT count)
68 enum machine_mode mode, array_mode;
69 bool limit_p;
71 mode = TYPE_MODE (vectype);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
74 MODE_INT, limit_p);
76 if (array_mode == BLKmode)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
81 GET_MODE_NAME (mode), count);
82 return false;
85 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
89 "cannot use %s<%s><%s>\n", name,
90 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
91 return false;
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE, vect_location,
96 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
97 GET_MODE_NAME (mode));
99 return true;
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 types. */
120 tree
121 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt);
125 HOST_WIDE_INT lhs, rhs;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE, vect_location,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
165 dump_printf (MSG_NOTE, " and ");
166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
167 dump_printf (MSG_NOTE, "\n");
170 if (optimize_loop_nest_for_size_p (loop))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
174 "versioning not supported when optimizing"
175 " for size.\n");
176 return false;
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
180 if (loop->inner)
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
184 "versioning not yet supported for outer-loops.\n");
185 return false;
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
189 step. */
190 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195 "versioning not yet supported for non-constant "
196 "step\n");
197 return false;
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return true;
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
212 static bool
213 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
214 loop_vec_info loop_vinfo, int *max_vf)
216 unsigned int i;
217 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
218 struct data_reference *dra = DDR_A (ddr);
219 struct data_reference *drb = DDR_B (ddr);
220 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222 lambda_vector dist_v;
223 unsigned int loop_depth;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
228 gcc_unreachable ();
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232 return false;
234 if (dra == drb
235 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
236 return false;
238 /* Unknown data dependence. */
239 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
241 /* If user asserted safelen consecutive iterations can be
242 executed concurrently, assume independence. */
243 if (loop->safelen >= 2)
245 if (loop->safelen < *max_vf)
246 *max_vf = loop->safelen;
247 return false;
250 if (STMT_VINFO_GATHER_P (stmtinfo_a)
251 || STMT_VINFO_GATHER_P (stmtinfo_b))
253 if (dump_enabled_p ())
255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
256 "versioning for alias not supported for: "
257 "can't determine dependence between ");
258 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
259 DR_REF (dra));
260 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
261 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
262 DR_REF (drb));
263 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
265 return true;
268 if (dump_enabled_p ())
270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
271 "versioning for alias required: "
272 "can't determine dependence between ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (dra));
275 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
276 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
277 DR_REF (drb));
278 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
281 /* Add to list of ddrs that need to be tested at run-time. */
282 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
285 /* Known data dependence. */
286 if (DDR_NUM_DIST_VECTS (ddr) == 0)
288 /* If user asserted safelen consecutive iterations can be
289 executed concurrently, assume independence. */
290 if (loop->safelen >= 2)
292 if (loop->safelen < *max_vf)
293 *max_vf = loop->safelen;
294 return false;
297 if (STMT_VINFO_GATHER_P (stmtinfo_a)
298 || STMT_VINFO_GATHER_P (stmtinfo_b))
300 if (dump_enabled_p ())
302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
303 "versioning for alias not supported for: "
304 "bad dist vector for ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (dra));
307 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
309 DR_REF (drb));
310 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
312 return true;
315 if (dump_enabled_p ())
317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
318 "versioning for alias required: "
319 "bad dist vector for ");
320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 /* Add to list of ddrs that need to be tested at run-time. */
326 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
329 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
330 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
332 int dist = dist_v[loop_depth];
334 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE, vect_location,
336 "dependence distance = %d.\n", dist);
338 if (dist == 0)
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE, vect_location,
343 "dependence distance == 0 between ");
344 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
345 dump_printf (MSG_NOTE, " and ");
346 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
347 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
350 /* When we perform grouped accesses and perform implicit CSE
351 by detecting equal accesses and doing disambiguation with
352 runtime alias tests like for
353 .. = a[i];
354 .. = a[i+1];
355 a[i] = ..;
356 a[i+1] = ..;
357 *p = ..;
358 .. = a[i];
359 .. = a[i+1];
360 where we will end up loading { a[i], a[i+1] } once, make
361 sure that inserting group loads before the first load and
362 stores after the last store will do the right thing. */
363 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
364 && GROUP_SAME_DR_STMT (stmtinfo_a))
365 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
366 && GROUP_SAME_DR_STMT (stmtinfo_b)))
368 gimple earlier_stmt;
369 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
370 if (DR_IS_WRITE
371 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
375 "READ_WRITE dependence in interleaving."
376 "\n");
377 return true;
381 continue;
384 if (dist > 0 && DDR_REVERSED_P (ddr))
386 /* If DDR_REVERSED_P the order of the data-refs in DDR was
387 reversed (to make distance vector positive), and the actual
388 distance is negative. */
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "dependence distance negative.\n");
392 continue;
395 if (abs (dist) >= 2
396 && abs (dist) < *max_vf)
398 /* The dependence distance requires reduction of the maximal
399 vectorization factor. */
400 *max_vf = abs (dist);
401 if (dump_enabled_p ())
402 dump_printf_loc (MSG_NOTE, vect_location,
403 "adjusting maximal vectorization factor to %i\n",
404 *max_vf);
407 if (abs (dist) >= *max_vf)
409 /* Dependence distance does not create dependence, as far as
410 vectorization is concerned, in this case. */
411 if (dump_enabled_p ())
412 dump_printf_loc (MSG_NOTE, vect_location,
413 "dependence distance >= VF.\n");
414 continue;
417 if (dump_enabled_p ())
419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
420 "not vectorized, possible dependence "
421 "between data-refs ");
422 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
423 dump_printf (MSG_NOTE, " and ");
424 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
425 dump_printf (MSG_NOTE, "\n");
428 return true;
431 return false;
434 /* Function vect_analyze_data_ref_dependences.
436 Examine all the data references in the loop, and make sure there do not
437 exist any data dependences between them. Set *MAX_VF according to
438 the maximum vectorization factor the data dependences allow. */
440 bool
441 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
443 unsigned int i;
444 struct data_dependence_relation *ddr;
446 if (dump_enabled_p ())
447 dump_printf_loc (MSG_NOTE, vect_location,
448 "=== vect_analyze_data_ref_dependences ===\n");
450 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
451 &LOOP_VINFO_DDRS (loop_vinfo),
452 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
453 return false;
455 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
456 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
457 return false;
459 return true;
463 /* Function vect_slp_analyze_data_ref_dependence.
465 Return TRUE if there (might) exist a dependence between a memory-reference
466 DRA and a memory-reference DRB. When versioning for alias may check a
467 dependence at run-time, return FALSE. Adjust *MAX_VF according to
468 the data dependence. */
470 static bool
471 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
473 struct data_reference *dra = DDR_A (ddr);
474 struct data_reference *drb = DDR_B (ddr);
476 /* We need to check dependences of statements marked as unvectorizable
477 as well, they still can prohibit vectorization. */
479 /* Independent data accesses. */
480 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
481 return false;
483 if (dra == drb)
484 return false;
486 /* Read-read is OK. */
487 if (DR_IS_READ (dra) && DR_IS_READ (drb))
488 return false;
490 /* If dra and drb are part of the same interleaving chain consider
491 them independent. */
492 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
493 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
494 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
495 return false;
497 /* Unknown data dependence. */
498 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
500 gimple earlier_stmt;
502 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "can't determine dependence between ");
506 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
507 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
508 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
509 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
512 /* We do not vectorize basic blocks with write-write dependencies. */
513 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
514 return true;
516 /* Check that it's not a load-after-store dependence. */
517 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
518 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
519 return true;
521 return false;
524 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE, vect_location,
527 "determined dependence between ");
528 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
529 dump_printf (MSG_NOTE, " and ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
531 dump_printf (MSG_NOTE, "\n");
534 /* Do not vectorize basic blocks with write-write dependences. */
535 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
536 return true;
538 /* Check dependence between DRA and DRB for basic block vectorization.
539 If the accesses share same bases and offsets, we can compare their initial
540 constant offsets to decide whether they differ or not. In case of a read-
541 write dependence we check that the load is before the store to ensure that
542 vectorization will not change the order of the accesses. */
544 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
545 gimple earlier_stmt;
547 /* Check that the data-refs have same bases and offsets. If not, we can't
548 determine if they are dependent. */
549 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
550 || !dr_equal_offsets_p (dra, drb))
551 return true;
553 /* Check the types. */
554 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
555 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
557 if (type_size_a != type_size_b
558 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
559 TREE_TYPE (DR_REF (drb))))
560 return true;
562 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
563 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
565 /* Two different locations - no dependence. */
566 if (init_a != init_b)
567 return false;
569 /* We have a read-write dependence. Check that the load is before the store.
570 When we vectorize basic blocks, vector load can be only before
571 corresponding scalar load, and vector store can be only after its
572 corresponding scalar store. So the order of the acceses is preserved in
573 case the load is before the store. */
574 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
575 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
576 return false;
578 return true;
582 /* Function vect_analyze_data_ref_dependences.
584 Examine all the data references in the basic-block, and make sure there
585 do not exist any data dependences between them. Set *MAX_VF according to
586 the maximum vectorization factor the data dependences allow. */
588 bool
589 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
591 struct data_dependence_relation *ddr;
592 unsigned int i;
594 if (dump_enabled_p ())
595 dump_printf_loc (MSG_NOTE, vect_location,
596 "=== vect_slp_analyze_data_ref_dependences ===\n");
598 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
599 &BB_VINFO_DDRS (bb_vinfo),
600 vNULL, true))
601 return false;
603 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
604 if (vect_slp_analyze_data_ref_dependence (ddr))
605 return false;
607 return true;
611 /* Function vect_compute_data_ref_alignment
613 Compute the misalignment of the data reference DR.
615 Output:
616 1. If during the misalignment computation it is found that the data reference
617 cannot be vectorized then false is returned.
618 2. DR_MISALIGNMENT (DR) is defined.
620 FOR NOW: No analysis is actually performed. Misalignment is calculated
621 only for trivial cases. TODO. */
623 static bool
624 vect_compute_data_ref_alignment (struct data_reference *dr)
626 gimple stmt = DR_STMT (dr);
627 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
628 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
629 struct loop *loop = NULL;
630 tree ref = DR_REF (dr);
631 tree vectype;
632 tree base, base_addr;
633 bool base_aligned;
634 tree misalign;
635 tree aligned_to, alignment;
637 if (dump_enabled_p ())
638 dump_printf_loc (MSG_NOTE, vect_location,
639 "vect_compute_data_ref_alignment:\n");
641 if (loop_vinfo)
642 loop = LOOP_VINFO_LOOP (loop_vinfo);
644 /* Initialize misalignment to unknown. */
645 SET_DR_MISALIGNMENT (dr, -1);
647 /* Strided loads perform only component accesses, misalignment information
648 is irrelevant for them. */
649 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
650 return true;
652 misalign = DR_INIT (dr);
653 aligned_to = DR_ALIGNED_TO (dr);
654 base_addr = DR_BASE_ADDRESS (dr);
655 vectype = STMT_VINFO_VECTYPE (stmt_info);
657 /* In case the dataref is in an inner-loop of the loop that is being
658 vectorized (LOOP), we use the base and misalignment information
659 relative to the outer-loop (LOOP). This is ok only if the misalignment
660 stays the same throughout the execution of the inner-loop, which is why
661 we have to check that the stride of the dataref in the inner-loop evenly
662 divides by the vector size. */
663 if (loop && nested_in_vect_loop_p (loop, stmt))
665 tree step = DR_STEP (dr);
666 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
668 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "inner step divides the vector-size.\n");
673 misalign = STMT_VINFO_DR_INIT (stmt_info);
674 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
675 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
677 else
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
681 "inner step doesn't divide the vector-size.\n");
682 misalign = NULL_TREE;
686 /* Similarly, if we're doing basic-block vectorization, we can only use
687 base and misalignment information relative to an innermost loop if the
688 misalignment stays the same throughout the execution of the loop.
689 As above, this is the case if the stride of the dataref evenly divides
690 by the vector size. */
691 if (!loop)
693 tree step = DR_STEP (dr);
694 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
696 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
700 "SLP: step doesn't divide the vector-size.\n");
701 misalign = NULL_TREE;
705 base = build_fold_indirect_ref (base_addr);
706 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
708 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
709 || !misalign)
711 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Unknown alignment for access: ");
715 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
716 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718 return true;
721 if ((DECL_P (base)
722 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
723 alignment) >= 0)
724 || (TREE_CODE (base_addr) == SSA_NAME
725 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
726 TREE_TYPE (base_addr)))),
727 alignment) >= 0)
728 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
729 base_aligned = true;
730 else
731 base_aligned = false;
733 if (!base_aligned)
735 /* Do not change the alignment of global variables here if
736 flag_section_anchors is enabled as we already generated
737 RTL for other functions. Most global variables should
738 have been aligned during the IPA increase_alignment pass. */
739 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
740 || (TREE_STATIC (base) && flag_section_anchors))
742 if (dump_enabled_p ())
744 dump_printf_loc (MSG_NOTE, vect_location,
745 "can't force alignment of ref: ");
746 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
747 dump_printf (MSG_NOTE, "\n");
749 return true;
752 /* Force the alignment of the decl.
753 NOTE: This is the only change to the code we make during
754 the analysis phase, before deciding to vectorize the loop. */
755 if (dump_enabled_p ())
757 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
758 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
759 dump_printf (MSG_NOTE, "\n");
762 ((dataref_aux *)dr->aux)->base_decl = base;
763 ((dataref_aux *)dr->aux)->base_misaligned = true;
766 /* If this is a backward running DR then first access in the larger
767 vectype actually is N-1 elements before the address in the DR.
768 Adjust misalign accordingly. */
769 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
771 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
772 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
773 otherwise we wouldn't be here. */
774 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
775 /* PLUS because DR_STEP was negative. */
776 misalign = size_binop (PLUS_EXPR, misalign, offset);
779 /* Modulo alignment. */
780 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
782 if (!tree_fits_uhwi_p (misalign))
784 /* Negative or overflowed misalignment value. */
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
787 "unexpected misalign value\n");
788 return false;
791 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
793 if (dump_enabled_p ())
795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
796 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
797 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
798 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
801 return true;
805 /* Function vect_compute_data_refs_alignment
807 Compute the misalignment of data references in the loop.
808 Return FALSE if a data reference is found that cannot be vectorized. */
810 static bool
811 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
812 bb_vec_info bb_vinfo)
814 vec<data_reference_p> datarefs;
815 struct data_reference *dr;
816 unsigned int i;
818 if (loop_vinfo)
819 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
820 else
821 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
823 FOR_EACH_VEC_ELT (datarefs, i, dr)
824 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
825 && !vect_compute_data_ref_alignment (dr))
827 if (bb_vinfo)
829 /* Mark unsupported statement as unvectorizable. */
830 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
831 continue;
833 else
834 return false;
837 return true;
841 /* Function vect_update_misalignment_for_peel
843 DR - the data reference whose misalignment is to be adjusted.
844 DR_PEEL - the data reference whose misalignment is being made
845 zero in the vector loop by the peel.
846 NPEEL - the number of iterations in the peel loop if the misalignment
847 of DR_PEEL is known at compile time. */
849 static void
850 vect_update_misalignment_for_peel (struct data_reference *dr,
851 struct data_reference *dr_peel, int npeel)
853 unsigned int i;
854 vec<dr_p> same_align_drs;
855 struct data_reference *current_dr;
856 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
857 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
858 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
859 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
861 /* For interleaved data accesses the step in the loop must be multiplied by
862 the size of the interleaving group. */
863 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
864 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
865 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
866 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
868 /* It can be assumed that the data refs with the same alignment as dr_peel
869 are aligned in the vector loop. */
870 same_align_drs
871 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
872 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
874 if (current_dr != dr)
875 continue;
876 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
877 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
878 SET_DR_MISALIGNMENT (dr, 0);
879 return;
882 if (known_alignment_for_access_p (dr)
883 && known_alignment_for_access_p (dr_peel))
885 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
886 int misal = DR_MISALIGNMENT (dr);
887 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
888 misal += negative ? -npeel * dr_size : npeel * dr_size;
889 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
890 SET_DR_MISALIGNMENT (dr, misal);
891 return;
894 if (dump_enabled_p ())
895 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
896 SET_DR_MISALIGNMENT (dr, -1);
900 /* Function vect_verify_datarefs_alignment
902 Return TRUE if all data references in the loop can be
903 handled with respect to alignment. */
905 bool
906 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
908 vec<data_reference_p> datarefs;
909 struct data_reference *dr;
910 enum dr_alignment_support supportable_dr_alignment;
911 unsigned int i;
913 if (loop_vinfo)
914 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
915 else
916 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
918 FOR_EACH_VEC_ELT (datarefs, i, dr)
920 gimple stmt = DR_STMT (dr);
921 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
923 if (!STMT_VINFO_RELEVANT_P (stmt_info))
924 continue;
926 /* For interleaving, only the alignment of the first access matters.
927 Skip statements marked as not vectorizable. */
928 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
929 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
930 || !STMT_VINFO_VECTORIZABLE (stmt_info))
931 continue;
933 /* Strided loads perform only component accesses, alignment is
934 irrelevant for them. */
935 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
936 continue;
938 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
939 if (!supportable_dr_alignment)
941 if (dump_enabled_p ())
943 if (DR_IS_READ (dr))
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "not vectorized: unsupported unaligned load.");
946 else
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
948 "not vectorized: unsupported unaligned "
949 "store.");
951 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
952 DR_REF (dr));
953 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
955 return false;
957 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
958 dump_printf_loc (MSG_NOTE, vect_location,
959 "Vectorizing an unaligned access.\n");
961 return true;
964 /* Given an memory reference EXP return whether its alignment is less
965 than its size. */
967 static bool
968 not_size_aligned (tree exp)
970 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
971 return true;
973 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
974 > get_object_alignment (exp));
977 /* Function vector_alignment_reachable_p
979 Return true if vector alignment for DR is reachable by peeling
980 a few loop iterations. Return false otherwise. */
982 static bool
983 vector_alignment_reachable_p (struct data_reference *dr)
985 gimple stmt = DR_STMT (dr);
986 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
987 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
989 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
991 /* For interleaved access we peel only if number of iterations in
992 the prolog loop ({VF - misalignment}), is a multiple of the
993 number of the interleaved accesses. */
994 int elem_size, mis_in_elements;
995 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
997 /* FORNOW: handle only known alignment. */
998 if (!known_alignment_for_access_p (dr))
999 return false;
1001 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1002 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1004 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1005 return false;
1008 /* If misalignment is known at the compile time then allow peeling
1009 only if natural alignment is reachable through peeling. */
1010 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1012 HOST_WIDE_INT elmsize =
1013 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1014 if (dump_enabled_p ())
1016 dump_printf_loc (MSG_NOTE, vect_location,
1017 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1018 dump_printf (MSG_NOTE,
1019 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1021 if (DR_MISALIGNMENT (dr) % elmsize)
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "data size does not divide the misalignment.\n");
1026 return false;
1030 if (!known_alignment_for_access_p (dr))
1032 tree type = TREE_TYPE (DR_REF (dr));
1033 bool is_packed = not_size_aligned (DR_REF (dr));
1034 if (dump_enabled_p ())
1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036 "Unknown misalignment, is_packed = %d\n",is_packed);
1037 if ((TYPE_USER_ALIGN (type) && !is_packed)
1038 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1039 return true;
1040 else
1041 return false;
1044 return true;
1048 /* Calculate the cost of the memory access represented by DR. */
1050 static void
1051 vect_get_data_access_cost (struct data_reference *dr,
1052 unsigned int *inside_cost,
1053 unsigned int *outside_cost,
1054 stmt_vector_for_cost *body_cost_vec)
1056 gimple stmt = DR_STMT (dr);
1057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1058 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1059 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1060 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1061 int ncopies = vf / nunits;
1063 if (DR_IS_READ (dr))
1064 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1065 NULL, body_cost_vec, false);
1066 else
1067 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1069 if (dump_enabled_p ())
1070 dump_printf_loc (MSG_NOTE, vect_location,
1071 "vect_get_data_access_cost: inside_cost = %d, "
1072 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1076 /* Insert DR into peeling hash table with NPEEL as key. */
1078 static void
1079 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1080 int npeel)
1082 struct _vect_peel_info elem, *slot;
1083 _vect_peel_info **new_slot;
1084 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1086 elem.npeel = npeel;
1087 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1088 if (slot)
1089 slot->count++;
1090 else
1092 slot = XNEW (struct _vect_peel_info);
1093 slot->npeel = npeel;
1094 slot->dr = dr;
1095 slot->count = 1;
1096 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1097 *new_slot = slot;
1100 if (!supportable_dr_alignment
1101 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1102 slot->count += VECT_MAX_COST;
1106 /* Traverse peeling hash table to find peeling option that aligns maximum
1107 number of data accesses. */
1110 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1111 _vect_peel_extended_info *max)
1113 vect_peel_info elem = *slot;
1115 if (elem->count > max->peel_info.count
1116 || (elem->count == max->peel_info.count
1117 && max->peel_info.npeel > elem->npeel))
1119 max->peel_info.npeel = elem->npeel;
1120 max->peel_info.count = elem->count;
1121 max->peel_info.dr = elem->dr;
1124 return 1;
1128 /* Traverse peeling hash table and calculate cost for each peeling option.
1129 Find the one with the lowest cost. */
1132 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1133 _vect_peel_extended_info *min)
1135 vect_peel_info elem = *slot;
1136 int save_misalignment, dummy;
1137 unsigned int inside_cost = 0, outside_cost = 0, i;
1138 gimple stmt = DR_STMT (elem->dr);
1139 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1140 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1141 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1142 struct data_reference *dr;
1143 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1144 int single_iter_cost;
1146 prologue_cost_vec.create (2);
1147 body_cost_vec.create (2);
1148 epilogue_cost_vec.create (2);
1150 FOR_EACH_VEC_ELT (datarefs, i, dr)
1152 stmt = DR_STMT (dr);
1153 stmt_info = vinfo_for_stmt (stmt);
1154 /* For interleaving, only the alignment of the first access
1155 matters. */
1156 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1157 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1158 continue;
1160 save_misalignment = DR_MISALIGNMENT (dr);
1161 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1162 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1163 &body_cost_vec);
1164 SET_DR_MISALIGNMENT (dr, save_misalignment);
1167 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1168 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1169 &dummy, single_iter_cost,
1170 &prologue_cost_vec,
1171 &epilogue_cost_vec);
1173 /* Prologue and epilogue costs are added to the target model later.
1174 These costs depend only on the scalar iteration cost, the
1175 number of peeling iterations finally chosen, and the number of
1176 misaligned statements. So discard the information found here. */
1177 prologue_cost_vec.release ();
1178 epilogue_cost_vec.release ();
1180 if (inside_cost < min->inside_cost
1181 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1183 min->inside_cost = inside_cost;
1184 min->outside_cost = outside_cost;
1185 min->body_cost_vec.release ();
1186 min->body_cost_vec = body_cost_vec;
1187 min->peel_info.dr = elem->dr;
1188 min->peel_info.npeel = elem->npeel;
1190 else
1191 body_cost_vec.release ();
1193 return 1;
1197 /* Choose best peeling option by traversing peeling hash table and either
1198 choosing an option with the lowest cost (if cost model is enabled) or the
1199 option that aligns as many accesses as possible. */
1201 static struct data_reference *
1202 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1203 unsigned int *npeel,
1204 stmt_vector_for_cost *body_cost_vec)
1206 struct _vect_peel_extended_info res;
1208 res.peel_info.dr = NULL;
1209 res.body_cost_vec = stmt_vector_for_cost ();
1211 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1213 res.inside_cost = INT_MAX;
1214 res.outside_cost = INT_MAX;
1215 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1216 .traverse <_vect_peel_extended_info *,
1217 vect_peeling_hash_get_lowest_cost> (&res);
1219 else
1221 res.peel_info.count = 0;
1222 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1223 .traverse <_vect_peel_extended_info *,
1224 vect_peeling_hash_get_most_frequent> (&res);
1227 *npeel = res.peel_info.npeel;
1228 *body_cost_vec = res.body_cost_vec;
1229 return res.peel_info.dr;
1233 /* Function vect_enhance_data_refs_alignment
1235 This pass will use loop versioning and loop peeling in order to enhance
1236 the alignment of data references in the loop.
1238 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1239 original loop is to be vectorized. Any other loops that are created by
1240 the transformations performed in this pass - are not supposed to be
1241 vectorized. This restriction will be relaxed.
1243 This pass will require a cost model to guide it whether to apply peeling
1244 or versioning or a combination of the two. For example, the scheme that
1245 intel uses when given a loop with several memory accesses, is as follows:
1246 choose one memory access ('p') which alignment you want to force by doing
1247 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1248 other accesses are not necessarily aligned, or (2) use loop versioning to
1249 generate one loop in which all accesses are aligned, and another loop in
1250 which only 'p' is necessarily aligned.
1252 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1253 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1254 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1256 Devising a cost model is the most critical aspect of this work. It will
1257 guide us on which access to peel for, whether to use loop versioning, how
1258 many versions to create, etc. The cost model will probably consist of
1259 generic considerations as well as target specific considerations (on
1260 powerpc for example, misaligned stores are more painful than misaligned
1261 loads).
1263 Here are the general steps involved in alignment enhancements:
1265 -- original loop, before alignment analysis:
1266 for (i=0; i<N; i++){
1267 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1268 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1271 -- After vect_compute_data_refs_alignment:
1272 for (i=0; i<N; i++){
1273 x = q[i]; # DR_MISALIGNMENT(q) = 3
1274 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1277 -- Possibility 1: we do loop versioning:
1278 if (p is aligned) {
1279 for (i=0; i<N; i++){ # loop 1A
1280 x = q[i]; # DR_MISALIGNMENT(q) = 3
1281 p[i] = y; # DR_MISALIGNMENT(p) = 0
1284 else {
1285 for (i=0; i<N; i++){ # loop 1B
1286 x = q[i]; # DR_MISALIGNMENT(q) = 3
1287 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1291 -- Possibility 2: we do loop peeling:
1292 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1293 x = q[i];
1294 p[i] = y;
1296 for (i = 3; i < N; i++){ # loop 2A
1297 x = q[i]; # DR_MISALIGNMENT(q) = 0
1298 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1301 -- Possibility 3: combination of loop peeling and versioning:
1302 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1303 x = q[i];
1304 p[i] = y;
1306 if (p is aligned) {
1307 for (i = 3; i<N; i++){ # loop 3A
1308 x = q[i]; # DR_MISALIGNMENT(q) = 0
1309 p[i] = y; # DR_MISALIGNMENT(p) = 0
1312 else {
1313 for (i = 3; i<N; i++){ # loop 3B
1314 x = q[i]; # DR_MISALIGNMENT(q) = 0
1315 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1319 These loops are later passed to loop_transform to be vectorized. The
1320 vectorizer will use the alignment information to guide the transformation
1321 (whether to generate regular loads/stores, or with special handling for
1322 misalignment). */
1324 bool
1325 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1327 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1328 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1329 enum dr_alignment_support supportable_dr_alignment;
1330 struct data_reference *dr0 = NULL, *first_store = NULL;
1331 struct data_reference *dr;
1332 unsigned int i, j;
1333 bool do_peeling = false;
1334 bool do_versioning = false;
1335 bool stat;
1336 gimple stmt;
1337 stmt_vec_info stmt_info;
1338 unsigned int npeel = 0;
1339 bool all_misalignments_unknown = true;
1340 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1341 unsigned possible_npeel_number = 1;
1342 tree vectype;
1343 unsigned int nelements, mis, same_align_drs_max = 0;
1344 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1346 if (dump_enabled_p ())
1347 dump_printf_loc (MSG_NOTE, vect_location,
1348 "=== vect_enhance_data_refs_alignment ===\n");
1350 /* While cost model enhancements are expected in the future, the high level
1351 view of the code at this time is as follows:
1353 A) If there is a misaligned access then see if peeling to align
1354 this access can make all data references satisfy
1355 vect_supportable_dr_alignment. If so, update data structures
1356 as needed and return true.
1358 B) If peeling wasn't possible and there is a data reference with an
1359 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1360 then see if loop versioning checks can be used to make all data
1361 references satisfy vect_supportable_dr_alignment. If so, update
1362 data structures as needed and return true.
1364 C) If neither peeling nor versioning were successful then return false if
1365 any data reference does not satisfy vect_supportable_dr_alignment.
1367 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1369 Note, Possibility 3 above (which is peeling and versioning together) is not
1370 being done at this time. */
1372 /* (1) Peeling to force alignment. */
1374 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1375 Considerations:
1376 + How many accesses will become aligned due to the peeling
1377 - How many accesses will become unaligned due to the peeling,
1378 and the cost of misaligned accesses.
1379 - The cost of peeling (the extra runtime checks, the increase
1380 in code size). */
1382 FOR_EACH_VEC_ELT (datarefs, i, dr)
1384 stmt = DR_STMT (dr);
1385 stmt_info = vinfo_for_stmt (stmt);
1387 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1388 continue;
1390 /* For interleaving, only the alignment of the first access
1391 matters. */
1392 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1393 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1394 continue;
1396 /* For invariant accesses there is nothing to enhance. */
1397 if (integer_zerop (DR_STEP (dr)))
1398 continue;
1400 /* Strided loads perform only component accesses, alignment is
1401 irrelevant for them. */
1402 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1403 continue;
1405 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1406 do_peeling = vector_alignment_reachable_p (dr);
1407 if (do_peeling)
1409 if (known_alignment_for_access_p (dr))
1411 unsigned int npeel_tmp;
1412 bool negative = tree_int_cst_compare (DR_STEP (dr),
1413 size_zero_node) < 0;
1415 /* Save info about DR in the hash table. */
1416 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1417 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1419 vectype = STMT_VINFO_VECTYPE (stmt_info);
1420 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1421 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1422 TREE_TYPE (DR_REF (dr))));
1423 npeel_tmp = (negative
1424 ? (mis - nelements) : (nelements - mis))
1425 & (nelements - 1);
1427 /* For multiple types, it is possible that the bigger type access
1428 will have more than one peeling option. E.g., a loop with two
1429 types: one of size (vector size / 4), and the other one of
1430 size (vector size / 8). Vectorization factor will 8. If both
1431 access are misaligned by 3, the first one needs one scalar
1432 iteration to be aligned, and the second one needs 5. But the
1433 the first one will be aligned also by peeling 5 scalar
1434 iterations, and in that case both accesses will be aligned.
1435 Hence, except for the immediate peeling amount, we also want
1436 to try to add full vector size, while we don't exceed
1437 vectorization factor.
1438 We do this automtically for cost model, since we calculate cost
1439 for every peeling option. */
1440 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1441 possible_npeel_number = vf /nelements;
1443 /* Handle the aligned case. We may decide to align some other
1444 access, making DR unaligned. */
1445 if (DR_MISALIGNMENT (dr) == 0)
1447 npeel_tmp = 0;
1448 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1449 possible_npeel_number++;
1452 for (j = 0; j < possible_npeel_number; j++)
1454 gcc_assert (npeel_tmp <= vf);
1455 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1456 npeel_tmp += nelements;
1459 all_misalignments_unknown = false;
1460 /* Data-ref that was chosen for the case that all the
1461 misalignments are unknown is not relevant anymore, since we
1462 have a data-ref with known alignment. */
1463 dr0 = NULL;
1465 else
1467 /* If we don't know any misalignment values, we prefer
1468 peeling for data-ref that has the maximum number of data-refs
1469 with the same alignment, unless the target prefers to align
1470 stores over load. */
1471 if (all_misalignments_unknown)
1473 unsigned same_align_drs
1474 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1475 if (!dr0
1476 || same_align_drs_max < same_align_drs)
1478 same_align_drs_max = same_align_drs;
1479 dr0 = dr;
1481 /* For data-refs with the same number of related
1482 accesses prefer the one where the misalign
1483 computation will be invariant in the outermost loop. */
1484 else if (same_align_drs_max == same_align_drs)
1486 struct loop *ivloop0, *ivloop;
1487 ivloop0 = outermost_invariant_loop_for_expr
1488 (loop, DR_BASE_ADDRESS (dr0));
1489 ivloop = outermost_invariant_loop_for_expr
1490 (loop, DR_BASE_ADDRESS (dr));
1491 if ((ivloop && !ivloop0)
1492 || (ivloop && ivloop0
1493 && flow_loop_nested_p (ivloop, ivloop0)))
1494 dr0 = dr;
1497 if (!first_store && DR_IS_WRITE (dr))
1498 first_store = dr;
1501 /* If there are both known and unknown misaligned accesses in the
1502 loop, we choose peeling amount according to the known
1503 accesses. */
1504 if (!supportable_dr_alignment)
1506 dr0 = dr;
1507 if (!first_store && DR_IS_WRITE (dr))
1508 first_store = dr;
1512 else
1514 if (!aligned_access_p (dr))
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518 "vector alignment may not be reachable\n");
1519 break;
1524 /* Check if we can possibly peel the loop. */
1525 if (!vect_can_advance_ivs_p (loop_vinfo)
1526 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1527 do_peeling = false;
1529 if (do_peeling && all_misalignments_unknown
1530 && vect_supportable_dr_alignment (dr0, false))
1533 /* Check if the target requires to prefer stores over loads, i.e., if
1534 misaligned stores are more expensive than misaligned loads (taking
1535 drs with same alignment into account). */
1536 if (first_store && DR_IS_READ (dr0))
1538 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1539 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1540 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1541 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1542 stmt_vector_for_cost dummy;
1543 dummy.create (2);
1545 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1546 &dummy);
1547 vect_get_data_access_cost (first_store, &store_inside_cost,
1548 &store_outside_cost, &dummy);
1550 dummy.release ();
1552 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1553 aligning the load DR0). */
1554 load_inside_penalty = store_inside_cost;
1555 load_outside_penalty = store_outside_cost;
1556 for (i = 0;
1557 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1558 DR_STMT (first_store))).iterate (i, &dr);
1559 i++)
1560 if (DR_IS_READ (dr))
1562 load_inside_penalty += load_inside_cost;
1563 load_outside_penalty += load_outside_cost;
1565 else
1567 load_inside_penalty += store_inside_cost;
1568 load_outside_penalty += store_outside_cost;
1571 /* Calculate the penalty for leaving DR0 unaligned (by
1572 aligning the FIRST_STORE). */
1573 store_inside_penalty = load_inside_cost;
1574 store_outside_penalty = load_outside_cost;
1575 for (i = 0;
1576 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1577 DR_STMT (dr0))).iterate (i, &dr);
1578 i++)
1579 if (DR_IS_READ (dr))
1581 store_inside_penalty += load_inside_cost;
1582 store_outside_penalty += load_outside_cost;
1584 else
1586 store_inside_penalty += store_inside_cost;
1587 store_outside_penalty += store_outside_cost;
1590 if (load_inside_penalty > store_inside_penalty
1591 || (load_inside_penalty == store_inside_penalty
1592 && load_outside_penalty > store_outside_penalty))
1593 dr0 = first_store;
1596 /* In case there are only loads with different unknown misalignments, use
1597 peeling only if it may help to align other accesses in the loop. */
1598 if (!first_store
1599 && !STMT_VINFO_SAME_ALIGN_REFS (
1600 vinfo_for_stmt (DR_STMT (dr0))).length ()
1601 && vect_supportable_dr_alignment (dr0, false)
1602 != dr_unaligned_supported)
1603 do_peeling = false;
1606 if (do_peeling && !dr0)
1608 /* Peeling is possible, but there is no data access that is not supported
1609 unless aligned. So we try to choose the best possible peeling. */
1611 /* We should get here only if there are drs with known misalignment. */
1612 gcc_assert (!all_misalignments_unknown);
1614 /* Choose the best peeling from the hash table. */
1615 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1616 &body_cost_vec);
1617 if (!dr0 || !npeel)
1618 do_peeling = false;
1621 if (do_peeling)
1623 stmt = DR_STMT (dr0);
1624 stmt_info = vinfo_for_stmt (stmt);
1625 vectype = STMT_VINFO_VECTYPE (stmt_info);
1626 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1628 if (known_alignment_for_access_p (dr0))
1630 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1631 size_zero_node) < 0;
1632 if (!npeel)
1634 /* Since it's known at compile time, compute the number of
1635 iterations in the peeled loop (the peeling factor) for use in
1636 updating DR_MISALIGNMENT values. The peeling factor is the
1637 vectorization factor minus the misalignment as an element
1638 count. */
1639 mis = DR_MISALIGNMENT (dr0);
1640 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1641 npeel = ((negative ? mis - nelements : nelements - mis)
1642 & (nelements - 1));
1645 /* For interleaved data access every iteration accesses all the
1646 members of the group, therefore we divide the number of iterations
1647 by the group size. */
1648 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1649 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1650 npeel /= GROUP_SIZE (stmt_info);
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_NOTE, vect_location,
1654 "Try peeling by %d\n", npeel);
1657 /* Ensure that all data refs can be vectorized after the peel. */
1658 FOR_EACH_VEC_ELT (datarefs, i, dr)
1660 int save_misalignment;
1662 if (dr == dr0)
1663 continue;
1665 stmt = DR_STMT (dr);
1666 stmt_info = vinfo_for_stmt (stmt);
1667 /* For interleaving, only the alignment of the first access
1668 matters. */
1669 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1670 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1671 continue;
1673 /* Strided loads perform only component accesses, alignment is
1674 irrelevant for them. */
1675 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1676 continue;
1678 save_misalignment = DR_MISALIGNMENT (dr);
1679 vect_update_misalignment_for_peel (dr, dr0, npeel);
1680 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1681 SET_DR_MISALIGNMENT (dr, save_misalignment);
1683 if (!supportable_dr_alignment)
1685 do_peeling = false;
1686 break;
1690 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1692 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1693 if (!stat)
1694 do_peeling = false;
1695 else
1697 body_cost_vec.release ();
1698 return stat;
1702 if (do_peeling)
1704 unsigned max_allowed_peel
1705 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1706 if (max_allowed_peel != (unsigned)-1)
1708 unsigned max_peel = npeel;
1709 if (max_peel == 0)
1711 gimple dr_stmt = DR_STMT (dr0);
1712 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1713 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1714 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1716 if (max_peel > max_allowed_peel)
1718 do_peeling = false;
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE, vect_location,
1721 "Disable peeling, max peels reached: %d\n", max_peel);
1726 if (do_peeling)
1728 stmt_info_for_cost *si;
1729 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1731 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1732 If the misalignment of DR_i is identical to that of dr0 then set
1733 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1734 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1735 by the peeling factor times the element size of DR_i (MOD the
1736 vectorization factor times the size). Otherwise, the
1737 misalignment of DR_i must be set to unknown. */
1738 FOR_EACH_VEC_ELT (datarefs, i, dr)
1739 if (dr != dr0)
1740 vect_update_misalignment_for_peel (dr, dr0, npeel);
1742 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1743 if (npeel)
1744 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1745 else
1746 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1747 = DR_MISALIGNMENT (dr0);
1748 SET_DR_MISALIGNMENT (dr0, 0);
1749 if (dump_enabled_p ())
1751 dump_printf_loc (MSG_NOTE, vect_location,
1752 "Alignment of access forced using peeling.\n");
1753 dump_printf_loc (MSG_NOTE, vect_location,
1754 "Peeling for alignment will be applied.\n");
1756 /* We've delayed passing the inside-loop peeling costs to the
1757 target cost model until we were sure peeling would happen.
1758 Do so now. */
1759 if (body_cost_vec.exists ())
1761 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1763 struct _stmt_vec_info *stmt_info
1764 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1765 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1766 si->misalign, vect_body);
1768 body_cost_vec.release ();
1771 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1772 gcc_assert (stat);
1773 return stat;
1777 body_cost_vec.release ();
1779 /* (2) Versioning to force alignment. */
1781 /* Try versioning if:
1782 1) optimize loop for speed
1783 2) there is at least one unsupported misaligned data ref with an unknown
1784 misalignment, and
1785 3) all misaligned data refs with a known misalignment are supported, and
1786 4) the number of runtime alignment checks is within reason. */
1788 do_versioning =
1789 optimize_loop_nest_for_speed_p (loop)
1790 && (!loop->inner); /* FORNOW */
1792 if (do_versioning)
1794 FOR_EACH_VEC_ELT (datarefs, i, dr)
1796 stmt = DR_STMT (dr);
1797 stmt_info = vinfo_for_stmt (stmt);
1799 /* For interleaving, only the alignment of the first access
1800 matters. */
1801 if (aligned_access_p (dr)
1802 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1803 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1804 continue;
1806 /* Strided loads perform only component accesses, alignment is
1807 irrelevant for them. */
1808 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1809 continue;
1811 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1813 if (!supportable_dr_alignment)
1815 gimple stmt;
1816 int mask;
1817 tree vectype;
1819 if (known_alignment_for_access_p (dr)
1820 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1821 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1823 do_versioning = false;
1824 break;
1827 stmt = DR_STMT (dr);
1828 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1829 gcc_assert (vectype);
1831 /* The rightmost bits of an aligned address must be zeros.
1832 Construct the mask needed for this test. For example,
1833 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1834 mask must be 15 = 0xf. */
1835 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1837 /* FORNOW: use the same mask to test all potentially unaligned
1838 references in the loop. The vectorizer currently supports
1839 a single vector size, see the reference to
1840 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1841 vectorization factor is computed. */
1842 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1843 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1844 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1845 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1846 DR_STMT (dr));
1850 /* Versioning requires at least one misaligned data reference. */
1851 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1852 do_versioning = false;
1853 else if (!do_versioning)
1854 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1857 if (do_versioning)
1859 vec<gimple> may_misalign_stmts
1860 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1861 gimple stmt;
1863 /* It can now be assumed that the data references in the statements
1864 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1865 of the loop being vectorized. */
1866 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1868 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1869 dr = STMT_VINFO_DATA_REF (stmt_info);
1870 SET_DR_MISALIGNMENT (dr, 0);
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_NOTE, vect_location,
1873 "Alignment of access forced using versioning.\n");
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_NOTE, vect_location,
1878 "Versioning for alignment will be applied.\n");
1880 /* Peeling and versioning can't be done together at this time. */
1881 gcc_assert (! (do_peeling && do_versioning));
1883 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1884 gcc_assert (stat);
1885 return stat;
1888 /* This point is reached if neither peeling nor versioning is being done. */
1889 gcc_assert (! (do_peeling || do_versioning));
1891 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1892 return stat;
1896 /* Function vect_find_same_alignment_drs.
1898 Update group and alignment relations according to the chosen
1899 vectorization factor. */
1901 static void
1902 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1903 loop_vec_info loop_vinfo)
1905 unsigned int i;
1906 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1907 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1908 struct data_reference *dra = DDR_A (ddr);
1909 struct data_reference *drb = DDR_B (ddr);
1910 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1911 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1912 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1913 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1914 lambda_vector dist_v;
1915 unsigned int loop_depth;
1917 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1918 return;
1920 if (dra == drb)
1921 return;
1923 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1924 return;
1926 /* Loop-based vectorization and known data dependence. */
1927 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1928 return;
1930 /* Data-dependence analysis reports a distance vector of zero
1931 for data-references that overlap only in the first iteration
1932 but have different sign step (see PR45764).
1933 So as a sanity check require equal DR_STEP. */
1934 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1935 return;
1937 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1938 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1940 int dist = dist_v[loop_depth];
1942 if (dump_enabled_p ())
1943 dump_printf_loc (MSG_NOTE, vect_location,
1944 "dependence distance = %d.\n", dist);
1946 /* Same loop iteration. */
1947 if (dist == 0
1948 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1950 /* Two references with distance zero have the same alignment. */
1951 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1952 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1953 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "accesses have the same alignment.\n");
1957 dump_printf (MSG_NOTE,
1958 "dependence distance modulo vf == 0 between ");
1959 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1960 dump_printf (MSG_NOTE, " and ");
1961 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1962 dump_printf (MSG_NOTE, "\n");
1969 /* Function vect_analyze_data_refs_alignment
1971 Analyze the alignment of the data-references in the loop.
1972 Return FALSE if a data reference is found that cannot be vectorized. */
1974 bool
1975 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1976 bb_vec_info bb_vinfo)
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_NOTE, vect_location,
1980 "=== vect_analyze_data_refs_alignment ===\n");
1982 /* Mark groups of data references with same alignment using
1983 data dependence information. */
1984 if (loop_vinfo)
1986 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1987 struct data_dependence_relation *ddr;
1988 unsigned int i;
1990 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1991 vect_find_same_alignment_drs (ddr, loop_vinfo);
1994 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1996 if (dump_enabled_p ())
1997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1998 "not vectorized: can't calculate alignment "
1999 "for data ref.\n");
2000 return false;
2003 return true;
2007 /* Analyze groups of accesses: check that DR belongs to a group of
2008 accesses of legal size, step, etc. Detect gaps, single element
2009 interleaving, and other special cases. Set grouped access info.
2010 Collect groups of strided stores for further use in SLP analysis. */
2012 static bool
2013 vect_analyze_group_access (struct data_reference *dr)
2015 tree step = DR_STEP (dr);
2016 tree scalar_type = TREE_TYPE (DR_REF (dr));
2017 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2018 gimple stmt = DR_STMT (dr);
2019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2021 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2022 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2023 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2024 bool slp_impossible = false;
2025 struct loop *loop = NULL;
2027 if (loop_vinfo)
2028 loop = LOOP_VINFO_LOOP (loop_vinfo);
2030 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2031 size of the interleaving group (including gaps). */
2032 groupsize = absu_hwi (dr_step) / type_size;
2034 /* Not consecutive access is possible only if it is a part of interleaving. */
2035 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2037 /* Check if it this DR is a part of interleaving, and is a single
2038 element of the group that is accessed in the loop. */
2040 /* Gaps are supported only for loads. STEP must be a multiple of the type
2041 size. The size of the group must be a power of 2. */
2042 if (DR_IS_READ (dr)
2043 && (dr_step % type_size) == 0
2044 && groupsize > 0
2045 && exact_log2 (groupsize) != -1)
2047 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2048 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2049 if (dump_enabled_p ())
2051 dump_printf_loc (MSG_NOTE, vect_location,
2052 "Detected single element interleaving ");
2053 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2054 dump_printf (MSG_NOTE, " step ");
2055 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2056 dump_printf (MSG_NOTE, "\n");
2059 if (loop_vinfo)
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location,
2063 "Data access with gaps requires scalar "
2064 "epilogue loop\n");
2065 if (loop->inner)
2067 if (dump_enabled_p ())
2068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2069 "Peeling for outer loop is not"
2070 " supported\n");
2071 return false;
2074 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2077 return true;
2080 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2083 "not consecutive access ");
2084 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2085 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2088 if (bb_vinfo)
2090 /* Mark the statement as unvectorizable. */
2091 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2092 return true;
2095 return false;
2098 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2100 /* First stmt in the interleaving chain. Check the chain. */
2101 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2102 struct data_reference *data_ref = dr;
2103 unsigned int count = 1;
2104 tree prev_init = DR_INIT (data_ref);
2105 gimple prev = stmt;
2106 HOST_WIDE_INT diff, gaps = 0;
2107 unsigned HOST_WIDE_INT count_in_bytes;
2109 while (next)
2111 /* Skip same data-refs. In case that two or more stmts share
2112 data-ref (supported only for loads), we vectorize only the first
2113 stmt, and the rest get their vectorized loads from the first
2114 one. */
2115 if (!tree_int_cst_compare (DR_INIT (data_ref),
2116 DR_INIT (STMT_VINFO_DATA_REF (
2117 vinfo_for_stmt (next)))))
2119 if (DR_IS_WRITE (data_ref))
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123 "Two store stmts share the same dr.\n");
2124 return false;
2127 /* For load use the same data-ref load. */
2128 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2130 prev = next;
2131 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2132 continue;
2135 prev = next;
2136 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2138 /* All group members have the same STEP by construction. */
2139 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2141 /* Check that the distance between two accesses is equal to the type
2142 size. Otherwise, we have gaps. */
2143 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2144 - TREE_INT_CST_LOW (prev_init)) / type_size;
2145 if (diff != 1)
2147 /* FORNOW: SLP of accesses with gaps is not supported. */
2148 slp_impossible = true;
2149 if (DR_IS_WRITE (data_ref))
2151 if (dump_enabled_p ())
2152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153 "interleaved store with gaps\n");
2154 return false;
2157 gaps += diff - 1;
2160 last_accessed_element += diff;
2162 /* Store the gap from the previous member of the group. If there is no
2163 gap in the access, GROUP_GAP is always 1. */
2164 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2166 prev_init = DR_INIT (data_ref);
2167 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2168 /* Count the number of data-refs in the chain. */
2169 count++;
2172 /* COUNT is the number of accesses found, we multiply it by the size of
2173 the type to get COUNT_IN_BYTES. */
2174 count_in_bytes = type_size * count;
2176 /* Check that the size of the interleaving (including gaps) is not
2177 greater than STEP. */
2178 if (dr_step != 0
2179 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "interleaving size is greater than step for ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2186 DR_REF (dr));
2187 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2189 return false;
2192 /* Check that the size of the interleaving is equal to STEP for stores,
2193 i.e., that there are no gaps. */
2194 if (dr_step != 0
2195 && absu_hwi (dr_step) != count_in_bytes)
2197 if (DR_IS_READ (dr))
2199 slp_impossible = true;
2200 /* There is a gap after the last load in the group. This gap is a
2201 difference between the groupsize and the number of elements.
2202 When there is no gap, this difference should be 0. */
2203 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2205 else
2207 if (dump_enabled_p ())
2208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2209 "interleaved store with gaps\n");
2210 return false;
2214 /* Check that STEP is a multiple of type size. */
2215 if (dr_step != 0
2216 && (dr_step % type_size) != 0)
2218 if (dump_enabled_p ())
2220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2221 "step is not a multiple of type size: step ");
2222 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2223 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2224 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2225 TYPE_SIZE_UNIT (scalar_type));
2226 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2228 return false;
2231 if (groupsize == 0)
2232 groupsize = count;
2234 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_NOTE, vect_location,
2237 "Detected interleaving of size %d\n", (int)groupsize);
2239 /* SLP: create an SLP data structure for every interleaving group of
2240 stores for further analysis in vect_analyse_slp. */
2241 if (DR_IS_WRITE (dr) && !slp_impossible)
2243 if (loop_vinfo)
2244 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2245 if (bb_vinfo)
2246 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2249 /* There is a gap in the end of the group. */
2250 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254 "Data access with gaps requires scalar "
2255 "epilogue loop\n");
2256 if (loop->inner)
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2260 "Peeling for outer loop is not supported\n");
2261 return false;
2264 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2268 return true;
2272 /* Analyze the access pattern of the data-reference DR.
2273 In case of non-consecutive accesses call vect_analyze_group_access() to
2274 analyze groups of accesses. */
2276 static bool
2277 vect_analyze_data_ref_access (struct data_reference *dr)
2279 tree step = DR_STEP (dr);
2280 tree scalar_type = TREE_TYPE (DR_REF (dr));
2281 gimple stmt = DR_STMT (dr);
2282 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2283 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2284 struct loop *loop = NULL;
2286 if (loop_vinfo)
2287 loop = LOOP_VINFO_LOOP (loop_vinfo);
2289 if (loop_vinfo && !step)
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2293 "bad data-ref access in loop\n");
2294 return false;
2297 /* Allow invariant loads in not nested loops. */
2298 if (loop_vinfo && integer_zerop (step))
2300 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2301 if (nested_in_vect_loop_p (loop, stmt))
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_NOTE, vect_location,
2305 "zero step in inner loop of nest\n");
2306 return false;
2308 return DR_IS_READ (dr);
2311 if (loop && nested_in_vect_loop_p (loop, stmt))
2313 /* Interleaved accesses are not yet supported within outer-loop
2314 vectorization for references in the inner-loop. */
2315 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2317 /* For the rest of the analysis we use the outer-loop step. */
2318 step = STMT_VINFO_DR_STEP (stmt_info);
2319 if (integer_zerop (step))
2321 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "zero step in outer loop.\n");
2324 if (DR_IS_READ (dr))
2325 return true;
2326 else
2327 return false;
2331 /* Consecutive? */
2332 if (TREE_CODE (step) == INTEGER_CST)
2334 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2335 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2336 || (dr_step < 0
2337 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2339 /* Mark that it is not interleaving. */
2340 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2341 return true;
2345 if (loop && nested_in_vect_loop_p (loop, stmt))
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_NOTE, vect_location,
2349 "grouped access in outer loop.\n");
2350 return false;
2353 /* Assume this is a DR handled by non-constant strided load case. */
2354 if (TREE_CODE (step) != INTEGER_CST)
2355 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2357 /* Not consecutive access - check if it's a part of interleaving group. */
2358 return vect_analyze_group_access (dr);
2363 /* A helper function used in the comparator function to sort data
2364 references. T1 and T2 are two data references to be compared.
2365 The function returns -1, 0, or 1. */
2367 static int
2368 compare_tree (tree t1, tree t2)
2370 int i, cmp;
2371 enum tree_code code;
2372 char tclass;
2374 if (t1 == t2)
2375 return 0;
2376 if (t1 == NULL)
2377 return -1;
2378 if (t2 == NULL)
2379 return 1;
2382 if (TREE_CODE (t1) != TREE_CODE (t2))
2383 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2385 code = TREE_CODE (t1);
2386 switch (code)
2388 /* For const values, we can just use hash values for comparisons. */
2389 case INTEGER_CST:
2390 case REAL_CST:
2391 case FIXED_CST:
2392 case STRING_CST:
2393 case COMPLEX_CST:
2394 case VECTOR_CST:
2396 hashval_t h1 = iterative_hash_expr (t1, 0);
2397 hashval_t h2 = iterative_hash_expr (t2, 0);
2398 if (h1 != h2)
2399 return h1 < h2 ? -1 : 1;
2400 break;
2403 case SSA_NAME:
2404 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2405 if (cmp != 0)
2406 return cmp;
2408 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2409 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2410 break;
2412 default:
2413 tclass = TREE_CODE_CLASS (code);
2415 /* For var-decl, we could compare their UIDs. */
2416 if (tclass == tcc_declaration)
2418 if (DECL_UID (t1) != DECL_UID (t2))
2419 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2420 break;
2423 /* For expressions with operands, compare their operands recursively. */
2424 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2426 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2427 if (cmp != 0)
2428 return cmp;
2432 return 0;
2436 /* Compare two data-references DRA and DRB to group them into chunks
2437 suitable for grouping. */
2439 static int
2440 dr_group_sort_cmp (const void *dra_, const void *drb_)
2442 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2443 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2444 int cmp;
2446 /* Stabilize sort. */
2447 if (dra == drb)
2448 return 0;
2450 /* Ordering of DRs according to base. */
2451 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2453 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2454 if (cmp != 0)
2455 return cmp;
2458 /* And according to DR_OFFSET. */
2459 if (!dr_equal_offsets_p (dra, drb))
2461 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2462 if (cmp != 0)
2463 return cmp;
2466 /* Put reads before writes. */
2467 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2468 return DR_IS_READ (dra) ? -1 : 1;
2470 /* Then sort after access size. */
2471 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2472 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2474 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2475 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2476 if (cmp != 0)
2477 return cmp;
2480 /* And after step. */
2481 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2483 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2484 if (cmp != 0)
2485 return cmp;
2488 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2489 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2490 if (cmp == 0)
2491 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2492 return cmp;
2495 /* Function vect_analyze_data_ref_accesses.
2497 Analyze the access pattern of all the data references in the loop.
2499 FORNOW: the only access pattern that is considered vectorizable is a
2500 simple step 1 (consecutive) access.
2502 FORNOW: handle only arrays and pointer accesses. */
2504 bool
2505 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2507 unsigned int i;
2508 vec<data_reference_p> datarefs;
2509 struct data_reference *dr;
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "=== vect_analyze_data_ref_accesses ===\n");
2515 if (loop_vinfo)
2516 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2517 else
2518 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2520 if (datarefs.is_empty ())
2521 return true;
2523 /* Sort the array of datarefs to make building the interleaving chains
2524 linear. */
2525 qsort (datarefs.address (), datarefs.length (),
2526 sizeof (data_reference_p), dr_group_sort_cmp);
2528 /* Build the interleaving chains. */
2529 for (i = 0; i < datarefs.length () - 1;)
2531 data_reference_p dra = datarefs[i];
2532 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2533 stmt_vec_info lastinfo = NULL;
2534 for (i = i + 1; i < datarefs.length (); ++i)
2536 data_reference_p drb = datarefs[i];
2537 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2539 /* ??? Imperfect sorting (non-compatible types, non-modulo
2540 accesses, same accesses) can lead to a group to be artificially
2541 split here as we don't just skip over those. If it really
2542 matters we can push those to a worklist and re-iterate
2543 over them. The we can just skip ahead to the next DR here. */
2545 /* Check that the data-refs have same first location (except init)
2546 and they are both either store or load (not load and store). */
2547 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2548 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2549 DR_BASE_ADDRESS (drb), 0)
2550 || !dr_equal_offsets_p (dra, drb))
2551 break;
2553 /* Check that the data-refs have the same constant size and step. */
2554 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2555 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2556 if (!tree_fits_uhwi_p (sza)
2557 || !tree_fits_uhwi_p (szb)
2558 || !tree_int_cst_equal (sza, szb)
2559 || !tree_fits_shwi_p (DR_STEP (dra))
2560 || !tree_fits_shwi_p (DR_STEP (drb))
2561 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2562 break;
2564 /* Do not place the same access in the interleaving chain twice. */
2565 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2566 break;
2568 /* Check the types are compatible.
2569 ??? We don't distinguish this during sorting. */
2570 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2571 TREE_TYPE (DR_REF (drb))))
2572 break;
2574 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2575 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2576 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2577 gcc_assert (init_a < init_b);
2579 /* If init_b == init_a + the size of the type * k, we have an
2580 interleaving, and DRA is accessed before DRB. */
2581 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2582 if ((init_b - init_a) % type_size_a != 0)
2583 break;
2585 /* The step (if not zero) is greater than the difference between
2586 data-refs' inits. This splits groups into suitable sizes. */
2587 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2588 if (step != 0 && step <= (init_b - init_a))
2589 break;
2591 if (dump_enabled_p ())
2593 dump_printf_loc (MSG_NOTE, vect_location,
2594 "Detected interleaving ");
2595 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2596 dump_printf (MSG_NOTE, " and ");
2597 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2598 dump_printf (MSG_NOTE, "\n");
2601 /* Link the found element into the group list. */
2602 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2604 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2605 lastinfo = stmtinfo_a;
2607 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2608 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2609 lastinfo = stmtinfo_b;
2613 FOR_EACH_VEC_ELT (datarefs, i, dr)
2614 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2615 && !vect_analyze_data_ref_access (dr))
2617 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2619 "not vectorized: complicated access pattern.\n");
2621 if (bb_vinfo)
2623 /* Mark the statement as not vectorizable. */
2624 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2625 continue;
2627 else
2628 return false;
2631 return true;
2635 /* Operator == between two dr_with_seg_len objects.
2637 This equality operator is used to make sure two data refs
2638 are the same one so that we will consider to combine the
2639 aliasing checks of those two pairs of data dependent data
2640 refs. */
2642 static bool
2643 operator == (const dr_with_seg_len& d1,
2644 const dr_with_seg_len& d2)
2646 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2647 DR_BASE_ADDRESS (d2.dr), 0)
2648 && compare_tree (d1.offset, d2.offset) == 0
2649 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2652 /* Function comp_dr_with_seg_len_pair.
2654 Comparison function for sorting objects of dr_with_seg_len_pair_t
2655 so that we can combine aliasing checks in one scan. */
2657 static int
2658 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2660 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2661 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2663 const dr_with_seg_len &p11 = p1->first,
2664 &p12 = p1->second,
2665 &p21 = p2->first,
2666 &p22 = p2->second;
2668 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2669 if a and c have the same basic address snd step, and b and d have the same
2670 address and step. Therefore, if any a&c or b&d don't have the same address
2671 and step, we don't care the order of those two pairs after sorting. */
2672 int comp_res;
2674 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2675 DR_BASE_ADDRESS (p21.dr))) != 0)
2676 return comp_res;
2677 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2678 DR_BASE_ADDRESS (p22.dr))) != 0)
2679 return comp_res;
2680 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2681 return comp_res;
2682 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2683 return comp_res;
2684 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2685 return comp_res;
2686 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2687 return comp_res;
2689 return 0;
2692 template <class T> static void
2693 swap (T& a, T& b)
2695 T c (a);
2696 a = b;
2697 b = c;
2700 /* Function vect_vfa_segment_size.
2702 Create an expression that computes the size of segment
2703 that will be accessed for a data reference. The functions takes into
2704 account that realignment loads may access one more vector.
2706 Input:
2707 DR: The data reference.
2708 LENGTH_FACTOR: segment length to consider.
2710 Return an expression whose value is the size of segment which will be
2711 accessed by DR. */
2713 static tree
2714 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2716 tree segment_length;
2718 if (integer_zerop (DR_STEP (dr)))
2719 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2720 else
2721 segment_length = size_binop (MULT_EXPR,
2722 fold_convert (sizetype, DR_STEP (dr)),
2723 fold_convert (sizetype, length_factor));
2725 if (vect_supportable_dr_alignment (dr, false)
2726 == dr_explicit_realign_optimized)
2728 tree vector_size = TYPE_SIZE_UNIT
2729 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2731 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2733 return segment_length;
2736 /* Function vect_prune_runtime_alias_test_list.
2738 Prune a list of ddrs to be tested at run-time by versioning for alias.
2739 Merge several alias checks into one if possible.
2740 Return FALSE if resulting list of ddrs is longer then allowed by
2741 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2743 bool
2744 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2746 vec<ddr_p> may_alias_ddrs =
2747 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2748 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2749 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2750 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2751 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2753 ddr_p ddr;
2754 unsigned int i;
2755 tree length_factor;
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE, vect_location,
2759 "=== vect_prune_runtime_alias_test_list ===\n");
2761 if (may_alias_ddrs.is_empty ())
2762 return true;
2764 /* Basically, for each pair of dependent data refs store_ptr_0
2765 and load_ptr_0, we create an expression:
2767 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2768 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2770 for aliasing checks. However, in some cases we can decrease
2771 the number of checks by combining two checks into one. For
2772 example, suppose we have another pair of data refs store_ptr_0
2773 and load_ptr_1, and if the following condition is satisfied:
2775 load_ptr_0 < load_ptr_1 &&
2776 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2778 (this condition means, in each iteration of vectorized loop,
2779 the accessed memory of store_ptr_0 cannot be between the memory
2780 of load_ptr_0 and load_ptr_1.)
2782 we then can use only the following expression to finish the
2783 alising checks between store_ptr_0 & load_ptr_0 and
2784 store_ptr_0 & load_ptr_1:
2786 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2787 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2789 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2790 same basic address. */
2792 comp_alias_ddrs.create (may_alias_ddrs.length ());
2794 /* First, we collect all data ref pairs for aliasing checks. */
2795 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2797 struct data_reference *dr_a, *dr_b;
2798 gimple dr_group_first_a, dr_group_first_b;
2799 tree segment_length_a, segment_length_b;
2800 gimple stmt_a, stmt_b;
2802 dr_a = DDR_A (ddr);
2803 stmt_a = DR_STMT (DDR_A (ddr));
2804 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2805 if (dr_group_first_a)
2807 stmt_a = dr_group_first_a;
2808 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2811 dr_b = DDR_B (ddr);
2812 stmt_b = DR_STMT (DDR_B (ddr));
2813 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2814 if (dr_group_first_b)
2816 stmt_b = dr_group_first_b;
2817 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2820 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2821 length_factor = scalar_loop_iters;
2822 else
2823 length_factor = size_int (vect_factor);
2824 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2825 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2827 dr_with_seg_len_pair_t dr_with_seg_len_pair
2828 (dr_with_seg_len (dr_a, segment_length_a),
2829 dr_with_seg_len (dr_b, segment_length_b));
2831 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2832 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2834 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2837 /* Second, we sort the collected data ref pairs so that we can scan
2838 them once to combine all possible aliasing checks. */
2839 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2841 /* Third, we scan the sorted dr pairs and check if we can combine
2842 alias checks of two neighbouring dr pairs. */
2843 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2845 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2846 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2847 *dr_b1 = &comp_alias_ddrs[i-1].second,
2848 *dr_a2 = &comp_alias_ddrs[i].first,
2849 *dr_b2 = &comp_alias_ddrs[i].second;
2851 /* Remove duplicate data ref pairs. */
2852 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2854 if (dump_enabled_p ())
2856 dump_printf_loc (MSG_NOTE, vect_location,
2857 "found equal ranges ");
2858 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2859 DR_REF (dr_a1->dr));
2860 dump_printf (MSG_NOTE, ", ");
2861 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2862 DR_REF (dr_b1->dr));
2863 dump_printf (MSG_NOTE, " and ");
2864 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2865 DR_REF (dr_a2->dr));
2866 dump_printf (MSG_NOTE, ", ");
2867 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2868 DR_REF (dr_b2->dr));
2869 dump_printf (MSG_NOTE, "\n");
2872 comp_alias_ddrs.ordered_remove (i--);
2873 continue;
2876 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2878 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2879 and DR_A1 and DR_A2 are two consecutive memrefs. */
2880 if (*dr_a1 == *dr_a2)
2882 swap (dr_a1, dr_b1);
2883 swap (dr_a2, dr_b2);
2886 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2887 DR_BASE_ADDRESS (dr_a2->dr),
2889 || !tree_fits_shwi_p (dr_a1->offset)
2890 || !tree_fits_shwi_p (dr_a2->offset))
2891 continue;
2893 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2894 - tree_to_shwi (dr_a1->offset));
2897 /* Now we check if the following condition is satisfied:
2899 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2901 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2902 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2903 have to make a best estimation. We can get the minimum value
2904 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2905 then either of the following two conditions can guarantee the
2906 one above:
2908 1: DIFF <= MIN_SEG_LEN_B
2909 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2913 HOST_WIDE_INT
2914 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2915 TREE_INT_CST_LOW (dr_b1->seg_len) :
2916 vect_factor;
2918 if (diff <= min_seg_len_b
2919 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2920 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2921 min_seg_len_b))
2923 dr_a1->seg_len = size_binop (PLUS_EXPR,
2924 dr_a2->seg_len, size_int (diff));
2925 comp_alias_ddrs.ordered_remove (i--);
2930 if ((int) comp_alias_ddrs.length () >
2931 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2933 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936 "disable versioning for alias - max number of "
2937 "generated checks exceeded.\n");
2940 return false;
2943 return true;
2946 /* Check whether a non-affine read in stmt is suitable for gather load
2947 and if so, return a builtin decl for that operation. */
2949 tree
2950 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2951 tree *offp, int *scalep)
2953 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2954 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2955 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2956 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2957 tree offtype = NULL_TREE;
2958 tree decl, base, off;
2959 enum machine_mode pmode;
2960 int punsignedp, pvolatilep;
2962 /* The gather builtins need address of the form
2963 loop_invariant + vector * {1, 2, 4, 8}
2965 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2966 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2967 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2968 multiplications and additions in it. To get a vector, we need
2969 a single SSA_NAME that will be defined in the loop and will
2970 contain everything that is not loop invariant and that can be
2971 vectorized. The following code attempts to find such a preexistng
2972 SSA_NAME OFF and put the loop invariants into a tree BASE
2973 that can be gimplified before the loop. */
2974 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2975 &pmode, &punsignedp, &pvolatilep, false);
2976 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2978 if (TREE_CODE (base) == MEM_REF)
2980 if (!integer_zerop (TREE_OPERAND (base, 1)))
2982 if (off == NULL_TREE)
2984 double_int moff = mem_ref_offset (base);
2985 off = double_int_to_tree (sizetype, moff);
2987 else
2988 off = size_binop (PLUS_EXPR, off,
2989 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2991 base = TREE_OPERAND (base, 0);
2993 else
2994 base = build_fold_addr_expr (base);
2996 if (off == NULL_TREE)
2997 off = size_zero_node;
2999 /* If base is not loop invariant, either off is 0, then we start with just
3000 the constant offset in the loop invariant BASE and continue with base
3001 as OFF, otherwise give up.
3002 We could handle that case by gimplifying the addition of base + off
3003 into some SSA_NAME and use that as off, but for now punt. */
3004 if (!expr_invariant_in_loop_p (loop, base))
3006 if (!integer_zerop (off))
3007 return NULL_TREE;
3008 off = base;
3009 base = size_int (pbitpos / BITS_PER_UNIT);
3011 /* Otherwise put base + constant offset into the loop invariant BASE
3012 and continue with OFF. */
3013 else
3015 base = fold_convert (sizetype, base);
3016 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3019 /* OFF at this point may be either a SSA_NAME or some tree expression
3020 from get_inner_reference. Try to peel off loop invariants from it
3021 into BASE as long as possible. */
3022 STRIP_NOPS (off);
3023 while (offtype == NULL_TREE)
3025 enum tree_code code;
3026 tree op0, op1, add = NULL_TREE;
3028 if (TREE_CODE (off) == SSA_NAME)
3030 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3032 if (expr_invariant_in_loop_p (loop, off))
3033 return NULL_TREE;
3035 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3036 break;
3038 op0 = gimple_assign_rhs1 (def_stmt);
3039 code = gimple_assign_rhs_code (def_stmt);
3040 op1 = gimple_assign_rhs2 (def_stmt);
3042 else
3044 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3045 return NULL_TREE;
3046 code = TREE_CODE (off);
3047 extract_ops_from_tree (off, &code, &op0, &op1);
3049 switch (code)
3051 case POINTER_PLUS_EXPR:
3052 case PLUS_EXPR:
3053 if (expr_invariant_in_loop_p (loop, op0))
3055 add = op0;
3056 off = op1;
3057 do_add:
3058 add = fold_convert (sizetype, add);
3059 if (scale != 1)
3060 add = size_binop (MULT_EXPR, add, size_int (scale));
3061 base = size_binop (PLUS_EXPR, base, add);
3062 continue;
3064 if (expr_invariant_in_loop_p (loop, op1))
3066 add = op1;
3067 off = op0;
3068 goto do_add;
3070 break;
3071 case MINUS_EXPR:
3072 if (expr_invariant_in_loop_p (loop, op1))
3074 add = fold_convert (sizetype, op1);
3075 add = size_binop (MINUS_EXPR, size_zero_node, add);
3076 off = op0;
3077 goto do_add;
3079 break;
3080 case MULT_EXPR:
3081 if (scale == 1 && tree_fits_shwi_p (op1))
3083 scale = tree_to_shwi (op1);
3084 off = op0;
3085 continue;
3087 break;
3088 case SSA_NAME:
3089 off = op0;
3090 continue;
3091 CASE_CONVERT:
3092 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3093 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3094 break;
3095 if (TYPE_PRECISION (TREE_TYPE (op0))
3096 == TYPE_PRECISION (TREE_TYPE (off)))
3098 off = op0;
3099 continue;
3101 if (TYPE_PRECISION (TREE_TYPE (op0))
3102 < TYPE_PRECISION (TREE_TYPE (off)))
3104 off = op0;
3105 offtype = TREE_TYPE (off);
3106 STRIP_NOPS (off);
3107 continue;
3109 break;
3110 default:
3111 break;
3113 break;
3116 /* If at the end OFF still isn't a SSA_NAME or isn't
3117 defined in the loop, punt. */
3118 if (TREE_CODE (off) != SSA_NAME
3119 || expr_invariant_in_loop_p (loop, off))
3120 return NULL_TREE;
3122 if (offtype == NULL_TREE)
3123 offtype = TREE_TYPE (off);
3125 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3126 offtype, scale);
3127 if (decl == NULL_TREE)
3128 return NULL_TREE;
3130 if (basep)
3131 *basep = base;
3132 if (offp)
3133 *offp = off;
3134 if (scalep)
3135 *scalep = scale;
3136 return decl;
3139 /* Function vect_analyze_data_refs.
3141 Find all the data references in the loop or basic block.
3143 The general structure of the analysis of data refs in the vectorizer is as
3144 follows:
3145 1- vect_analyze_data_refs(loop/bb): call
3146 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3147 in the loop/bb and their dependences.
3148 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3149 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3150 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3154 bool
3155 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3156 bb_vec_info bb_vinfo,
3157 int *min_vf)
3159 struct loop *loop = NULL;
3160 basic_block bb = NULL;
3161 unsigned int i;
3162 vec<data_reference_p> datarefs;
3163 struct data_reference *dr;
3164 tree scalar_type;
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_NOTE, vect_location,
3168 "=== vect_analyze_data_refs ===\n");
3170 if (loop_vinfo)
3172 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3174 loop = LOOP_VINFO_LOOP (loop_vinfo);
3175 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3176 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3180 "not vectorized: loop contains function calls"
3181 " or data references that cannot be analyzed\n");
3182 return false;
3185 for (i = 0; i < loop->num_nodes; i++)
3187 gimple_stmt_iterator gsi;
3189 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3191 gimple stmt = gsi_stmt (gsi);
3192 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3194 if (is_gimple_call (stmt) && loop->safelen)
3196 tree fndecl = gimple_call_fndecl (stmt), op;
3197 if (fndecl != NULL_TREE)
3199 struct cgraph_node *node = cgraph_get_node (fndecl);
3200 if (node != NULL && node->simd_clones != NULL)
3202 unsigned int j, n = gimple_call_num_args (stmt);
3203 for (j = 0; j < n; j++)
3205 op = gimple_call_arg (stmt, j);
3206 if (DECL_P (op)
3207 || (REFERENCE_CLASS_P (op)
3208 && get_base_address (op)))
3209 break;
3211 op = gimple_call_lhs (stmt);
3212 /* Ignore #pragma omp declare simd functions
3213 if they don't have data references in the
3214 call stmt itself. */
3215 if (j == n
3216 && !(op
3217 && (DECL_P (op)
3218 || (REFERENCE_CLASS_P (op)
3219 && get_base_address (op)))))
3220 continue;
3224 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3225 if (dump_enabled_p ())
3226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3227 "not vectorized: loop contains function "
3228 "calls or data references that cannot "
3229 "be analyzed\n");
3230 return false;
3235 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3237 else
3239 gimple_stmt_iterator gsi;
3241 bb = BB_VINFO_BB (bb_vinfo);
3242 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3244 gimple stmt = gsi_stmt (gsi);
3245 if (!find_data_references_in_stmt (NULL, stmt,
3246 &BB_VINFO_DATAREFS (bb_vinfo)))
3248 /* Mark the rest of the basic-block as unvectorizable. */
3249 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3251 stmt = gsi_stmt (gsi);
3252 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3254 break;
3258 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3261 /* Go through the data-refs, check that the analysis succeeded. Update
3262 pointer from stmt_vec_info struct to DR and vectype. */
3264 FOR_EACH_VEC_ELT (datarefs, i, dr)
3266 gimple stmt;
3267 stmt_vec_info stmt_info;
3268 tree base, offset, init;
3269 bool gather = false;
3270 bool simd_lane_access = false;
3271 int vf;
3273 again:
3274 if (!dr || !DR_REF (dr))
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 "not vectorized: unhandled data-ref\n");
3279 return false;
3282 stmt = DR_STMT (dr);
3283 stmt_info = vinfo_for_stmt (stmt);
3285 /* Discard clobbers from the dataref vector. We will remove
3286 clobber stmts during vectorization. */
3287 if (gimple_clobber_p (stmt))
3289 if (i == datarefs.length () - 1)
3291 datarefs.pop ();
3292 break;
3294 datarefs[i] = datarefs.pop ();
3295 goto again;
3298 /* Check that analysis of the data-ref succeeded. */
3299 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3300 || !DR_STEP (dr))
3302 bool maybe_gather
3303 = DR_IS_READ (dr)
3304 && !TREE_THIS_VOLATILE (DR_REF (dr))
3305 && targetm.vectorize.builtin_gather != NULL;
3306 bool maybe_simd_lane_access
3307 = loop_vinfo && loop->simduid;
3309 /* If target supports vector gather loads, or if this might be
3310 a SIMD lane access, see if they can't be used. */
3311 if (loop_vinfo
3312 && (maybe_gather || maybe_simd_lane_access)
3313 && !nested_in_vect_loop_p (loop, stmt))
3315 struct data_reference *newdr
3316 = create_data_ref (NULL, loop_containing_stmt (stmt),
3317 DR_REF (dr), stmt, true);
3318 gcc_assert (newdr != NULL && DR_REF (newdr));
3319 if (DR_BASE_ADDRESS (newdr)
3320 && DR_OFFSET (newdr)
3321 && DR_INIT (newdr)
3322 && DR_STEP (newdr)
3323 && integer_zerop (DR_STEP (newdr)))
3325 if (maybe_simd_lane_access)
3327 tree off = DR_OFFSET (newdr);
3328 STRIP_NOPS (off);
3329 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3330 && TREE_CODE (off) == MULT_EXPR
3331 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3333 tree step = TREE_OPERAND (off, 1);
3334 off = TREE_OPERAND (off, 0);
3335 STRIP_NOPS (off);
3336 if (CONVERT_EXPR_P (off)
3337 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3338 0)))
3339 < TYPE_PRECISION (TREE_TYPE (off)))
3340 off = TREE_OPERAND (off, 0);
3341 if (TREE_CODE (off) == SSA_NAME)
3343 gimple def = SSA_NAME_DEF_STMT (off);
3344 tree reft = TREE_TYPE (DR_REF (newdr));
3345 if (gimple_call_internal_p (def)
3346 && gimple_call_internal_fn (def)
3347 == IFN_GOMP_SIMD_LANE)
3349 tree arg = gimple_call_arg (def, 0);
3350 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3351 arg = SSA_NAME_VAR (arg);
3352 if (arg == loop->simduid
3353 /* For now. */
3354 && tree_int_cst_equal
3355 (TYPE_SIZE_UNIT (reft),
3356 step))
3358 DR_OFFSET (newdr) = ssize_int (0);
3359 DR_STEP (newdr) = step;
3360 DR_ALIGNED_TO (newdr)
3361 = size_int (BIGGEST_ALIGNMENT);
3362 dr = newdr;
3363 simd_lane_access = true;
3369 if (!simd_lane_access && maybe_gather)
3371 dr = newdr;
3372 gather = true;
3375 if (!gather && !simd_lane_access)
3376 free_data_ref (newdr);
3379 if (!gather && !simd_lane_access)
3381 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3384 "not vectorized: data ref analysis "
3385 "failed ");
3386 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3387 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3390 if (bb_vinfo)
3391 break;
3393 return false;
3397 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3401 "not vectorized: base addr of dr is a "
3402 "constant\n");
3404 if (bb_vinfo)
3405 break;
3407 if (gather || simd_lane_access)
3408 free_data_ref (dr);
3409 return false;
3412 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3414 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3417 "not vectorized: volatile type ");
3418 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3419 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3422 if (bb_vinfo)
3423 break;
3425 return false;
3428 if (stmt_can_throw_internal (stmt))
3430 if (dump_enabled_p ())
3432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3433 "not vectorized: statement can throw an "
3434 "exception ");
3435 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3436 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3439 if (bb_vinfo)
3440 break;
3442 if (gather || simd_lane_access)
3443 free_data_ref (dr);
3444 return false;
3447 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3448 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3450 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3453 "not vectorized: statement is bitfield "
3454 "access ");
3455 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3456 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3459 if (bb_vinfo)
3460 break;
3462 if (gather || simd_lane_access)
3463 free_data_ref (dr);
3464 return false;
3467 base = unshare_expr (DR_BASE_ADDRESS (dr));
3468 offset = unshare_expr (DR_OFFSET (dr));
3469 init = unshare_expr (DR_INIT (dr));
3471 if (is_gimple_call (stmt))
3473 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3476 "not vectorized: dr in a call ");
3477 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3478 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3481 if (bb_vinfo)
3482 break;
3484 if (gather || simd_lane_access)
3485 free_data_ref (dr);
3486 return false;
3489 /* Update DR field in stmt_vec_info struct. */
3491 /* If the dataref is in an inner-loop of the loop that is considered for
3492 for vectorization, we also want to analyze the access relative to
3493 the outer-loop (DR contains information only relative to the
3494 inner-most enclosing loop). We do that by building a reference to the
3495 first location accessed by the inner-loop, and analyze it relative to
3496 the outer-loop. */
3497 if (loop && nested_in_vect_loop_p (loop, stmt))
3499 tree outer_step, outer_base, outer_init;
3500 HOST_WIDE_INT pbitsize, pbitpos;
3501 tree poffset;
3502 enum machine_mode pmode;
3503 int punsignedp, pvolatilep;
3504 affine_iv base_iv, offset_iv;
3505 tree dinit;
3507 /* Build a reference to the first location accessed by the
3508 inner-loop: *(BASE+INIT). (The first location is actually
3509 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3510 tree inner_base = build_fold_indirect_ref
3511 (fold_build_pointer_plus (base, init));
3513 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_NOTE, vect_location,
3516 "analyze in outer-loop: ");
3517 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3518 dump_printf (MSG_NOTE, "\n");
3521 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3522 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3523 gcc_assert (outer_base != NULL_TREE);
3525 if (pbitpos % BITS_PER_UNIT != 0)
3527 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3529 "failed: bit offset alignment.\n");
3530 return false;
3533 outer_base = build_fold_addr_expr (outer_base);
3534 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3535 &base_iv, false))
3537 if (dump_enabled_p ())
3538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3539 "failed: evolution of base is not affine.\n");
3540 return false;
3543 if (offset)
3545 if (poffset)
3546 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3547 poffset);
3548 else
3549 poffset = offset;
3552 if (!poffset)
3554 offset_iv.base = ssize_int (0);
3555 offset_iv.step = ssize_int (0);
3557 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3558 &offset_iv, false))
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3562 "evolution of offset is not affine.\n");
3563 return false;
3566 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3567 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3568 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3569 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3570 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3572 outer_step = size_binop (PLUS_EXPR,
3573 fold_convert (ssizetype, base_iv.step),
3574 fold_convert (ssizetype, offset_iv.step));
3576 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3577 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3578 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3579 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3580 STMT_VINFO_DR_OFFSET (stmt_info) =
3581 fold_convert (ssizetype, offset_iv.base);
3582 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3583 size_int (highest_pow2_factor (offset_iv.base));
3585 if (dump_enabled_p ())
3587 dump_printf_loc (MSG_NOTE, vect_location,
3588 "\touter base_address: ");
3589 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3590 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3591 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3592 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3593 STMT_VINFO_DR_OFFSET (stmt_info));
3594 dump_printf (MSG_NOTE,
3595 "\n\touter constant offset from base address: ");
3596 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3597 STMT_VINFO_DR_INIT (stmt_info));
3598 dump_printf (MSG_NOTE, "\n\touter step: ");
3599 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3600 STMT_VINFO_DR_STEP (stmt_info));
3601 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3602 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3603 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3604 dump_printf (MSG_NOTE, "\n");
3608 if (STMT_VINFO_DATA_REF (stmt_info))
3610 if (dump_enabled_p ())
3612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3613 "not vectorized: more than one data ref "
3614 "in stmt: ");
3615 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3616 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3619 if (bb_vinfo)
3620 break;
3622 if (gather || simd_lane_access)
3623 free_data_ref (dr);
3624 return false;
3627 STMT_VINFO_DATA_REF (stmt_info) = dr;
3628 if (simd_lane_access)
3630 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3631 datarefs[i] = dr;
3634 /* Set vectype for STMT. */
3635 scalar_type = TREE_TYPE (DR_REF (dr));
3636 STMT_VINFO_VECTYPE (stmt_info) =
3637 get_vectype_for_scalar_type (scalar_type);
3638 if (!STMT_VINFO_VECTYPE (stmt_info))
3640 if (dump_enabled_p ())
3642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3643 "not vectorized: no vectype for stmt: ");
3644 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3645 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3647 scalar_type);
3648 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3651 if (bb_vinfo)
3652 break;
3654 if (gather || simd_lane_access)
3656 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3657 free_data_ref (dr);
3659 return false;
3661 else
3663 if (dump_enabled_p ())
3665 dump_printf_loc (MSG_NOTE, vect_location,
3666 "got vectype for stmt: ");
3667 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3668 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3669 STMT_VINFO_VECTYPE (stmt_info));
3670 dump_printf (MSG_NOTE, "\n");
3674 /* Adjust the minimal vectorization factor according to the
3675 vector type. */
3676 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3677 if (vf > *min_vf)
3678 *min_vf = vf;
3680 if (gather)
3682 tree off;
3684 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3685 if (gather
3686 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3687 gather = false;
3688 if (!gather)
3690 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3691 free_data_ref (dr);
3692 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3695 "not vectorized: not suitable for gather "
3696 "load ");
3697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3698 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3700 return false;
3703 datarefs[i] = dr;
3704 STMT_VINFO_GATHER_P (stmt_info) = true;
3706 else if (loop_vinfo
3707 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3709 if (nested_in_vect_loop_p (loop, stmt)
3710 || !DR_IS_READ (dr))
3712 if (dump_enabled_p ())
3714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3715 "not vectorized: not suitable for strided "
3716 "load ");
3717 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3718 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3720 return false;
3722 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3726 /* If we stopped analysis at the first dataref we could not analyze
3727 when trying to vectorize a basic-block mark the rest of the datarefs
3728 as not vectorizable and truncate the vector of datarefs. That
3729 avoids spending useless time in analyzing their dependence. */
3730 if (i != datarefs.length ())
3732 gcc_assert (bb_vinfo != NULL);
3733 for (unsigned j = i; j < datarefs.length (); ++j)
3735 data_reference_p dr = datarefs[j];
3736 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3737 free_data_ref (dr);
3739 datarefs.truncate (i);
3742 return true;
3746 /* Function vect_get_new_vect_var.
3748 Returns a name for a new variable. The current naming scheme appends the
3749 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3750 the name of vectorizer generated variables, and appends that to NAME if
3751 provided. */
3753 tree
3754 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3756 const char *prefix;
3757 tree new_vect_var;
3759 switch (var_kind)
3761 case vect_simple_var:
3762 prefix = "vect";
3763 break;
3764 case vect_scalar_var:
3765 prefix = "stmp";
3766 break;
3767 case vect_pointer_var:
3768 prefix = "vectp";
3769 break;
3770 default:
3771 gcc_unreachable ();
3774 if (name)
3776 char* tmp = concat (prefix, "_", name, NULL);
3777 new_vect_var = create_tmp_reg (type, tmp);
3778 free (tmp);
3780 else
3781 new_vect_var = create_tmp_reg (type, prefix);
3783 return new_vect_var;
3787 /* Function vect_create_addr_base_for_vector_ref.
3789 Create an expression that computes the address of the first memory location
3790 that will be accessed for a data reference.
3792 Input:
3793 STMT: The statement containing the data reference.
3794 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3795 OFFSET: Optional. If supplied, it is be added to the initial address.
3796 LOOP: Specify relative to which loop-nest should the address be computed.
3797 For example, when the dataref is in an inner-loop nested in an
3798 outer-loop that is now being vectorized, LOOP can be either the
3799 outer-loop, or the inner-loop. The first memory location accessed
3800 by the following dataref ('in' points to short):
3802 for (i=0; i<N; i++)
3803 for (j=0; j<M; j++)
3804 s += in[i+j]
3806 is as follows:
3807 if LOOP=i_loop: &in (relative to i_loop)
3808 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3810 Output:
3811 1. Return an SSA_NAME whose value is the address of the memory location of
3812 the first vector of the data reference.
3813 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3814 these statement(s) which define the returned SSA_NAME.
3816 FORNOW: We are only handling array accesses with step 1. */
3818 tree
3819 vect_create_addr_base_for_vector_ref (gimple stmt,
3820 gimple_seq *new_stmt_list,
3821 tree offset,
3822 struct loop *loop)
3824 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3825 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3826 tree data_ref_base;
3827 const char *base_name;
3828 tree addr_base;
3829 tree dest;
3830 gimple_seq seq = NULL;
3831 tree base_offset;
3832 tree init;
3833 tree vect_ptr_type;
3834 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3835 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3837 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3839 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3841 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3843 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3844 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3845 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3847 else
3849 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3850 base_offset = unshare_expr (DR_OFFSET (dr));
3851 init = unshare_expr (DR_INIT (dr));
3854 if (loop_vinfo)
3855 base_name = get_name (data_ref_base);
3856 else
3858 base_offset = ssize_int (0);
3859 init = ssize_int (0);
3860 base_name = get_name (DR_REF (dr));
3863 /* Create base_offset */
3864 base_offset = size_binop (PLUS_EXPR,
3865 fold_convert (sizetype, base_offset),
3866 fold_convert (sizetype, init));
3868 if (offset)
3870 offset = fold_build2 (MULT_EXPR, sizetype,
3871 fold_convert (sizetype, offset), step);
3872 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3873 base_offset, offset);
3876 /* base + base_offset */
3877 if (loop_vinfo)
3878 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3879 else
3881 addr_base = build1 (ADDR_EXPR,
3882 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3883 unshare_expr (DR_REF (dr)));
3886 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3887 addr_base = fold_convert (vect_ptr_type, addr_base);
3888 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3889 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3890 gimple_seq_add_seq (new_stmt_list, seq);
3892 if (DR_PTR_INFO (dr)
3893 && TREE_CODE (addr_base) == SSA_NAME)
3895 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3896 if (offset)
3897 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3900 if (dump_enabled_p ())
3902 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3903 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3904 dump_printf (MSG_NOTE, "\n");
3907 return addr_base;
3911 /* Function vect_create_data_ref_ptr.
3913 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3914 location accessed in the loop by STMT, along with the def-use update
3915 chain to appropriately advance the pointer through the loop iterations.
3916 Also set aliasing information for the pointer. This pointer is used by
3917 the callers to this function to create a memory reference expression for
3918 vector load/store access.
3920 Input:
3921 1. STMT: a stmt that references memory. Expected to be of the form
3922 GIMPLE_ASSIGN <name, data-ref> or
3923 GIMPLE_ASSIGN <data-ref, name>.
3924 2. AGGR_TYPE: the type of the reference, which should be either a vector
3925 or an array.
3926 3. AT_LOOP: the loop where the vector memref is to be created.
3927 4. OFFSET (optional): an offset to be added to the initial address accessed
3928 by the data-ref in STMT.
3929 5. BSI: location where the new stmts are to be placed if there is no loop
3930 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3931 pointing to the initial address.
3933 Output:
3934 1. Declare a new ptr to vector_type, and have it point to the base of the
3935 data reference (initial addressed accessed by the data reference).
3936 For example, for vector of type V8HI, the following code is generated:
3938 v8hi *ap;
3939 ap = (v8hi *)initial_address;
3941 if OFFSET is not supplied:
3942 initial_address = &a[init];
3943 if OFFSET is supplied:
3944 initial_address = &a[init + OFFSET];
3946 Return the initial_address in INITIAL_ADDRESS.
3948 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3949 update the pointer in each iteration of the loop.
3951 Return the increment stmt that updates the pointer in PTR_INCR.
3953 3. Set INV_P to true if the access pattern of the data reference in the
3954 vectorized loop is invariant. Set it to false otherwise.
3956 4. Return the pointer. */
3958 tree
3959 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3960 tree offset, tree *initial_address,
3961 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3962 bool only_init, bool *inv_p)
3964 const char *base_name;
3965 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3966 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3967 struct loop *loop = NULL;
3968 bool nested_in_vect_loop = false;
3969 struct loop *containing_loop = NULL;
3970 tree aggr_ptr_type;
3971 tree aggr_ptr;
3972 tree new_temp;
3973 gimple vec_stmt;
3974 gimple_seq new_stmt_list = NULL;
3975 edge pe = NULL;
3976 basic_block new_bb;
3977 tree aggr_ptr_init;
3978 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3979 tree aptr;
3980 gimple_stmt_iterator incr_gsi;
3981 bool insert_after;
3982 tree indx_before_incr, indx_after_incr;
3983 gimple incr;
3984 tree step;
3985 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3987 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3988 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3990 if (loop_vinfo)
3992 loop = LOOP_VINFO_LOOP (loop_vinfo);
3993 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3994 containing_loop = (gimple_bb (stmt))->loop_father;
3995 pe = loop_preheader_edge (loop);
3997 else
3999 gcc_assert (bb_vinfo);
4000 only_init = true;
4001 *ptr_incr = NULL;
4004 /* Check the step (evolution) of the load in LOOP, and record
4005 whether it's invariant. */
4006 if (nested_in_vect_loop)
4007 step = STMT_VINFO_DR_STEP (stmt_info);
4008 else
4009 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4011 if (integer_zerop (step))
4012 *inv_p = true;
4013 else
4014 *inv_p = false;
4016 /* Create an expression for the first address accessed by this load
4017 in LOOP. */
4018 base_name = get_name (DR_BASE_ADDRESS (dr));
4020 if (dump_enabled_p ())
4022 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4023 dump_printf_loc (MSG_NOTE, vect_location,
4024 "create %s-pointer variable to type: ",
4025 get_tree_code_name (TREE_CODE (aggr_type)));
4026 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4027 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4028 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4029 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4030 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4031 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4032 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4033 else
4034 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4035 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4036 dump_printf (MSG_NOTE, "\n");
4039 /* (1) Create the new aggregate-pointer variable.
4040 Vector and array types inherit the alias set of their component
4041 type by default so we need to use a ref-all pointer if the data
4042 reference does not conflict with the created aggregated data
4043 reference because it is not addressable. */
4044 bool need_ref_all = false;
4045 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4046 get_alias_set (DR_REF (dr))))
4047 need_ref_all = true;
4048 /* Likewise for any of the data references in the stmt group. */
4049 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4051 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4054 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4055 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4056 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4057 get_alias_set (DR_REF (sdr))))
4059 need_ref_all = true;
4060 break;
4062 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4064 while (orig_stmt);
4066 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4067 need_ref_all);
4068 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4071 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4072 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4073 def-use update cycles for the pointer: one relative to the outer-loop
4074 (LOOP), which is what steps (3) and (4) below do. The other is relative
4075 to the inner-loop (which is the inner-most loop containing the dataref),
4076 and this is done be step (5) below.
4078 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4079 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4080 redundant. Steps (3),(4) create the following:
4082 vp0 = &base_addr;
4083 LOOP: vp1 = phi(vp0,vp2)
4086 vp2 = vp1 + step
4087 goto LOOP
4089 If there is an inner-loop nested in loop, then step (5) will also be
4090 applied, and an additional update in the inner-loop will be created:
4092 vp0 = &base_addr;
4093 LOOP: vp1 = phi(vp0,vp2)
4095 inner: vp3 = phi(vp1,vp4)
4096 vp4 = vp3 + inner_step
4097 if () goto inner
4099 vp2 = vp1 + step
4100 if () goto LOOP */
4102 /* (2) Calculate the initial address of the aggregate-pointer, and set
4103 the aggregate-pointer to point to it before the loop. */
4105 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4107 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4108 offset, loop);
4109 if (new_stmt_list)
4111 if (pe)
4113 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4114 gcc_assert (!new_bb);
4116 else
4117 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4120 *initial_address = new_temp;
4122 /* Create: p = (aggr_type *) initial_base */
4123 if (TREE_CODE (new_temp) != SSA_NAME
4124 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4126 vec_stmt = gimple_build_assign (aggr_ptr,
4127 fold_convert (aggr_ptr_type, new_temp));
4128 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4129 /* Copy the points-to information if it exists. */
4130 if (DR_PTR_INFO (dr))
4131 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4132 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4133 if (pe)
4135 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4136 gcc_assert (!new_bb);
4138 else
4139 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4141 else
4142 aggr_ptr_init = new_temp;
4144 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4145 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4146 inner-loop nested in LOOP (during outer-loop vectorization). */
4148 /* No update in loop is required. */
4149 if (only_init && (!loop_vinfo || at_loop == loop))
4150 aptr = aggr_ptr_init;
4151 else
4153 /* The step of the aggregate pointer is the type size. */
4154 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4155 /* One exception to the above is when the scalar step of the load in
4156 LOOP is zero. In this case the step here is also zero. */
4157 if (*inv_p)
4158 iv_step = size_zero_node;
4159 else if (tree_int_cst_sgn (step) == -1)
4160 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4162 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4164 create_iv (aggr_ptr_init,
4165 fold_convert (aggr_ptr_type, iv_step),
4166 aggr_ptr, loop, &incr_gsi, insert_after,
4167 &indx_before_incr, &indx_after_incr);
4168 incr = gsi_stmt (incr_gsi);
4169 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4171 /* Copy the points-to information if it exists. */
4172 if (DR_PTR_INFO (dr))
4174 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4175 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4177 if (ptr_incr)
4178 *ptr_incr = incr;
4180 aptr = indx_before_incr;
4183 if (!nested_in_vect_loop || only_init)
4184 return aptr;
4187 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4188 nested in LOOP, if exists. */
4190 gcc_assert (nested_in_vect_loop);
4191 if (!only_init)
4193 standard_iv_increment_position (containing_loop, &incr_gsi,
4194 &insert_after);
4195 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4196 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4197 &indx_after_incr);
4198 incr = gsi_stmt (incr_gsi);
4199 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4201 /* Copy the points-to information if it exists. */
4202 if (DR_PTR_INFO (dr))
4204 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4205 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4207 if (ptr_incr)
4208 *ptr_incr = incr;
4210 return indx_before_incr;
4212 else
4213 gcc_unreachable ();
4217 /* Function bump_vector_ptr
4219 Increment a pointer (to a vector type) by vector-size. If requested,
4220 i.e. if PTR-INCR is given, then also connect the new increment stmt
4221 to the existing def-use update-chain of the pointer, by modifying
4222 the PTR_INCR as illustrated below:
4224 The pointer def-use update-chain before this function:
4225 DATAREF_PTR = phi (p_0, p_2)
4226 ....
4227 PTR_INCR: p_2 = DATAREF_PTR + step
4229 The pointer def-use update-chain after this function:
4230 DATAREF_PTR = phi (p_0, p_2)
4231 ....
4232 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4233 ....
4234 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4236 Input:
4237 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4238 in the loop.
4239 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4240 the loop. The increment amount across iterations is expected
4241 to be vector_size.
4242 BSI - location where the new update stmt is to be placed.
4243 STMT - the original scalar memory-access stmt that is being vectorized.
4244 BUMP - optional. The offset by which to bump the pointer. If not given,
4245 the offset is assumed to be vector_size.
4247 Output: Return NEW_DATAREF_PTR as illustrated above.
4251 tree
4252 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4253 gimple stmt, tree bump)
4255 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4256 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4257 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4258 tree update = TYPE_SIZE_UNIT (vectype);
4259 gimple incr_stmt;
4260 ssa_op_iter iter;
4261 use_operand_p use_p;
4262 tree new_dataref_ptr;
4264 if (bump)
4265 update = bump;
4267 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4268 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4269 dataref_ptr, update);
4270 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4272 /* Copy the points-to information if it exists. */
4273 if (DR_PTR_INFO (dr))
4275 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4276 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4279 if (!ptr_incr)
4280 return new_dataref_ptr;
4282 /* Update the vector-pointer's cross-iteration increment. */
4283 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4285 tree use = USE_FROM_PTR (use_p);
4287 if (use == dataref_ptr)
4288 SET_USE (use_p, new_dataref_ptr);
4289 else
4290 gcc_assert (tree_int_cst_compare (use, update) == 0);
4293 return new_dataref_ptr;
4297 /* Function vect_create_destination_var.
4299 Create a new temporary of type VECTYPE. */
4301 tree
4302 vect_create_destination_var (tree scalar_dest, tree vectype)
4304 tree vec_dest;
4305 const char *name;
4306 char *new_name;
4307 tree type;
4308 enum vect_var_kind kind;
4310 kind = vectype ? vect_simple_var : vect_scalar_var;
4311 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4313 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4315 name = get_name (scalar_dest);
4316 if (name)
4317 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4318 else
4319 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4320 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4321 free (new_name);
4323 return vec_dest;
4326 /* Function vect_grouped_store_supported.
4328 Returns TRUE if interleave high and interleave low permutations
4329 are supported, and FALSE otherwise. */
4331 bool
4332 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4334 enum machine_mode mode = TYPE_MODE (vectype);
4336 /* vect_permute_store_chain requires the group size to be a power of two. */
4337 if (exact_log2 (count) == -1)
4339 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4341 "the size of the group of accesses"
4342 " is not a power of 2\n");
4343 return false;
4346 /* Check that the permutation is supported. */
4347 if (VECTOR_MODE_P (mode))
4349 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4350 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4351 for (i = 0; i < nelt / 2; i++)
4353 sel[i * 2] = i;
4354 sel[i * 2 + 1] = i + nelt;
4356 if (can_vec_perm_p (mode, false, sel))
4358 for (i = 0; i < nelt; i++)
4359 sel[i] += nelt / 2;
4360 if (can_vec_perm_p (mode, false, sel))
4361 return true;
4365 if (dump_enabled_p ())
4366 dump_printf (MSG_MISSED_OPTIMIZATION,
4367 "interleave op not supported by target.\n");
4368 return false;
4372 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4373 type VECTYPE. */
4375 bool
4376 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4378 return vect_lanes_optab_supported_p ("vec_store_lanes",
4379 vec_store_lanes_optab,
4380 vectype, count);
4384 /* Function vect_permute_store_chain.
4386 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4387 a power of 2, generate interleave_high/low stmts to reorder the data
4388 correctly for the stores. Return the final references for stores in
4389 RESULT_CHAIN.
4391 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4392 The input is 4 vectors each containing 8 elements. We assign a number to
4393 each element, the input sequence is:
4395 1st vec: 0 1 2 3 4 5 6 7
4396 2nd vec: 8 9 10 11 12 13 14 15
4397 3rd vec: 16 17 18 19 20 21 22 23
4398 4th vec: 24 25 26 27 28 29 30 31
4400 The output sequence should be:
4402 1st vec: 0 8 16 24 1 9 17 25
4403 2nd vec: 2 10 18 26 3 11 19 27
4404 3rd vec: 4 12 20 28 5 13 21 30
4405 4th vec: 6 14 22 30 7 15 23 31
4407 i.e., we interleave the contents of the four vectors in their order.
4409 We use interleave_high/low instructions to create such output. The input of
4410 each interleave_high/low operation is two vectors:
4411 1st vec 2nd vec
4412 0 1 2 3 4 5 6 7
4413 the even elements of the result vector are obtained left-to-right from the
4414 high/low elements of the first vector. The odd elements of the result are
4415 obtained left-to-right from the high/low elements of the second vector.
4416 The output of interleave_high will be: 0 4 1 5
4417 and of interleave_low: 2 6 3 7
4420 The permutation is done in log LENGTH stages. In each stage interleave_high
4421 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4422 where the first argument is taken from the first half of DR_CHAIN and the
4423 second argument from it's second half.
4424 In our example,
4426 I1: interleave_high (1st vec, 3rd vec)
4427 I2: interleave_low (1st vec, 3rd vec)
4428 I3: interleave_high (2nd vec, 4th vec)
4429 I4: interleave_low (2nd vec, 4th vec)
4431 The output for the first stage is:
4433 I1: 0 16 1 17 2 18 3 19
4434 I2: 4 20 5 21 6 22 7 23
4435 I3: 8 24 9 25 10 26 11 27
4436 I4: 12 28 13 29 14 30 15 31
4438 The output of the second stage, i.e. the final result is:
4440 I1: 0 8 16 24 1 9 17 25
4441 I2: 2 10 18 26 3 11 19 27
4442 I3: 4 12 20 28 5 13 21 30
4443 I4: 6 14 22 30 7 15 23 31. */
4445 void
4446 vect_permute_store_chain (vec<tree> dr_chain,
4447 unsigned int length,
4448 gimple stmt,
4449 gimple_stmt_iterator *gsi,
4450 vec<tree> *result_chain)
4452 tree vect1, vect2, high, low;
4453 gimple perm_stmt;
4454 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4455 tree perm_mask_low, perm_mask_high;
4456 unsigned int i, n;
4457 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4458 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4460 result_chain->quick_grow (length);
4461 memcpy (result_chain->address (), dr_chain.address (),
4462 length * sizeof (tree));
4464 for (i = 0, n = nelt / 2; i < n; i++)
4466 sel[i * 2] = i;
4467 sel[i * 2 + 1] = i + nelt;
4469 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4470 gcc_assert (perm_mask_high != NULL);
4472 for (i = 0; i < nelt; i++)
4473 sel[i] += nelt / 2;
4474 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4475 gcc_assert (perm_mask_low != NULL);
4477 for (i = 0, n = exact_log2 (length); i < n; i++)
4479 for (j = 0; j < length/2; j++)
4481 vect1 = dr_chain[j];
4482 vect2 = dr_chain[j+length/2];
4484 /* Create interleaving stmt:
4485 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4486 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4487 perm_stmt
4488 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4489 vect1, vect2, perm_mask_high);
4490 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4491 (*result_chain)[2*j] = high;
4493 /* Create interleaving stmt:
4494 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4495 nelt*3/2+1, ...}> */
4496 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4497 perm_stmt
4498 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4499 vect1, vect2, perm_mask_low);
4500 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4501 (*result_chain)[2*j+1] = low;
4503 memcpy (dr_chain.address (), result_chain->address (),
4504 length * sizeof (tree));
4508 /* Function vect_setup_realignment
4510 This function is called when vectorizing an unaligned load using
4511 the dr_explicit_realign[_optimized] scheme.
4512 This function generates the following code at the loop prolog:
4514 p = initial_addr;
4515 x msq_init = *(floor(p)); # prolog load
4516 realignment_token = call target_builtin;
4517 loop:
4518 x msq = phi (msq_init, ---)
4520 The stmts marked with x are generated only for the case of
4521 dr_explicit_realign_optimized.
4523 The code above sets up a new (vector) pointer, pointing to the first
4524 location accessed by STMT, and a "floor-aligned" load using that pointer.
4525 It also generates code to compute the "realignment-token" (if the relevant
4526 target hook was defined), and creates a phi-node at the loop-header bb
4527 whose arguments are the result of the prolog-load (created by this
4528 function) and the result of a load that takes place in the loop (to be
4529 created by the caller to this function).
4531 For the case of dr_explicit_realign_optimized:
4532 The caller to this function uses the phi-result (msq) to create the
4533 realignment code inside the loop, and sets up the missing phi argument,
4534 as follows:
4535 loop:
4536 msq = phi (msq_init, lsq)
4537 lsq = *(floor(p')); # load in loop
4538 result = realign_load (msq, lsq, realignment_token);
4540 For the case of dr_explicit_realign:
4541 loop:
4542 msq = *(floor(p)); # load in loop
4543 p' = p + (VS-1);
4544 lsq = *(floor(p')); # load in loop
4545 result = realign_load (msq, lsq, realignment_token);
4547 Input:
4548 STMT - (scalar) load stmt to be vectorized. This load accesses
4549 a memory location that may be unaligned.
4550 BSI - place where new code is to be inserted.
4551 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4552 is used.
4554 Output:
4555 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4556 target hook, if defined.
4557 Return value - the result of the loop-header phi node. */
4559 tree
4560 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4561 tree *realignment_token,
4562 enum dr_alignment_support alignment_support_scheme,
4563 tree init_addr,
4564 struct loop **at_loop)
4566 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4567 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4568 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4569 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4570 struct loop *loop = NULL;
4571 edge pe = NULL;
4572 tree scalar_dest = gimple_assign_lhs (stmt);
4573 tree vec_dest;
4574 gimple inc;
4575 tree ptr;
4576 tree data_ref;
4577 gimple new_stmt;
4578 basic_block new_bb;
4579 tree msq_init = NULL_TREE;
4580 tree new_temp;
4581 gimple phi_stmt;
4582 tree msq = NULL_TREE;
4583 gimple_seq stmts = NULL;
4584 bool inv_p;
4585 bool compute_in_loop = false;
4586 bool nested_in_vect_loop = false;
4587 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4588 struct loop *loop_for_initial_load = NULL;
4590 if (loop_vinfo)
4592 loop = LOOP_VINFO_LOOP (loop_vinfo);
4593 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4596 gcc_assert (alignment_support_scheme == dr_explicit_realign
4597 || alignment_support_scheme == dr_explicit_realign_optimized);
4599 /* We need to generate three things:
4600 1. the misalignment computation
4601 2. the extra vector load (for the optimized realignment scheme).
4602 3. the phi node for the two vectors from which the realignment is
4603 done (for the optimized realignment scheme). */
4605 /* 1. Determine where to generate the misalignment computation.
4607 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4608 calculation will be generated by this function, outside the loop (in the
4609 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4610 caller, inside the loop.
4612 Background: If the misalignment remains fixed throughout the iterations of
4613 the loop, then both realignment schemes are applicable, and also the
4614 misalignment computation can be done outside LOOP. This is because we are
4615 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4616 are a multiple of VS (the Vector Size), and therefore the misalignment in
4617 different vectorized LOOP iterations is always the same.
4618 The problem arises only if the memory access is in an inner-loop nested
4619 inside LOOP, which is now being vectorized using outer-loop vectorization.
4620 This is the only case when the misalignment of the memory access may not
4621 remain fixed throughout the iterations of the inner-loop (as explained in
4622 detail in vect_supportable_dr_alignment). In this case, not only is the
4623 optimized realignment scheme not applicable, but also the misalignment
4624 computation (and generation of the realignment token that is passed to
4625 REALIGN_LOAD) have to be done inside the loop.
4627 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4628 or not, which in turn determines if the misalignment is computed inside
4629 the inner-loop, or outside LOOP. */
4631 if (init_addr != NULL_TREE || !loop_vinfo)
4633 compute_in_loop = true;
4634 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4638 /* 2. Determine where to generate the extra vector load.
4640 For the optimized realignment scheme, instead of generating two vector
4641 loads in each iteration, we generate a single extra vector load in the
4642 preheader of the loop, and in each iteration reuse the result of the
4643 vector load from the previous iteration. In case the memory access is in
4644 an inner-loop nested inside LOOP, which is now being vectorized using
4645 outer-loop vectorization, we need to determine whether this initial vector
4646 load should be generated at the preheader of the inner-loop, or can be
4647 generated at the preheader of LOOP. If the memory access has no evolution
4648 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4649 to be generated inside LOOP (in the preheader of the inner-loop). */
4651 if (nested_in_vect_loop)
4653 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4654 bool invariant_in_outerloop =
4655 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4656 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4658 else
4659 loop_for_initial_load = loop;
4660 if (at_loop)
4661 *at_loop = loop_for_initial_load;
4663 if (loop_for_initial_load)
4664 pe = loop_preheader_edge (loop_for_initial_load);
4666 /* 3. For the case of the optimized realignment, create the first vector
4667 load at the loop preheader. */
4669 if (alignment_support_scheme == dr_explicit_realign_optimized)
4671 /* Create msq_init = *(floor(p1)) in the loop preheader */
4673 gcc_assert (!compute_in_loop);
4674 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4675 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4676 NULL_TREE, &init_addr, NULL, &inc,
4677 true, &inv_p);
4678 new_temp = copy_ssa_name (ptr, NULL);
4679 new_stmt = gimple_build_assign_with_ops
4680 (BIT_AND_EXPR, new_temp, ptr,
4681 build_int_cst (TREE_TYPE (ptr),
4682 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4683 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4684 gcc_assert (!new_bb);
4685 data_ref
4686 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4687 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4688 new_stmt = gimple_build_assign (vec_dest, data_ref);
4689 new_temp = make_ssa_name (vec_dest, new_stmt);
4690 gimple_assign_set_lhs (new_stmt, new_temp);
4691 if (pe)
4693 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4694 gcc_assert (!new_bb);
4696 else
4697 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4699 msq_init = gimple_assign_lhs (new_stmt);
4702 /* 4. Create realignment token using a target builtin, if available.
4703 It is done either inside the containing loop, or before LOOP (as
4704 determined above). */
4706 if (targetm.vectorize.builtin_mask_for_load)
4708 tree builtin_decl;
4710 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4711 if (!init_addr)
4713 /* Generate the INIT_ADDR computation outside LOOP. */
4714 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4715 NULL_TREE, loop);
4716 if (loop)
4718 pe = loop_preheader_edge (loop);
4719 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4720 gcc_assert (!new_bb);
4722 else
4723 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4726 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4727 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4728 vec_dest =
4729 vect_create_destination_var (scalar_dest,
4730 gimple_call_return_type (new_stmt));
4731 new_temp = make_ssa_name (vec_dest, new_stmt);
4732 gimple_call_set_lhs (new_stmt, new_temp);
4734 if (compute_in_loop)
4735 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4736 else
4738 /* Generate the misalignment computation outside LOOP. */
4739 pe = loop_preheader_edge (loop);
4740 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4741 gcc_assert (!new_bb);
4744 *realignment_token = gimple_call_lhs (new_stmt);
4746 /* The result of the CALL_EXPR to this builtin is determined from
4747 the value of the parameter and no global variables are touched
4748 which makes the builtin a "const" function. Requiring the
4749 builtin to have the "const" attribute makes it unnecessary
4750 to call mark_call_clobbered. */
4751 gcc_assert (TREE_READONLY (builtin_decl));
4754 if (alignment_support_scheme == dr_explicit_realign)
4755 return msq;
4757 gcc_assert (!compute_in_loop);
4758 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4761 /* 5. Create msq = phi <msq_init, lsq> in loop */
4763 pe = loop_preheader_edge (containing_loop);
4764 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4765 msq = make_ssa_name (vec_dest, NULL);
4766 phi_stmt = create_phi_node (msq, containing_loop->header);
4767 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4769 return msq;
4773 /* Function vect_grouped_load_supported.
4775 Returns TRUE if even and odd permutations are supported,
4776 and FALSE otherwise. */
4778 bool
4779 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4781 enum machine_mode mode = TYPE_MODE (vectype);
4783 /* vect_permute_load_chain requires the group size to be a power of two. */
4784 if (exact_log2 (count) == -1)
4786 if (dump_enabled_p ())
4787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4788 "the size of the group of accesses"
4789 " is not a power of 2\n");
4790 return false;
4793 /* Check that the permutation is supported. */
4794 if (VECTOR_MODE_P (mode))
4796 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4797 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4799 for (i = 0; i < nelt; i++)
4800 sel[i] = i * 2;
4801 if (can_vec_perm_p (mode, false, sel))
4803 for (i = 0; i < nelt; i++)
4804 sel[i] = i * 2 + 1;
4805 if (can_vec_perm_p (mode, false, sel))
4806 return true;
4810 if (dump_enabled_p ())
4811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4812 "extract even/odd not supported by target\n");
4813 return false;
4816 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4817 type VECTYPE. */
4819 bool
4820 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4822 return vect_lanes_optab_supported_p ("vec_load_lanes",
4823 vec_load_lanes_optab,
4824 vectype, count);
4827 /* Function vect_permute_load_chain.
4829 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4830 a power of 2, generate extract_even/odd stmts to reorder the input data
4831 correctly. Return the final references for loads in RESULT_CHAIN.
4833 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4834 The input is 4 vectors each containing 8 elements. We assign a number to each
4835 element, the input sequence is:
4837 1st vec: 0 1 2 3 4 5 6 7
4838 2nd vec: 8 9 10 11 12 13 14 15
4839 3rd vec: 16 17 18 19 20 21 22 23
4840 4th vec: 24 25 26 27 28 29 30 31
4842 The output sequence should be:
4844 1st vec: 0 4 8 12 16 20 24 28
4845 2nd vec: 1 5 9 13 17 21 25 29
4846 3rd vec: 2 6 10 14 18 22 26 30
4847 4th vec: 3 7 11 15 19 23 27 31
4849 i.e., the first output vector should contain the first elements of each
4850 interleaving group, etc.
4852 We use extract_even/odd instructions to create such output. The input of
4853 each extract_even/odd operation is two vectors
4854 1st vec 2nd vec
4855 0 1 2 3 4 5 6 7
4857 and the output is the vector of extracted even/odd elements. The output of
4858 extract_even will be: 0 2 4 6
4859 and of extract_odd: 1 3 5 7
4862 The permutation is done in log LENGTH stages. In each stage extract_even
4863 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4864 their order. In our example,
4866 E1: extract_even (1st vec, 2nd vec)
4867 E2: extract_odd (1st vec, 2nd vec)
4868 E3: extract_even (3rd vec, 4th vec)
4869 E4: extract_odd (3rd vec, 4th vec)
4871 The output for the first stage will be:
4873 E1: 0 2 4 6 8 10 12 14
4874 E2: 1 3 5 7 9 11 13 15
4875 E3: 16 18 20 22 24 26 28 30
4876 E4: 17 19 21 23 25 27 29 31
4878 In order to proceed and create the correct sequence for the next stage (or
4879 for the correct output, if the second stage is the last one, as in our
4880 example), we first put the output of extract_even operation and then the
4881 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4882 The input for the second stage is:
4884 1st vec (E1): 0 2 4 6 8 10 12 14
4885 2nd vec (E3): 16 18 20 22 24 26 28 30
4886 3rd vec (E2): 1 3 5 7 9 11 13 15
4887 4th vec (E4): 17 19 21 23 25 27 29 31
4889 The output of the second stage:
4891 E1: 0 4 8 12 16 20 24 28
4892 E2: 2 6 10 14 18 22 26 30
4893 E3: 1 5 9 13 17 21 25 29
4894 E4: 3 7 11 15 19 23 27 31
4896 And RESULT_CHAIN after reordering:
4898 1st vec (E1): 0 4 8 12 16 20 24 28
4899 2nd vec (E3): 1 5 9 13 17 21 25 29
4900 3rd vec (E2): 2 6 10 14 18 22 26 30
4901 4th vec (E4): 3 7 11 15 19 23 27 31. */
4903 static void
4904 vect_permute_load_chain (vec<tree> dr_chain,
4905 unsigned int length,
4906 gimple stmt,
4907 gimple_stmt_iterator *gsi,
4908 vec<tree> *result_chain)
4910 tree data_ref, first_vect, second_vect;
4911 tree perm_mask_even, perm_mask_odd;
4912 gimple perm_stmt;
4913 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4914 unsigned int i, j, log_length = exact_log2 (length);
4915 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4916 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4918 result_chain->quick_grow (length);
4919 memcpy (result_chain->address (), dr_chain.address (),
4920 length * sizeof (tree));
4922 for (i = 0; i < nelt; ++i)
4923 sel[i] = i * 2;
4924 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4925 gcc_assert (perm_mask_even != NULL);
4927 for (i = 0; i < nelt; ++i)
4928 sel[i] = i * 2 + 1;
4929 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4930 gcc_assert (perm_mask_odd != NULL);
4932 for (i = 0; i < log_length; i++)
4934 for (j = 0; j < length; j += 2)
4936 first_vect = dr_chain[j];
4937 second_vect = dr_chain[j+1];
4939 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4940 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4941 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4942 first_vect, second_vect,
4943 perm_mask_even);
4944 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4945 (*result_chain)[j/2] = data_ref;
4947 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4948 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4949 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4950 first_vect, second_vect,
4951 perm_mask_odd);
4952 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4953 (*result_chain)[j/2+length/2] = data_ref;
4955 memcpy (dr_chain.address (), result_chain->address (),
4956 length * sizeof (tree));
4961 /* Function vect_transform_grouped_load.
4963 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4964 to perform their permutation and ascribe the result vectorized statements to
4965 the scalar statements.
4968 void
4969 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4970 gimple_stmt_iterator *gsi)
4972 vec<tree> result_chain = vNULL;
4974 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4975 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4976 vectors, that are ready for vector computation. */
4977 result_chain.create (size);
4978 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4979 vect_record_grouped_load_vectors (stmt, result_chain);
4980 result_chain.release ();
4983 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4984 generated as part of the vectorization of STMT. Assign the statement
4985 for each vector to the associated scalar statement. */
4987 void
4988 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4990 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4991 gimple next_stmt, new_stmt;
4992 unsigned int i, gap_count;
4993 tree tmp_data_ref;
4995 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4996 Since we scan the chain starting from it's first node, their order
4997 corresponds the order of data-refs in RESULT_CHAIN. */
4998 next_stmt = first_stmt;
4999 gap_count = 1;
5000 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5002 if (!next_stmt)
5003 break;
5005 /* Skip the gaps. Loads created for the gaps will be removed by dead
5006 code elimination pass later. No need to check for the first stmt in
5007 the group, since it always exists.
5008 GROUP_GAP is the number of steps in elements from the previous
5009 access (if there is no gap GROUP_GAP is 1). We skip loads that
5010 correspond to the gaps. */
5011 if (next_stmt != first_stmt
5012 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5014 gap_count++;
5015 continue;
5018 while (next_stmt)
5020 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5021 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5022 copies, and we put the new vector statement in the first available
5023 RELATED_STMT. */
5024 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5025 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5026 else
5028 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5030 gimple prev_stmt =
5031 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5032 gimple rel_stmt =
5033 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5034 while (rel_stmt)
5036 prev_stmt = rel_stmt;
5037 rel_stmt =
5038 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5041 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5042 new_stmt;
5046 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5047 gap_count = 1;
5048 /* If NEXT_STMT accesses the same DR as the previous statement,
5049 put the same TMP_DATA_REF as its vectorized statement; otherwise
5050 get the next data-ref from RESULT_CHAIN. */
5051 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5052 break;
5057 /* Function vect_force_dr_alignment_p.
5059 Returns whether the alignment of a DECL can be forced to be aligned
5060 on ALIGNMENT bit boundary. */
5062 bool
5063 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5065 if (TREE_CODE (decl) != VAR_DECL)
5066 return false;
5068 /* We cannot change alignment of common or external symbols as another
5069 translation unit may contain a definition with lower alignment.
5070 The rules of common symbol linking mean that the definition
5071 will override the common symbol. The same is true for constant
5072 pool entries which may be shared and are not properly merged
5073 by LTO. */
5074 if (DECL_EXTERNAL (decl)
5075 || DECL_COMMON (decl)
5076 || DECL_IN_CONSTANT_POOL (decl))
5077 return false;
5079 if (TREE_ASM_WRITTEN (decl))
5080 return false;
5082 /* Do not override the alignment as specified by the ABI when the used
5083 attribute is set. */
5084 if (DECL_PRESERVE_P (decl))
5085 return false;
5087 /* Do not override explicit alignment set by the user when an explicit
5088 section name is also used. This is a common idiom used by many
5089 software projects. */
5090 if (DECL_SECTION_NAME (decl) != NULL_TREE
5091 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5092 return false;
5094 if (TREE_STATIC (decl))
5095 return (alignment <= MAX_OFILE_ALIGNMENT);
5096 else
5097 return (alignment <= MAX_STACK_ALIGNMENT);
5101 /* Return whether the data reference DR is supported with respect to its
5102 alignment.
5103 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5104 it is aligned, i.e., check if it is possible to vectorize it with different
5105 alignment. */
5107 enum dr_alignment_support
5108 vect_supportable_dr_alignment (struct data_reference *dr,
5109 bool check_aligned_accesses)
5111 gimple stmt = DR_STMT (dr);
5112 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5113 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5114 enum machine_mode mode = TYPE_MODE (vectype);
5115 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5116 struct loop *vect_loop = NULL;
5117 bool nested_in_vect_loop = false;
5119 if (aligned_access_p (dr) && !check_aligned_accesses)
5120 return dr_aligned;
5122 if (loop_vinfo)
5124 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5125 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5128 /* Possibly unaligned access. */
5130 /* We can choose between using the implicit realignment scheme (generating
5131 a misaligned_move stmt) and the explicit realignment scheme (generating
5132 aligned loads with a REALIGN_LOAD). There are two variants to the
5133 explicit realignment scheme: optimized, and unoptimized.
5134 We can optimize the realignment only if the step between consecutive
5135 vector loads is equal to the vector size. Since the vector memory
5136 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5137 is guaranteed that the misalignment amount remains the same throughout the
5138 execution of the vectorized loop. Therefore, we can create the
5139 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5140 at the loop preheader.
5142 However, in the case of outer-loop vectorization, when vectorizing a
5143 memory access in the inner-loop nested within the LOOP that is now being
5144 vectorized, while it is guaranteed that the misalignment of the
5145 vectorized memory access will remain the same in different outer-loop
5146 iterations, it is *not* guaranteed that is will remain the same throughout
5147 the execution of the inner-loop. This is because the inner-loop advances
5148 with the original scalar step (and not in steps of VS). If the inner-loop
5149 step happens to be a multiple of VS, then the misalignment remains fixed
5150 and we can use the optimized realignment scheme. For example:
5152 for (i=0; i<N; i++)
5153 for (j=0; j<M; j++)
5154 s += a[i+j];
5156 When vectorizing the i-loop in the above example, the step between
5157 consecutive vector loads is 1, and so the misalignment does not remain
5158 fixed across the execution of the inner-loop, and the realignment cannot
5159 be optimized (as illustrated in the following pseudo vectorized loop):
5161 for (i=0; i<N; i+=4)
5162 for (j=0; j<M; j++){
5163 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5164 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5165 // (assuming that we start from an aligned address).
5168 We therefore have to use the unoptimized realignment scheme:
5170 for (i=0; i<N; i+=4)
5171 for (j=k; j<M; j+=4)
5172 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5173 // that the misalignment of the initial address is
5174 // 0).
5176 The loop can then be vectorized as follows:
5178 for (k=0; k<4; k++){
5179 rt = get_realignment_token (&vp[k]);
5180 for (i=0; i<N; i+=4){
5181 v1 = vp[i+k];
5182 for (j=k; j<M; j+=4){
5183 v2 = vp[i+j+VS-1];
5184 va = REALIGN_LOAD <v1,v2,rt>;
5185 vs += va;
5186 v1 = v2;
5189 } */
5191 if (DR_IS_READ (dr))
5193 bool is_packed = false;
5194 tree type = (TREE_TYPE (DR_REF (dr)));
5196 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5197 && (!targetm.vectorize.builtin_mask_for_load
5198 || targetm.vectorize.builtin_mask_for_load ()))
5200 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5201 if ((nested_in_vect_loop
5202 && (TREE_INT_CST_LOW (DR_STEP (dr))
5203 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5204 || !loop_vinfo)
5205 return dr_explicit_realign;
5206 else
5207 return dr_explicit_realign_optimized;
5209 if (!known_alignment_for_access_p (dr))
5210 is_packed = not_size_aligned (DR_REF (dr));
5212 if ((TYPE_USER_ALIGN (type) && !is_packed)
5213 || targetm.vectorize.
5214 support_vector_misalignment (mode, type,
5215 DR_MISALIGNMENT (dr), is_packed))
5216 /* Can't software pipeline the loads, but can at least do them. */
5217 return dr_unaligned_supported;
5219 else
5221 bool is_packed = false;
5222 tree type = (TREE_TYPE (DR_REF (dr)));
5224 if (!known_alignment_for_access_p (dr))
5225 is_packed = not_size_aligned (DR_REF (dr));
5227 if ((TYPE_USER_ALIGN (type) && !is_packed)
5228 || targetm.vectorize.
5229 support_vector_misalignment (mode, type,
5230 DR_MISALIGNMENT (dr), is_packed))
5231 return dr_unaligned_supported;
5234 /* Unsupported. */
5235 return dr_unaligned_unsupported;