New vectorizer messages; message format change.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob4c12f49391617bb4d1be2751400c16674fe3b860
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
40 /* Need to include rtl.h, expr.h, etc. for optabs. */
41 #include "expr.h"
42 #include "optabs.h"
44 /* Return true if load- or store-lanes optab OPTAB is implemented for
45 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
47 static bool
48 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
49 tree vectype, unsigned HOST_WIDE_INT count)
51 enum machine_mode mode, array_mode;
52 bool limit_p;
54 mode = TYPE_MODE (vectype);
55 limit_p = !targetm.array_mode_supported_p (mode, count);
56 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
57 MODE_INT, limit_p);
59 if (array_mode == BLKmode)
61 if (dump_enabled_p ())
62 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
63 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
64 GET_MODE_NAME (mode), count);
65 return false;
68 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "cannot use %s<%s><%s>", name,
73 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
74 return false;
77 if (dump_enabled_p ())
78 dump_printf_loc (MSG_NOTE, vect_location,
79 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
80 GET_MODE_NAME (mode));
82 return true;
86 /* Return the smallest scalar part of STMT.
87 This is used to determine the vectype of the stmt. We generally set the
88 vectype according to the type of the result (lhs). For stmts whose
89 result-type is different than the type of the arguments (e.g., demotion,
90 promotion), vectype will be reset appropriately (later). Note that we have
91 to visit the smallest datatype in this function, because that determines the
92 VF. If the smallest datatype in the loop is present only as the rhs of a
93 promotion operation - we'd miss it.
94 Such a case, where a variable of this datatype does not appear in the lhs
95 anywhere in the loop, can only occur if it's an invariant: e.g.:
96 'int_x = (int) short_inv', which we'd expect to have been optimized away by
97 invariant motion. However, we cannot rely on invariant motion to always
98 take invariants out of the loop, and so in the case of promotion we also
99 have to check the rhs.
100 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
101 types. */
103 tree
104 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
105 HOST_WIDE_INT *rhs_size_unit)
107 tree scalar_type = gimple_expr_type (stmt);
108 HOST_WIDE_INT lhs, rhs;
110 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
112 if (is_gimple_assign (stmt)
113 && (gimple_assign_cast_p (stmt)
114 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
115 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
116 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
118 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
120 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
121 if (rhs < lhs)
122 scalar_type = rhs_type;
125 *lhs_size_unit = lhs;
126 *rhs_size_unit = rhs;
127 return scalar_type;
131 /* Check if data references pointed by DR_I and DR_J are same or
132 belong to same interleaving group. Return FALSE if drs are
133 different, otherwise return TRUE. */
135 static bool
136 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
138 gimple stmt_i = DR_STMT (dr_i);
139 gimple stmt_j = DR_STMT (dr_j);
141 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
142 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
143 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
144 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
145 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
146 return true;
147 else
148 return false;
151 /* If address ranges represented by DDR_I and DDR_J are equal,
152 return TRUE, otherwise return FALSE. */
154 static bool
155 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
157 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
158 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
159 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
160 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
161 return true;
162 else
163 return false;
166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
167 tested at run-time. Return TRUE if DDR was successfully inserted.
168 Return false if versioning is not supported. */
170 static bool
171 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
175 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
176 return false;
178 if (dump_enabled_p ())
180 dump_printf_loc (MSG_NOTE, vect_location,
181 "mark for run-time aliasing test between ");
182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
183 dump_printf (MSG_NOTE, " and ");
184 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
187 if (optimize_loop_nest_for_size_p (loop))
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
191 "versioning not supported when optimizing for size.");
192 return false;
195 /* FORNOW: We don't support versioning with outer-loop vectorization. */
196 if (loop->inner)
198 if (dump_enabled_p ())
199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
200 "versioning not yet supported for outer-loops.");
201 return false;
204 /* FORNOW: We don't support creating runtime alias tests for non-constant
205 step. */
206 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
207 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
209 if (dump_enabled_p ())
210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
211 "versioning not yet supported for non-constant "
212 "step");
213 return false;
216 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
217 return true;
221 /* Function vect_analyze_data_ref_dependence.
223 Return TRUE if there (might) exist a dependence between a memory-reference
224 DRA and a memory-reference DRB. When versioning for alias may check a
225 dependence at run-time, return FALSE. Adjust *MAX_VF according to
226 the data dependence. */
228 static bool
229 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
230 loop_vec_info loop_vinfo, int *max_vf)
232 unsigned int i;
233 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
234 struct data_reference *dra = DDR_A (ddr);
235 struct data_reference *drb = DDR_B (ddr);
236 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
237 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
238 lambda_vector dist_v;
239 unsigned int loop_depth;
241 /* In loop analysis all data references should be vectorizable. */
242 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
243 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
244 gcc_unreachable ();
246 /* Independent data accesses. */
247 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
248 return false;
250 if (dra == drb
251 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
252 return false;
254 /* Unknown data dependence. */
255 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
257 /* If user asserted safelen consecutive iterations can be
258 executed concurrently, assume independence. */
259 if (loop->safelen >= 2)
261 if (loop->safelen < *max_vf)
262 *max_vf = loop->safelen;
263 return false;
266 if (STMT_VINFO_GATHER_P (stmtinfo_a)
267 || STMT_VINFO_GATHER_P (stmtinfo_b))
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
272 "versioning for alias not supported for: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 DR_REF (dra));
276 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 DR_REF (drb));
280 return true;
283 if (dump_enabled_p ())
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
286 "versioning for alias required: "
287 "can't determine dependence between ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (dra));
290 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
292 DR_REF (drb));
295 /* Add to list of ddrs that need to be tested at run-time. */
296 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
299 /* Known data dependence. */
300 if (DDR_NUM_DIST_VECTS (ddr) == 0)
302 /* If user asserted safelen consecutive iterations can be
303 executed concurrently, assume independence. */
304 if (loop->safelen >= 2)
306 if (loop->safelen < *max_vf)
307 *max_vf = loop->safelen;
308 return false;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a)
312 || STMT_VINFO_GATHER_P (stmtinfo_b))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 DR_REF (drb));
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
341 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
344 int dist = dist_v[loop_depth];
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE, vect_location,
348 "dependence distance = %d.", dist);
350 if (dist == 0)
352 if (dump_enabled_p ())
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_NOTE, " and ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
361 /* When we perform grouped accesses and perform implicit CSE
362 by detecting equal accesses and doing disambiguation with
363 runtime alias tests like for
364 .. = a[i];
365 .. = a[i+1];
366 a[i] = ..;
367 a[i+1] = ..;
368 *p = ..;
369 .. = a[i];
370 .. = a[i+1];
371 where we will end up loading { a[i], a[i+1] } once, make
372 sure that inserting group loads before the first load and
373 stores after the last store will do the right thing. */
374 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 && GROUP_SAME_DR_STMT (stmtinfo_a))
376 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
377 && GROUP_SAME_DR_STMT (stmtinfo_b)))
379 gimple earlier_stmt;
380 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
381 if (DR_IS_WRITE
382 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
384 if (dump_enabled_p ())
385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
386 "READ_WRITE dependence in interleaving.");
387 return true;
391 continue;
394 if (dist > 0 && DDR_REVERSED_P (ddr))
396 /* If DDR_REVERSED_P the order of the data-refs in DDR was
397 reversed (to make distance vector positive), and the actual
398 distance is negative. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
401 "dependence distance negative.");
402 continue;
405 if (abs (dist) >= 2
406 && abs (dist) < *max_vf)
408 /* The dependence distance requires reduction of the maximal
409 vectorization factor. */
410 *max_vf = abs (dist);
411 if (dump_enabled_p ())
412 dump_printf_loc (MSG_NOTE, vect_location,
413 "adjusting maximal vectorization factor to %i",
414 *max_vf);
417 if (abs (dist) >= *max_vf)
419 /* Dependence distance does not create dependence, as far as
420 vectorization is concerned, in this case. */
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_NOTE, vect_location,
423 "dependence distance >= VF.");
424 continue;
427 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
430 "not vectorized, possible dependence "
431 "between data-refs ");
432 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
433 dump_printf (MSG_NOTE, " and ");
434 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
437 return true;
440 return false;
443 /* Function vect_analyze_data_ref_dependences.
445 Examine all the data references in the loop, and make sure there do not
446 exist any data dependences between them. Set *MAX_VF according to
447 the maximum vectorization factor the data dependences allow. */
449 bool
450 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
452 unsigned int i;
453 struct data_dependence_relation *ddr;
455 if (dump_enabled_p ())
456 dump_printf_loc (MSG_NOTE, vect_location,
457 "=== vect_analyze_data_ref_dependences ===");
459 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
460 &LOOP_VINFO_DDRS (loop_vinfo),
461 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
462 return false;
464 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
465 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
466 return false;
468 return true;
472 /* Function vect_slp_analyze_data_ref_dependence.
474 Return TRUE if there (might) exist a dependence between a memory-reference
475 DRA and a memory-reference DRB. When versioning for alias may check a
476 dependence at run-time, return FALSE. Adjust *MAX_VF according to
477 the data dependence. */
479 static bool
480 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
482 struct data_reference *dra = DDR_A (ddr);
483 struct data_reference *drb = DDR_B (ddr);
485 /* We need to check dependences of statements marked as unvectorizable
486 as well, they still can prohibit vectorization. */
488 /* Independent data accesses. */
489 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
490 return false;
492 if (dra == drb)
493 return false;
495 /* Read-read is OK. */
496 if (DR_IS_READ (dra) && DR_IS_READ (drb))
497 return false;
499 /* If dra and drb are part of the same interleaving chain consider
500 them independent. */
501 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
502 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
503 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
504 return false;
506 /* Unknown data dependence. */
507 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
509 gimple earlier_stmt;
511 if (dump_enabled_p ())
513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
514 "can't determine dependence between ");
515 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
516 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
517 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
520 /* We do not vectorize basic blocks with write-write dependencies. */
521 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
522 return true;
524 /* Check that it's not a load-after-store dependence. */
525 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
526 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
527 return true;
529 return false;
532 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "determined dependence between ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
537 dump_printf (MSG_NOTE, " and ");
538 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
541 /* Do not vectorize basic blocks with write-write dependences. */
542 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
543 return true;
545 /* Check dependence between DRA and DRB for basic block vectorization.
546 If the accesses share same bases and offsets, we can compare their initial
547 constant offsets to decide whether they differ or not. In case of a read-
548 write dependence we check that the load is before the store to ensure that
549 vectorization will not change the order of the accesses. */
551 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
552 gimple earlier_stmt;
554 /* Check that the data-refs have same bases and offsets. If not, we can't
555 determine if they are dependent. */
556 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
557 || !dr_equal_offsets_p (dra, drb))
558 return true;
560 /* Check the types. */
561 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
562 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
564 if (type_size_a != type_size_b
565 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
566 TREE_TYPE (DR_REF (drb))))
567 return true;
569 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
570 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
572 /* Two different locations - no dependence. */
573 if (init_a != init_b)
574 return false;
576 /* We have a read-write dependence. Check that the load is before the store.
577 When we vectorize basic blocks, vector load can be only before
578 corresponding scalar load, and vector store can be only after its
579 corresponding scalar store. So the order of the acceses is preserved in
580 case the load is before the store. */
581 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
582 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
583 return false;
585 return true;
589 /* Function vect_analyze_data_ref_dependences.
591 Examine all the data references in the basic-block, and make sure there
592 do not exist any data dependences between them. Set *MAX_VF according to
593 the maximum vectorization factor the data dependences allow. */
595 bool
596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
598 struct data_dependence_relation *ddr;
599 unsigned int i;
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "=== vect_slp_analyze_data_ref_dependences ===");
605 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
606 &BB_VINFO_DDRS (bb_vinfo),
607 vNULL, true))
608 return false;
610 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
611 if (vect_slp_analyze_data_ref_dependence (ddr))
612 return false;
614 return true;
618 /* Function vect_compute_data_ref_alignment
620 Compute the misalignment of the data reference DR.
622 Output:
623 1. If during the misalignment computation it is found that the data reference
624 cannot be vectorized then false is returned.
625 2. DR_MISALIGNMENT (DR) is defined.
627 FOR NOW: No analysis is actually performed. Misalignment is calculated
628 only for trivial cases. TODO. */
630 static bool
631 vect_compute_data_ref_alignment (struct data_reference *dr)
633 gimple stmt = DR_STMT (dr);
634 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
636 struct loop *loop = NULL;
637 tree ref = DR_REF (dr);
638 tree vectype;
639 tree base, base_addr;
640 bool base_aligned;
641 tree misalign;
642 tree aligned_to, alignment;
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE, vect_location,
646 "vect_compute_data_ref_alignment:");
648 if (loop_vinfo)
649 loop = LOOP_VINFO_LOOP (loop_vinfo);
651 /* Initialize misalignment to unknown. */
652 SET_DR_MISALIGNMENT (dr, -1);
654 /* Strided loads perform only component accesses, misalignment information
655 is irrelevant for them. */
656 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
657 return true;
659 misalign = DR_INIT (dr);
660 aligned_to = DR_ALIGNED_TO (dr);
661 base_addr = DR_BASE_ADDRESS (dr);
662 vectype = STMT_VINFO_VECTYPE (stmt_info);
664 /* In case the dataref is in an inner-loop of the loop that is being
665 vectorized (LOOP), we use the base and misalignment information
666 relative to the outer-loop (LOOP). This is ok only if the misalignment
667 stays the same throughout the execution of the inner-loop, which is why
668 we have to check that the stride of the dataref in the inner-loop evenly
669 divides by the vector size. */
670 if (loop && nested_in_vect_loop_p (loop, stmt))
672 tree step = DR_STEP (dr);
673 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
675 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_NOTE, vect_location,
679 "inner step divides the vector-size.");
680 misalign = STMT_VINFO_DR_INIT (stmt_info);
681 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
682 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
684 else
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
688 "inner step doesn't divide the vector-size.");
689 misalign = NULL_TREE;
693 /* Similarly, if we're doing basic-block vectorization, we can only use
694 base and misalignment information relative to an innermost loop if the
695 misalignment stays the same throughout the execution of the loop.
696 As above, this is the case if the stride of the dataref evenly divides
697 by the vector size. */
698 if (!loop)
700 tree step = DR_STEP (dr);
701 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
703 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "SLP: step doesn't divide the vector-size.");
708 misalign = NULL_TREE;
712 base = build_fold_indirect_ref (base_addr);
713 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
715 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
716 || !misalign)
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
724 return true;
727 if ((DECL_P (base)
728 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
729 alignment) >= 0)
730 || (TREE_CODE (base_addr) == SSA_NAME
731 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
732 TREE_TYPE (base_addr)))),
733 alignment) >= 0)
734 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
735 base_aligned = true;
736 else
737 base_aligned = false;
739 if (!base_aligned)
741 /* Do not change the alignment of global variables here if
742 flag_section_anchors is enabled as we already generated
743 RTL for other functions. Most global variables should
744 have been aligned during the IPA increase_alignment pass. */
745 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
746 || (TREE_STATIC (base) && flag_section_anchors))
748 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE, vect_location,
751 "can't force alignment of ref: ");
752 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
754 return true;
757 /* Force the alignment of the decl.
758 NOTE: This is the only change to the code we make during
759 the analysis phase, before deciding to vectorize the loop. */
760 if (dump_enabled_p ())
762 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
763 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
766 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
767 DECL_USER_ALIGN (base) = 1;
770 /* At this point we assume that the base is aligned. */
771 gcc_assert (base_aligned
772 || (TREE_CODE (base) == VAR_DECL
773 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
775 /* If this is a backward running DR then first access in the larger
776 vectype actually is N-1 elements before the address in the DR.
777 Adjust misalign accordingly. */
778 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
780 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
781 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
782 otherwise we wouldn't be here. */
783 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
784 /* PLUS because DR_STEP was negative. */
785 misalign = size_binop (PLUS_EXPR, misalign, offset);
788 /* Modulo alignment. */
789 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
791 if (!host_integerp (misalign, 1))
793 /* Negative or overflowed misalignment value. */
794 if (dump_enabled_p ())
795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
796 "unexpected misalign value");
797 return false;
800 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
802 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
806 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
809 return true;
813 /* Function vect_compute_data_refs_alignment
815 Compute the misalignment of data references in the loop.
816 Return FALSE if a data reference is found that cannot be vectorized. */
818 static bool
819 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
820 bb_vec_info bb_vinfo)
822 vec<data_reference_p> datarefs;
823 struct data_reference *dr;
824 unsigned int i;
826 if (loop_vinfo)
827 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
828 else
829 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
831 FOR_EACH_VEC_ELT (datarefs, i, dr)
832 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
833 && !vect_compute_data_ref_alignment (dr))
835 if (bb_vinfo)
837 /* Mark unsupported statement as unvectorizable. */
838 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
839 continue;
841 else
842 return false;
845 return true;
849 /* Function vect_update_misalignment_for_peel
851 DR - the data reference whose misalignment is to be adjusted.
852 DR_PEEL - the data reference whose misalignment is being made
853 zero in the vector loop by the peel.
854 NPEEL - the number of iterations in the peel loop if the misalignment
855 of DR_PEEL is known at compile time. */
857 static void
858 vect_update_misalignment_for_peel (struct data_reference *dr,
859 struct data_reference *dr_peel, int npeel)
861 unsigned int i;
862 vec<dr_p> same_align_drs;
863 struct data_reference *current_dr;
864 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
865 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
866 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
867 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
869 /* For interleaved data accesses the step in the loop must be multiplied by
870 the size of the interleaving group. */
871 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
872 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
873 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
874 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
876 /* It can be assumed that the data refs with the same alignment as dr_peel
877 are aligned in the vector loop. */
878 same_align_drs
879 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
880 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
882 if (current_dr != dr)
883 continue;
884 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
885 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
886 SET_DR_MISALIGNMENT (dr, 0);
887 return;
890 if (known_alignment_for_access_p (dr)
891 && known_alignment_for_access_p (dr_peel))
893 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
894 int misal = DR_MISALIGNMENT (dr);
895 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
896 misal += negative ? -npeel * dr_size : npeel * dr_size;
897 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
898 SET_DR_MISALIGNMENT (dr, misal);
899 return;
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
904 SET_DR_MISALIGNMENT (dr, -1);
908 /* Function vect_verify_datarefs_alignment
910 Return TRUE if all data references in the loop can be
911 handled with respect to alignment. */
913 bool
914 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
916 vec<data_reference_p> datarefs;
917 struct data_reference *dr;
918 enum dr_alignment_support supportable_dr_alignment;
919 unsigned int i;
921 if (loop_vinfo)
922 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
923 else
924 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
926 FOR_EACH_VEC_ELT (datarefs, i, dr)
928 gimple stmt = DR_STMT (dr);
929 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
931 if (!STMT_VINFO_RELEVANT_P (stmt_info))
932 continue;
934 /* For interleaving, only the alignment of the first access matters.
935 Skip statements marked as not vectorizable. */
936 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
937 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
938 || !STMT_VINFO_VECTORIZABLE (stmt_info))
939 continue;
941 /* Strided loads perform only component accesses, alignment is
942 irrelevant for them. */
943 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
944 continue;
946 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
947 if (!supportable_dr_alignment)
949 if (dump_enabled_p ())
951 if (DR_IS_READ (dr))
952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
953 "not vectorized: unsupported unaligned load.");
954 else
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "not vectorized: unsupported unaligned "
957 "store.");
959 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
960 DR_REF (dr));
962 return false;
964 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
965 dump_printf_loc (MSG_NOTE, vect_location,
966 "Vectorizing an unaligned access.");
968 return true;
971 /* Given an memory reference EXP return whether its alignment is less
972 than its size. */
974 static bool
975 not_size_aligned (tree exp)
977 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
978 return true;
980 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
981 > get_object_alignment (exp));
984 /* Function vector_alignment_reachable_p
986 Return true if vector alignment for DR is reachable by peeling
987 a few loop iterations. Return false otherwise. */
989 static bool
990 vector_alignment_reachable_p (struct data_reference *dr)
992 gimple stmt = DR_STMT (dr);
993 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
994 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
996 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
998 /* For interleaved access we peel only if number of iterations in
999 the prolog loop ({VF - misalignment}), is a multiple of the
1000 number of the interleaved accesses. */
1001 int elem_size, mis_in_elements;
1002 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1004 /* FORNOW: handle only known alignment. */
1005 if (!known_alignment_for_access_p (dr))
1006 return false;
1008 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1009 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1011 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1012 return false;
1015 /* If misalignment is known at the compile time then allow peeling
1016 only if natural alignment is reachable through peeling. */
1017 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1019 HOST_WIDE_INT elmsize =
1020 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1021 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1025 dump_printf (MSG_NOTE,
1026 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1028 if (DR_MISALIGNMENT (dr) % elmsize)
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "data size does not divide the misalignment.\n");
1033 return false;
1037 if (!known_alignment_for_access_p (dr))
1039 tree type = TREE_TYPE (DR_REF (dr));
1040 bool is_packed = not_size_aligned (DR_REF (dr));
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "Unknown misalignment, is_packed = %d",is_packed);
1044 if ((TYPE_USER_ALIGN (type) && !is_packed)
1045 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1046 return true;
1047 else
1048 return false;
1051 return true;
1055 /* Calculate the cost of the memory access represented by DR. */
1057 static void
1058 vect_get_data_access_cost (struct data_reference *dr,
1059 unsigned int *inside_cost,
1060 unsigned int *outside_cost,
1061 stmt_vector_for_cost *body_cost_vec)
1063 gimple stmt = DR_STMT (dr);
1064 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1065 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1066 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1067 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1068 int ncopies = vf / nunits;
1070 if (DR_IS_READ (dr))
1071 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1072 NULL, body_cost_vec, false);
1073 else
1074 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1076 if (dump_enabled_p ())
1077 dump_printf_loc (MSG_NOTE, vect_location,
1078 "vect_get_data_access_cost: inside_cost = %d, "
1079 "outside_cost = %d.", *inside_cost, *outside_cost);
1083 /* Insert DR into peeling hash table with NPEEL as key. */
1085 static void
1086 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1087 int npeel)
1089 struct _vect_peel_info elem, *slot;
1090 _vect_peel_info **new_slot;
1091 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1093 elem.npeel = npeel;
1094 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1095 if (slot)
1096 slot->count++;
1097 else
1099 slot = XNEW (struct _vect_peel_info);
1100 slot->npeel = npeel;
1101 slot->dr = dr;
1102 slot->count = 1;
1103 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1104 *new_slot = slot;
1107 if (!supportable_dr_alignment && !flag_vect_cost_model)
1108 slot->count += VECT_MAX_COST;
1112 /* Traverse peeling hash table to find peeling option that aligns maximum
1113 number of data accesses. */
1116 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1117 _vect_peel_extended_info *max)
1119 vect_peel_info elem = *slot;
1121 if (elem->count > max->peel_info.count
1122 || (elem->count == max->peel_info.count
1123 && max->peel_info.npeel > elem->npeel))
1125 max->peel_info.npeel = elem->npeel;
1126 max->peel_info.count = elem->count;
1127 max->peel_info.dr = elem->dr;
1130 return 1;
1134 /* Traverse peeling hash table and calculate cost for each peeling option.
1135 Find the one with the lowest cost. */
1138 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1139 _vect_peel_extended_info *min)
1141 vect_peel_info elem = *slot;
1142 int save_misalignment, dummy;
1143 unsigned int inside_cost = 0, outside_cost = 0, i;
1144 gimple stmt = DR_STMT (elem->dr);
1145 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1146 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1147 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1148 struct data_reference *dr;
1149 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1150 int single_iter_cost;
1152 prologue_cost_vec.create (2);
1153 body_cost_vec.create (2);
1154 epilogue_cost_vec.create (2);
1156 FOR_EACH_VEC_ELT (datarefs, i, dr)
1158 stmt = DR_STMT (dr);
1159 stmt_info = vinfo_for_stmt (stmt);
1160 /* For interleaving, only the alignment of the first access
1161 matters. */
1162 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1163 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1164 continue;
1166 save_misalignment = DR_MISALIGNMENT (dr);
1167 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1168 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1169 &body_cost_vec);
1170 SET_DR_MISALIGNMENT (dr, save_misalignment);
1173 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1174 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1175 &dummy, single_iter_cost,
1176 &prologue_cost_vec,
1177 &epilogue_cost_vec);
1179 /* Prologue and epilogue costs are added to the target model later.
1180 These costs depend only on the scalar iteration cost, the
1181 number of peeling iterations finally chosen, and the number of
1182 misaligned statements. So discard the information found here. */
1183 prologue_cost_vec.release ();
1184 epilogue_cost_vec.release ();
1186 if (inside_cost < min->inside_cost
1187 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1189 min->inside_cost = inside_cost;
1190 min->outside_cost = outside_cost;
1191 min->body_cost_vec.release ();
1192 min->body_cost_vec = body_cost_vec;
1193 min->peel_info.dr = elem->dr;
1194 min->peel_info.npeel = elem->npeel;
1196 else
1197 body_cost_vec.release ();
1199 return 1;
1203 /* Choose best peeling option by traversing peeling hash table and either
1204 choosing an option with the lowest cost (if cost model is enabled) or the
1205 option that aligns as many accesses as possible. */
1207 static struct data_reference *
1208 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1209 unsigned int *npeel,
1210 stmt_vector_for_cost *body_cost_vec)
1212 struct _vect_peel_extended_info res;
1214 res.peel_info.dr = NULL;
1215 res.body_cost_vec = stmt_vector_for_cost();
1217 if (flag_vect_cost_model)
1219 res.inside_cost = INT_MAX;
1220 res.outside_cost = INT_MAX;
1221 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1222 .traverse <_vect_peel_extended_info *,
1223 vect_peeling_hash_get_lowest_cost> (&res);
1225 else
1227 res.peel_info.count = 0;
1228 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1229 .traverse <_vect_peel_extended_info *,
1230 vect_peeling_hash_get_most_frequent> (&res);
1233 *npeel = res.peel_info.npeel;
1234 *body_cost_vec = res.body_cost_vec;
1235 return res.peel_info.dr;
1239 /* Function vect_enhance_data_refs_alignment
1241 This pass will use loop versioning and loop peeling in order to enhance
1242 the alignment of data references in the loop.
1244 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1245 original loop is to be vectorized. Any other loops that are created by
1246 the transformations performed in this pass - are not supposed to be
1247 vectorized. This restriction will be relaxed.
1249 This pass will require a cost model to guide it whether to apply peeling
1250 or versioning or a combination of the two. For example, the scheme that
1251 intel uses when given a loop with several memory accesses, is as follows:
1252 choose one memory access ('p') which alignment you want to force by doing
1253 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1254 other accesses are not necessarily aligned, or (2) use loop versioning to
1255 generate one loop in which all accesses are aligned, and another loop in
1256 which only 'p' is necessarily aligned.
1258 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1259 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1260 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1262 Devising a cost model is the most critical aspect of this work. It will
1263 guide us on which access to peel for, whether to use loop versioning, how
1264 many versions to create, etc. The cost model will probably consist of
1265 generic considerations as well as target specific considerations (on
1266 powerpc for example, misaligned stores are more painful than misaligned
1267 loads).
1269 Here are the general steps involved in alignment enhancements:
1271 -- original loop, before alignment analysis:
1272 for (i=0; i<N; i++){
1273 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1274 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1277 -- After vect_compute_data_refs_alignment:
1278 for (i=0; i<N; i++){
1279 x = q[i]; # DR_MISALIGNMENT(q) = 3
1280 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1283 -- Possibility 1: we do loop versioning:
1284 if (p is aligned) {
1285 for (i=0; i<N; i++){ # loop 1A
1286 x = q[i]; # DR_MISALIGNMENT(q) = 3
1287 p[i] = y; # DR_MISALIGNMENT(p) = 0
1290 else {
1291 for (i=0; i<N; i++){ # loop 1B
1292 x = q[i]; # DR_MISALIGNMENT(q) = 3
1293 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1297 -- Possibility 2: we do loop peeling:
1298 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1299 x = q[i];
1300 p[i] = y;
1302 for (i = 3; i < N; i++){ # loop 2A
1303 x = q[i]; # DR_MISALIGNMENT(q) = 0
1304 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1307 -- Possibility 3: combination of loop peeling and versioning:
1308 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1309 x = q[i];
1310 p[i] = y;
1312 if (p is aligned) {
1313 for (i = 3; i<N; i++){ # loop 3A
1314 x = q[i]; # DR_MISALIGNMENT(q) = 0
1315 p[i] = y; # DR_MISALIGNMENT(p) = 0
1318 else {
1319 for (i = 3; i<N; i++){ # loop 3B
1320 x = q[i]; # DR_MISALIGNMENT(q) = 0
1321 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1325 These loops are later passed to loop_transform to be vectorized. The
1326 vectorizer will use the alignment information to guide the transformation
1327 (whether to generate regular loads/stores, or with special handling for
1328 misalignment). */
1330 bool
1331 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1333 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1334 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1335 enum dr_alignment_support supportable_dr_alignment;
1336 struct data_reference *dr0 = NULL, *first_store = NULL;
1337 struct data_reference *dr;
1338 unsigned int i, j;
1339 bool do_peeling = false;
1340 bool do_versioning = false;
1341 bool stat;
1342 gimple stmt;
1343 stmt_vec_info stmt_info;
1344 unsigned int npeel = 0;
1345 bool all_misalignments_unknown = true;
1346 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1347 unsigned possible_npeel_number = 1;
1348 tree vectype;
1349 unsigned int nelements, mis, same_align_drs_max = 0;
1350 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1352 if (dump_enabled_p ())
1353 dump_printf_loc (MSG_NOTE, vect_location,
1354 "=== vect_enhance_data_refs_alignment ===");
1356 /* While cost model enhancements are expected in the future, the high level
1357 view of the code at this time is as follows:
1359 A) If there is a misaligned access then see if peeling to align
1360 this access can make all data references satisfy
1361 vect_supportable_dr_alignment. If so, update data structures
1362 as needed and return true.
1364 B) If peeling wasn't possible and there is a data reference with an
1365 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1366 then see if loop versioning checks can be used to make all data
1367 references satisfy vect_supportable_dr_alignment. If so, update
1368 data structures as needed and return true.
1370 C) If neither peeling nor versioning were successful then return false if
1371 any data reference does not satisfy vect_supportable_dr_alignment.
1373 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1375 Note, Possibility 3 above (which is peeling and versioning together) is not
1376 being done at this time. */
1378 /* (1) Peeling to force alignment. */
1380 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1381 Considerations:
1382 + How many accesses will become aligned due to the peeling
1383 - How many accesses will become unaligned due to the peeling,
1384 and the cost of misaligned accesses.
1385 - The cost of peeling (the extra runtime checks, the increase
1386 in code size). */
1388 FOR_EACH_VEC_ELT (datarefs, i, dr)
1390 stmt = DR_STMT (dr);
1391 stmt_info = vinfo_for_stmt (stmt);
1393 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1394 continue;
1396 /* For interleaving, only the alignment of the first access
1397 matters. */
1398 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1399 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1400 continue;
1402 /* For invariant accesses there is nothing to enhance. */
1403 if (integer_zerop (DR_STEP (dr)))
1404 continue;
1406 /* Strided loads perform only component accesses, alignment is
1407 irrelevant for them. */
1408 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1409 continue;
1411 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1412 do_peeling = vector_alignment_reachable_p (dr);
1413 if (do_peeling)
1415 if (known_alignment_for_access_p (dr))
1417 unsigned int npeel_tmp;
1418 bool negative = tree_int_cst_compare (DR_STEP (dr),
1419 size_zero_node) < 0;
1421 /* Save info about DR in the hash table. */
1422 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1423 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1425 vectype = STMT_VINFO_VECTYPE (stmt_info);
1426 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1427 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1428 TREE_TYPE (DR_REF (dr))));
1429 npeel_tmp = (negative
1430 ? (mis - nelements) : (nelements - mis))
1431 & (nelements - 1);
1433 /* For multiple types, it is possible that the bigger type access
1434 will have more than one peeling option. E.g., a loop with two
1435 types: one of size (vector size / 4), and the other one of
1436 size (vector size / 8). Vectorization factor will 8. If both
1437 access are misaligned by 3, the first one needs one scalar
1438 iteration to be aligned, and the second one needs 5. But the
1439 the first one will be aligned also by peeling 5 scalar
1440 iterations, and in that case both accesses will be aligned.
1441 Hence, except for the immediate peeling amount, we also want
1442 to try to add full vector size, while we don't exceed
1443 vectorization factor.
1444 We do this automtically for cost model, since we calculate cost
1445 for every peeling option. */
1446 if (!flag_vect_cost_model)
1447 possible_npeel_number = vf /nelements;
1449 /* Handle the aligned case. We may decide to align some other
1450 access, making DR unaligned. */
1451 if (DR_MISALIGNMENT (dr) == 0)
1453 npeel_tmp = 0;
1454 if (!flag_vect_cost_model)
1455 possible_npeel_number++;
1458 for (j = 0; j < possible_npeel_number; j++)
1460 gcc_assert (npeel_tmp <= vf);
1461 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1462 npeel_tmp += nelements;
1465 all_misalignments_unknown = false;
1466 /* Data-ref that was chosen for the case that all the
1467 misalignments are unknown is not relevant anymore, since we
1468 have a data-ref with known alignment. */
1469 dr0 = NULL;
1471 else
1473 /* If we don't know any misalignment values, we prefer
1474 peeling for data-ref that has the maximum number of data-refs
1475 with the same alignment, unless the target prefers to align
1476 stores over load. */
1477 if (all_misalignments_unknown)
1479 unsigned same_align_drs
1480 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1481 if (!dr0
1482 || same_align_drs_max < same_align_drs)
1484 same_align_drs_max = same_align_drs;
1485 dr0 = dr;
1487 /* For data-refs with the same number of related
1488 accesses prefer the one where the misalign
1489 computation will be invariant in the outermost loop. */
1490 else if (same_align_drs_max == same_align_drs)
1492 struct loop *ivloop0, *ivloop;
1493 ivloop0 = outermost_invariant_loop_for_expr
1494 (loop, DR_BASE_ADDRESS (dr0));
1495 ivloop = outermost_invariant_loop_for_expr
1496 (loop, DR_BASE_ADDRESS (dr));
1497 if ((ivloop && !ivloop0)
1498 || (ivloop && ivloop0
1499 && flow_loop_nested_p (ivloop, ivloop0)))
1500 dr0 = dr;
1503 if (!first_store && DR_IS_WRITE (dr))
1504 first_store = dr;
1507 /* If there are both known and unknown misaligned accesses in the
1508 loop, we choose peeling amount according to the known
1509 accesses. */
1510 if (!supportable_dr_alignment)
1512 dr0 = dr;
1513 if (!first_store && DR_IS_WRITE (dr))
1514 first_store = dr;
1518 else
1520 if (!aligned_access_p (dr))
1522 if (dump_enabled_p ())
1523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1524 "vector alignment may not be reachable");
1525 break;
1530 /* Check if we can possibly peel the loop. */
1531 if (!vect_can_advance_ivs_p (loop_vinfo)
1532 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1533 do_peeling = false;
1535 if (do_peeling && all_misalignments_unknown
1536 && vect_supportable_dr_alignment (dr0, false))
1539 /* Check if the target requires to prefer stores over loads, i.e., if
1540 misaligned stores are more expensive than misaligned loads (taking
1541 drs with same alignment into account). */
1542 if (first_store && DR_IS_READ (dr0))
1544 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1545 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1546 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1547 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1548 stmt_vector_for_cost dummy;
1549 dummy.create (2);
1551 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1552 &dummy);
1553 vect_get_data_access_cost (first_store, &store_inside_cost,
1554 &store_outside_cost, &dummy);
1556 dummy.release ();
1558 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1559 aligning the load DR0). */
1560 load_inside_penalty = store_inside_cost;
1561 load_outside_penalty = store_outside_cost;
1562 for (i = 0;
1563 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1564 DR_STMT (first_store))).iterate (i, &dr);
1565 i++)
1566 if (DR_IS_READ (dr))
1568 load_inside_penalty += load_inside_cost;
1569 load_outside_penalty += load_outside_cost;
1571 else
1573 load_inside_penalty += store_inside_cost;
1574 load_outside_penalty += store_outside_cost;
1577 /* Calculate the penalty for leaving DR0 unaligned (by
1578 aligning the FIRST_STORE). */
1579 store_inside_penalty = load_inside_cost;
1580 store_outside_penalty = load_outside_cost;
1581 for (i = 0;
1582 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1583 DR_STMT (dr0))).iterate (i, &dr);
1584 i++)
1585 if (DR_IS_READ (dr))
1587 store_inside_penalty += load_inside_cost;
1588 store_outside_penalty += load_outside_cost;
1590 else
1592 store_inside_penalty += store_inside_cost;
1593 store_outside_penalty += store_outside_cost;
1596 if (load_inside_penalty > store_inside_penalty
1597 || (load_inside_penalty == store_inside_penalty
1598 && load_outside_penalty > store_outside_penalty))
1599 dr0 = first_store;
1602 /* In case there are only loads with different unknown misalignments, use
1603 peeling only if it may help to align other accesses in the loop. */
1604 if (!first_store
1605 && !STMT_VINFO_SAME_ALIGN_REFS (
1606 vinfo_for_stmt (DR_STMT (dr0))).length ()
1607 && vect_supportable_dr_alignment (dr0, false)
1608 != dr_unaligned_supported)
1609 do_peeling = false;
1612 if (do_peeling && !dr0)
1614 /* Peeling is possible, but there is no data access that is not supported
1615 unless aligned. So we try to choose the best possible peeling. */
1617 /* We should get here only if there are drs with known misalignment. */
1618 gcc_assert (!all_misalignments_unknown);
1620 /* Choose the best peeling from the hash table. */
1621 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1622 &body_cost_vec);
1623 if (!dr0 || !npeel)
1624 do_peeling = false;
1627 if (do_peeling)
1629 stmt = DR_STMT (dr0);
1630 stmt_info = vinfo_for_stmt (stmt);
1631 vectype = STMT_VINFO_VECTYPE (stmt_info);
1632 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1634 if (known_alignment_for_access_p (dr0))
1636 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1637 size_zero_node) < 0;
1638 if (!npeel)
1640 /* Since it's known at compile time, compute the number of
1641 iterations in the peeled loop (the peeling factor) for use in
1642 updating DR_MISALIGNMENT values. The peeling factor is the
1643 vectorization factor minus the misalignment as an element
1644 count. */
1645 mis = DR_MISALIGNMENT (dr0);
1646 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1647 npeel = ((negative ? mis - nelements : nelements - mis)
1648 & (nelements - 1));
1651 /* For interleaved data access every iteration accesses all the
1652 members of the group, therefore we divide the number of iterations
1653 by the group size. */
1654 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1655 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1656 npeel /= GROUP_SIZE (stmt_info);
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_NOTE, vect_location,
1660 "Try peeling by %d", npeel);
1663 /* Ensure that all data refs can be vectorized after the peel. */
1664 FOR_EACH_VEC_ELT (datarefs, i, dr)
1666 int save_misalignment;
1668 if (dr == dr0)
1669 continue;
1671 stmt = DR_STMT (dr);
1672 stmt_info = vinfo_for_stmt (stmt);
1673 /* For interleaving, only the alignment of the first access
1674 matters. */
1675 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1676 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1677 continue;
1679 /* Strided loads perform only component accesses, alignment is
1680 irrelevant for them. */
1681 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1682 continue;
1684 save_misalignment = DR_MISALIGNMENT (dr);
1685 vect_update_misalignment_for_peel (dr, dr0, npeel);
1686 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1687 SET_DR_MISALIGNMENT (dr, save_misalignment);
1689 if (!supportable_dr_alignment)
1691 do_peeling = false;
1692 break;
1696 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1698 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1699 if (!stat)
1700 do_peeling = false;
1701 else
1703 body_cost_vec.release ();
1704 return stat;
1708 if (do_peeling)
1710 stmt_info_for_cost *si;
1711 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1713 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1714 If the misalignment of DR_i is identical to that of dr0 then set
1715 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1716 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1717 by the peeling factor times the element size of DR_i (MOD the
1718 vectorization factor times the size). Otherwise, the
1719 misalignment of DR_i must be set to unknown. */
1720 FOR_EACH_VEC_ELT (datarefs, i, dr)
1721 if (dr != dr0)
1722 vect_update_misalignment_for_peel (dr, dr0, npeel);
1724 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1725 if (npeel)
1726 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1727 else
1728 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1729 SET_DR_MISALIGNMENT (dr0, 0);
1730 if (dump_enabled_p ())
1732 dump_printf_loc (MSG_NOTE, vect_location,
1733 "Alignment of access forced using peeling.");
1734 dump_printf_loc (MSG_NOTE, vect_location,
1735 "Peeling for alignment will be applied.");
1737 /* We've delayed passing the inside-loop peeling costs to the
1738 target cost model until we were sure peeling would happen.
1739 Do so now. */
1740 if (body_cost_vec.exists ())
1742 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1744 struct _stmt_vec_info *stmt_info
1745 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1746 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1747 si->misalign, vect_body);
1749 body_cost_vec.release ();
1752 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1753 gcc_assert (stat);
1754 return stat;
1758 body_cost_vec.release ();
1760 /* (2) Versioning to force alignment. */
1762 /* Try versioning if:
1763 1) flag_tree_vect_loop_version is TRUE
1764 2) optimize loop for speed
1765 3) there is at least one unsupported misaligned data ref with an unknown
1766 misalignment, and
1767 4) all misaligned data refs with a known misalignment are supported, and
1768 5) the number of runtime alignment checks is within reason. */
1770 do_versioning =
1771 flag_tree_vect_loop_version
1772 && optimize_loop_nest_for_speed_p (loop)
1773 && (!loop->inner); /* FORNOW */
1775 if (do_versioning)
1777 FOR_EACH_VEC_ELT (datarefs, i, dr)
1779 stmt = DR_STMT (dr);
1780 stmt_info = vinfo_for_stmt (stmt);
1782 /* For interleaving, only the alignment of the first access
1783 matters. */
1784 if (aligned_access_p (dr)
1785 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1786 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1787 continue;
1789 /* Strided loads perform only component accesses, alignment is
1790 irrelevant for them. */
1791 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1792 continue;
1794 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1796 if (!supportable_dr_alignment)
1798 gimple stmt;
1799 int mask;
1800 tree vectype;
1802 if (known_alignment_for_access_p (dr)
1803 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1804 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1806 do_versioning = false;
1807 break;
1810 stmt = DR_STMT (dr);
1811 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1812 gcc_assert (vectype);
1814 /* The rightmost bits of an aligned address must be zeros.
1815 Construct the mask needed for this test. For example,
1816 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1817 mask must be 15 = 0xf. */
1818 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1820 /* FORNOW: use the same mask to test all potentially unaligned
1821 references in the loop. The vectorizer currently supports
1822 a single vector size, see the reference to
1823 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1824 vectorization factor is computed. */
1825 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1826 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1827 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1828 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1829 DR_STMT (dr));
1833 /* Versioning requires at least one misaligned data reference. */
1834 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1835 do_versioning = false;
1836 else if (!do_versioning)
1837 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1840 if (do_versioning)
1842 vec<gimple> may_misalign_stmts
1843 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1844 gimple stmt;
1846 /* It can now be assumed that the data references in the statements
1847 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1848 of the loop being vectorized. */
1849 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1851 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1852 dr = STMT_VINFO_DATA_REF (stmt_info);
1853 SET_DR_MISALIGNMENT (dr, 0);
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_NOTE, vect_location,
1856 "Alignment of access forced using versioning.");
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location,
1861 "Versioning for alignment will be applied.");
1863 /* Peeling and versioning can't be done together at this time. */
1864 gcc_assert (! (do_peeling && do_versioning));
1866 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1867 gcc_assert (stat);
1868 return stat;
1871 /* This point is reached if neither peeling nor versioning is being done. */
1872 gcc_assert (! (do_peeling || do_versioning));
1874 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1875 return stat;
1879 /* Function vect_find_same_alignment_drs.
1881 Update group and alignment relations according to the chosen
1882 vectorization factor. */
1884 static void
1885 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1886 loop_vec_info loop_vinfo)
1888 unsigned int i;
1889 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1890 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1891 struct data_reference *dra = DDR_A (ddr);
1892 struct data_reference *drb = DDR_B (ddr);
1893 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1894 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1895 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1896 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1897 lambda_vector dist_v;
1898 unsigned int loop_depth;
1900 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1901 return;
1903 if (dra == drb)
1904 return;
1906 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1907 return;
1909 /* Loop-based vectorization and known data dependence. */
1910 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1911 return;
1913 /* Data-dependence analysis reports a distance vector of zero
1914 for data-references that overlap only in the first iteration
1915 but have different sign step (see PR45764).
1916 So as a sanity check require equal DR_STEP. */
1917 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1918 return;
1920 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1921 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1923 int dist = dist_v[loop_depth];
1925 if (dump_enabled_p ())
1926 dump_printf_loc (MSG_NOTE, vect_location,
1927 "dependence distance = %d.", dist);
1929 /* Same loop iteration. */
1930 if (dist == 0
1931 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1933 /* Two references with distance zero have the same alignment. */
1934 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1935 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1936 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_NOTE, vect_location,
1939 "accesses have the same alignment.");
1940 dump_printf (MSG_NOTE,
1941 "dependence distance modulo vf == 0 between ");
1942 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1943 dump_printf (MSG_NOTE, " and ");
1944 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1951 /* Function vect_analyze_data_refs_alignment
1953 Analyze the alignment of the data-references in the loop.
1954 Return FALSE if a data reference is found that cannot be vectorized. */
1956 bool
1957 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1958 bb_vec_info bb_vinfo)
1960 if (dump_enabled_p ())
1961 dump_printf_loc (MSG_NOTE, vect_location,
1962 "=== vect_analyze_data_refs_alignment ===");
1964 /* Mark groups of data references with same alignment using
1965 data dependence information. */
1966 if (loop_vinfo)
1968 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1969 struct data_dependence_relation *ddr;
1970 unsigned int i;
1972 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1973 vect_find_same_alignment_drs (ddr, loop_vinfo);
1976 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980 "not vectorized: can't calculate alignment "
1981 "for data ref.");
1982 return false;
1985 return true;
1989 /* Analyze groups of accesses: check that DR belongs to a group of
1990 accesses of legal size, step, etc. Detect gaps, single element
1991 interleaving, and other special cases. Set grouped access info.
1992 Collect groups of strided stores for further use in SLP analysis. */
1994 static bool
1995 vect_analyze_group_access (struct data_reference *dr)
1997 tree step = DR_STEP (dr);
1998 tree scalar_type = TREE_TYPE (DR_REF (dr));
1999 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2000 gimple stmt = DR_STMT (dr);
2001 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2003 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2004 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2005 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2006 bool slp_impossible = false;
2007 struct loop *loop = NULL;
2009 if (loop_vinfo)
2010 loop = LOOP_VINFO_LOOP (loop_vinfo);
2012 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2013 size of the interleaving group (including gaps). */
2014 groupsize = absu_hwi (dr_step) / type_size;
2016 /* Not consecutive access is possible only if it is a part of interleaving. */
2017 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2019 /* Check if it this DR is a part of interleaving, and is a single
2020 element of the group that is accessed in the loop. */
2022 /* Gaps are supported only for loads. STEP must be a multiple of the type
2023 size. The size of the group must be a power of 2. */
2024 if (DR_IS_READ (dr)
2025 && (dr_step % type_size) == 0
2026 && groupsize > 0
2027 && exact_log2 (groupsize) != -1)
2029 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2030 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2031 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "Detected single element interleaving ");
2035 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2036 dump_printf (MSG_NOTE, " step ");
2037 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2040 if (loop_vinfo)
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_NOTE, vect_location,
2044 "Data access with gaps requires scalar "
2045 "epilogue loop");
2046 if (loop->inner)
2048 if (dump_enabled_p ())
2049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2050 "Peeling for outer loop is not"
2051 " supported");
2052 return false;
2055 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2058 return true;
2061 if (dump_enabled_p ())
2063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2064 "not consecutive access ");
2065 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2068 if (bb_vinfo)
2070 /* Mark the statement as unvectorizable. */
2071 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2072 return true;
2075 return false;
2078 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2080 /* First stmt in the interleaving chain. Check the chain. */
2081 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2082 struct data_reference *data_ref = dr;
2083 unsigned int count = 1;
2084 tree prev_init = DR_INIT (data_ref);
2085 gimple prev = stmt;
2086 HOST_WIDE_INT diff, gaps = 0;
2087 unsigned HOST_WIDE_INT count_in_bytes;
2089 while (next)
2091 /* Skip same data-refs. In case that two or more stmts share
2092 data-ref (supported only for loads), we vectorize only the first
2093 stmt, and the rest get their vectorized loads from the first
2094 one. */
2095 if (!tree_int_cst_compare (DR_INIT (data_ref),
2096 DR_INIT (STMT_VINFO_DATA_REF (
2097 vinfo_for_stmt (next)))))
2099 if (DR_IS_WRITE (data_ref))
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103 "Two store stmts share the same dr.");
2104 return false;
2107 /* For load use the same data-ref load. */
2108 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2110 prev = next;
2111 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2112 continue;
2115 prev = next;
2116 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2118 /* All group members have the same STEP by construction. */
2119 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2121 /* Check that the distance between two accesses is equal to the type
2122 size. Otherwise, we have gaps. */
2123 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2124 - TREE_INT_CST_LOW (prev_init)) / type_size;
2125 if (diff != 1)
2127 /* FORNOW: SLP of accesses with gaps is not supported. */
2128 slp_impossible = true;
2129 if (DR_IS_WRITE (data_ref))
2131 if (dump_enabled_p ())
2132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133 "interleaved store with gaps");
2134 return false;
2137 gaps += diff - 1;
2140 last_accessed_element += diff;
2142 /* Store the gap from the previous member of the group. If there is no
2143 gap in the access, GROUP_GAP is always 1. */
2144 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2146 prev_init = DR_INIT (data_ref);
2147 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2148 /* Count the number of data-refs in the chain. */
2149 count++;
2152 /* COUNT is the number of accesses found, we multiply it by the size of
2153 the type to get COUNT_IN_BYTES. */
2154 count_in_bytes = type_size * count;
2156 /* Check that the size of the interleaving (including gaps) is not
2157 greater than STEP. */
2158 if (dr_step != 0
2159 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2161 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2164 "interleaving size is greater than step for ");
2165 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2167 return false;
2170 /* Check that the size of the interleaving is equal to STEP for stores,
2171 i.e., that there are no gaps. */
2172 if (dr_step != 0
2173 && absu_hwi (dr_step) != count_in_bytes)
2175 if (DR_IS_READ (dr))
2177 slp_impossible = true;
2178 /* There is a gap after the last load in the group. This gap is a
2179 difference between the groupsize and the number of elements.
2180 When there is no gap, this difference should be 0. */
2181 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2183 else
2185 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2187 "interleaved store with gaps");
2188 return false;
2192 /* Check that STEP is a multiple of type size. */
2193 if (dr_step != 0
2194 && (dr_step % type_size) != 0)
2196 if (dump_enabled_p ())
2198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2199 "step is not a multiple of type size: step ");
2200 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2201 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2202 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2203 TYPE_SIZE_UNIT (scalar_type));
2205 return false;
2208 if (groupsize == 0)
2209 groupsize = count;
2211 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_NOTE, vect_location,
2214 "Detected interleaving of size %d", (int)groupsize);
2216 /* SLP: create an SLP data structure for every interleaving group of
2217 stores for further analysis in vect_analyse_slp. */
2218 if (DR_IS_WRITE (dr) && !slp_impossible)
2220 if (loop_vinfo)
2221 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2222 if (bb_vinfo)
2223 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2226 /* There is a gap in the end of the group. */
2227 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231 "Data access with gaps requires scalar "
2232 "epilogue loop");
2233 if (loop->inner)
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2237 "Peeling for outer loop is not supported");
2238 return false;
2241 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2245 return true;
2249 /* Analyze the access pattern of the data-reference DR.
2250 In case of non-consecutive accesses call vect_analyze_group_access() to
2251 analyze groups of accesses. */
2253 static bool
2254 vect_analyze_data_ref_access (struct data_reference *dr)
2256 tree step = DR_STEP (dr);
2257 tree scalar_type = TREE_TYPE (DR_REF (dr));
2258 gimple stmt = DR_STMT (dr);
2259 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2260 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2261 struct loop *loop = NULL;
2263 if (loop_vinfo)
2264 loop = LOOP_VINFO_LOOP (loop_vinfo);
2266 if (loop_vinfo && !step)
2268 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2270 "bad data-ref access in loop");
2271 return false;
2274 /* Allow invariant loads in loops. */
2275 if (loop_vinfo && integer_zerop (step))
2277 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2278 return DR_IS_READ (dr);
2281 if (loop && nested_in_vect_loop_p (loop, stmt))
2283 /* Interleaved accesses are not yet supported within outer-loop
2284 vectorization for references in the inner-loop. */
2285 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2287 /* For the rest of the analysis we use the outer-loop step. */
2288 step = STMT_VINFO_DR_STEP (stmt_info);
2289 if (integer_zerop (step))
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_NOTE, vect_location,
2293 "zero step in outer loop.");
2294 if (DR_IS_READ (dr))
2295 return true;
2296 else
2297 return false;
2301 /* Consecutive? */
2302 if (TREE_CODE (step) == INTEGER_CST)
2304 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2305 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2306 || (dr_step < 0
2307 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2309 /* Mark that it is not interleaving. */
2310 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2311 return true;
2315 if (loop && nested_in_vect_loop_p (loop, stmt))
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_NOTE, vect_location,
2319 "grouped access in outer loop.");
2320 return false;
2323 /* Assume this is a DR handled by non-constant strided load case. */
2324 if (TREE_CODE (step) != INTEGER_CST)
2325 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2327 /* Not consecutive access - check if it's a part of interleaving group. */
2328 return vect_analyze_group_access (dr);
2333 /* A helper function used in the comparator function to sort data
2334 references. T1 and T2 are two data references to be compared.
2335 The function returns -1, 0, or 1. */
2337 static int
2338 compare_tree (tree t1, tree t2)
2340 int i, cmp;
2341 enum tree_code code;
2342 char tclass;
2344 if (t1 == t2)
2345 return 0;
2346 if (t1 == NULL)
2347 return -1;
2348 if (t2 == NULL)
2349 return 1;
2352 if (TREE_CODE (t1) != TREE_CODE (t2))
2353 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2355 code = TREE_CODE (t1);
2356 switch (code)
2358 /* For const values, we can just use hash values for comparisons. */
2359 case INTEGER_CST:
2360 case REAL_CST:
2361 case FIXED_CST:
2362 case STRING_CST:
2363 case COMPLEX_CST:
2364 case VECTOR_CST:
2366 hashval_t h1 = iterative_hash_expr (t1, 0);
2367 hashval_t h2 = iterative_hash_expr (t2, 0);
2368 if (h1 != h2)
2369 return h1 < h2 ? -1 : 1;
2370 break;
2373 case SSA_NAME:
2374 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2375 if (cmp != 0)
2376 return cmp;
2378 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2379 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2380 break;
2382 default:
2383 tclass = TREE_CODE_CLASS (code);
2385 /* For var-decl, we could compare their UIDs. */
2386 if (tclass == tcc_declaration)
2388 if (DECL_UID (t1) != DECL_UID (t2))
2389 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2390 break;
2393 /* For expressions with operands, compare their operands recursively. */
2394 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2396 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2397 if (cmp != 0)
2398 return cmp;
2402 return 0;
2406 /* Compare two data-references DRA and DRB to group them into chunks
2407 suitable for grouping. */
2409 static int
2410 dr_group_sort_cmp (const void *dra_, const void *drb_)
2412 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2413 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2414 int cmp;
2416 /* Stabilize sort. */
2417 if (dra == drb)
2418 return 0;
2420 /* Ordering of DRs according to base. */
2421 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2423 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2424 if (cmp != 0)
2425 return cmp;
2428 /* And according to DR_OFFSET. */
2429 if (!dr_equal_offsets_p (dra, drb))
2431 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2432 if (cmp != 0)
2433 return cmp;
2436 /* Put reads before writes. */
2437 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2438 return DR_IS_READ (dra) ? -1 : 1;
2440 /* Then sort after access size. */
2441 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2442 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2444 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2445 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2446 if (cmp != 0)
2447 return cmp;
2450 /* And after step. */
2451 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2453 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2454 if (cmp != 0)
2455 return cmp;
2458 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2459 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2460 if (cmp == 0)
2461 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2462 return cmp;
2465 /* Function vect_analyze_data_ref_accesses.
2467 Analyze the access pattern of all the data references in the loop.
2469 FORNOW: the only access pattern that is considered vectorizable is a
2470 simple step 1 (consecutive) access.
2472 FORNOW: handle only arrays and pointer accesses. */
2474 bool
2475 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2477 unsigned int i;
2478 vec<data_reference_p> datarefs;
2479 struct data_reference *dr;
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE, vect_location,
2483 "=== vect_analyze_data_ref_accesses ===");
2485 if (loop_vinfo)
2486 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2487 else
2488 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2490 if (datarefs.is_empty ())
2491 return true;
2493 /* Sort the array of datarefs to make building the interleaving chains
2494 linear. */
2495 qsort (datarefs.address(), datarefs.length (),
2496 sizeof (data_reference_p), dr_group_sort_cmp);
2498 /* Build the interleaving chains. */
2499 for (i = 0; i < datarefs.length () - 1;)
2501 data_reference_p dra = datarefs[i];
2502 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2503 stmt_vec_info lastinfo = NULL;
2504 for (i = i + 1; i < datarefs.length (); ++i)
2506 data_reference_p drb = datarefs[i];
2507 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2509 /* ??? Imperfect sorting (non-compatible types, non-modulo
2510 accesses, same accesses) can lead to a group to be artificially
2511 split here as we don't just skip over those. If it really
2512 matters we can push those to a worklist and re-iterate
2513 over them. The we can just skip ahead to the next DR here. */
2515 /* Check that the data-refs have same first location (except init)
2516 and they are both either store or load (not load and store). */
2517 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2518 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2519 DR_BASE_ADDRESS (drb), 0)
2520 || !dr_equal_offsets_p (dra, drb))
2521 break;
2523 /* Check that the data-refs have the same constant size and step. */
2524 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2525 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2526 if (!host_integerp (sza, 1)
2527 || !host_integerp (szb, 1)
2528 || !tree_int_cst_equal (sza, szb)
2529 || !host_integerp (DR_STEP (dra), 0)
2530 || !host_integerp (DR_STEP (drb), 0)
2531 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2532 break;
2534 /* Do not place the same access in the interleaving chain twice. */
2535 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2536 break;
2538 /* Check the types are compatible.
2539 ??? We don't distinguish this during sorting. */
2540 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2541 TREE_TYPE (DR_REF (drb))))
2542 break;
2544 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2545 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2546 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2547 gcc_assert (init_a < init_b);
2549 /* If init_b == init_a + the size of the type * k, we have an
2550 interleaving, and DRA is accessed before DRB. */
2551 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2552 if ((init_b - init_a) % type_size_a != 0)
2553 break;
2555 /* The step (if not zero) is greater than the difference between
2556 data-refs' inits. This splits groups into suitable sizes. */
2557 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2558 if (step != 0 && step <= (init_b - init_a))
2559 break;
2561 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_NOTE, vect_location,
2564 "Detected interleaving ");
2565 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2566 dump_printf (MSG_NOTE, " and ");
2567 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2570 /* Link the found element into the group list. */
2571 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2573 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2574 lastinfo = stmtinfo_a;
2576 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2577 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2578 lastinfo = stmtinfo_b;
2582 FOR_EACH_VEC_ELT (datarefs, i, dr)
2583 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2584 && !vect_analyze_data_ref_access (dr))
2586 if (dump_enabled_p ())
2587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2588 "not vectorized: complicated access pattern.");
2590 if (bb_vinfo)
2592 /* Mark the statement as not vectorizable. */
2593 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2594 continue;
2596 else
2597 return false;
2600 return true;
2603 /* Function vect_prune_runtime_alias_test_list.
2605 Prune a list of ddrs to be tested at run-time by versioning for alias.
2606 Return FALSE if resulting list of ddrs is longer then allowed by
2607 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2609 bool
2610 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2612 vec<ddr_p> ddrs =
2613 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2614 unsigned i, j;
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE, vect_location,
2618 "=== vect_prune_runtime_alias_test_list ===");
2620 for (i = 0; i < ddrs.length (); )
2622 bool found;
2623 ddr_p ddr_i;
2625 ddr_i = ddrs[i];
2626 found = false;
2628 for (j = 0; j < i; j++)
2630 ddr_p ddr_j = ddrs[j];
2632 if (vect_vfa_range_equal (ddr_i, ddr_j))
2634 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_NOTE, vect_location,
2637 "found equal ranges ");
2638 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2639 dump_printf (MSG_NOTE, ", ");
2640 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2641 dump_printf (MSG_NOTE, " and ");
2642 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2643 dump_printf (MSG_NOTE, ", ");
2644 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2646 found = true;
2647 break;
2651 if (found)
2653 ddrs.ordered_remove (i);
2654 continue;
2656 i++;
2659 if (ddrs.length () >
2660 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2662 if (dump_enabled_p ())
2664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2665 "disable versioning for alias - max number of "
2666 "generated checks exceeded.");
2669 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2671 return false;
2674 return true;
2677 /* Check whether a non-affine read in stmt is suitable for gather load
2678 and if so, return a builtin decl for that operation. */
2680 tree
2681 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2682 tree *offp, int *scalep)
2684 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2685 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2686 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2687 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2688 tree offtype = NULL_TREE;
2689 tree decl, base, off;
2690 enum machine_mode pmode;
2691 int punsignedp, pvolatilep;
2693 /* The gather builtins need address of the form
2694 loop_invariant + vector * {1, 2, 4, 8}
2696 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2697 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2698 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2699 multiplications and additions in it. To get a vector, we need
2700 a single SSA_NAME that will be defined in the loop and will
2701 contain everything that is not loop invariant and that can be
2702 vectorized. The following code attempts to find such a preexistng
2703 SSA_NAME OFF and put the loop invariants into a tree BASE
2704 that can be gimplified before the loop. */
2705 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2706 &pmode, &punsignedp, &pvolatilep, false);
2707 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2709 if (TREE_CODE (base) == MEM_REF)
2711 if (!integer_zerop (TREE_OPERAND (base, 1)))
2713 if (off == NULL_TREE)
2715 double_int moff = mem_ref_offset (base);
2716 off = double_int_to_tree (sizetype, moff);
2718 else
2719 off = size_binop (PLUS_EXPR, off,
2720 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2722 base = TREE_OPERAND (base, 0);
2724 else
2725 base = build_fold_addr_expr (base);
2727 if (off == NULL_TREE)
2728 off = size_zero_node;
2730 /* If base is not loop invariant, either off is 0, then we start with just
2731 the constant offset in the loop invariant BASE and continue with base
2732 as OFF, otherwise give up.
2733 We could handle that case by gimplifying the addition of base + off
2734 into some SSA_NAME and use that as off, but for now punt. */
2735 if (!expr_invariant_in_loop_p (loop, base))
2737 if (!integer_zerop (off))
2738 return NULL_TREE;
2739 off = base;
2740 base = size_int (pbitpos / BITS_PER_UNIT);
2742 /* Otherwise put base + constant offset into the loop invariant BASE
2743 and continue with OFF. */
2744 else
2746 base = fold_convert (sizetype, base);
2747 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2750 /* OFF at this point may be either a SSA_NAME or some tree expression
2751 from get_inner_reference. Try to peel off loop invariants from it
2752 into BASE as long as possible. */
2753 STRIP_NOPS (off);
2754 while (offtype == NULL_TREE)
2756 enum tree_code code;
2757 tree op0, op1, add = NULL_TREE;
2759 if (TREE_CODE (off) == SSA_NAME)
2761 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2763 if (expr_invariant_in_loop_p (loop, off))
2764 return NULL_TREE;
2766 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2767 break;
2769 op0 = gimple_assign_rhs1 (def_stmt);
2770 code = gimple_assign_rhs_code (def_stmt);
2771 op1 = gimple_assign_rhs2 (def_stmt);
2773 else
2775 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2776 return NULL_TREE;
2777 code = TREE_CODE (off);
2778 extract_ops_from_tree (off, &code, &op0, &op1);
2780 switch (code)
2782 case POINTER_PLUS_EXPR:
2783 case PLUS_EXPR:
2784 if (expr_invariant_in_loop_p (loop, op0))
2786 add = op0;
2787 off = op1;
2788 do_add:
2789 add = fold_convert (sizetype, add);
2790 if (scale != 1)
2791 add = size_binop (MULT_EXPR, add, size_int (scale));
2792 base = size_binop (PLUS_EXPR, base, add);
2793 continue;
2795 if (expr_invariant_in_loop_p (loop, op1))
2797 add = op1;
2798 off = op0;
2799 goto do_add;
2801 break;
2802 case MINUS_EXPR:
2803 if (expr_invariant_in_loop_p (loop, op1))
2805 add = fold_convert (sizetype, op1);
2806 add = size_binop (MINUS_EXPR, size_zero_node, add);
2807 off = op0;
2808 goto do_add;
2810 break;
2811 case MULT_EXPR:
2812 if (scale == 1 && host_integerp (op1, 0))
2814 scale = tree_low_cst (op1, 0);
2815 off = op0;
2816 continue;
2818 break;
2819 case SSA_NAME:
2820 off = op0;
2821 continue;
2822 CASE_CONVERT:
2823 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2824 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2825 break;
2826 if (TYPE_PRECISION (TREE_TYPE (op0))
2827 == TYPE_PRECISION (TREE_TYPE (off)))
2829 off = op0;
2830 continue;
2832 if (TYPE_PRECISION (TREE_TYPE (op0))
2833 < TYPE_PRECISION (TREE_TYPE (off)))
2835 off = op0;
2836 offtype = TREE_TYPE (off);
2837 STRIP_NOPS (off);
2838 continue;
2840 break;
2841 default:
2842 break;
2844 break;
2847 /* If at the end OFF still isn't a SSA_NAME or isn't
2848 defined in the loop, punt. */
2849 if (TREE_CODE (off) != SSA_NAME
2850 || expr_invariant_in_loop_p (loop, off))
2851 return NULL_TREE;
2853 if (offtype == NULL_TREE)
2854 offtype = TREE_TYPE (off);
2856 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2857 offtype, scale);
2858 if (decl == NULL_TREE)
2859 return NULL_TREE;
2861 if (basep)
2862 *basep = base;
2863 if (offp)
2864 *offp = off;
2865 if (scalep)
2866 *scalep = scale;
2867 return decl;
2870 /* Function vect_analyze_data_refs.
2872 Find all the data references in the loop or basic block.
2874 The general structure of the analysis of data refs in the vectorizer is as
2875 follows:
2876 1- vect_analyze_data_refs(loop/bb): call
2877 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2878 in the loop/bb and their dependences.
2879 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2880 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2881 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2885 bool
2886 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2887 bb_vec_info bb_vinfo,
2888 int *min_vf)
2890 struct loop *loop = NULL;
2891 basic_block bb = NULL;
2892 unsigned int i;
2893 vec<data_reference_p> datarefs;
2894 struct data_reference *dr;
2895 tree scalar_type;
2897 if (dump_enabled_p ())
2898 dump_printf_loc (MSG_NOTE, vect_location,
2899 "=== vect_analyze_data_refs ===\n");
2901 if (loop_vinfo)
2903 loop = LOOP_VINFO_LOOP (loop_vinfo);
2904 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2905 || find_data_references_in_loop
2906 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910 "not vectorized: loop contains function calls"
2911 " or data references that cannot be analyzed");
2912 return false;
2915 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2917 else
2919 gimple_stmt_iterator gsi;
2921 bb = BB_VINFO_BB (bb_vinfo);
2922 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2924 gimple stmt = gsi_stmt (gsi);
2925 if (!find_data_references_in_stmt (NULL, stmt,
2926 &BB_VINFO_DATAREFS (bb_vinfo)))
2928 /* Mark the rest of the basic-block as unvectorizable. */
2929 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2931 stmt = gsi_stmt (gsi);
2932 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2934 break;
2938 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2941 /* Go through the data-refs, check that the analysis succeeded. Update
2942 pointer from stmt_vec_info struct to DR and vectype. */
2944 FOR_EACH_VEC_ELT (datarefs, i, dr)
2946 gimple stmt;
2947 stmt_vec_info stmt_info;
2948 tree base, offset, init;
2949 bool gather = false;
2950 bool simd_lane_access = false;
2951 int vf;
2953 again:
2954 if (!dr || !DR_REF (dr))
2956 if (dump_enabled_p ())
2957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2958 "not vectorized: unhandled data-ref ");
2959 return false;
2962 stmt = DR_STMT (dr);
2963 stmt_info = vinfo_for_stmt (stmt);
2965 /* Discard clobbers from the dataref vector. We will remove
2966 clobber stmts during vectorization. */
2967 if (gimple_clobber_p (stmt))
2969 if (i == datarefs.length () - 1)
2971 datarefs.pop ();
2972 break;
2974 datarefs[i] = datarefs.pop ();
2975 goto again;
2978 /* Check that analysis of the data-ref succeeded. */
2979 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2980 || !DR_STEP (dr))
2982 bool maybe_gather
2983 = DR_IS_READ (dr)
2984 && !TREE_THIS_VOLATILE (DR_REF (dr))
2985 && targetm.vectorize.builtin_gather != NULL;
2986 bool maybe_simd_lane_access
2987 = loop_vinfo && loop->simduid;
2989 /* If target supports vector gather loads, or if this might be
2990 a SIMD lane access, see if they can't be used. */
2991 if (loop_vinfo
2992 && (maybe_gather || maybe_simd_lane_access)
2993 && !nested_in_vect_loop_p (loop, stmt))
2995 struct data_reference *newdr
2996 = create_data_ref (NULL, loop_containing_stmt (stmt),
2997 DR_REF (dr), stmt, true);
2998 gcc_assert (newdr != NULL && DR_REF (newdr));
2999 if (DR_BASE_ADDRESS (newdr)
3000 && DR_OFFSET (newdr)
3001 && DR_INIT (newdr)
3002 && DR_STEP (newdr)
3003 && integer_zerop (DR_STEP (newdr)))
3005 if (maybe_simd_lane_access)
3007 tree off = DR_OFFSET (newdr);
3008 STRIP_NOPS (off);
3009 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3010 && TREE_CODE (off) == MULT_EXPR
3011 && host_integerp (TREE_OPERAND (off, 1), 1))
3013 tree step = TREE_OPERAND (off, 1);
3014 off = TREE_OPERAND (off, 0);
3015 STRIP_NOPS (off);
3016 if (CONVERT_EXPR_P (off)
3017 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3018 0)))
3019 < TYPE_PRECISION (TREE_TYPE (off)))
3020 off = TREE_OPERAND (off, 0);
3021 if (TREE_CODE (off) == SSA_NAME)
3023 gimple def = SSA_NAME_DEF_STMT (off);
3024 tree reft = TREE_TYPE (DR_REF (newdr));
3025 if (gimple_call_internal_p (def)
3026 && gimple_call_internal_fn (def)
3027 == IFN_GOMP_SIMD_LANE)
3029 tree arg = gimple_call_arg (def, 0);
3030 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3031 arg = SSA_NAME_VAR (arg);
3032 if (arg == loop->simduid
3033 /* For now. */
3034 && tree_int_cst_equal
3035 (TYPE_SIZE_UNIT (reft),
3036 step))
3038 DR_OFFSET (newdr) = ssize_int (0);
3039 DR_STEP (newdr) = step;
3040 dr = newdr;
3041 simd_lane_access = true;
3047 if (!simd_lane_access && maybe_gather)
3049 dr = newdr;
3050 gather = true;
3053 if (!gather && !simd_lane_access)
3054 free_data_ref (newdr);
3057 if (!gather && !simd_lane_access)
3059 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3062 "not vectorized: data ref analysis "
3063 "failed ");
3064 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3067 if (bb_vinfo)
3068 break;
3070 return false;
3074 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3076 if (dump_enabled_p ())
3077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3078 "not vectorized: base addr of dr is a "
3079 "constant");
3081 if (bb_vinfo)
3082 break;
3084 if (gather || simd_lane_access)
3085 free_data_ref (dr);
3086 return false;
3089 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3091 if (dump_enabled_p ())
3093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3094 "not vectorized: volatile type ");
3095 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3098 if (bb_vinfo)
3099 break;
3101 return false;
3104 if (stmt_can_throw_internal (stmt))
3106 if (dump_enabled_p ())
3108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3109 "not vectorized: statement can throw an "
3110 "exception ");
3111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3114 if (bb_vinfo)
3115 break;
3117 if (gather || simd_lane_access)
3118 free_data_ref (dr);
3119 return false;
3122 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3123 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3125 if (dump_enabled_p ())
3127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3128 "not vectorized: statement is bitfield "
3129 "access ");
3130 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3133 if (bb_vinfo)
3134 break;
3136 if (gather || simd_lane_access)
3137 free_data_ref (dr);
3138 return false;
3141 base = unshare_expr (DR_BASE_ADDRESS (dr));
3142 offset = unshare_expr (DR_OFFSET (dr));
3143 init = unshare_expr (DR_INIT (dr));
3145 if (is_gimple_call (stmt))
3147 if (dump_enabled_p ())
3149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3150 "not vectorized: dr in a call ");
3151 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3154 if (bb_vinfo)
3155 break;
3157 if (gather || simd_lane_access)
3158 free_data_ref (dr);
3159 return false;
3162 /* Update DR field in stmt_vec_info struct. */
3164 /* If the dataref is in an inner-loop of the loop that is considered for
3165 for vectorization, we also want to analyze the access relative to
3166 the outer-loop (DR contains information only relative to the
3167 inner-most enclosing loop). We do that by building a reference to the
3168 first location accessed by the inner-loop, and analyze it relative to
3169 the outer-loop. */
3170 if (loop && nested_in_vect_loop_p (loop, stmt))
3172 tree outer_step, outer_base, outer_init;
3173 HOST_WIDE_INT pbitsize, pbitpos;
3174 tree poffset;
3175 enum machine_mode pmode;
3176 int punsignedp, pvolatilep;
3177 affine_iv base_iv, offset_iv;
3178 tree dinit;
3180 /* Build a reference to the first location accessed by the
3181 inner-loop: *(BASE+INIT). (The first location is actually
3182 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3183 tree inner_base = build_fold_indirect_ref
3184 (fold_build_pointer_plus (base, init));
3186 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_NOTE, vect_location,
3189 "analyze in outer-loop: ");
3190 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3193 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3194 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3195 gcc_assert (outer_base != NULL_TREE);
3197 if (pbitpos % BITS_PER_UNIT != 0)
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201 "failed: bit offset alignment.\n");
3202 return false;
3205 outer_base = build_fold_addr_expr (outer_base);
3206 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3207 &base_iv, false))
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3211 "failed: evolution of base is not affine.\n");
3212 return false;
3215 if (offset)
3217 if (poffset)
3218 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3219 poffset);
3220 else
3221 poffset = offset;
3224 if (!poffset)
3226 offset_iv.base = ssize_int (0);
3227 offset_iv.step = ssize_int (0);
3229 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3230 &offset_iv, false))
3232 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3234 "evolution of offset is not affine.\n");
3235 return false;
3238 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3239 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3240 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3241 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3242 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3244 outer_step = size_binop (PLUS_EXPR,
3245 fold_convert (ssizetype, base_iv.step),
3246 fold_convert (ssizetype, offset_iv.step));
3248 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3249 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3250 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3251 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3252 STMT_VINFO_DR_OFFSET (stmt_info) =
3253 fold_convert (ssizetype, offset_iv.base);
3254 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3255 size_int (highest_pow2_factor (offset_iv.base));
3257 if (dump_enabled_p ())
3259 dump_printf_loc (MSG_NOTE, vect_location,
3260 "\touter base_address: ");
3261 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3262 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3263 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3264 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3265 STMT_VINFO_DR_OFFSET (stmt_info));
3266 dump_printf (MSG_NOTE,
3267 "\n\touter constant offset from base address: ");
3268 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3269 STMT_VINFO_DR_INIT (stmt_info));
3270 dump_printf (MSG_NOTE, "\n\touter step: ");
3271 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3272 STMT_VINFO_DR_STEP (stmt_info));
3273 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3274 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3275 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3279 if (STMT_VINFO_DATA_REF (stmt_info))
3281 if (dump_enabled_p ())
3283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3284 "not vectorized: more than one data ref "
3285 "in stmt: ");
3286 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3289 if (bb_vinfo)
3290 break;
3292 if (gather || simd_lane_access)
3293 free_data_ref (dr);
3294 return false;
3297 STMT_VINFO_DATA_REF (stmt_info) = dr;
3298 if (simd_lane_access)
3300 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3301 datarefs[i] = dr;
3304 /* Set vectype for STMT. */
3305 scalar_type = TREE_TYPE (DR_REF (dr));
3306 STMT_VINFO_VECTYPE (stmt_info) =
3307 get_vectype_for_scalar_type (scalar_type);
3308 if (!STMT_VINFO_VECTYPE (stmt_info))
3310 if (dump_enabled_p ())
3312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3313 "not vectorized: no vectype for stmt: ");
3314 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3315 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3316 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3317 scalar_type);
3320 if (bb_vinfo)
3321 break;
3323 if (gather || simd_lane_access)
3325 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3326 free_data_ref (dr);
3328 return false;
3330 else
3332 if (dump_enabled_p ())
3334 dump_printf_loc (MSG_NOTE, vect_location,
3335 "got vectype for stmt: ");
3336 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3337 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3338 STMT_VINFO_VECTYPE (stmt_info));
3342 /* Adjust the minimal vectorization factor according to the
3343 vector type. */
3344 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3345 if (vf > *min_vf)
3346 *min_vf = vf;
3348 if (gather)
3350 tree off;
3352 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3353 if (gather
3354 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3355 gather = false;
3356 if (!gather)
3358 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3359 free_data_ref (dr);
3360 if (dump_enabled_p ())
3362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3363 "not vectorized: not suitable for gather "
3364 "load ");
3365 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3367 return false;
3370 datarefs[i] = dr;
3371 STMT_VINFO_GATHER_P (stmt_info) = true;
3373 else if (loop_vinfo
3374 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3376 if (nested_in_vect_loop_p (loop, stmt)
3377 || !DR_IS_READ (dr))
3379 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3382 "not vectorized: not suitable for strided "
3383 "load ");
3384 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3386 return false;
3388 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3392 /* If we stopped analysis at the first dataref we could not analyze
3393 when trying to vectorize a basic-block mark the rest of the datarefs
3394 as not vectorizable and truncate the vector of datarefs. That
3395 avoids spending useless time in analyzing their dependence. */
3396 if (i != datarefs.length ())
3398 gcc_assert (bb_vinfo != NULL);
3399 for (unsigned j = i; j < datarefs.length (); ++j)
3401 data_reference_p dr = datarefs[j];
3402 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3403 free_data_ref (dr);
3405 datarefs.truncate (i);
3408 return true;
3412 /* Function vect_get_new_vect_var.
3414 Returns a name for a new variable. The current naming scheme appends the
3415 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3416 the name of vectorizer generated variables, and appends that to NAME if
3417 provided. */
3419 tree
3420 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3422 const char *prefix;
3423 tree new_vect_var;
3425 switch (var_kind)
3427 case vect_simple_var:
3428 prefix = "vect";
3429 break;
3430 case vect_scalar_var:
3431 prefix = "stmp";
3432 break;
3433 case vect_pointer_var:
3434 prefix = "vectp";
3435 break;
3436 default:
3437 gcc_unreachable ();
3440 if (name)
3442 char* tmp = concat (prefix, "_", name, NULL);
3443 new_vect_var = create_tmp_reg (type, tmp);
3444 free (tmp);
3446 else
3447 new_vect_var = create_tmp_reg (type, prefix);
3449 return new_vect_var;
3453 /* Function vect_create_addr_base_for_vector_ref.
3455 Create an expression that computes the address of the first memory location
3456 that will be accessed for a data reference.
3458 Input:
3459 STMT: The statement containing the data reference.
3460 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3461 OFFSET: Optional. If supplied, it is be added to the initial address.
3462 LOOP: Specify relative to which loop-nest should the address be computed.
3463 For example, when the dataref is in an inner-loop nested in an
3464 outer-loop that is now being vectorized, LOOP can be either the
3465 outer-loop, or the inner-loop. The first memory location accessed
3466 by the following dataref ('in' points to short):
3468 for (i=0; i<N; i++)
3469 for (j=0; j<M; j++)
3470 s += in[i+j]
3472 is as follows:
3473 if LOOP=i_loop: &in (relative to i_loop)
3474 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3476 Output:
3477 1. Return an SSA_NAME whose value is the address of the memory location of
3478 the first vector of the data reference.
3479 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3480 these statement(s) which define the returned SSA_NAME.
3482 FORNOW: We are only handling array accesses with step 1. */
3484 tree
3485 vect_create_addr_base_for_vector_ref (gimple stmt,
3486 gimple_seq *new_stmt_list,
3487 tree offset,
3488 struct loop *loop)
3490 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3491 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3492 tree data_ref_base;
3493 const char *base_name;
3494 tree addr_base;
3495 tree dest;
3496 gimple_seq seq = NULL;
3497 tree base_offset;
3498 tree init;
3499 tree vect_ptr_type;
3500 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3501 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3503 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3505 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3507 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3509 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3510 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3511 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3513 else
3515 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3516 base_offset = unshare_expr (DR_OFFSET (dr));
3517 init = unshare_expr (DR_INIT (dr));
3520 if (loop_vinfo)
3521 base_name = get_name (data_ref_base);
3522 else
3524 base_offset = ssize_int (0);
3525 init = ssize_int (0);
3526 base_name = get_name (DR_REF (dr));
3529 /* Create base_offset */
3530 base_offset = size_binop (PLUS_EXPR,
3531 fold_convert (sizetype, base_offset),
3532 fold_convert (sizetype, init));
3534 if (offset)
3536 offset = fold_build2 (MULT_EXPR, sizetype,
3537 fold_convert (sizetype, offset), step);
3538 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3539 base_offset, offset);
3542 /* base + base_offset */
3543 if (loop_vinfo)
3544 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3545 else
3547 addr_base = build1 (ADDR_EXPR,
3548 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3549 unshare_expr (DR_REF (dr)));
3552 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3553 addr_base = fold_convert (vect_ptr_type, addr_base);
3554 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3555 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3556 gimple_seq_add_seq (new_stmt_list, seq);
3558 if (DR_PTR_INFO (dr)
3559 && TREE_CODE (addr_base) == SSA_NAME)
3561 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3562 if (offset)
3563 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3566 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3569 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3572 return addr_base;
3576 /* Function vect_create_data_ref_ptr.
3578 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3579 location accessed in the loop by STMT, along with the def-use update
3580 chain to appropriately advance the pointer through the loop iterations.
3581 Also set aliasing information for the pointer. This pointer is used by
3582 the callers to this function to create a memory reference expression for
3583 vector load/store access.
3585 Input:
3586 1. STMT: a stmt that references memory. Expected to be of the form
3587 GIMPLE_ASSIGN <name, data-ref> or
3588 GIMPLE_ASSIGN <data-ref, name>.
3589 2. AGGR_TYPE: the type of the reference, which should be either a vector
3590 or an array.
3591 3. AT_LOOP: the loop where the vector memref is to be created.
3592 4. OFFSET (optional): an offset to be added to the initial address accessed
3593 by the data-ref in STMT.
3594 5. BSI: location where the new stmts are to be placed if there is no loop
3595 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3596 pointing to the initial address.
3598 Output:
3599 1. Declare a new ptr to vector_type, and have it point to the base of the
3600 data reference (initial addressed accessed by the data reference).
3601 For example, for vector of type V8HI, the following code is generated:
3603 v8hi *ap;
3604 ap = (v8hi *)initial_address;
3606 if OFFSET is not supplied:
3607 initial_address = &a[init];
3608 if OFFSET is supplied:
3609 initial_address = &a[init + OFFSET];
3611 Return the initial_address in INITIAL_ADDRESS.
3613 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3614 update the pointer in each iteration of the loop.
3616 Return the increment stmt that updates the pointer in PTR_INCR.
3618 3. Set INV_P to true if the access pattern of the data reference in the
3619 vectorized loop is invariant. Set it to false otherwise.
3621 4. Return the pointer. */
3623 tree
3624 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3625 tree offset, tree *initial_address,
3626 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3627 bool only_init, bool *inv_p)
3629 const char *base_name;
3630 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3631 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3632 struct loop *loop = NULL;
3633 bool nested_in_vect_loop = false;
3634 struct loop *containing_loop = NULL;
3635 tree aggr_ptr_type;
3636 tree aggr_ptr;
3637 tree new_temp;
3638 gimple vec_stmt;
3639 gimple_seq new_stmt_list = NULL;
3640 edge pe = NULL;
3641 basic_block new_bb;
3642 tree aggr_ptr_init;
3643 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3644 tree aptr;
3645 gimple_stmt_iterator incr_gsi;
3646 bool insert_after;
3647 tree indx_before_incr, indx_after_incr;
3648 gimple incr;
3649 tree step;
3650 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3652 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3653 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3655 if (loop_vinfo)
3657 loop = LOOP_VINFO_LOOP (loop_vinfo);
3658 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3659 containing_loop = (gimple_bb (stmt))->loop_father;
3660 pe = loop_preheader_edge (loop);
3662 else
3664 gcc_assert (bb_vinfo);
3665 only_init = true;
3666 *ptr_incr = NULL;
3669 /* Check the step (evolution) of the load in LOOP, and record
3670 whether it's invariant. */
3671 if (nested_in_vect_loop)
3672 step = STMT_VINFO_DR_STEP (stmt_info);
3673 else
3674 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3676 if (integer_zerop (step))
3677 *inv_p = true;
3678 else
3679 *inv_p = false;
3681 /* Create an expression for the first address accessed by this load
3682 in LOOP. */
3683 base_name = get_name (DR_BASE_ADDRESS (dr));
3685 if (dump_enabled_p ())
3687 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3688 dump_printf_loc (MSG_NOTE, vect_location,
3689 "create %s-pointer variable to type: ",
3690 tree_code_name[(int) TREE_CODE (aggr_type)]);
3691 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3692 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3693 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3694 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3695 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3696 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3697 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3698 else
3699 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3700 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3703 /* (1) Create the new aggregate-pointer variable.
3704 Vector and array types inherit the alias set of their component
3705 type by default so we need to use a ref-all pointer if the data
3706 reference does not conflict with the created aggregated data
3707 reference because it is not addressable. */
3708 bool need_ref_all = false;
3709 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3710 get_alias_set (DR_REF (dr))))
3711 need_ref_all = true;
3712 /* Likewise for any of the data references in the stmt group. */
3713 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3715 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3718 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3719 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3720 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3721 get_alias_set (DR_REF (sdr))))
3723 need_ref_all = true;
3724 break;
3726 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3728 while (orig_stmt);
3730 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3731 need_ref_all);
3732 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3735 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3736 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3737 def-use update cycles for the pointer: one relative to the outer-loop
3738 (LOOP), which is what steps (3) and (4) below do. The other is relative
3739 to the inner-loop (which is the inner-most loop containing the dataref),
3740 and this is done be step (5) below.
3742 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3743 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3744 redundant. Steps (3),(4) create the following:
3746 vp0 = &base_addr;
3747 LOOP: vp1 = phi(vp0,vp2)
3750 vp2 = vp1 + step
3751 goto LOOP
3753 If there is an inner-loop nested in loop, then step (5) will also be
3754 applied, and an additional update in the inner-loop will be created:
3756 vp0 = &base_addr;
3757 LOOP: vp1 = phi(vp0,vp2)
3759 inner: vp3 = phi(vp1,vp4)
3760 vp4 = vp3 + inner_step
3761 if () goto inner
3763 vp2 = vp1 + step
3764 if () goto LOOP */
3766 /* (2) Calculate the initial address of the aggregate-pointer, and set
3767 the aggregate-pointer to point to it before the loop. */
3769 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3771 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3772 offset, loop);
3773 if (new_stmt_list)
3775 if (pe)
3777 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3778 gcc_assert (!new_bb);
3780 else
3781 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3784 *initial_address = new_temp;
3786 /* Create: p = (aggr_type *) initial_base */
3787 if (TREE_CODE (new_temp) != SSA_NAME
3788 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3790 vec_stmt = gimple_build_assign (aggr_ptr,
3791 fold_convert (aggr_ptr_type, new_temp));
3792 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3793 /* Copy the points-to information if it exists. */
3794 if (DR_PTR_INFO (dr))
3795 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3796 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3797 if (pe)
3799 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3800 gcc_assert (!new_bb);
3802 else
3803 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3805 else
3806 aggr_ptr_init = new_temp;
3808 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3809 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3810 inner-loop nested in LOOP (during outer-loop vectorization). */
3812 /* No update in loop is required. */
3813 if (only_init && (!loop_vinfo || at_loop == loop))
3814 aptr = aggr_ptr_init;
3815 else
3817 /* The step of the aggregate pointer is the type size. */
3818 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3819 /* One exception to the above is when the scalar step of the load in
3820 LOOP is zero. In this case the step here is also zero. */
3821 if (*inv_p)
3822 iv_step = size_zero_node;
3823 else if (tree_int_cst_sgn (step) == -1)
3824 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3826 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3828 create_iv (aggr_ptr_init,
3829 fold_convert (aggr_ptr_type, iv_step),
3830 aggr_ptr, loop, &incr_gsi, insert_after,
3831 &indx_before_incr, &indx_after_incr);
3832 incr = gsi_stmt (incr_gsi);
3833 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3835 /* Copy the points-to information if it exists. */
3836 if (DR_PTR_INFO (dr))
3838 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3839 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3841 if (ptr_incr)
3842 *ptr_incr = incr;
3844 aptr = indx_before_incr;
3847 if (!nested_in_vect_loop || only_init)
3848 return aptr;
3851 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3852 nested in LOOP, if exists. */
3854 gcc_assert (nested_in_vect_loop);
3855 if (!only_init)
3857 standard_iv_increment_position (containing_loop, &incr_gsi,
3858 &insert_after);
3859 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3860 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3861 &indx_after_incr);
3862 incr = gsi_stmt (incr_gsi);
3863 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3865 /* Copy the points-to information if it exists. */
3866 if (DR_PTR_INFO (dr))
3868 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3869 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3871 if (ptr_incr)
3872 *ptr_incr = incr;
3874 return indx_before_incr;
3876 else
3877 gcc_unreachable ();
3881 /* Function bump_vector_ptr
3883 Increment a pointer (to a vector type) by vector-size. If requested,
3884 i.e. if PTR-INCR is given, then also connect the new increment stmt
3885 to the existing def-use update-chain of the pointer, by modifying
3886 the PTR_INCR as illustrated below:
3888 The pointer def-use update-chain before this function:
3889 DATAREF_PTR = phi (p_0, p_2)
3890 ....
3891 PTR_INCR: p_2 = DATAREF_PTR + step
3893 The pointer def-use update-chain after this function:
3894 DATAREF_PTR = phi (p_0, p_2)
3895 ....
3896 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3897 ....
3898 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3900 Input:
3901 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3902 in the loop.
3903 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3904 the loop. The increment amount across iterations is expected
3905 to be vector_size.
3906 BSI - location where the new update stmt is to be placed.
3907 STMT - the original scalar memory-access stmt that is being vectorized.
3908 BUMP - optional. The offset by which to bump the pointer. If not given,
3909 the offset is assumed to be vector_size.
3911 Output: Return NEW_DATAREF_PTR as illustrated above.
3915 tree
3916 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3917 gimple stmt, tree bump)
3919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3920 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3921 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3922 tree update = TYPE_SIZE_UNIT (vectype);
3923 gimple incr_stmt;
3924 ssa_op_iter iter;
3925 use_operand_p use_p;
3926 tree new_dataref_ptr;
3928 if (bump)
3929 update = bump;
3931 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3932 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3933 dataref_ptr, update);
3934 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3936 /* Copy the points-to information if it exists. */
3937 if (DR_PTR_INFO (dr))
3939 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3940 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3943 if (!ptr_incr)
3944 return new_dataref_ptr;
3946 /* Update the vector-pointer's cross-iteration increment. */
3947 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3949 tree use = USE_FROM_PTR (use_p);
3951 if (use == dataref_ptr)
3952 SET_USE (use_p, new_dataref_ptr);
3953 else
3954 gcc_assert (tree_int_cst_compare (use, update) == 0);
3957 return new_dataref_ptr;
3961 /* Function vect_create_destination_var.
3963 Create a new temporary of type VECTYPE. */
3965 tree
3966 vect_create_destination_var (tree scalar_dest, tree vectype)
3968 tree vec_dest;
3969 const char *name;
3970 char *new_name;
3971 tree type;
3972 enum vect_var_kind kind;
3974 kind = vectype ? vect_simple_var : vect_scalar_var;
3975 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3977 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3979 name = get_name (scalar_dest);
3980 if (name)
3981 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3982 else
3983 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3984 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3985 free (new_name);
3987 return vec_dest;
3990 /* Function vect_grouped_store_supported.
3992 Returns TRUE if interleave high and interleave low permutations
3993 are supported, and FALSE otherwise. */
3995 bool
3996 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3998 enum machine_mode mode = TYPE_MODE (vectype);
4000 /* vect_permute_store_chain requires the group size to be a power of two. */
4001 if (exact_log2 (count) == -1)
4003 if (dump_enabled_p ())
4004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4005 "the size of the group of accesses"
4006 " is not a power of 2");
4007 return false;
4010 /* Check that the permutation is supported. */
4011 if (VECTOR_MODE_P (mode))
4013 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4014 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4015 for (i = 0; i < nelt / 2; i++)
4017 sel[i * 2] = i;
4018 sel[i * 2 + 1] = i + nelt;
4020 if (can_vec_perm_p (mode, false, sel))
4022 for (i = 0; i < nelt; i++)
4023 sel[i] += nelt / 2;
4024 if (can_vec_perm_p (mode, false, sel))
4025 return true;
4029 if (dump_enabled_p ())
4030 dump_printf (MSG_MISSED_OPTIMIZATION,
4031 "interleave op not supported by target.");
4032 return false;
4036 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4037 type VECTYPE. */
4039 bool
4040 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4042 return vect_lanes_optab_supported_p ("vec_store_lanes",
4043 vec_store_lanes_optab,
4044 vectype, count);
4048 /* Function vect_permute_store_chain.
4050 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4051 a power of 2, generate interleave_high/low stmts to reorder the data
4052 correctly for the stores. Return the final references for stores in
4053 RESULT_CHAIN.
4055 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4056 The input is 4 vectors each containing 8 elements. We assign a number to
4057 each element, the input sequence is:
4059 1st vec: 0 1 2 3 4 5 6 7
4060 2nd vec: 8 9 10 11 12 13 14 15
4061 3rd vec: 16 17 18 19 20 21 22 23
4062 4th vec: 24 25 26 27 28 29 30 31
4064 The output sequence should be:
4066 1st vec: 0 8 16 24 1 9 17 25
4067 2nd vec: 2 10 18 26 3 11 19 27
4068 3rd vec: 4 12 20 28 5 13 21 30
4069 4th vec: 6 14 22 30 7 15 23 31
4071 i.e., we interleave the contents of the four vectors in their order.
4073 We use interleave_high/low instructions to create such output. The input of
4074 each interleave_high/low operation is two vectors:
4075 1st vec 2nd vec
4076 0 1 2 3 4 5 6 7
4077 the even elements of the result vector are obtained left-to-right from the
4078 high/low elements of the first vector. The odd elements of the result are
4079 obtained left-to-right from the high/low elements of the second vector.
4080 The output of interleave_high will be: 0 4 1 5
4081 and of interleave_low: 2 6 3 7
4084 The permutation is done in log LENGTH stages. In each stage interleave_high
4085 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4086 where the first argument is taken from the first half of DR_CHAIN and the
4087 second argument from it's second half.
4088 In our example,
4090 I1: interleave_high (1st vec, 3rd vec)
4091 I2: interleave_low (1st vec, 3rd vec)
4092 I3: interleave_high (2nd vec, 4th vec)
4093 I4: interleave_low (2nd vec, 4th vec)
4095 The output for the first stage is:
4097 I1: 0 16 1 17 2 18 3 19
4098 I2: 4 20 5 21 6 22 7 23
4099 I3: 8 24 9 25 10 26 11 27
4100 I4: 12 28 13 29 14 30 15 31
4102 The output of the second stage, i.e. the final result is:
4104 I1: 0 8 16 24 1 9 17 25
4105 I2: 2 10 18 26 3 11 19 27
4106 I3: 4 12 20 28 5 13 21 30
4107 I4: 6 14 22 30 7 15 23 31. */
4109 void
4110 vect_permute_store_chain (vec<tree> dr_chain,
4111 unsigned int length,
4112 gimple stmt,
4113 gimple_stmt_iterator *gsi,
4114 vec<tree> *result_chain)
4116 tree vect1, vect2, high, low;
4117 gimple perm_stmt;
4118 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4119 tree perm_mask_low, perm_mask_high;
4120 unsigned int i, n;
4121 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4122 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4124 result_chain->quick_grow (length);
4125 memcpy (result_chain->address (), dr_chain.address (),
4126 length * sizeof (tree));
4128 for (i = 0, n = nelt / 2; i < n; i++)
4130 sel[i * 2] = i;
4131 sel[i * 2 + 1] = i + nelt;
4133 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4134 gcc_assert (perm_mask_high != NULL);
4136 for (i = 0; i < nelt; i++)
4137 sel[i] += nelt / 2;
4138 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4139 gcc_assert (perm_mask_low != NULL);
4141 for (i = 0, n = exact_log2 (length); i < n; i++)
4143 for (j = 0; j < length/2; j++)
4145 vect1 = dr_chain[j];
4146 vect2 = dr_chain[j+length/2];
4148 /* Create interleaving stmt:
4149 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4150 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4151 perm_stmt
4152 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4153 vect1, vect2, perm_mask_high);
4154 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4155 (*result_chain)[2*j] = high;
4157 /* Create interleaving stmt:
4158 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4159 nelt*3/2+1, ...}> */
4160 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4161 perm_stmt
4162 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4163 vect1, vect2, perm_mask_low);
4164 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4165 (*result_chain)[2*j+1] = low;
4167 memcpy (dr_chain.address (), result_chain->address (),
4168 length * sizeof (tree));
4172 /* Function vect_setup_realignment
4174 This function is called when vectorizing an unaligned load using
4175 the dr_explicit_realign[_optimized] scheme.
4176 This function generates the following code at the loop prolog:
4178 p = initial_addr;
4179 x msq_init = *(floor(p)); # prolog load
4180 realignment_token = call target_builtin;
4181 loop:
4182 x msq = phi (msq_init, ---)
4184 The stmts marked with x are generated only for the case of
4185 dr_explicit_realign_optimized.
4187 The code above sets up a new (vector) pointer, pointing to the first
4188 location accessed by STMT, and a "floor-aligned" load using that pointer.
4189 It also generates code to compute the "realignment-token" (if the relevant
4190 target hook was defined), and creates a phi-node at the loop-header bb
4191 whose arguments are the result of the prolog-load (created by this
4192 function) and the result of a load that takes place in the loop (to be
4193 created by the caller to this function).
4195 For the case of dr_explicit_realign_optimized:
4196 The caller to this function uses the phi-result (msq) to create the
4197 realignment code inside the loop, and sets up the missing phi argument,
4198 as follows:
4199 loop:
4200 msq = phi (msq_init, lsq)
4201 lsq = *(floor(p')); # load in loop
4202 result = realign_load (msq, lsq, realignment_token);
4204 For the case of dr_explicit_realign:
4205 loop:
4206 msq = *(floor(p)); # load in loop
4207 p' = p + (VS-1);
4208 lsq = *(floor(p')); # load in loop
4209 result = realign_load (msq, lsq, realignment_token);
4211 Input:
4212 STMT - (scalar) load stmt to be vectorized. This load accesses
4213 a memory location that may be unaligned.
4214 BSI - place where new code is to be inserted.
4215 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4216 is used.
4218 Output:
4219 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4220 target hook, if defined.
4221 Return value - the result of the loop-header phi node. */
4223 tree
4224 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4225 tree *realignment_token,
4226 enum dr_alignment_support alignment_support_scheme,
4227 tree init_addr,
4228 struct loop **at_loop)
4230 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4231 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4232 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4233 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4234 struct loop *loop = NULL;
4235 edge pe = NULL;
4236 tree scalar_dest = gimple_assign_lhs (stmt);
4237 tree vec_dest;
4238 gimple inc;
4239 tree ptr;
4240 tree data_ref;
4241 gimple new_stmt;
4242 basic_block new_bb;
4243 tree msq_init = NULL_TREE;
4244 tree new_temp;
4245 gimple phi_stmt;
4246 tree msq = NULL_TREE;
4247 gimple_seq stmts = NULL;
4248 bool inv_p;
4249 bool compute_in_loop = false;
4250 bool nested_in_vect_loop = false;
4251 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4252 struct loop *loop_for_initial_load = NULL;
4254 if (loop_vinfo)
4256 loop = LOOP_VINFO_LOOP (loop_vinfo);
4257 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4260 gcc_assert (alignment_support_scheme == dr_explicit_realign
4261 || alignment_support_scheme == dr_explicit_realign_optimized);
4263 /* We need to generate three things:
4264 1. the misalignment computation
4265 2. the extra vector load (for the optimized realignment scheme).
4266 3. the phi node for the two vectors from which the realignment is
4267 done (for the optimized realignment scheme). */
4269 /* 1. Determine where to generate the misalignment computation.
4271 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4272 calculation will be generated by this function, outside the loop (in the
4273 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4274 caller, inside the loop.
4276 Background: If the misalignment remains fixed throughout the iterations of
4277 the loop, then both realignment schemes are applicable, and also the
4278 misalignment computation can be done outside LOOP. This is because we are
4279 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4280 are a multiple of VS (the Vector Size), and therefore the misalignment in
4281 different vectorized LOOP iterations is always the same.
4282 The problem arises only if the memory access is in an inner-loop nested
4283 inside LOOP, which is now being vectorized using outer-loop vectorization.
4284 This is the only case when the misalignment of the memory access may not
4285 remain fixed throughout the iterations of the inner-loop (as explained in
4286 detail in vect_supportable_dr_alignment). In this case, not only is the
4287 optimized realignment scheme not applicable, but also the misalignment
4288 computation (and generation of the realignment token that is passed to
4289 REALIGN_LOAD) have to be done inside the loop.
4291 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4292 or not, which in turn determines if the misalignment is computed inside
4293 the inner-loop, or outside LOOP. */
4295 if (init_addr != NULL_TREE || !loop_vinfo)
4297 compute_in_loop = true;
4298 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4302 /* 2. Determine where to generate the extra vector load.
4304 For the optimized realignment scheme, instead of generating two vector
4305 loads in each iteration, we generate a single extra vector load in the
4306 preheader of the loop, and in each iteration reuse the result of the
4307 vector load from the previous iteration. In case the memory access is in
4308 an inner-loop nested inside LOOP, which is now being vectorized using
4309 outer-loop vectorization, we need to determine whether this initial vector
4310 load should be generated at the preheader of the inner-loop, or can be
4311 generated at the preheader of LOOP. If the memory access has no evolution
4312 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4313 to be generated inside LOOP (in the preheader of the inner-loop). */
4315 if (nested_in_vect_loop)
4317 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4318 bool invariant_in_outerloop =
4319 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4320 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4322 else
4323 loop_for_initial_load = loop;
4324 if (at_loop)
4325 *at_loop = loop_for_initial_load;
4327 if (loop_for_initial_load)
4328 pe = loop_preheader_edge (loop_for_initial_load);
4330 /* 3. For the case of the optimized realignment, create the first vector
4331 load at the loop preheader. */
4333 if (alignment_support_scheme == dr_explicit_realign_optimized)
4335 /* Create msq_init = *(floor(p1)) in the loop preheader */
4337 gcc_assert (!compute_in_loop);
4338 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4339 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4340 NULL_TREE, &init_addr, NULL, &inc,
4341 true, &inv_p);
4342 new_temp = copy_ssa_name (ptr, NULL);
4343 new_stmt = gimple_build_assign_with_ops
4344 (BIT_AND_EXPR, new_temp, ptr,
4345 build_int_cst (TREE_TYPE (ptr),
4346 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4347 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4348 gcc_assert (!new_bb);
4349 data_ref
4350 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4351 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4352 new_stmt = gimple_build_assign (vec_dest, data_ref);
4353 new_temp = make_ssa_name (vec_dest, new_stmt);
4354 gimple_assign_set_lhs (new_stmt, new_temp);
4355 if (pe)
4357 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4358 gcc_assert (!new_bb);
4360 else
4361 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4363 msq_init = gimple_assign_lhs (new_stmt);
4366 /* 4. Create realignment token using a target builtin, if available.
4367 It is done either inside the containing loop, or before LOOP (as
4368 determined above). */
4370 if (targetm.vectorize.builtin_mask_for_load)
4372 tree builtin_decl;
4374 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4375 if (!init_addr)
4377 /* Generate the INIT_ADDR computation outside LOOP. */
4378 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4379 NULL_TREE, loop);
4380 if (loop)
4382 pe = loop_preheader_edge (loop);
4383 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4384 gcc_assert (!new_bb);
4386 else
4387 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4390 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4391 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4392 vec_dest =
4393 vect_create_destination_var (scalar_dest,
4394 gimple_call_return_type (new_stmt));
4395 new_temp = make_ssa_name (vec_dest, new_stmt);
4396 gimple_call_set_lhs (new_stmt, new_temp);
4398 if (compute_in_loop)
4399 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4400 else
4402 /* Generate the misalignment computation outside LOOP. */
4403 pe = loop_preheader_edge (loop);
4404 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4405 gcc_assert (!new_bb);
4408 *realignment_token = gimple_call_lhs (new_stmt);
4410 /* The result of the CALL_EXPR to this builtin is determined from
4411 the value of the parameter and no global variables are touched
4412 which makes the builtin a "const" function. Requiring the
4413 builtin to have the "const" attribute makes it unnecessary
4414 to call mark_call_clobbered. */
4415 gcc_assert (TREE_READONLY (builtin_decl));
4418 if (alignment_support_scheme == dr_explicit_realign)
4419 return msq;
4421 gcc_assert (!compute_in_loop);
4422 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4425 /* 5. Create msq = phi <msq_init, lsq> in loop */
4427 pe = loop_preheader_edge (containing_loop);
4428 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4429 msq = make_ssa_name (vec_dest, NULL);
4430 phi_stmt = create_phi_node (msq, containing_loop->header);
4431 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4433 return msq;
4437 /* Function vect_grouped_load_supported.
4439 Returns TRUE if even and odd permutations are supported,
4440 and FALSE otherwise. */
4442 bool
4443 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4445 enum machine_mode mode = TYPE_MODE (vectype);
4447 /* vect_permute_load_chain requires the group size to be a power of two. */
4448 if (exact_log2 (count) == -1)
4450 if (dump_enabled_p ())
4451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4452 "the size of the group of accesses"
4453 " is not a power of 2");
4454 return false;
4457 /* Check that the permutation is supported. */
4458 if (VECTOR_MODE_P (mode))
4460 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4461 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4463 for (i = 0; i < nelt; i++)
4464 sel[i] = i * 2;
4465 if (can_vec_perm_p (mode, false, sel))
4467 for (i = 0; i < nelt; i++)
4468 sel[i] = i * 2 + 1;
4469 if (can_vec_perm_p (mode, false, sel))
4470 return true;
4474 if (dump_enabled_p ())
4475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4476 "extract even/odd not supported by target");
4477 return false;
4480 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4481 type VECTYPE. */
4483 bool
4484 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4486 return vect_lanes_optab_supported_p ("vec_load_lanes",
4487 vec_load_lanes_optab,
4488 vectype, count);
4491 /* Function vect_permute_load_chain.
4493 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4494 a power of 2, generate extract_even/odd stmts to reorder the input data
4495 correctly. Return the final references for loads in RESULT_CHAIN.
4497 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4498 The input is 4 vectors each containing 8 elements. We assign a number to each
4499 element, the input sequence is:
4501 1st vec: 0 1 2 3 4 5 6 7
4502 2nd vec: 8 9 10 11 12 13 14 15
4503 3rd vec: 16 17 18 19 20 21 22 23
4504 4th vec: 24 25 26 27 28 29 30 31
4506 The output sequence should be:
4508 1st vec: 0 4 8 12 16 20 24 28
4509 2nd vec: 1 5 9 13 17 21 25 29
4510 3rd vec: 2 6 10 14 18 22 26 30
4511 4th vec: 3 7 11 15 19 23 27 31
4513 i.e., the first output vector should contain the first elements of each
4514 interleaving group, etc.
4516 We use extract_even/odd instructions to create such output. The input of
4517 each extract_even/odd operation is two vectors
4518 1st vec 2nd vec
4519 0 1 2 3 4 5 6 7
4521 and the output is the vector of extracted even/odd elements. The output of
4522 extract_even will be: 0 2 4 6
4523 and of extract_odd: 1 3 5 7
4526 The permutation is done in log LENGTH stages. In each stage extract_even
4527 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4528 their order. In our example,
4530 E1: extract_even (1st vec, 2nd vec)
4531 E2: extract_odd (1st vec, 2nd vec)
4532 E3: extract_even (3rd vec, 4th vec)
4533 E4: extract_odd (3rd vec, 4th vec)
4535 The output for the first stage will be:
4537 E1: 0 2 4 6 8 10 12 14
4538 E2: 1 3 5 7 9 11 13 15
4539 E3: 16 18 20 22 24 26 28 30
4540 E4: 17 19 21 23 25 27 29 31
4542 In order to proceed and create the correct sequence for the next stage (or
4543 for the correct output, if the second stage is the last one, as in our
4544 example), we first put the output of extract_even operation and then the
4545 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4546 The input for the second stage is:
4548 1st vec (E1): 0 2 4 6 8 10 12 14
4549 2nd vec (E3): 16 18 20 22 24 26 28 30
4550 3rd vec (E2): 1 3 5 7 9 11 13 15
4551 4th vec (E4): 17 19 21 23 25 27 29 31
4553 The output of the second stage:
4555 E1: 0 4 8 12 16 20 24 28
4556 E2: 2 6 10 14 18 22 26 30
4557 E3: 1 5 9 13 17 21 25 29
4558 E4: 3 7 11 15 19 23 27 31
4560 And RESULT_CHAIN after reordering:
4562 1st vec (E1): 0 4 8 12 16 20 24 28
4563 2nd vec (E3): 1 5 9 13 17 21 25 29
4564 3rd vec (E2): 2 6 10 14 18 22 26 30
4565 4th vec (E4): 3 7 11 15 19 23 27 31. */
4567 static void
4568 vect_permute_load_chain (vec<tree> dr_chain,
4569 unsigned int length,
4570 gimple stmt,
4571 gimple_stmt_iterator *gsi,
4572 vec<tree> *result_chain)
4574 tree data_ref, first_vect, second_vect;
4575 tree perm_mask_even, perm_mask_odd;
4576 gimple perm_stmt;
4577 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4578 unsigned int i, j, log_length = exact_log2 (length);
4579 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4580 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4582 result_chain->quick_grow (length);
4583 memcpy (result_chain->address (), dr_chain.address (),
4584 length * sizeof (tree));
4586 for (i = 0; i < nelt; ++i)
4587 sel[i] = i * 2;
4588 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4589 gcc_assert (perm_mask_even != NULL);
4591 for (i = 0; i < nelt; ++i)
4592 sel[i] = i * 2 + 1;
4593 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4594 gcc_assert (perm_mask_odd != NULL);
4596 for (i = 0; i < log_length; i++)
4598 for (j = 0; j < length; j += 2)
4600 first_vect = dr_chain[j];
4601 second_vect = dr_chain[j+1];
4603 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4604 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4605 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4606 first_vect, second_vect,
4607 perm_mask_even);
4608 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4609 (*result_chain)[j/2] = data_ref;
4611 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4612 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4613 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4614 first_vect, second_vect,
4615 perm_mask_odd);
4616 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4617 (*result_chain)[j/2+length/2] = data_ref;
4619 memcpy (dr_chain.address (), result_chain->address (),
4620 length * sizeof (tree));
4625 /* Function vect_transform_grouped_load.
4627 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4628 to perform their permutation and ascribe the result vectorized statements to
4629 the scalar statements.
4632 void
4633 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4634 gimple_stmt_iterator *gsi)
4636 vec<tree> result_chain = vNULL;
4638 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4639 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4640 vectors, that are ready for vector computation. */
4641 result_chain.create (size);
4642 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4643 vect_record_grouped_load_vectors (stmt, result_chain);
4644 result_chain.release ();
4647 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4648 generated as part of the vectorization of STMT. Assign the statement
4649 for each vector to the associated scalar statement. */
4651 void
4652 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4654 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4655 gimple next_stmt, new_stmt;
4656 unsigned int i, gap_count;
4657 tree tmp_data_ref;
4659 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4660 Since we scan the chain starting from it's first node, their order
4661 corresponds the order of data-refs in RESULT_CHAIN. */
4662 next_stmt = first_stmt;
4663 gap_count = 1;
4664 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4666 if (!next_stmt)
4667 break;
4669 /* Skip the gaps. Loads created for the gaps will be removed by dead
4670 code elimination pass later. No need to check for the first stmt in
4671 the group, since it always exists.
4672 GROUP_GAP is the number of steps in elements from the previous
4673 access (if there is no gap GROUP_GAP is 1). We skip loads that
4674 correspond to the gaps. */
4675 if (next_stmt != first_stmt
4676 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4678 gap_count++;
4679 continue;
4682 while (next_stmt)
4684 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4685 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4686 copies, and we put the new vector statement in the first available
4687 RELATED_STMT. */
4688 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4689 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4690 else
4692 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4694 gimple prev_stmt =
4695 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4696 gimple rel_stmt =
4697 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4698 while (rel_stmt)
4700 prev_stmt = rel_stmt;
4701 rel_stmt =
4702 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4705 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4706 new_stmt;
4710 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4711 gap_count = 1;
4712 /* If NEXT_STMT accesses the same DR as the previous statement,
4713 put the same TMP_DATA_REF as its vectorized statement; otherwise
4714 get the next data-ref from RESULT_CHAIN. */
4715 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4716 break;
4721 /* Function vect_force_dr_alignment_p.
4723 Returns whether the alignment of a DECL can be forced to be aligned
4724 on ALIGNMENT bit boundary. */
4726 bool
4727 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4729 if (TREE_CODE (decl) != VAR_DECL)
4730 return false;
4732 /* We cannot change alignment of common or external symbols as another
4733 translation unit may contain a definition with lower alignment.
4734 The rules of common symbol linking mean that the definition
4735 will override the common symbol. The same is true for constant
4736 pool entries which may be shared and are not properly merged
4737 by LTO. */
4738 if (DECL_EXTERNAL (decl)
4739 || DECL_COMMON (decl)
4740 || DECL_IN_CONSTANT_POOL (decl))
4741 return false;
4743 if (TREE_ASM_WRITTEN (decl))
4744 return false;
4746 /* Do not override the alignment as specified by the ABI when the used
4747 attribute is set. */
4748 if (DECL_PRESERVE_P (decl))
4749 return false;
4751 /* Do not override explicit alignment set by the user when an explicit
4752 section name is also used. This is a common idiom used by many
4753 software projects. */
4754 if (DECL_SECTION_NAME (decl) != NULL_TREE
4755 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4756 return false;
4758 if (TREE_STATIC (decl))
4759 return (alignment <= MAX_OFILE_ALIGNMENT);
4760 else
4761 return (alignment <= MAX_STACK_ALIGNMENT);
4765 /* Return whether the data reference DR is supported with respect to its
4766 alignment.
4767 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4768 it is aligned, i.e., check if it is possible to vectorize it with different
4769 alignment. */
4771 enum dr_alignment_support
4772 vect_supportable_dr_alignment (struct data_reference *dr,
4773 bool check_aligned_accesses)
4775 gimple stmt = DR_STMT (dr);
4776 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4777 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4778 enum machine_mode mode = TYPE_MODE (vectype);
4779 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4780 struct loop *vect_loop = NULL;
4781 bool nested_in_vect_loop = false;
4783 if (aligned_access_p (dr) && !check_aligned_accesses)
4784 return dr_aligned;
4786 if (loop_vinfo)
4788 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4789 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4792 /* Possibly unaligned access. */
4794 /* We can choose between using the implicit realignment scheme (generating
4795 a misaligned_move stmt) and the explicit realignment scheme (generating
4796 aligned loads with a REALIGN_LOAD). There are two variants to the
4797 explicit realignment scheme: optimized, and unoptimized.
4798 We can optimize the realignment only if the step between consecutive
4799 vector loads is equal to the vector size. Since the vector memory
4800 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4801 is guaranteed that the misalignment amount remains the same throughout the
4802 execution of the vectorized loop. Therefore, we can create the
4803 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4804 at the loop preheader.
4806 However, in the case of outer-loop vectorization, when vectorizing a
4807 memory access in the inner-loop nested within the LOOP that is now being
4808 vectorized, while it is guaranteed that the misalignment of the
4809 vectorized memory access will remain the same in different outer-loop
4810 iterations, it is *not* guaranteed that is will remain the same throughout
4811 the execution of the inner-loop. This is because the inner-loop advances
4812 with the original scalar step (and not in steps of VS). If the inner-loop
4813 step happens to be a multiple of VS, then the misalignment remains fixed
4814 and we can use the optimized realignment scheme. For example:
4816 for (i=0; i<N; i++)
4817 for (j=0; j<M; j++)
4818 s += a[i+j];
4820 When vectorizing the i-loop in the above example, the step between
4821 consecutive vector loads is 1, and so the misalignment does not remain
4822 fixed across the execution of the inner-loop, and the realignment cannot
4823 be optimized (as illustrated in the following pseudo vectorized loop):
4825 for (i=0; i<N; i+=4)
4826 for (j=0; j<M; j++){
4827 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4828 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4829 // (assuming that we start from an aligned address).
4832 We therefore have to use the unoptimized realignment scheme:
4834 for (i=0; i<N; i+=4)
4835 for (j=k; j<M; j+=4)
4836 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4837 // that the misalignment of the initial address is
4838 // 0).
4840 The loop can then be vectorized as follows:
4842 for (k=0; k<4; k++){
4843 rt = get_realignment_token (&vp[k]);
4844 for (i=0; i<N; i+=4){
4845 v1 = vp[i+k];
4846 for (j=k; j<M; j+=4){
4847 v2 = vp[i+j+VS-1];
4848 va = REALIGN_LOAD <v1,v2,rt>;
4849 vs += va;
4850 v1 = v2;
4853 } */
4855 if (DR_IS_READ (dr))
4857 bool is_packed = false;
4858 tree type = (TREE_TYPE (DR_REF (dr)));
4860 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4861 && (!targetm.vectorize.builtin_mask_for_load
4862 || targetm.vectorize.builtin_mask_for_load ()))
4864 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4865 if ((nested_in_vect_loop
4866 && (TREE_INT_CST_LOW (DR_STEP (dr))
4867 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4868 || !loop_vinfo)
4869 return dr_explicit_realign;
4870 else
4871 return dr_explicit_realign_optimized;
4873 if (!known_alignment_for_access_p (dr))
4874 is_packed = not_size_aligned (DR_REF (dr));
4876 if ((TYPE_USER_ALIGN (type) && !is_packed)
4877 || targetm.vectorize.
4878 support_vector_misalignment (mode, type,
4879 DR_MISALIGNMENT (dr), is_packed))
4880 /* Can't software pipeline the loads, but can at least do them. */
4881 return dr_unaligned_supported;
4883 else
4885 bool is_packed = false;
4886 tree type = (TREE_TYPE (DR_REF (dr)));
4888 if (!known_alignment_for_access_p (dr))
4889 is_packed = not_size_aligned (DR_REF (dr));
4891 if ((TYPE_USER_ALIGN (type) && !is_packed)
4892 || targetm.vectorize.
4893 support_vector_misalignment (mode, type,
4894 DR_MISALIGNMENT (dr), is_packed))
4895 return dr_unaligned_supported;
4898 /* Unsupported. */
4899 return dr_unaligned_unsupported;