fix for PR bootstrap/58951
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobfb3fe94240b1b854be4f1589d801ef5c55ea2aa4
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "gimple.h"
34 #include "gimple-ssa.h"
35 #include "tree-phinodes.h"
36 #include "ssa-iterators.h"
37 #include "tree-ssanames.h"
38 #include "tree-ssa-loop-ivopts.h"
39 #include "tree-ssa-loop-manip.h"
40 #include "tree-ssa-loop.h"
41 #include "dumpfile.h"
42 #include "cfgloop.h"
43 #include "tree-chrec.h"
44 #include "tree-scalar-evolution.h"
45 #include "tree-vectorizer.h"
46 #include "diagnostic-core.h"
47 /* Need to include rtl.h, expr.h, etc. for optabs. */
48 #include "expr.h"
49 #include "optabs.h"
51 /* Return true if load- or store-lanes optab OPTAB is implemented for
52 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
54 static bool
55 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
56 tree vectype, unsigned HOST_WIDE_INT count)
58 enum machine_mode mode, array_mode;
59 bool limit_p;
61 mode = TYPE_MODE (vectype);
62 limit_p = !targetm.array_mode_supported_p (mode, count);
63 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
64 MODE_INT, limit_p);
66 if (array_mode == BLKmode)
68 if (dump_enabled_p ())
69 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
70 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
71 GET_MODE_NAME (mode), count);
72 return false;
75 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
77 if (dump_enabled_p ())
78 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
79 "cannot use %s<%s><%s>\n", name,
80 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
81 return false;
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_NOTE, vect_location,
86 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
87 GET_MODE_NAME (mode));
89 return true;
93 /* Return the smallest scalar part of STMT.
94 This is used to determine the vectype of the stmt. We generally set the
95 vectype according to the type of the result (lhs). For stmts whose
96 result-type is different than the type of the arguments (e.g., demotion,
97 promotion), vectype will be reset appropriately (later). Note that we have
98 to visit the smallest datatype in this function, because that determines the
99 VF. If the smallest datatype in the loop is present only as the rhs of a
100 promotion operation - we'd miss it.
101 Such a case, where a variable of this datatype does not appear in the lhs
102 anywhere in the loop, can only occur if it's an invariant: e.g.:
103 'int_x = (int) short_inv', which we'd expect to have been optimized away by
104 invariant motion. However, we cannot rely on invariant motion to always
105 take invariants out of the loop, and so in the case of promotion we also
106 have to check the rhs.
107 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
108 types. */
110 tree
111 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
112 HOST_WIDE_INT *rhs_size_unit)
114 tree scalar_type = gimple_expr_type (stmt);
115 HOST_WIDE_INT lhs, rhs;
117 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
119 if (is_gimple_assign (stmt)
120 && (gimple_assign_cast_p (stmt)
121 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
122 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
123 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
125 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
127 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
128 if (rhs < lhs)
129 scalar_type = rhs_type;
132 *lhs_size_unit = lhs;
133 *rhs_size_unit = rhs;
134 return scalar_type;
138 /* Check if data references pointed by DR_I and DR_J are same or
139 belong to same interleaving group. Return FALSE if drs are
140 different, otherwise return TRUE. */
142 static bool
143 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
145 gimple stmt_i = DR_STMT (dr_i);
146 gimple stmt_j = DR_STMT (dr_j);
148 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
149 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
150 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
151 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
152 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
153 return true;
154 else
155 return false;
158 /* If address ranges represented by DDR_I and DDR_J are equal,
159 return TRUE, otherwise return FALSE. */
161 static bool
162 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
164 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
165 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
166 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
167 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
168 return true;
169 else
170 return false;
173 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
174 tested at run-time. Return TRUE if DDR was successfully inserted.
175 Return false if versioning is not supported. */
177 static bool
178 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
180 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
182 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
183 return false;
185 if (dump_enabled_p ())
187 dump_printf_loc (MSG_NOTE, vect_location,
188 "mark for run-time aliasing test between ");
189 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
190 dump_printf (MSG_NOTE, " and ");
191 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
192 dump_printf (MSG_NOTE, "\n");
195 if (optimize_loop_nest_for_size_p (loop))
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
199 "versioning not supported when optimizing"
200 " for size.\n");
201 return false;
204 /* FORNOW: We don't support versioning with outer-loop vectorization. */
205 if (loop->inner)
207 if (dump_enabled_p ())
208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
209 "versioning not yet supported for outer-loops.\n");
210 return false;
213 /* FORNOW: We don't support creating runtime alias tests for non-constant
214 step. */
215 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
216 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
220 "versioning not yet supported for non-constant "
221 "step\n");
222 return false;
225 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
226 return true;
230 /* Function vect_analyze_data_ref_dependence.
232 Return TRUE if there (might) exist a dependence between a memory-reference
233 DRA and a memory-reference DRB. When versioning for alias may check a
234 dependence at run-time, return FALSE. Adjust *MAX_VF according to
235 the data dependence. */
237 static bool
238 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
239 loop_vec_info loop_vinfo, int *max_vf)
241 unsigned int i;
242 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
243 struct data_reference *dra = DDR_A (ddr);
244 struct data_reference *drb = DDR_B (ddr);
245 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
246 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
247 lambda_vector dist_v;
248 unsigned int loop_depth;
250 /* In loop analysis all data references should be vectorizable. */
251 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
252 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
253 gcc_unreachable ();
255 /* Independent data accesses. */
256 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
257 return false;
259 if (dra == drb
260 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
261 return false;
263 /* Unknown data dependence. */
264 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
266 /* If user asserted safelen consecutive iterations can be
267 executed concurrently, assume independence. */
268 if (loop->safelen >= 2)
270 if (loop->safelen < *max_vf)
271 *max_vf = loop->safelen;
272 return false;
275 if (STMT_VINFO_GATHER_P (stmtinfo_a)
276 || STMT_VINFO_GATHER_P (stmtinfo_b))
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "versioning for alias not supported for: "
282 "can't determine dependence between ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (dra));
285 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (drb));
288 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
290 return true;
293 if (dump_enabled_p ())
295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
296 "versioning for alias required: "
297 "can't determine dependence between ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
299 DR_REF (dra));
300 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
301 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
302 DR_REF (drb));
303 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
306 /* Add to list of ddrs that need to be tested at run-time. */
307 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
310 /* Known data dependence. */
311 if (DDR_NUM_DIST_VECTS (ddr) == 0)
313 /* If user asserted safelen consecutive iterations can be
314 executed concurrently, assume independence. */
315 if (loop->safelen >= 2)
317 if (loop->safelen < *max_vf)
318 *max_vf = loop->safelen;
319 return false;
322 if (STMT_VINFO_GATHER_P (stmtinfo_a)
323 || STMT_VINFO_GATHER_P (stmtinfo_b))
325 if (dump_enabled_p ())
327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
328 "versioning for alias not supported for: "
329 "bad dist vector for ");
330 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
331 DR_REF (dra));
332 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
334 DR_REF (drb));
335 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
337 return true;
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
343 "versioning for alias required: "
344 "bad dist vector for ");
345 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
346 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
348 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
350 /* Add to list of ddrs that need to be tested at run-time. */
351 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
354 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
355 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
357 int dist = dist_v[loop_depth];
359 if (dump_enabled_p ())
360 dump_printf_loc (MSG_NOTE, vect_location,
361 "dependence distance = %d.\n", dist);
363 if (dist == 0)
365 if (dump_enabled_p ())
367 dump_printf_loc (MSG_NOTE, vect_location,
368 "dependence distance == 0 between ");
369 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
370 dump_printf (MSG_NOTE, " and ");
371 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
372 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
375 /* When we perform grouped accesses and perform implicit CSE
376 by detecting equal accesses and doing disambiguation with
377 runtime alias tests like for
378 .. = a[i];
379 .. = a[i+1];
380 a[i] = ..;
381 a[i+1] = ..;
382 *p = ..;
383 .. = a[i];
384 .. = a[i+1];
385 where we will end up loading { a[i], a[i+1] } once, make
386 sure that inserting group loads before the first load and
387 stores after the last store will do the right thing. */
388 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
389 && GROUP_SAME_DR_STMT (stmtinfo_a))
390 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
391 && GROUP_SAME_DR_STMT (stmtinfo_b)))
393 gimple earlier_stmt;
394 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
395 if (DR_IS_WRITE
396 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "READ_WRITE dependence in interleaving."
401 "\n");
402 return true;
406 continue;
409 if (dist > 0 && DDR_REVERSED_P (ddr))
411 /* If DDR_REVERSED_P the order of the data-refs in DDR was
412 reversed (to make distance vector positive), and the actual
413 distance is negative. */
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "dependence distance negative.\n");
417 continue;
420 if (abs (dist) >= 2
421 && abs (dist) < *max_vf)
423 /* The dependence distance requires reduction of the maximal
424 vectorization factor. */
425 *max_vf = abs (dist);
426 if (dump_enabled_p ())
427 dump_printf_loc (MSG_NOTE, vect_location,
428 "adjusting maximal vectorization factor to %i\n",
429 *max_vf);
432 if (abs (dist) >= *max_vf)
434 /* Dependence distance does not create dependence, as far as
435 vectorization is concerned, in this case. */
436 if (dump_enabled_p ())
437 dump_printf_loc (MSG_NOTE, vect_location,
438 "dependence distance >= VF.\n");
439 continue;
442 if (dump_enabled_p ())
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
445 "not vectorized, possible dependence "
446 "between data-refs ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
448 dump_printf (MSG_NOTE, " and ");
449 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
450 dump_printf (MSG_NOTE, "\n");
453 return true;
456 return false;
459 /* Function vect_analyze_data_ref_dependences.
461 Examine all the data references in the loop, and make sure there do not
462 exist any data dependences between them. Set *MAX_VF according to
463 the maximum vectorization factor the data dependences allow. */
465 bool
466 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
468 unsigned int i;
469 struct data_dependence_relation *ddr;
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_NOTE, vect_location,
473 "=== vect_analyze_data_ref_dependences ===\n");
475 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
476 &LOOP_VINFO_DDRS (loop_vinfo),
477 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
478 return false;
480 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
481 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
482 return false;
484 return true;
488 /* Function vect_slp_analyze_data_ref_dependence.
490 Return TRUE if there (might) exist a dependence between a memory-reference
491 DRA and a memory-reference DRB. When versioning for alias may check a
492 dependence at run-time, return FALSE. Adjust *MAX_VF according to
493 the data dependence. */
495 static bool
496 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
498 struct data_reference *dra = DDR_A (ddr);
499 struct data_reference *drb = DDR_B (ddr);
501 /* We need to check dependences of statements marked as unvectorizable
502 as well, they still can prohibit vectorization. */
504 /* Independent data accesses. */
505 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
506 return false;
508 if (dra == drb)
509 return false;
511 /* Read-read is OK. */
512 if (DR_IS_READ (dra) && DR_IS_READ (drb))
513 return false;
515 /* If dra and drb are part of the same interleaving chain consider
516 them independent. */
517 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
518 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
519 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
520 return false;
522 /* Unknown data dependence. */
523 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
525 gimple earlier_stmt;
527 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "can't determine dependence between ");
531 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
532 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
533 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
534 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
537 /* We do not vectorize basic blocks with write-write dependencies. */
538 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
539 return true;
541 /* Check that it's not a load-after-store dependence. */
542 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
543 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
544 return true;
546 return false;
549 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location,
552 "determined dependence between ");
553 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
554 dump_printf (MSG_NOTE, " and ");
555 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
556 dump_printf (MSG_NOTE, "\n");
559 /* Do not vectorize basic blocks with write-write dependences. */
560 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
561 return true;
563 /* Check dependence between DRA and DRB for basic block vectorization.
564 If the accesses share same bases and offsets, we can compare their initial
565 constant offsets to decide whether they differ or not. In case of a read-
566 write dependence we check that the load is before the store to ensure that
567 vectorization will not change the order of the accesses. */
569 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
570 gimple earlier_stmt;
572 /* Check that the data-refs have same bases and offsets. If not, we can't
573 determine if they are dependent. */
574 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
575 || !dr_equal_offsets_p (dra, drb))
576 return true;
578 /* Check the types. */
579 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
580 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
582 if (type_size_a != type_size_b
583 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
584 TREE_TYPE (DR_REF (drb))))
585 return true;
587 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
588 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
590 /* Two different locations - no dependence. */
591 if (init_a != init_b)
592 return false;
594 /* We have a read-write dependence. Check that the load is before the store.
595 When we vectorize basic blocks, vector load can be only before
596 corresponding scalar load, and vector store can be only after its
597 corresponding scalar store. So the order of the acceses is preserved in
598 case the load is before the store. */
599 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
600 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
601 return false;
603 return true;
607 /* Function vect_analyze_data_ref_dependences.
609 Examine all the data references in the basic-block, and make sure there
610 do not exist any data dependences between them. Set *MAX_VF according to
611 the maximum vectorization factor the data dependences allow. */
613 bool
614 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
616 struct data_dependence_relation *ddr;
617 unsigned int i;
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE, vect_location,
621 "=== vect_slp_analyze_data_ref_dependences ===\n");
623 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
624 &BB_VINFO_DDRS (bb_vinfo),
625 vNULL, true))
626 return false;
628 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
629 if (vect_slp_analyze_data_ref_dependence (ddr))
630 return false;
632 return true;
636 /* Function vect_compute_data_ref_alignment
638 Compute the misalignment of the data reference DR.
640 Output:
641 1. If during the misalignment computation it is found that the data reference
642 cannot be vectorized then false is returned.
643 2. DR_MISALIGNMENT (DR) is defined.
645 FOR NOW: No analysis is actually performed. Misalignment is calculated
646 only for trivial cases. TODO. */
648 static bool
649 vect_compute_data_ref_alignment (struct data_reference *dr)
651 gimple stmt = DR_STMT (dr);
652 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
653 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
654 struct loop *loop = NULL;
655 tree ref = DR_REF (dr);
656 tree vectype;
657 tree base, base_addr;
658 bool base_aligned;
659 tree misalign;
660 tree aligned_to, alignment;
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_NOTE, vect_location,
664 "vect_compute_data_ref_alignment:\n");
666 if (loop_vinfo)
667 loop = LOOP_VINFO_LOOP (loop_vinfo);
669 /* Initialize misalignment to unknown. */
670 SET_DR_MISALIGNMENT (dr, -1);
672 /* Strided loads perform only component accesses, misalignment information
673 is irrelevant for them. */
674 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
675 return true;
677 misalign = DR_INIT (dr);
678 aligned_to = DR_ALIGNED_TO (dr);
679 base_addr = DR_BASE_ADDRESS (dr);
680 vectype = STMT_VINFO_VECTYPE (stmt_info);
682 /* In case the dataref is in an inner-loop of the loop that is being
683 vectorized (LOOP), we use the base and misalignment information
684 relative to the outer-loop (LOOP). This is ok only if the misalignment
685 stays the same throughout the execution of the inner-loop, which is why
686 we have to check that the stride of the dataref in the inner-loop evenly
687 divides by the vector size. */
688 if (loop && nested_in_vect_loop_p (loop, stmt))
690 tree step = DR_STEP (dr);
691 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
693 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
695 if (dump_enabled_p ())
696 dump_printf_loc (MSG_NOTE, vect_location,
697 "inner step divides the vector-size.\n");
698 misalign = STMT_VINFO_DR_INIT (stmt_info);
699 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
700 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
702 else
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
706 "inner step doesn't divide the vector-size.\n");
707 misalign = NULL_TREE;
711 /* Similarly, if we're doing basic-block vectorization, we can only use
712 base and misalignment information relative to an innermost loop if the
713 misalignment stays the same throughout the execution of the loop.
714 As above, this is the case if the stride of the dataref evenly divides
715 by the vector size. */
716 if (!loop)
718 tree step = DR_STEP (dr);
719 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
721 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
723 if (dump_enabled_p ())
724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
725 "SLP: step doesn't divide the vector-size.\n");
726 misalign = NULL_TREE;
730 base = build_fold_indirect_ref (base_addr);
731 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
733 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
734 || !misalign)
736 if (dump_enabled_p ())
738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
739 "Unknown alignment for access: ");
740 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
741 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
743 return true;
746 if ((DECL_P (base)
747 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
748 alignment) >= 0)
749 || (TREE_CODE (base_addr) == SSA_NAME
750 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
751 TREE_TYPE (base_addr)))),
752 alignment) >= 0)
753 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
754 base_aligned = true;
755 else
756 base_aligned = false;
758 if (!base_aligned)
760 /* Do not change the alignment of global variables here if
761 flag_section_anchors is enabled as we already generated
762 RTL for other functions. Most global variables should
763 have been aligned during the IPA increase_alignment pass. */
764 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
765 || (TREE_STATIC (base) && flag_section_anchors))
767 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE, vect_location,
770 "can't force alignment of ref: ");
771 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
772 dump_printf (MSG_NOTE, "\n");
774 return true;
777 /* Force the alignment of the decl.
778 NOTE: This is the only change to the code we make during
779 the analysis phase, before deciding to vectorize the loop. */
780 if (dump_enabled_p ())
782 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
783 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
784 dump_printf (MSG_NOTE, "\n");
787 ((dataref_aux *)dr->aux)->base_decl = base;
788 ((dataref_aux *)dr->aux)->base_misaligned = true;
791 /* If this is a backward running DR then first access in the larger
792 vectype actually is N-1 elements before the address in the DR.
793 Adjust misalign accordingly. */
794 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
796 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
797 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
798 otherwise we wouldn't be here. */
799 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
800 /* PLUS because DR_STEP was negative. */
801 misalign = size_binop (PLUS_EXPR, misalign, offset);
804 /* Modulo alignment. */
805 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
807 if (!host_integerp (misalign, 1))
809 /* Negative or overflowed misalignment value. */
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
812 "unexpected misalign value\n");
813 return false;
816 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
818 if (dump_enabled_p ())
820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
821 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
822 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
823 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
826 return true;
830 /* Function vect_compute_data_refs_alignment
832 Compute the misalignment of data references in the loop.
833 Return FALSE if a data reference is found that cannot be vectorized. */
835 static bool
836 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
837 bb_vec_info bb_vinfo)
839 vec<data_reference_p> datarefs;
840 struct data_reference *dr;
841 unsigned int i;
843 if (loop_vinfo)
844 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
845 else
846 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
848 FOR_EACH_VEC_ELT (datarefs, i, dr)
849 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
850 && !vect_compute_data_ref_alignment (dr))
852 if (bb_vinfo)
854 /* Mark unsupported statement as unvectorizable. */
855 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
856 continue;
858 else
859 return false;
862 return true;
866 /* Function vect_update_misalignment_for_peel
868 DR - the data reference whose misalignment is to be adjusted.
869 DR_PEEL - the data reference whose misalignment is being made
870 zero in the vector loop by the peel.
871 NPEEL - the number of iterations in the peel loop if the misalignment
872 of DR_PEEL is known at compile time. */
874 static void
875 vect_update_misalignment_for_peel (struct data_reference *dr,
876 struct data_reference *dr_peel, int npeel)
878 unsigned int i;
879 vec<dr_p> same_align_drs;
880 struct data_reference *current_dr;
881 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
882 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
883 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
884 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
886 /* For interleaved data accesses the step in the loop must be multiplied by
887 the size of the interleaving group. */
888 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
889 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
890 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
891 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
893 /* It can be assumed that the data refs with the same alignment as dr_peel
894 are aligned in the vector loop. */
895 same_align_drs
896 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
897 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
899 if (current_dr != dr)
900 continue;
901 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
902 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
903 SET_DR_MISALIGNMENT (dr, 0);
904 return;
907 if (known_alignment_for_access_p (dr)
908 && known_alignment_for_access_p (dr_peel))
910 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
911 int misal = DR_MISALIGNMENT (dr);
912 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
913 misal += negative ? -npeel * dr_size : npeel * dr_size;
914 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
915 SET_DR_MISALIGNMENT (dr, misal);
916 return;
919 if (dump_enabled_p ())
920 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
921 SET_DR_MISALIGNMENT (dr, -1);
925 /* Function vect_verify_datarefs_alignment
927 Return TRUE if all data references in the loop can be
928 handled with respect to alignment. */
930 bool
931 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
933 vec<data_reference_p> datarefs;
934 struct data_reference *dr;
935 enum dr_alignment_support supportable_dr_alignment;
936 unsigned int i;
938 if (loop_vinfo)
939 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
940 else
941 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
943 FOR_EACH_VEC_ELT (datarefs, i, dr)
945 gimple stmt = DR_STMT (dr);
946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
948 if (!STMT_VINFO_RELEVANT_P (stmt_info))
949 continue;
951 /* For interleaving, only the alignment of the first access matters.
952 Skip statements marked as not vectorizable. */
953 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
954 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
955 || !STMT_VINFO_VECTORIZABLE (stmt_info))
956 continue;
958 /* Strided loads perform only component accesses, alignment is
959 irrelevant for them. */
960 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
961 continue;
963 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
964 if (!supportable_dr_alignment)
966 if (dump_enabled_p ())
968 if (DR_IS_READ (dr))
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "not vectorized: unsupported unaligned load.");
971 else
972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
973 "not vectorized: unsupported unaligned "
974 "store.");
976 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
977 DR_REF (dr));
978 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
980 return false;
982 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
983 dump_printf_loc (MSG_NOTE, vect_location,
984 "Vectorizing an unaligned access.\n");
986 return true;
989 /* Given an memory reference EXP return whether its alignment is less
990 than its size. */
992 static bool
993 not_size_aligned (tree exp)
995 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
996 return true;
998 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
999 > get_object_alignment (exp));
1002 /* Function vector_alignment_reachable_p
1004 Return true if vector alignment for DR is reachable by peeling
1005 a few loop iterations. Return false otherwise. */
1007 static bool
1008 vector_alignment_reachable_p (struct data_reference *dr)
1010 gimple stmt = DR_STMT (dr);
1011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1012 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1014 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1016 /* For interleaved access we peel only if number of iterations in
1017 the prolog loop ({VF - misalignment}), is a multiple of the
1018 number of the interleaved accesses. */
1019 int elem_size, mis_in_elements;
1020 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1022 /* FORNOW: handle only known alignment. */
1023 if (!known_alignment_for_access_p (dr))
1024 return false;
1026 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1027 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1029 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1030 return false;
1033 /* If misalignment is known at the compile time then allow peeling
1034 only if natural alignment is reachable through peeling. */
1035 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1037 HOST_WIDE_INT elmsize =
1038 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1039 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_NOTE, vect_location,
1042 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1043 dump_printf (MSG_NOTE,
1044 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1046 if (DR_MISALIGNMENT (dr) % elmsize)
1048 if (dump_enabled_p ())
1049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1050 "data size does not divide the misalignment.\n");
1051 return false;
1055 if (!known_alignment_for_access_p (dr))
1057 tree type = TREE_TYPE (DR_REF (dr));
1058 bool is_packed = not_size_aligned (DR_REF (dr));
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 "Unknown misalignment, is_packed = %d\n",is_packed);
1062 if ((TYPE_USER_ALIGN (type) && !is_packed)
1063 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1064 return true;
1065 else
1066 return false;
1069 return true;
1073 /* Calculate the cost of the memory access represented by DR. */
1075 static void
1076 vect_get_data_access_cost (struct data_reference *dr,
1077 unsigned int *inside_cost,
1078 unsigned int *outside_cost,
1079 stmt_vector_for_cost *body_cost_vec)
1081 gimple stmt = DR_STMT (dr);
1082 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1083 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1084 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1085 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1086 int ncopies = vf / nunits;
1088 if (DR_IS_READ (dr))
1089 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1090 NULL, body_cost_vec, false);
1091 else
1092 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE, vect_location,
1096 "vect_get_data_access_cost: inside_cost = %d, "
1097 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1101 /* Insert DR into peeling hash table with NPEEL as key. */
1103 static void
1104 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1105 int npeel)
1107 struct _vect_peel_info elem, *slot;
1108 _vect_peel_info **new_slot;
1109 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1111 elem.npeel = npeel;
1112 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1113 if (slot)
1114 slot->count++;
1115 else
1117 slot = XNEW (struct _vect_peel_info);
1118 slot->npeel = npeel;
1119 slot->dr = dr;
1120 slot->count = 1;
1121 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1122 *new_slot = slot;
1125 if (!supportable_dr_alignment && unlimited_cost_model ())
1126 slot->count += VECT_MAX_COST;
1130 /* Traverse peeling hash table to find peeling option that aligns maximum
1131 number of data accesses. */
1134 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1135 _vect_peel_extended_info *max)
1137 vect_peel_info elem = *slot;
1139 if (elem->count > max->peel_info.count
1140 || (elem->count == max->peel_info.count
1141 && max->peel_info.npeel > elem->npeel))
1143 max->peel_info.npeel = elem->npeel;
1144 max->peel_info.count = elem->count;
1145 max->peel_info.dr = elem->dr;
1148 return 1;
1152 /* Traverse peeling hash table and calculate cost for each peeling option.
1153 Find the one with the lowest cost. */
1156 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1157 _vect_peel_extended_info *min)
1159 vect_peel_info elem = *slot;
1160 int save_misalignment, dummy;
1161 unsigned int inside_cost = 0, outside_cost = 0, i;
1162 gimple stmt = DR_STMT (elem->dr);
1163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1165 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1166 struct data_reference *dr;
1167 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1168 int single_iter_cost;
1170 prologue_cost_vec.create (2);
1171 body_cost_vec.create (2);
1172 epilogue_cost_vec.create (2);
1174 FOR_EACH_VEC_ELT (datarefs, i, dr)
1176 stmt = DR_STMT (dr);
1177 stmt_info = vinfo_for_stmt (stmt);
1178 /* For interleaving, only the alignment of the first access
1179 matters. */
1180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1182 continue;
1184 save_misalignment = DR_MISALIGNMENT (dr);
1185 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1186 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1187 &body_cost_vec);
1188 SET_DR_MISALIGNMENT (dr, save_misalignment);
1191 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1192 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1193 &dummy, single_iter_cost,
1194 &prologue_cost_vec,
1195 &epilogue_cost_vec);
1197 /* Prologue and epilogue costs are added to the target model later.
1198 These costs depend only on the scalar iteration cost, the
1199 number of peeling iterations finally chosen, and the number of
1200 misaligned statements. So discard the information found here. */
1201 prologue_cost_vec.release ();
1202 epilogue_cost_vec.release ();
1204 if (inside_cost < min->inside_cost
1205 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1207 min->inside_cost = inside_cost;
1208 min->outside_cost = outside_cost;
1209 min->body_cost_vec.release ();
1210 min->body_cost_vec = body_cost_vec;
1211 min->peel_info.dr = elem->dr;
1212 min->peel_info.npeel = elem->npeel;
1214 else
1215 body_cost_vec.release ();
1217 return 1;
1221 /* Choose best peeling option by traversing peeling hash table and either
1222 choosing an option with the lowest cost (if cost model is enabled) or the
1223 option that aligns as many accesses as possible. */
1225 static struct data_reference *
1226 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1227 unsigned int *npeel,
1228 stmt_vector_for_cost *body_cost_vec)
1230 struct _vect_peel_extended_info res;
1232 res.peel_info.dr = NULL;
1233 res.body_cost_vec = stmt_vector_for_cost ();
1235 if (!unlimited_cost_model ())
1237 res.inside_cost = INT_MAX;
1238 res.outside_cost = INT_MAX;
1239 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1240 .traverse <_vect_peel_extended_info *,
1241 vect_peeling_hash_get_lowest_cost> (&res);
1243 else
1245 res.peel_info.count = 0;
1246 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1247 .traverse <_vect_peel_extended_info *,
1248 vect_peeling_hash_get_most_frequent> (&res);
1251 *npeel = res.peel_info.npeel;
1252 *body_cost_vec = res.body_cost_vec;
1253 return res.peel_info.dr;
1257 /* Function vect_enhance_data_refs_alignment
1259 This pass will use loop versioning and loop peeling in order to enhance
1260 the alignment of data references in the loop.
1262 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1263 original loop is to be vectorized. Any other loops that are created by
1264 the transformations performed in this pass - are not supposed to be
1265 vectorized. This restriction will be relaxed.
1267 This pass will require a cost model to guide it whether to apply peeling
1268 or versioning or a combination of the two. For example, the scheme that
1269 intel uses when given a loop with several memory accesses, is as follows:
1270 choose one memory access ('p') which alignment you want to force by doing
1271 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1272 other accesses are not necessarily aligned, or (2) use loop versioning to
1273 generate one loop in which all accesses are aligned, and another loop in
1274 which only 'p' is necessarily aligned.
1276 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1277 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1278 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1280 Devising a cost model is the most critical aspect of this work. It will
1281 guide us on which access to peel for, whether to use loop versioning, how
1282 many versions to create, etc. The cost model will probably consist of
1283 generic considerations as well as target specific considerations (on
1284 powerpc for example, misaligned stores are more painful than misaligned
1285 loads).
1287 Here are the general steps involved in alignment enhancements:
1289 -- original loop, before alignment analysis:
1290 for (i=0; i<N; i++){
1291 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1292 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1295 -- After vect_compute_data_refs_alignment:
1296 for (i=0; i<N; i++){
1297 x = q[i]; # DR_MISALIGNMENT(q) = 3
1298 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1301 -- Possibility 1: we do loop versioning:
1302 if (p is aligned) {
1303 for (i=0; i<N; i++){ # loop 1A
1304 x = q[i]; # DR_MISALIGNMENT(q) = 3
1305 p[i] = y; # DR_MISALIGNMENT(p) = 0
1308 else {
1309 for (i=0; i<N; i++){ # loop 1B
1310 x = q[i]; # DR_MISALIGNMENT(q) = 3
1311 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1315 -- Possibility 2: we do loop peeling:
1316 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1317 x = q[i];
1318 p[i] = y;
1320 for (i = 3; i < N; i++){ # loop 2A
1321 x = q[i]; # DR_MISALIGNMENT(q) = 0
1322 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1325 -- Possibility 3: combination of loop peeling and versioning:
1326 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1327 x = q[i];
1328 p[i] = y;
1330 if (p is aligned) {
1331 for (i = 3; i<N; i++){ # loop 3A
1332 x = q[i]; # DR_MISALIGNMENT(q) = 0
1333 p[i] = y; # DR_MISALIGNMENT(p) = 0
1336 else {
1337 for (i = 3; i<N; i++){ # loop 3B
1338 x = q[i]; # DR_MISALIGNMENT(q) = 0
1339 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1343 These loops are later passed to loop_transform to be vectorized. The
1344 vectorizer will use the alignment information to guide the transformation
1345 (whether to generate regular loads/stores, or with special handling for
1346 misalignment). */
1348 bool
1349 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1351 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1352 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1353 enum dr_alignment_support supportable_dr_alignment;
1354 struct data_reference *dr0 = NULL, *first_store = NULL;
1355 struct data_reference *dr;
1356 unsigned int i, j;
1357 bool do_peeling = false;
1358 bool do_versioning = false;
1359 bool stat;
1360 gimple stmt;
1361 stmt_vec_info stmt_info;
1362 unsigned int npeel = 0;
1363 bool all_misalignments_unknown = true;
1364 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1365 unsigned possible_npeel_number = 1;
1366 tree vectype;
1367 unsigned int nelements, mis, same_align_drs_max = 0;
1368 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_NOTE, vect_location,
1372 "=== vect_enhance_data_refs_alignment ===\n");
1374 /* While cost model enhancements are expected in the future, the high level
1375 view of the code at this time is as follows:
1377 A) If there is a misaligned access then see if peeling to align
1378 this access can make all data references satisfy
1379 vect_supportable_dr_alignment. If so, update data structures
1380 as needed and return true.
1382 B) If peeling wasn't possible and there is a data reference with an
1383 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1384 then see if loop versioning checks can be used to make all data
1385 references satisfy vect_supportable_dr_alignment. If so, update
1386 data structures as needed and return true.
1388 C) If neither peeling nor versioning were successful then return false if
1389 any data reference does not satisfy vect_supportable_dr_alignment.
1391 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1393 Note, Possibility 3 above (which is peeling and versioning together) is not
1394 being done at this time. */
1396 /* (1) Peeling to force alignment. */
1398 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1399 Considerations:
1400 + How many accesses will become aligned due to the peeling
1401 - How many accesses will become unaligned due to the peeling,
1402 and the cost of misaligned accesses.
1403 - The cost of peeling (the extra runtime checks, the increase
1404 in code size). */
1406 FOR_EACH_VEC_ELT (datarefs, i, dr)
1408 stmt = DR_STMT (dr);
1409 stmt_info = vinfo_for_stmt (stmt);
1411 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1412 continue;
1414 /* For interleaving, only the alignment of the first access
1415 matters. */
1416 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1417 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1418 continue;
1420 /* For invariant accesses there is nothing to enhance. */
1421 if (integer_zerop (DR_STEP (dr)))
1422 continue;
1424 /* Strided loads perform only component accesses, alignment is
1425 irrelevant for them. */
1426 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1427 continue;
1429 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1430 do_peeling = vector_alignment_reachable_p (dr);
1431 if (do_peeling)
1433 if (known_alignment_for_access_p (dr))
1435 unsigned int npeel_tmp;
1436 bool negative = tree_int_cst_compare (DR_STEP (dr),
1437 size_zero_node) < 0;
1439 /* Save info about DR in the hash table. */
1440 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1441 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1443 vectype = STMT_VINFO_VECTYPE (stmt_info);
1444 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1445 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1446 TREE_TYPE (DR_REF (dr))));
1447 npeel_tmp = (negative
1448 ? (mis - nelements) : (nelements - mis))
1449 & (nelements - 1);
1451 /* For multiple types, it is possible that the bigger type access
1452 will have more than one peeling option. E.g., a loop with two
1453 types: one of size (vector size / 4), and the other one of
1454 size (vector size / 8). Vectorization factor will 8. If both
1455 access are misaligned by 3, the first one needs one scalar
1456 iteration to be aligned, and the second one needs 5. But the
1457 the first one will be aligned also by peeling 5 scalar
1458 iterations, and in that case both accesses will be aligned.
1459 Hence, except for the immediate peeling amount, we also want
1460 to try to add full vector size, while we don't exceed
1461 vectorization factor.
1462 We do this automtically for cost model, since we calculate cost
1463 for every peeling option. */
1464 if (unlimited_cost_model ())
1465 possible_npeel_number = vf /nelements;
1467 /* Handle the aligned case. We may decide to align some other
1468 access, making DR unaligned. */
1469 if (DR_MISALIGNMENT (dr) == 0)
1471 npeel_tmp = 0;
1472 if (unlimited_cost_model ())
1473 possible_npeel_number++;
1476 for (j = 0; j < possible_npeel_number; j++)
1478 gcc_assert (npeel_tmp <= vf);
1479 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1480 npeel_tmp += nelements;
1483 all_misalignments_unknown = false;
1484 /* Data-ref that was chosen for the case that all the
1485 misalignments are unknown is not relevant anymore, since we
1486 have a data-ref with known alignment. */
1487 dr0 = NULL;
1489 else
1491 /* If we don't know any misalignment values, we prefer
1492 peeling for data-ref that has the maximum number of data-refs
1493 with the same alignment, unless the target prefers to align
1494 stores over load. */
1495 if (all_misalignments_unknown)
1497 unsigned same_align_drs
1498 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1499 if (!dr0
1500 || same_align_drs_max < same_align_drs)
1502 same_align_drs_max = same_align_drs;
1503 dr0 = dr;
1505 /* For data-refs with the same number of related
1506 accesses prefer the one where the misalign
1507 computation will be invariant in the outermost loop. */
1508 else if (same_align_drs_max == same_align_drs)
1510 struct loop *ivloop0, *ivloop;
1511 ivloop0 = outermost_invariant_loop_for_expr
1512 (loop, DR_BASE_ADDRESS (dr0));
1513 ivloop = outermost_invariant_loop_for_expr
1514 (loop, DR_BASE_ADDRESS (dr));
1515 if ((ivloop && !ivloop0)
1516 || (ivloop && ivloop0
1517 && flow_loop_nested_p (ivloop, ivloop0)))
1518 dr0 = dr;
1521 if (!first_store && DR_IS_WRITE (dr))
1522 first_store = dr;
1525 /* If there are both known and unknown misaligned accesses in the
1526 loop, we choose peeling amount according to the known
1527 accesses. */
1528 if (!supportable_dr_alignment)
1530 dr0 = dr;
1531 if (!first_store && DR_IS_WRITE (dr))
1532 first_store = dr;
1536 else
1538 if (!aligned_access_p (dr))
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1542 "vector alignment may not be reachable\n");
1543 break;
1548 /* Check if we can possibly peel the loop. */
1549 if (!vect_can_advance_ivs_p (loop_vinfo)
1550 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1551 do_peeling = false;
1553 if (do_peeling && all_misalignments_unknown
1554 && vect_supportable_dr_alignment (dr0, false))
1557 /* Check if the target requires to prefer stores over loads, i.e., if
1558 misaligned stores are more expensive than misaligned loads (taking
1559 drs with same alignment into account). */
1560 if (first_store && DR_IS_READ (dr0))
1562 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1563 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1564 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1565 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1566 stmt_vector_for_cost dummy;
1567 dummy.create (2);
1569 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1570 &dummy);
1571 vect_get_data_access_cost (first_store, &store_inside_cost,
1572 &store_outside_cost, &dummy);
1574 dummy.release ();
1576 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1577 aligning the load DR0). */
1578 load_inside_penalty = store_inside_cost;
1579 load_outside_penalty = store_outside_cost;
1580 for (i = 0;
1581 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1582 DR_STMT (first_store))).iterate (i, &dr);
1583 i++)
1584 if (DR_IS_READ (dr))
1586 load_inside_penalty += load_inside_cost;
1587 load_outside_penalty += load_outside_cost;
1589 else
1591 load_inside_penalty += store_inside_cost;
1592 load_outside_penalty += store_outside_cost;
1595 /* Calculate the penalty for leaving DR0 unaligned (by
1596 aligning the FIRST_STORE). */
1597 store_inside_penalty = load_inside_cost;
1598 store_outside_penalty = load_outside_cost;
1599 for (i = 0;
1600 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1601 DR_STMT (dr0))).iterate (i, &dr);
1602 i++)
1603 if (DR_IS_READ (dr))
1605 store_inside_penalty += load_inside_cost;
1606 store_outside_penalty += load_outside_cost;
1608 else
1610 store_inside_penalty += store_inside_cost;
1611 store_outside_penalty += store_outside_cost;
1614 if (load_inside_penalty > store_inside_penalty
1615 || (load_inside_penalty == store_inside_penalty
1616 && load_outside_penalty > store_outside_penalty))
1617 dr0 = first_store;
1620 /* In case there are only loads with different unknown misalignments, use
1621 peeling only if it may help to align other accesses in the loop. */
1622 if (!first_store
1623 && !STMT_VINFO_SAME_ALIGN_REFS (
1624 vinfo_for_stmt (DR_STMT (dr0))).length ()
1625 && vect_supportable_dr_alignment (dr0, false)
1626 != dr_unaligned_supported)
1627 do_peeling = false;
1630 if (do_peeling && !dr0)
1632 /* Peeling is possible, but there is no data access that is not supported
1633 unless aligned. So we try to choose the best possible peeling. */
1635 /* We should get here only if there are drs with known misalignment. */
1636 gcc_assert (!all_misalignments_unknown);
1638 /* Choose the best peeling from the hash table. */
1639 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1640 &body_cost_vec);
1641 if (!dr0 || !npeel)
1642 do_peeling = false;
1645 if (do_peeling)
1647 stmt = DR_STMT (dr0);
1648 stmt_info = vinfo_for_stmt (stmt);
1649 vectype = STMT_VINFO_VECTYPE (stmt_info);
1650 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1652 if (known_alignment_for_access_p (dr0))
1654 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1655 size_zero_node) < 0;
1656 if (!npeel)
1658 /* Since it's known at compile time, compute the number of
1659 iterations in the peeled loop (the peeling factor) for use in
1660 updating DR_MISALIGNMENT values. The peeling factor is the
1661 vectorization factor minus the misalignment as an element
1662 count. */
1663 mis = DR_MISALIGNMENT (dr0);
1664 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1665 npeel = ((negative ? mis - nelements : nelements - mis)
1666 & (nelements - 1));
1669 /* For interleaved data access every iteration accesses all the
1670 members of the group, therefore we divide the number of iterations
1671 by the group size. */
1672 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1673 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1674 npeel /= GROUP_SIZE (stmt_info);
1676 if (dump_enabled_p ())
1677 dump_printf_loc (MSG_NOTE, vect_location,
1678 "Try peeling by %d\n", npeel);
1681 /* Ensure that all data refs can be vectorized after the peel. */
1682 FOR_EACH_VEC_ELT (datarefs, i, dr)
1684 int save_misalignment;
1686 if (dr == dr0)
1687 continue;
1689 stmt = DR_STMT (dr);
1690 stmt_info = vinfo_for_stmt (stmt);
1691 /* For interleaving, only the alignment of the first access
1692 matters. */
1693 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1694 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1695 continue;
1697 /* Strided loads perform only component accesses, alignment is
1698 irrelevant for them. */
1699 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1700 continue;
1702 save_misalignment = DR_MISALIGNMENT (dr);
1703 vect_update_misalignment_for_peel (dr, dr0, npeel);
1704 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1705 SET_DR_MISALIGNMENT (dr, save_misalignment);
1707 if (!supportable_dr_alignment)
1709 do_peeling = false;
1710 break;
1714 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1716 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1717 if (!stat)
1718 do_peeling = false;
1719 else
1721 body_cost_vec.release ();
1722 return stat;
1726 if (do_peeling)
1728 unsigned max_allowed_peel
1729 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1730 if (max_allowed_peel != (unsigned)-1)
1732 unsigned max_peel = npeel;
1733 if (max_peel == 0)
1735 gimple dr_stmt = DR_STMT (dr0);
1736 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1737 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1738 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1740 if (max_peel > max_allowed_peel)
1742 do_peeling = false;
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_NOTE, vect_location,
1745 "Disable peeling, max peels reached: %d\n", max_peel);
1750 if (do_peeling)
1752 stmt_info_for_cost *si;
1753 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1755 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1756 If the misalignment of DR_i is identical to that of dr0 then set
1757 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1758 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1759 by the peeling factor times the element size of DR_i (MOD the
1760 vectorization factor times the size). Otherwise, the
1761 misalignment of DR_i must be set to unknown. */
1762 FOR_EACH_VEC_ELT (datarefs, i, dr)
1763 if (dr != dr0)
1764 vect_update_misalignment_for_peel (dr, dr0, npeel);
1766 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1767 if (npeel)
1768 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1769 else
1770 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1771 SET_DR_MISALIGNMENT (dr0, 0);
1772 if (dump_enabled_p ())
1774 dump_printf_loc (MSG_NOTE, vect_location,
1775 "Alignment of access forced using peeling.\n");
1776 dump_printf_loc (MSG_NOTE, vect_location,
1777 "Peeling for alignment will be applied.\n");
1779 /* We've delayed passing the inside-loop peeling costs to the
1780 target cost model until we were sure peeling would happen.
1781 Do so now. */
1782 if (body_cost_vec.exists ())
1784 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1786 struct _stmt_vec_info *stmt_info
1787 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1788 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1789 si->misalign, vect_body);
1791 body_cost_vec.release ();
1794 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1795 gcc_assert (stat);
1796 return stat;
1800 body_cost_vec.release ();
1802 /* (2) Versioning to force alignment. */
1804 /* Try versioning if:
1805 1) optimize loop for speed
1806 2) there is at least one unsupported misaligned data ref with an unknown
1807 misalignment, and
1808 3) all misaligned data refs with a known misalignment are supported, and
1809 4) the number of runtime alignment checks is within reason. */
1811 do_versioning =
1812 optimize_loop_nest_for_speed_p (loop)
1813 && (!loop->inner); /* FORNOW */
1815 if (do_versioning)
1817 FOR_EACH_VEC_ELT (datarefs, i, dr)
1819 stmt = DR_STMT (dr);
1820 stmt_info = vinfo_for_stmt (stmt);
1822 /* For interleaving, only the alignment of the first access
1823 matters. */
1824 if (aligned_access_p (dr)
1825 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1826 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1827 continue;
1829 /* Strided loads perform only component accesses, alignment is
1830 irrelevant for them. */
1831 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1832 continue;
1834 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1836 if (!supportable_dr_alignment)
1838 gimple stmt;
1839 int mask;
1840 tree vectype;
1842 if (known_alignment_for_access_p (dr)
1843 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1844 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1846 do_versioning = false;
1847 break;
1850 stmt = DR_STMT (dr);
1851 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1852 gcc_assert (vectype);
1854 /* The rightmost bits of an aligned address must be zeros.
1855 Construct the mask needed for this test. For example,
1856 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1857 mask must be 15 = 0xf. */
1858 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1860 /* FORNOW: use the same mask to test all potentially unaligned
1861 references in the loop. The vectorizer currently supports
1862 a single vector size, see the reference to
1863 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1864 vectorization factor is computed. */
1865 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1866 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1867 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1868 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1869 DR_STMT (dr));
1873 /* Versioning requires at least one misaligned data reference. */
1874 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1875 do_versioning = false;
1876 else if (!do_versioning)
1877 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1880 if (do_versioning)
1882 vec<gimple> may_misalign_stmts
1883 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1884 gimple stmt;
1886 /* It can now be assumed that the data references in the statements
1887 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1888 of the loop being vectorized. */
1889 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1891 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1892 dr = STMT_VINFO_DATA_REF (stmt_info);
1893 SET_DR_MISALIGNMENT (dr, 0);
1894 if (dump_enabled_p ())
1895 dump_printf_loc (MSG_NOTE, vect_location,
1896 "Alignment of access forced using versioning.\n");
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_NOTE, vect_location,
1901 "Versioning for alignment will be applied.\n");
1903 /* Peeling and versioning can't be done together at this time. */
1904 gcc_assert (! (do_peeling && do_versioning));
1906 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1907 gcc_assert (stat);
1908 return stat;
1911 /* This point is reached if neither peeling nor versioning is being done. */
1912 gcc_assert (! (do_peeling || do_versioning));
1914 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1915 return stat;
1919 /* Function vect_find_same_alignment_drs.
1921 Update group and alignment relations according to the chosen
1922 vectorization factor. */
1924 static void
1925 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1926 loop_vec_info loop_vinfo)
1928 unsigned int i;
1929 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1930 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1931 struct data_reference *dra = DDR_A (ddr);
1932 struct data_reference *drb = DDR_B (ddr);
1933 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1934 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1935 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1936 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1937 lambda_vector dist_v;
1938 unsigned int loop_depth;
1940 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1941 return;
1943 if (dra == drb)
1944 return;
1946 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1947 return;
1949 /* Loop-based vectorization and known data dependence. */
1950 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1951 return;
1953 /* Data-dependence analysis reports a distance vector of zero
1954 for data-references that overlap only in the first iteration
1955 but have different sign step (see PR45764).
1956 So as a sanity check require equal DR_STEP. */
1957 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1958 return;
1960 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1961 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1963 int dist = dist_v[loop_depth];
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_NOTE, vect_location,
1967 "dependence distance = %d.\n", dist);
1969 /* Same loop iteration. */
1970 if (dist == 0
1971 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1973 /* Two references with distance zero have the same alignment. */
1974 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1975 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1976 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE, vect_location,
1979 "accesses have the same alignment.\n");
1980 dump_printf (MSG_NOTE,
1981 "dependence distance modulo vf == 0 between ");
1982 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1983 dump_printf (MSG_NOTE, " and ");
1984 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1985 dump_printf (MSG_NOTE, "\n");
1992 /* Function vect_analyze_data_refs_alignment
1994 Analyze the alignment of the data-references in the loop.
1995 Return FALSE if a data reference is found that cannot be vectorized. */
1997 bool
1998 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1999 bb_vec_info bb_vinfo)
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_NOTE, vect_location,
2003 "=== vect_analyze_data_refs_alignment ===\n");
2005 /* Mark groups of data references with same alignment using
2006 data dependence information. */
2007 if (loop_vinfo)
2009 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2010 struct data_dependence_relation *ddr;
2011 unsigned int i;
2013 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2014 vect_find_same_alignment_drs (ddr, loop_vinfo);
2017 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021 "not vectorized: can't calculate alignment "
2022 "for data ref.\n");
2023 return false;
2026 return true;
2030 /* Analyze groups of accesses: check that DR belongs to a group of
2031 accesses of legal size, step, etc. Detect gaps, single element
2032 interleaving, and other special cases. Set grouped access info.
2033 Collect groups of strided stores for further use in SLP analysis. */
2035 static bool
2036 vect_analyze_group_access (struct data_reference *dr)
2038 tree step = DR_STEP (dr);
2039 tree scalar_type = TREE_TYPE (DR_REF (dr));
2040 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2041 gimple stmt = DR_STMT (dr);
2042 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2043 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2044 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2045 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2046 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2047 bool slp_impossible = false;
2048 struct loop *loop = NULL;
2050 if (loop_vinfo)
2051 loop = LOOP_VINFO_LOOP (loop_vinfo);
2053 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2054 size of the interleaving group (including gaps). */
2055 groupsize = absu_hwi (dr_step) / type_size;
2057 /* Not consecutive access is possible only if it is a part of interleaving. */
2058 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2060 /* Check if it this DR is a part of interleaving, and is a single
2061 element of the group that is accessed in the loop. */
2063 /* Gaps are supported only for loads. STEP must be a multiple of the type
2064 size. The size of the group must be a power of 2. */
2065 if (DR_IS_READ (dr)
2066 && (dr_step % type_size) == 0
2067 && groupsize > 0
2068 && exact_log2 (groupsize) != -1)
2070 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2071 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2072 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_NOTE, vect_location,
2075 "Detected single element interleaving ");
2076 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2077 dump_printf (MSG_NOTE, " step ");
2078 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2079 dump_printf (MSG_NOTE, "\n");
2082 if (loop_vinfo)
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE, vect_location,
2086 "Data access with gaps requires scalar "
2087 "epilogue loop\n");
2088 if (loop->inner)
2090 if (dump_enabled_p ())
2091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2092 "Peeling for outer loop is not"
2093 " supported\n");
2094 return false;
2097 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2100 return true;
2103 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2106 "not consecutive access ");
2107 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2108 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2111 if (bb_vinfo)
2113 /* Mark the statement as unvectorizable. */
2114 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2115 return true;
2118 return false;
2121 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2123 /* First stmt in the interleaving chain. Check the chain. */
2124 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2125 struct data_reference *data_ref = dr;
2126 unsigned int count = 1;
2127 tree prev_init = DR_INIT (data_ref);
2128 gimple prev = stmt;
2129 HOST_WIDE_INT diff, gaps = 0;
2130 unsigned HOST_WIDE_INT count_in_bytes;
2132 while (next)
2134 /* Skip same data-refs. In case that two or more stmts share
2135 data-ref (supported only for loads), we vectorize only the first
2136 stmt, and the rest get their vectorized loads from the first
2137 one. */
2138 if (!tree_int_cst_compare (DR_INIT (data_ref),
2139 DR_INIT (STMT_VINFO_DATA_REF (
2140 vinfo_for_stmt (next)))))
2142 if (DR_IS_WRITE (data_ref))
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146 "Two store stmts share the same dr.\n");
2147 return false;
2150 /* For load use the same data-ref load. */
2151 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2153 prev = next;
2154 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2155 continue;
2158 prev = next;
2159 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2161 /* All group members have the same STEP by construction. */
2162 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2164 /* Check that the distance between two accesses is equal to the type
2165 size. Otherwise, we have gaps. */
2166 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2167 - TREE_INT_CST_LOW (prev_init)) / type_size;
2168 if (diff != 1)
2170 /* FORNOW: SLP of accesses with gaps is not supported. */
2171 slp_impossible = true;
2172 if (DR_IS_WRITE (data_ref))
2174 if (dump_enabled_p ())
2175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2176 "interleaved store with gaps\n");
2177 return false;
2180 gaps += diff - 1;
2183 last_accessed_element += diff;
2185 /* Store the gap from the previous member of the group. If there is no
2186 gap in the access, GROUP_GAP is always 1. */
2187 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2189 prev_init = DR_INIT (data_ref);
2190 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2191 /* Count the number of data-refs in the chain. */
2192 count++;
2195 /* COUNT is the number of accesses found, we multiply it by the size of
2196 the type to get COUNT_IN_BYTES. */
2197 count_in_bytes = type_size * count;
2199 /* Check that the size of the interleaving (including gaps) is not
2200 greater than STEP. */
2201 if (dr_step != 0
2202 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2204 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207 "interleaving size is greater than step for ");
2208 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2209 DR_REF (dr));
2210 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2212 return false;
2215 /* Check that the size of the interleaving is equal to STEP for stores,
2216 i.e., that there are no gaps. */
2217 if (dr_step != 0
2218 && absu_hwi (dr_step) != count_in_bytes)
2220 if (DR_IS_READ (dr))
2222 slp_impossible = true;
2223 /* There is a gap after the last load in the group. This gap is a
2224 difference between the groupsize and the number of elements.
2225 When there is no gap, this difference should be 0. */
2226 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2228 else
2230 if (dump_enabled_p ())
2231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232 "interleaved store with gaps\n");
2233 return false;
2237 /* Check that STEP is a multiple of type size. */
2238 if (dr_step != 0
2239 && (dr_step % type_size) != 0)
2241 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244 "step is not a multiple of type size: step ");
2245 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2246 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2247 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2248 TYPE_SIZE_UNIT (scalar_type));
2249 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2251 return false;
2254 if (groupsize == 0)
2255 groupsize = count;
2257 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2258 if (dump_enabled_p ())
2259 dump_printf_loc (MSG_NOTE, vect_location,
2260 "Detected interleaving of size %d\n", (int)groupsize);
2262 /* SLP: create an SLP data structure for every interleaving group of
2263 stores for further analysis in vect_analyse_slp. */
2264 if (DR_IS_WRITE (dr) && !slp_impossible)
2266 if (loop_vinfo)
2267 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2268 if (bb_vinfo)
2269 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2272 /* There is a gap in the end of the group. */
2273 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 "Data access with gaps requires scalar "
2278 "epilogue loop\n");
2279 if (loop->inner)
2281 if (dump_enabled_p ())
2282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2283 "Peeling for outer loop is not supported\n");
2284 return false;
2287 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2291 return true;
2295 /* Analyze the access pattern of the data-reference DR.
2296 In case of non-consecutive accesses call vect_analyze_group_access() to
2297 analyze groups of accesses. */
2299 static bool
2300 vect_analyze_data_ref_access (struct data_reference *dr)
2302 tree step = DR_STEP (dr);
2303 tree scalar_type = TREE_TYPE (DR_REF (dr));
2304 gimple stmt = DR_STMT (dr);
2305 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2306 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2307 struct loop *loop = NULL;
2309 if (loop_vinfo)
2310 loop = LOOP_VINFO_LOOP (loop_vinfo);
2312 if (loop_vinfo && !step)
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "bad data-ref access in loop\n");
2317 return false;
2320 /* Allow invariant loads in not nested loops. */
2321 if (loop_vinfo && integer_zerop (step))
2323 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2324 if (nested_in_vect_loop_p (loop, stmt))
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_NOTE, vect_location,
2328 "zero step in inner loop of nest\n");
2329 return false;
2331 return DR_IS_READ (dr);
2334 if (loop && nested_in_vect_loop_p (loop, stmt))
2336 /* Interleaved accesses are not yet supported within outer-loop
2337 vectorization for references in the inner-loop. */
2338 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2340 /* For the rest of the analysis we use the outer-loop step. */
2341 step = STMT_VINFO_DR_STEP (stmt_info);
2342 if (integer_zerop (step))
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_NOTE, vect_location,
2346 "zero step in outer loop.\n");
2347 if (DR_IS_READ (dr))
2348 return true;
2349 else
2350 return false;
2354 /* Consecutive? */
2355 if (TREE_CODE (step) == INTEGER_CST)
2357 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2358 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2359 || (dr_step < 0
2360 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2362 /* Mark that it is not interleaving. */
2363 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2364 return true;
2368 if (loop && nested_in_vect_loop_p (loop, stmt))
2370 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_NOTE, vect_location,
2372 "grouped access in outer loop.\n");
2373 return false;
2376 /* Assume this is a DR handled by non-constant strided load case. */
2377 if (TREE_CODE (step) != INTEGER_CST)
2378 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2380 /* Not consecutive access - check if it's a part of interleaving group. */
2381 return vect_analyze_group_access (dr);
2386 /* A helper function used in the comparator function to sort data
2387 references. T1 and T2 are two data references to be compared.
2388 The function returns -1, 0, or 1. */
2390 static int
2391 compare_tree (tree t1, tree t2)
2393 int i, cmp;
2394 enum tree_code code;
2395 char tclass;
2397 if (t1 == t2)
2398 return 0;
2399 if (t1 == NULL)
2400 return -1;
2401 if (t2 == NULL)
2402 return 1;
2405 if (TREE_CODE (t1) != TREE_CODE (t2))
2406 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2408 code = TREE_CODE (t1);
2409 switch (code)
2411 /* For const values, we can just use hash values for comparisons. */
2412 case INTEGER_CST:
2413 case REAL_CST:
2414 case FIXED_CST:
2415 case STRING_CST:
2416 case COMPLEX_CST:
2417 case VECTOR_CST:
2419 hashval_t h1 = iterative_hash_expr (t1, 0);
2420 hashval_t h2 = iterative_hash_expr (t2, 0);
2421 if (h1 != h2)
2422 return h1 < h2 ? -1 : 1;
2423 break;
2426 case SSA_NAME:
2427 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2428 if (cmp != 0)
2429 return cmp;
2431 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2432 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2433 break;
2435 default:
2436 tclass = TREE_CODE_CLASS (code);
2438 /* For var-decl, we could compare their UIDs. */
2439 if (tclass == tcc_declaration)
2441 if (DECL_UID (t1) != DECL_UID (t2))
2442 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2443 break;
2446 /* For expressions with operands, compare their operands recursively. */
2447 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2449 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2450 if (cmp != 0)
2451 return cmp;
2455 return 0;
2459 /* Compare two data-references DRA and DRB to group them into chunks
2460 suitable for grouping. */
2462 static int
2463 dr_group_sort_cmp (const void *dra_, const void *drb_)
2465 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2466 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2467 int cmp;
2469 /* Stabilize sort. */
2470 if (dra == drb)
2471 return 0;
2473 /* Ordering of DRs according to base. */
2474 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2476 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2477 if (cmp != 0)
2478 return cmp;
2481 /* And according to DR_OFFSET. */
2482 if (!dr_equal_offsets_p (dra, drb))
2484 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2485 if (cmp != 0)
2486 return cmp;
2489 /* Put reads before writes. */
2490 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2491 return DR_IS_READ (dra) ? -1 : 1;
2493 /* Then sort after access size. */
2494 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2495 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2497 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2498 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2499 if (cmp != 0)
2500 return cmp;
2503 /* And after step. */
2504 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2506 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2507 if (cmp != 0)
2508 return cmp;
2511 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2512 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2513 if (cmp == 0)
2514 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2515 return cmp;
2518 /* Function vect_analyze_data_ref_accesses.
2520 Analyze the access pattern of all the data references in the loop.
2522 FORNOW: the only access pattern that is considered vectorizable is a
2523 simple step 1 (consecutive) access.
2525 FORNOW: handle only arrays and pointer accesses. */
2527 bool
2528 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2530 unsigned int i;
2531 vec<data_reference_p> datarefs;
2532 struct data_reference *dr;
2534 if (dump_enabled_p ())
2535 dump_printf_loc (MSG_NOTE, vect_location,
2536 "=== vect_analyze_data_ref_accesses ===\n");
2538 if (loop_vinfo)
2539 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2540 else
2541 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2543 if (datarefs.is_empty ())
2544 return true;
2546 /* Sort the array of datarefs to make building the interleaving chains
2547 linear. */
2548 qsort (datarefs.address (), datarefs.length (),
2549 sizeof (data_reference_p), dr_group_sort_cmp);
2551 /* Build the interleaving chains. */
2552 for (i = 0; i < datarefs.length () - 1;)
2554 data_reference_p dra = datarefs[i];
2555 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2556 stmt_vec_info lastinfo = NULL;
2557 for (i = i + 1; i < datarefs.length (); ++i)
2559 data_reference_p drb = datarefs[i];
2560 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2562 /* ??? Imperfect sorting (non-compatible types, non-modulo
2563 accesses, same accesses) can lead to a group to be artificially
2564 split here as we don't just skip over those. If it really
2565 matters we can push those to a worklist and re-iterate
2566 over them. The we can just skip ahead to the next DR here. */
2568 /* Check that the data-refs have same first location (except init)
2569 and they are both either store or load (not load and store). */
2570 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2571 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2572 DR_BASE_ADDRESS (drb), 0)
2573 || !dr_equal_offsets_p (dra, drb))
2574 break;
2576 /* Check that the data-refs have the same constant size and step. */
2577 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2578 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2579 if (!host_integerp (sza, 1)
2580 || !host_integerp (szb, 1)
2581 || !tree_int_cst_equal (sza, szb)
2582 || !host_integerp (DR_STEP (dra), 0)
2583 || !host_integerp (DR_STEP (drb), 0)
2584 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2585 break;
2587 /* Do not place the same access in the interleaving chain twice. */
2588 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2589 break;
2591 /* Check the types are compatible.
2592 ??? We don't distinguish this during sorting. */
2593 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2594 TREE_TYPE (DR_REF (drb))))
2595 break;
2597 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2598 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2599 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2600 gcc_assert (init_a < init_b);
2602 /* If init_b == init_a + the size of the type * k, we have an
2603 interleaving, and DRA is accessed before DRB. */
2604 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2605 if ((init_b - init_a) % type_size_a != 0)
2606 break;
2608 /* The step (if not zero) is greater than the difference between
2609 data-refs' inits. This splits groups into suitable sizes. */
2610 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2611 if (step != 0 && step <= (init_b - init_a))
2612 break;
2614 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_NOTE, vect_location,
2617 "Detected interleaving ");
2618 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2619 dump_printf (MSG_NOTE, " and ");
2620 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2621 dump_printf (MSG_NOTE, "\n");
2624 /* Link the found element into the group list. */
2625 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2627 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2628 lastinfo = stmtinfo_a;
2630 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2631 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2632 lastinfo = stmtinfo_b;
2636 FOR_EACH_VEC_ELT (datarefs, i, dr)
2637 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2638 && !vect_analyze_data_ref_access (dr))
2640 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2642 "not vectorized: complicated access pattern.\n");
2644 if (bb_vinfo)
2646 /* Mark the statement as not vectorizable. */
2647 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2648 continue;
2650 else
2651 return false;
2654 return true;
2657 /* Function vect_prune_runtime_alias_test_list.
2659 Prune a list of ddrs to be tested at run-time by versioning for alias.
2660 Return FALSE if resulting list of ddrs is longer then allowed by
2661 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2663 bool
2664 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2666 vec<ddr_p> ddrs =
2667 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2668 unsigned i, j;
2670 if (dump_enabled_p ())
2671 dump_printf_loc (MSG_NOTE, vect_location,
2672 "=== vect_prune_runtime_alias_test_list ===\n");
2674 for (i = 0; i < ddrs.length (); )
2676 bool found;
2677 ddr_p ddr_i;
2679 ddr_i = ddrs[i];
2680 found = false;
2682 for (j = 0; j < i; j++)
2684 ddr_p ddr_j = ddrs[j];
2686 if (vect_vfa_range_equal (ddr_i, ddr_j))
2688 if (dump_enabled_p ())
2690 dump_printf_loc (MSG_NOTE, vect_location,
2691 "found equal ranges ");
2692 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2693 DR_REF (DDR_A (ddr_i)));
2694 dump_printf (MSG_NOTE, ", ");
2695 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2696 DR_REF (DDR_B (ddr_i)));
2697 dump_printf (MSG_NOTE, " and ");
2698 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2699 DR_REF (DDR_A (ddr_j)));
2700 dump_printf (MSG_NOTE, ", ");
2701 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2702 DR_REF (DDR_B (ddr_j)));
2703 dump_printf (MSG_NOTE, "\n");
2705 found = true;
2706 break;
2710 if (found)
2712 ddrs.ordered_remove (i);
2713 continue;
2715 i++;
2718 if (ddrs.length () >
2719 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2721 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2724 "disable versioning for alias - max number of "
2725 "generated checks exceeded.\n");
2728 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2730 return false;
2733 return true;
2736 /* Check whether a non-affine read in stmt is suitable for gather load
2737 and if so, return a builtin decl for that operation. */
2739 tree
2740 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2741 tree *offp, int *scalep)
2743 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2744 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2745 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2746 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2747 tree offtype = NULL_TREE;
2748 tree decl, base, off;
2749 enum machine_mode pmode;
2750 int punsignedp, pvolatilep;
2752 /* The gather builtins need address of the form
2753 loop_invariant + vector * {1, 2, 4, 8}
2755 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2756 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2757 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2758 multiplications and additions in it. To get a vector, we need
2759 a single SSA_NAME that will be defined in the loop and will
2760 contain everything that is not loop invariant and that can be
2761 vectorized. The following code attempts to find such a preexistng
2762 SSA_NAME OFF and put the loop invariants into a tree BASE
2763 that can be gimplified before the loop. */
2764 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2765 &pmode, &punsignedp, &pvolatilep, false);
2766 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2768 if (TREE_CODE (base) == MEM_REF)
2770 if (!integer_zerop (TREE_OPERAND (base, 1)))
2772 if (off == NULL_TREE)
2774 double_int moff = mem_ref_offset (base);
2775 off = double_int_to_tree (sizetype, moff);
2777 else
2778 off = size_binop (PLUS_EXPR, off,
2779 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2781 base = TREE_OPERAND (base, 0);
2783 else
2784 base = build_fold_addr_expr (base);
2786 if (off == NULL_TREE)
2787 off = size_zero_node;
2789 /* If base is not loop invariant, either off is 0, then we start with just
2790 the constant offset in the loop invariant BASE and continue with base
2791 as OFF, otherwise give up.
2792 We could handle that case by gimplifying the addition of base + off
2793 into some SSA_NAME and use that as off, but for now punt. */
2794 if (!expr_invariant_in_loop_p (loop, base))
2796 if (!integer_zerop (off))
2797 return NULL_TREE;
2798 off = base;
2799 base = size_int (pbitpos / BITS_PER_UNIT);
2801 /* Otherwise put base + constant offset into the loop invariant BASE
2802 and continue with OFF. */
2803 else
2805 base = fold_convert (sizetype, base);
2806 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2809 /* OFF at this point may be either a SSA_NAME or some tree expression
2810 from get_inner_reference. Try to peel off loop invariants from it
2811 into BASE as long as possible. */
2812 STRIP_NOPS (off);
2813 while (offtype == NULL_TREE)
2815 enum tree_code code;
2816 tree op0, op1, add = NULL_TREE;
2818 if (TREE_CODE (off) == SSA_NAME)
2820 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2822 if (expr_invariant_in_loop_p (loop, off))
2823 return NULL_TREE;
2825 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2826 break;
2828 op0 = gimple_assign_rhs1 (def_stmt);
2829 code = gimple_assign_rhs_code (def_stmt);
2830 op1 = gimple_assign_rhs2 (def_stmt);
2832 else
2834 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2835 return NULL_TREE;
2836 code = TREE_CODE (off);
2837 extract_ops_from_tree (off, &code, &op0, &op1);
2839 switch (code)
2841 case POINTER_PLUS_EXPR:
2842 case PLUS_EXPR:
2843 if (expr_invariant_in_loop_p (loop, op0))
2845 add = op0;
2846 off = op1;
2847 do_add:
2848 add = fold_convert (sizetype, add);
2849 if (scale != 1)
2850 add = size_binop (MULT_EXPR, add, size_int (scale));
2851 base = size_binop (PLUS_EXPR, base, add);
2852 continue;
2854 if (expr_invariant_in_loop_p (loop, op1))
2856 add = op1;
2857 off = op0;
2858 goto do_add;
2860 break;
2861 case MINUS_EXPR:
2862 if (expr_invariant_in_loop_p (loop, op1))
2864 add = fold_convert (sizetype, op1);
2865 add = size_binop (MINUS_EXPR, size_zero_node, add);
2866 off = op0;
2867 goto do_add;
2869 break;
2870 case MULT_EXPR:
2871 if (scale == 1 && host_integerp (op1, 0))
2873 scale = tree_low_cst (op1, 0);
2874 off = op0;
2875 continue;
2877 break;
2878 case SSA_NAME:
2879 off = op0;
2880 continue;
2881 CASE_CONVERT:
2882 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2883 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2884 break;
2885 if (TYPE_PRECISION (TREE_TYPE (op0))
2886 == TYPE_PRECISION (TREE_TYPE (off)))
2888 off = op0;
2889 continue;
2891 if (TYPE_PRECISION (TREE_TYPE (op0))
2892 < TYPE_PRECISION (TREE_TYPE (off)))
2894 off = op0;
2895 offtype = TREE_TYPE (off);
2896 STRIP_NOPS (off);
2897 continue;
2899 break;
2900 default:
2901 break;
2903 break;
2906 /* If at the end OFF still isn't a SSA_NAME or isn't
2907 defined in the loop, punt. */
2908 if (TREE_CODE (off) != SSA_NAME
2909 || expr_invariant_in_loop_p (loop, off))
2910 return NULL_TREE;
2912 if (offtype == NULL_TREE)
2913 offtype = TREE_TYPE (off);
2915 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2916 offtype, scale);
2917 if (decl == NULL_TREE)
2918 return NULL_TREE;
2920 if (basep)
2921 *basep = base;
2922 if (offp)
2923 *offp = off;
2924 if (scalep)
2925 *scalep = scale;
2926 return decl;
2929 /* Function vect_analyze_data_refs.
2931 Find all the data references in the loop or basic block.
2933 The general structure of the analysis of data refs in the vectorizer is as
2934 follows:
2935 1- vect_analyze_data_refs(loop/bb): call
2936 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2937 in the loop/bb and their dependences.
2938 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2939 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2940 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2944 bool
2945 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2946 bb_vec_info bb_vinfo,
2947 int *min_vf)
2949 struct loop *loop = NULL;
2950 basic_block bb = NULL;
2951 unsigned int i;
2952 vec<data_reference_p> datarefs;
2953 struct data_reference *dr;
2954 tree scalar_type;
2956 if (dump_enabled_p ())
2957 dump_printf_loc (MSG_NOTE, vect_location,
2958 "=== vect_analyze_data_refs ===\n");
2960 if (loop_vinfo)
2962 loop = LOOP_VINFO_LOOP (loop_vinfo);
2963 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2964 || find_data_references_in_loop
2965 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2967 if (dump_enabled_p ())
2968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2969 "not vectorized: loop contains function calls"
2970 " or data references that cannot be analyzed\n");
2971 return false;
2974 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2976 else
2978 gimple_stmt_iterator gsi;
2980 bb = BB_VINFO_BB (bb_vinfo);
2981 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2983 gimple stmt = gsi_stmt (gsi);
2984 if (!find_data_references_in_stmt (NULL, stmt,
2985 &BB_VINFO_DATAREFS (bb_vinfo)))
2987 /* Mark the rest of the basic-block as unvectorizable. */
2988 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2990 stmt = gsi_stmt (gsi);
2991 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2993 break;
2997 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3000 /* Go through the data-refs, check that the analysis succeeded. Update
3001 pointer from stmt_vec_info struct to DR and vectype. */
3003 FOR_EACH_VEC_ELT (datarefs, i, dr)
3005 gimple stmt;
3006 stmt_vec_info stmt_info;
3007 tree base, offset, init;
3008 bool gather = false;
3009 bool simd_lane_access = false;
3010 int vf;
3012 again:
3013 if (!dr || !DR_REF (dr))
3015 if (dump_enabled_p ())
3016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017 "not vectorized: unhandled data-ref\n");
3018 return false;
3021 stmt = DR_STMT (dr);
3022 stmt_info = vinfo_for_stmt (stmt);
3024 /* Discard clobbers from the dataref vector. We will remove
3025 clobber stmts during vectorization. */
3026 if (gimple_clobber_p (stmt))
3028 if (i == datarefs.length () - 1)
3030 datarefs.pop ();
3031 break;
3033 datarefs[i] = datarefs.pop ();
3034 goto again;
3037 /* Check that analysis of the data-ref succeeded. */
3038 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3039 || !DR_STEP (dr))
3041 bool maybe_gather
3042 = DR_IS_READ (dr)
3043 && !TREE_THIS_VOLATILE (DR_REF (dr))
3044 && targetm.vectorize.builtin_gather != NULL;
3045 bool maybe_simd_lane_access
3046 = loop_vinfo && loop->simduid;
3048 /* If target supports vector gather loads, or if this might be
3049 a SIMD lane access, see if they can't be used. */
3050 if (loop_vinfo
3051 && (maybe_gather || maybe_simd_lane_access)
3052 && !nested_in_vect_loop_p (loop, stmt))
3054 struct data_reference *newdr
3055 = create_data_ref (NULL, loop_containing_stmt (stmt),
3056 DR_REF (dr), stmt, true);
3057 gcc_assert (newdr != NULL && DR_REF (newdr));
3058 if (DR_BASE_ADDRESS (newdr)
3059 && DR_OFFSET (newdr)
3060 && DR_INIT (newdr)
3061 && DR_STEP (newdr)
3062 && integer_zerop (DR_STEP (newdr)))
3064 if (maybe_simd_lane_access)
3066 tree off = DR_OFFSET (newdr);
3067 STRIP_NOPS (off);
3068 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3069 && TREE_CODE (off) == MULT_EXPR
3070 && host_integerp (TREE_OPERAND (off, 1), 1))
3072 tree step = TREE_OPERAND (off, 1);
3073 off = TREE_OPERAND (off, 0);
3074 STRIP_NOPS (off);
3075 if (CONVERT_EXPR_P (off)
3076 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3077 0)))
3078 < TYPE_PRECISION (TREE_TYPE (off)))
3079 off = TREE_OPERAND (off, 0);
3080 if (TREE_CODE (off) == SSA_NAME)
3082 gimple def = SSA_NAME_DEF_STMT (off);
3083 tree reft = TREE_TYPE (DR_REF (newdr));
3084 if (gimple_call_internal_p (def)
3085 && gimple_call_internal_fn (def)
3086 == IFN_GOMP_SIMD_LANE)
3088 tree arg = gimple_call_arg (def, 0);
3089 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3090 arg = SSA_NAME_VAR (arg);
3091 if (arg == loop->simduid
3092 /* For now. */
3093 && tree_int_cst_equal
3094 (TYPE_SIZE_UNIT (reft),
3095 step))
3097 DR_OFFSET (newdr) = ssize_int (0);
3098 DR_STEP (newdr) = step;
3099 DR_ALIGNED_TO (newdr)
3100 = size_int (BIGGEST_ALIGNMENT);
3101 dr = newdr;
3102 simd_lane_access = true;
3108 if (!simd_lane_access && maybe_gather)
3110 dr = newdr;
3111 gather = true;
3114 if (!gather && !simd_lane_access)
3115 free_data_ref (newdr);
3118 if (!gather && !simd_lane_access)
3120 if (dump_enabled_p ())
3122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3123 "not vectorized: data ref analysis "
3124 "failed ");
3125 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3126 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3129 if (bb_vinfo)
3130 break;
3132 return false;
3136 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3138 if (dump_enabled_p ())
3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140 "not vectorized: base addr of dr is a "
3141 "constant\n");
3143 if (bb_vinfo)
3144 break;
3146 if (gather || simd_lane_access)
3147 free_data_ref (dr);
3148 return false;
3151 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3153 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156 "not vectorized: volatile type ");
3157 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3158 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3161 if (bb_vinfo)
3162 break;
3164 return false;
3167 if (stmt_can_throw_internal (stmt))
3169 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3172 "not vectorized: statement can throw an "
3173 "exception ");
3174 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3175 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3178 if (bb_vinfo)
3179 break;
3181 if (gather || simd_lane_access)
3182 free_data_ref (dr);
3183 return false;
3186 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3187 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3189 if (dump_enabled_p ())
3191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3192 "not vectorized: statement is bitfield "
3193 "access ");
3194 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3195 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3198 if (bb_vinfo)
3199 break;
3201 if (gather || simd_lane_access)
3202 free_data_ref (dr);
3203 return false;
3206 base = unshare_expr (DR_BASE_ADDRESS (dr));
3207 offset = unshare_expr (DR_OFFSET (dr));
3208 init = unshare_expr (DR_INIT (dr));
3210 if (is_gimple_call (stmt))
3212 if (dump_enabled_p ())
3214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3215 "not vectorized: dr in a call ");
3216 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3217 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3220 if (bb_vinfo)
3221 break;
3223 if (gather || simd_lane_access)
3224 free_data_ref (dr);
3225 return false;
3228 /* Update DR field in stmt_vec_info struct. */
3230 /* If the dataref is in an inner-loop of the loop that is considered for
3231 for vectorization, we also want to analyze the access relative to
3232 the outer-loop (DR contains information only relative to the
3233 inner-most enclosing loop). We do that by building a reference to the
3234 first location accessed by the inner-loop, and analyze it relative to
3235 the outer-loop. */
3236 if (loop && nested_in_vect_loop_p (loop, stmt))
3238 tree outer_step, outer_base, outer_init;
3239 HOST_WIDE_INT pbitsize, pbitpos;
3240 tree poffset;
3241 enum machine_mode pmode;
3242 int punsignedp, pvolatilep;
3243 affine_iv base_iv, offset_iv;
3244 tree dinit;
3246 /* Build a reference to the first location accessed by the
3247 inner-loop: *(BASE+INIT). (The first location is actually
3248 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3249 tree inner_base = build_fold_indirect_ref
3250 (fold_build_pointer_plus (base, init));
3252 if (dump_enabled_p ())
3254 dump_printf_loc (MSG_NOTE, vect_location,
3255 "analyze in outer-loop: ");
3256 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3257 dump_printf (MSG_NOTE, "\n");
3260 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3261 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3262 gcc_assert (outer_base != NULL_TREE);
3264 if (pbitpos % BITS_PER_UNIT != 0)
3266 if (dump_enabled_p ())
3267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3268 "failed: bit offset alignment.\n");
3269 return false;
3272 outer_base = build_fold_addr_expr (outer_base);
3273 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3274 &base_iv, false))
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 "failed: evolution of base is not affine.\n");
3279 return false;
3282 if (offset)
3284 if (poffset)
3285 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3286 poffset);
3287 else
3288 poffset = offset;
3291 if (!poffset)
3293 offset_iv.base = ssize_int (0);
3294 offset_iv.step = ssize_int (0);
3296 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3297 &offset_iv, false))
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301 "evolution of offset is not affine.\n");
3302 return false;
3305 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3306 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3307 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3308 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3309 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3311 outer_step = size_binop (PLUS_EXPR,
3312 fold_convert (ssizetype, base_iv.step),
3313 fold_convert (ssizetype, offset_iv.step));
3315 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3316 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3317 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3318 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3319 STMT_VINFO_DR_OFFSET (stmt_info) =
3320 fold_convert (ssizetype, offset_iv.base);
3321 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3322 size_int (highest_pow2_factor (offset_iv.base));
3324 if (dump_enabled_p ())
3326 dump_printf_loc (MSG_NOTE, vect_location,
3327 "\touter base_address: ");
3328 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3329 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3330 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3331 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3332 STMT_VINFO_DR_OFFSET (stmt_info));
3333 dump_printf (MSG_NOTE,
3334 "\n\touter constant offset from base address: ");
3335 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3336 STMT_VINFO_DR_INIT (stmt_info));
3337 dump_printf (MSG_NOTE, "\n\touter step: ");
3338 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3339 STMT_VINFO_DR_STEP (stmt_info));
3340 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3341 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3342 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3343 dump_printf (MSG_NOTE, "\n");
3347 if (STMT_VINFO_DATA_REF (stmt_info))
3349 if (dump_enabled_p ())
3351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3352 "not vectorized: more than one data ref "
3353 "in stmt: ");
3354 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3355 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3358 if (bb_vinfo)
3359 break;
3361 if (gather || simd_lane_access)
3362 free_data_ref (dr);
3363 return false;
3366 STMT_VINFO_DATA_REF (stmt_info) = dr;
3367 if (simd_lane_access)
3369 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3370 datarefs[i] = dr;
3373 /* Set vectype for STMT. */
3374 scalar_type = TREE_TYPE (DR_REF (dr));
3375 STMT_VINFO_VECTYPE (stmt_info) =
3376 get_vectype_for_scalar_type (scalar_type);
3377 if (!STMT_VINFO_VECTYPE (stmt_info))
3379 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3382 "not vectorized: no vectype for stmt: ");
3383 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3384 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3385 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3386 scalar_type);
3387 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3390 if (bb_vinfo)
3391 break;
3393 if (gather || simd_lane_access)
3395 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3396 free_data_ref (dr);
3398 return false;
3400 else
3402 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_NOTE, vect_location,
3405 "got vectype for stmt: ");
3406 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3407 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3408 STMT_VINFO_VECTYPE (stmt_info));
3409 dump_printf (MSG_NOTE, "\n");
3413 /* Adjust the minimal vectorization factor according to the
3414 vector type. */
3415 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3416 if (vf > *min_vf)
3417 *min_vf = vf;
3419 if (gather)
3421 tree off;
3423 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3424 if (gather
3425 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3426 gather = false;
3427 if (!gather)
3429 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3430 free_data_ref (dr);
3431 if (dump_enabled_p ())
3433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3434 "not vectorized: not suitable for gather "
3435 "load ");
3436 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3439 return false;
3442 datarefs[i] = dr;
3443 STMT_VINFO_GATHER_P (stmt_info) = true;
3445 else if (loop_vinfo
3446 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3448 if (nested_in_vect_loop_p (loop, stmt)
3449 || !DR_IS_READ (dr))
3451 if (dump_enabled_p ())
3453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3454 "not vectorized: not suitable for strided "
3455 "load ");
3456 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3457 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3459 return false;
3461 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3465 /* If we stopped analysis at the first dataref we could not analyze
3466 when trying to vectorize a basic-block mark the rest of the datarefs
3467 as not vectorizable and truncate the vector of datarefs. That
3468 avoids spending useless time in analyzing their dependence. */
3469 if (i != datarefs.length ())
3471 gcc_assert (bb_vinfo != NULL);
3472 for (unsigned j = i; j < datarefs.length (); ++j)
3474 data_reference_p dr = datarefs[j];
3475 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3476 free_data_ref (dr);
3478 datarefs.truncate (i);
3481 return true;
3485 /* Function vect_get_new_vect_var.
3487 Returns a name for a new variable. The current naming scheme appends the
3488 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3489 the name of vectorizer generated variables, and appends that to NAME if
3490 provided. */
3492 tree
3493 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3495 const char *prefix;
3496 tree new_vect_var;
3498 switch (var_kind)
3500 case vect_simple_var:
3501 prefix = "vect";
3502 break;
3503 case vect_scalar_var:
3504 prefix = "stmp";
3505 break;
3506 case vect_pointer_var:
3507 prefix = "vectp";
3508 break;
3509 default:
3510 gcc_unreachable ();
3513 if (name)
3515 char* tmp = concat (prefix, "_", name, NULL);
3516 new_vect_var = create_tmp_reg (type, tmp);
3517 free (tmp);
3519 else
3520 new_vect_var = create_tmp_reg (type, prefix);
3522 return new_vect_var;
3526 /* Function vect_create_addr_base_for_vector_ref.
3528 Create an expression that computes the address of the first memory location
3529 that will be accessed for a data reference.
3531 Input:
3532 STMT: The statement containing the data reference.
3533 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3534 OFFSET: Optional. If supplied, it is be added to the initial address.
3535 LOOP: Specify relative to which loop-nest should the address be computed.
3536 For example, when the dataref is in an inner-loop nested in an
3537 outer-loop that is now being vectorized, LOOP can be either the
3538 outer-loop, or the inner-loop. The first memory location accessed
3539 by the following dataref ('in' points to short):
3541 for (i=0; i<N; i++)
3542 for (j=0; j<M; j++)
3543 s += in[i+j]
3545 is as follows:
3546 if LOOP=i_loop: &in (relative to i_loop)
3547 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3549 Output:
3550 1. Return an SSA_NAME whose value is the address of the memory location of
3551 the first vector of the data reference.
3552 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3553 these statement(s) which define the returned SSA_NAME.
3555 FORNOW: We are only handling array accesses with step 1. */
3557 tree
3558 vect_create_addr_base_for_vector_ref (gimple stmt,
3559 gimple_seq *new_stmt_list,
3560 tree offset,
3561 struct loop *loop)
3563 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3564 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3565 tree data_ref_base;
3566 const char *base_name;
3567 tree addr_base;
3568 tree dest;
3569 gimple_seq seq = NULL;
3570 tree base_offset;
3571 tree init;
3572 tree vect_ptr_type;
3573 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3574 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3576 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3578 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3580 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3582 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3583 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3584 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3586 else
3588 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3589 base_offset = unshare_expr (DR_OFFSET (dr));
3590 init = unshare_expr (DR_INIT (dr));
3593 if (loop_vinfo)
3594 base_name = get_name (data_ref_base);
3595 else
3597 base_offset = ssize_int (0);
3598 init = ssize_int (0);
3599 base_name = get_name (DR_REF (dr));
3602 /* Create base_offset */
3603 base_offset = size_binop (PLUS_EXPR,
3604 fold_convert (sizetype, base_offset),
3605 fold_convert (sizetype, init));
3607 if (offset)
3609 offset = fold_build2 (MULT_EXPR, sizetype,
3610 fold_convert (sizetype, offset), step);
3611 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3612 base_offset, offset);
3615 /* base + base_offset */
3616 if (loop_vinfo)
3617 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3618 else
3620 addr_base = build1 (ADDR_EXPR,
3621 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3622 unshare_expr (DR_REF (dr)));
3625 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3626 addr_base = fold_convert (vect_ptr_type, addr_base);
3627 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3628 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3629 gimple_seq_add_seq (new_stmt_list, seq);
3631 if (DR_PTR_INFO (dr)
3632 && TREE_CODE (addr_base) == SSA_NAME)
3634 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3635 if (offset)
3636 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3639 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3642 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3643 dump_printf (MSG_NOTE, "\n");
3646 return addr_base;
3650 /* Function vect_create_data_ref_ptr.
3652 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3653 location accessed in the loop by STMT, along with the def-use update
3654 chain to appropriately advance the pointer through the loop iterations.
3655 Also set aliasing information for the pointer. This pointer is used by
3656 the callers to this function to create a memory reference expression for
3657 vector load/store access.
3659 Input:
3660 1. STMT: a stmt that references memory. Expected to be of the form
3661 GIMPLE_ASSIGN <name, data-ref> or
3662 GIMPLE_ASSIGN <data-ref, name>.
3663 2. AGGR_TYPE: the type of the reference, which should be either a vector
3664 or an array.
3665 3. AT_LOOP: the loop where the vector memref is to be created.
3666 4. OFFSET (optional): an offset to be added to the initial address accessed
3667 by the data-ref in STMT.
3668 5. BSI: location where the new stmts are to be placed if there is no loop
3669 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3670 pointing to the initial address.
3672 Output:
3673 1. Declare a new ptr to vector_type, and have it point to the base of the
3674 data reference (initial addressed accessed by the data reference).
3675 For example, for vector of type V8HI, the following code is generated:
3677 v8hi *ap;
3678 ap = (v8hi *)initial_address;
3680 if OFFSET is not supplied:
3681 initial_address = &a[init];
3682 if OFFSET is supplied:
3683 initial_address = &a[init + OFFSET];
3685 Return the initial_address in INITIAL_ADDRESS.
3687 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3688 update the pointer in each iteration of the loop.
3690 Return the increment stmt that updates the pointer in PTR_INCR.
3692 3. Set INV_P to true if the access pattern of the data reference in the
3693 vectorized loop is invariant. Set it to false otherwise.
3695 4. Return the pointer. */
3697 tree
3698 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3699 tree offset, tree *initial_address,
3700 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3701 bool only_init, bool *inv_p)
3703 const char *base_name;
3704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3705 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3706 struct loop *loop = NULL;
3707 bool nested_in_vect_loop = false;
3708 struct loop *containing_loop = NULL;
3709 tree aggr_ptr_type;
3710 tree aggr_ptr;
3711 tree new_temp;
3712 gimple vec_stmt;
3713 gimple_seq new_stmt_list = NULL;
3714 edge pe = NULL;
3715 basic_block new_bb;
3716 tree aggr_ptr_init;
3717 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3718 tree aptr;
3719 gimple_stmt_iterator incr_gsi;
3720 bool insert_after;
3721 tree indx_before_incr, indx_after_incr;
3722 gimple incr;
3723 tree step;
3724 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3726 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3727 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3729 if (loop_vinfo)
3731 loop = LOOP_VINFO_LOOP (loop_vinfo);
3732 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3733 containing_loop = (gimple_bb (stmt))->loop_father;
3734 pe = loop_preheader_edge (loop);
3736 else
3738 gcc_assert (bb_vinfo);
3739 only_init = true;
3740 *ptr_incr = NULL;
3743 /* Check the step (evolution) of the load in LOOP, and record
3744 whether it's invariant. */
3745 if (nested_in_vect_loop)
3746 step = STMT_VINFO_DR_STEP (stmt_info);
3747 else
3748 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3750 if (integer_zerop (step))
3751 *inv_p = true;
3752 else
3753 *inv_p = false;
3755 /* Create an expression for the first address accessed by this load
3756 in LOOP. */
3757 base_name = get_name (DR_BASE_ADDRESS (dr));
3759 if (dump_enabled_p ())
3761 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3762 dump_printf_loc (MSG_NOTE, vect_location,
3763 "create %s-pointer variable to type: ",
3764 get_tree_code_name (TREE_CODE (aggr_type)));
3765 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3766 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3767 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3768 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3769 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3770 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3771 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3772 else
3773 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3774 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3775 dump_printf (MSG_NOTE, "\n");
3778 /* (1) Create the new aggregate-pointer variable.
3779 Vector and array types inherit the alias set of their component
3780 type by default so we need to use a ref-all pointer if the data
3781 reference does not conflict with the created aggregated data
3782 reference because it is not addressable. */
3783 bool need_ref_all = false;
3784 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3785 get_alias_set (DR_REF (dr))))
3786 need_ref_all = true;
3787 /* Likewise for any of the data references in the stmt group. */
3788 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3790 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3793 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3794 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3795 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3796 get_alias_set (DR_REF (sdr))))
3798 need_ref_all = true;
3799 break;
3801 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3803 while (orig_stmt);
3805 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3806 need_ref_all);
3807 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3810 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3811 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3812 def-use update cycles for the pointer: one relative to the outer-loop
3813 (LOOP), which is what steps (3) and (4) below do. The other is relative
3814 to the inner-loop (which is the inner-most loop containing the dataref),
3815 and this is done be step (5) below.
3817 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3818 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3819 redundant. Steps (3),(4) create the following:
3821 vp0 = &base_addr;
3822 LOOP: vp1 = phi(vp0,vp2)
3825 vp2 = vp1 + step
3826 goto LOOP
3828 If there is an inner-loop nested in loop, then step (5) will also be
3829 applied, and an additional update in the inner-loop will be created:
3831 vp0 = &base_addr;
3832 LOOP: vp1 = phi(vp0,vp2)
3834 inner: vp3 = phi(vp1,vp4)
3835 vp4 = vp3 + inner_step
3836 if () goto inner
3838 vp2 = vp1 + step
3839 if () goto LOOP */
3841 /* (2) Calculate the initial address of the aggregate-pointer, and set
3842 the aggregate-pointer to point to it before the loop. */
3844 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3846 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3847 offset, loop);
3848 if (new_stmt_list)
3850 if (pe)
3852 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3853 gcc_assert (!new_bb);
3855 else
3856 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3859 *initial_address = new_temp;
3861 /* Create: p = (aggr_type *) initial_base */
3862 if (TREE_CODE (new_temp) != SSA_NAME
3863 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3865 vec_stmt = gimple_build_assign (aggr_ptr,
3866 fold_convert (aggr_ptr_type, new_temp));
3867 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3868 /* Copy the points-to information if it exists. */
3869 if (DR_PTR_INFO (dr))
3870 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3871 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3872 if (pe)
3874 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3875 gcc_assert (!new_bb);
3877 else
3878 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3880 else
3881 aggr_ptr_init = new_temp;
3883 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3884 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3885 inner-loop nested in LOOP (during outer-loop vectorization). */
3887 /* No update in loop is required. */
3888 if (only_init && (!loop_vinfo || at_loop == loop))
3889 aptr = aggr_ptr_init;
3890 else
3892 /* The step of the aggregate pointer is the type size. */
3893 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3894 /* One exception to the above is when the scalar step of the load in
3895 LOOP is zero. In this case the step here is also zero. */
3896 if (*inv_p)
3897 iv_step = size_zero_node;
3898 else if (tree_int_cst_sgn (step) == -1)
3899 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3901 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3903 create_iv (aggr_ptr_init,
3904 fold_convert (aggr_ptr_type, iv_step),
3905 aggr_ptr, loop, &incr_gsi, insert_after,
3906 &indx_before_incr, &indx_after_incr);
3907 incr = gsi_stmt (incr_gsi);
3908 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3910 /* Copy the points-to information if it exists. */
3911 if (DR_PTR_INFO (dr))
3913 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3914 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3916 if (ptr_incr)
3917 *ptr_incr = incr;
3919 aptr = indx_before_incr;
3922 if (!nested_in_vect_loop || only_init)
3923 return aptr;
3926 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3927 nested in LOOP, if exists. */
3929 gcc_assert (nested_in_vect_loop);
3930 if (!only_init)
3932 standard_iv_increment_position (containing_loop, &incr_gsi,
3933 &insert_after);
3934 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3935 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3936 &indx_after_incr);
3937 incr = gsi_stmt (incr_gsi);
3938 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3940 /* Copy the points-to information if it exists. */
3941 if (DR_PTR_INFO (dr))
3943 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3944 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3946 if (ptr_incr)
3947 *ptr_incr = incr;
3949 return indx_before_incr;
3951 else
3952 gcc_unreachable ();
3956 /* Function bump_vector_ptr
3958 Increment a pointer (to a vector type) by vector-size. If requested,
3959 i.e. if PTR-INCR is given, then also connect the new increment stmt
3960 to the existing def-use update-chain of the pointer, by modifying
3961 the PTR_INCR as illustrated below:
3963 The pointer def-use update-chain before this function:
3964 DATAREF_PTR = phi (p_0, p_2)
3965 ....
3966 PTR_INCR: p_2 = DATAREF_PTR + step
3968 The pointer def-use update-chain after this function:
3969 DATAREF_PTR = phi (p_0, p_2)
3970 ....
3971 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3972 ....
3973 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3975 Input:
3976 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3977 in the loop.
3978 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3979 the loop. The increment amount across iterations is expected
3980 to be vector_size.
3981 BSI - location where the new update stmt is to be placed.
3982 STMT - the original scalar memory-access stmt that is being vectorized.
3983 BUMP - optional. The offset by which to bump the pointer. If not given,
3984 the offset is assumed to be vector_size.
3986 Output: Return NEW_DATAREF_PTR as illustrated above.
3990 tree
3991 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3992 gimple stmt, tree bump)
3994 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3995 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3996 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3997 tree update = TYPE_SIZE_UNIT (vectype);
3998 gimple incr_stmt;
3999 ssa_op_iter iter;
4000 use_operand_p use_p;
4001 tree new_dataref_ptr;
4003 if (bump)
4004 update = bump;
4006 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4007 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4008 dataref_ptr, update);
4009 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4011 /* Copy the points-to information if it exists. */
4012 if (DR_PTR_INFO (dr))
4014 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4015 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4018 if (!ptr_incr)
4019 return new_dataref_ptr;
4021 /* Update the vector-pointer's cross-iteration increment. */
4022 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4024 tree use = USE_FROM_PTR (use_p);
4026 if (use == dataref_ptr)
4027 SET_USE (use_p, new_dataref_ptr);
4028 else
4029 gcc_assert (tree_int_cst_compare (use, update) == 0);
4032 return new_dataref_ptr;
4036 /* Function vect_create_destination_var.
4038 Create a new temporary of type VECTYPE. */
4040 tree
4041 vect_create_destination_var (tree scalar_dest, tree vectype)
4043 tree vec_dest;
4044 const char *name;
4045 char *new_name;
4046 tree type;
4047 enum vect_var_kind kind;
4049 kind = vectype ? vect_simple_var : vect_scalar_var;
4050 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4052 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4054 name = get_name (scalar_dest);
4055 if (name)
4056 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4057 else
4058 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4059 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4060 free (new_name);
4062 return vec_dest;
4065 /* Function vect_grouped_store_supported.
4067 Returns TRUE if interleave high and interleave low permutations
4068 are supported, and FALSE otherwise. */
4070 bool
4071 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4073 enum machine_mode mode = TYPE_MODE (vectype);
4075 /* vect_permute_store_chain requires the group size to be a power of two. */
4076 if (exact_log2 (count) == -1)
4078 if (dump_enabled_p ())
4079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4080 "the size of the group of accesses"
4081 " is not a power of 2\n");
4082 return false;
4085 /* Check that the permutation is supported. */
4086 if (VECTOR_MODE_P (mode))
4088 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4089 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4090 for (i = 0; i < nelt / 2; i++)
4092 sel[i * 2] = i;
4093 sel[i * 2 + 1] = i + nelt;
4095 if (can_vec_perm_p (mode, false, sel))
4097 for (i = 0; i < nelt; i++)
4098 sel[i] += nelt / 2;
4099 if (can_vec_perm_p (mode, false, sel))
4100 return true;
4104 if (dump_enabled_p ())
4105 dump_printf (MSG_MISSED_OPTIMIZATION,
4106 "interleave op not supported by target.\n");
4107 return false;
4111 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4112 type VECTYPE. */
4114 bool
4115 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4117 return vect_lanes_optab_supported_p ("vec_store_lanes",
4118 vec_store_lanes_optab,
4119 vectype, count);
4123 /* Function vect_permute_store_chain.
4125 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4126 a power of 2, generate interleave_high/low stmts to reorder the data
4127 correctly for the stores. Return the final references for stores in
4128 RESULT_CHAIN.
4130 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4131 The input is 4 vectors each containing 8 elements. We assign a number to
4132 each element, the input sequence is:
4134 1st vec: 0 1 2 3 4 5 6 7
4135 2nd vec: 8 9 10 11 12 13 14 15
4136 3rd vec: 16 17 18 19 20 21 22 23
4137 4th vec: 24 25 26 27 28 29 30 31
4139 The output sequence should be:
4141 1st vec: 0 8 16 24 1 9 17 25
4142 2nd vec: 2 10 18 26 3 11 19 27
4143 3rd vec: 4 12 20 28 5 13 21 30
4144 4th vec: 6 14 22 30 7 15 23 31
4146 i.e., we interleave the contents of the four vectors in their order.
4148 We use interleave_high/low instructions to create such output. The input of
4149 each interleave_high/low operation is two vectors:
4150 1st vec 2nd vec
4151 0 1 2 3 4 5 6 7
4152 the even elements of the result vector are obtained left-to-right from the
4153 high/low elements of the first vector. The odd elements of the result are
4154 obtained left-to-right from the high/low elements of the second vector.
4155 The output of interleave_high will be: 0 4 1 5
4156 and of interleave_low: 2 6 3 7
4159 The permutation is done in log LENGTH stages. In each stage interleave_high
4160 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4161 where the first argument is taken from the first half of DR_CHAIN and the
4162 second argument from it's second half.
4163 In our example,
4165 I1: interleave_high (1st vec, 3rd vec)
4166 I2: interleave_low (1st vec, 3rd vec)
4167 I3: interleave_high (2nd vec, 4th vec)
4168 I4: interleave_low (2nd vec, 4th vec)
4170 The output for the first stage is:
4172 I1: 0 16 1 17 2 18 3 19
4173 I2: 4 20 5 21 6 22 7 23
4174 I3: 8 24 9 25 10 26 11 27
4175 I4: 12 28 13 29 14 30 15 31
4177 The output of the second stage, i.e. the final result is:
4179 I1: 0 8 16 24 1 9 17 25
4180 I2: 2 10 18 26 3 11 19 27
4181 I3: 4 12 20 28 5 13 21 30
4182 I4: 6 14 22 30 7 15 23 31. */
4184 void
4185 vect_permute_store_chain (vec<tree> dr_chain,
4186 unsigned int length,
4187 gimple stmt,
4188 gimple_stmt_iterator *gsi,
4189 vec<tree> *result_chain)
4191 tree vect1, vect2, high, low;
4192 gimple perm_stmt;
4193 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4194 tree perm_mask_low, perm_mask_high;
4195 unsigned int i, n;
4196 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4197 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4199 result_chain->quick_grow (length);
4200 memcpy (result_chain->address (), dr_chain.address (),
4201 length * sizeof (tree));
4203 for (i = 0, n = nelt / 2; i < n; i++)
4205 sel[i * 2] = i;
4206 sel[i * 2 + 1] = i + nelt;
4208 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4209 gcc_assert (perm_mask_high != NULL);
4211 for (i = 0; i < nelt; i++)
4212 sel[i] += nelt / 2;
4213 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4214 gcc_assert (perm_mask_low != NULL);
4216 for (i = 0, n = exact_log2 (length); i < n; i++)
4218 for (j = 0; j < length/2; j++)
4220 vect1 = dr_chain[j];
4221 vect2 = dr_chain[j+length/2];
4223 /* Create interleaving stmt:
4224 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4225 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4226 perm_stmt
4227 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4228 vect1, vect2, perm_mask_high);
4229 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4230 (*result_chain)[2*j] = high;
4232 /* Create interleaving stmt:
4233 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4234 nelt*3/2+1, ...}> */
4235 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4236 perm_stmt
4237 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4238 vect1, vect2, perm_mask_low);
4239 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4240 (*result_chain)[2*j+1] = low;
4242 memcpy (dr_chain.address (), result_chain->address (),
4243 length * sizeof (tree));
4247 /* Function vect_setup_realignment
4249 This function is called when vectorizing an unaligned load using
4250 the dr_explicit_realign[_optimized] scheme.
4251 This function generates the following code at the loop prolog:
4253 p = initial_addr;
4254 x msq_init = *(floor(p)); # prolog load
4255 realignment_token = call target_builtin;
4256 loop:
4257 x msq = phi (msq_init, ---)
4259 The stmts marked with x are generated only for the case of
4260 dr_explicit_realign_optimized.
4262 The code above sets up a new (vector) pointer, pointing to the first
4263 location accessed by STMT, and a "floor-aligned" load using that pointer.
4264 It also generates code to compute the "realignment-token" (if the relevant
4265 target hook was defined), and creates a phi-node at the loop-header bb
4266 whose arguments are the result of the prolog-load (created by this
4267 function) and the result of a load that takes place in the loop (to be
4268 created by the caller to this function).
4270 For the case of dr_explicit_realign_optimized:
4271 The caller to this function uses the phi-result (msq) to create the
4272 realignment code inside the loop, and sets up the missing phi argument,
4273 as follows:
4274 loop:
4275 msq = phi (msq_init, lsq)
4276 lsq = *(floor(p')); # load in loop
4277 result = realign_load (msq, lsq, realignment_token);
4279 For the case of dr_explicit_realign:
4280 loop:
4281 msq = *(floor(p)); # load in loop
4282 p' = p + (VS-1);
4283 lsq = *(floor(p')); # load in loop
4284 result = realign_load (msq, lsq, realignment_token);
4286 Input:
4287 STMT - (scalar) load stmt to be vectorized. This load accesses
4288 a memory location that may be unaligned.
4289 BSI - place where new code is to be inserted.
4290 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4291 is used.
4293 Output:
4294 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4295 target hook, if defined.
4296 Return value - the result of the loop-header phi node. */
4298 tree
4299 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4300 tree *realignment_token,
4301 enum dr_alignment_support alignment_support_scheme,
4302 tree init_addr,
4303 struct loop **at_loop)
4305 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4306 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4307 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4308 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4309 struct loop *loop = NULL;
4310 edge pe = NULL;
4311 tree scalar_dest = gimple_assign_lhs (stmt);
4312 tree vec_dest;
4313 gimple inc;
4314 tree ptr;
4315 tree data_ref;
4316 gimple new_stmt;
4317 basic_block new_bb;
4318 tree msq_init = NULL_TREE;
4319 tree new_temp;
4320 gimple phi_stmt;
4321 tree msq = NULL_TREE;
4322 gimple_seq stmts = NULL;
4323 bool inv_p;
4324 bool compute_in_loop = false;
4325 bool nested_in_vect_loop = false;
4326 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4327 struct loop *loop_for_initial_load = NULL;
4329 if (loop_vinfo)
4331 loop = LOOP_VINFO_LOOP (loop_vinfo);
4332 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4335 gcc_assert (alignment_support_scheme == dr_explicit_realign
4336 || alignment_support_scheme == dr_explicit_realign_optimized);
4338 /* We need to generate three things:
4339 1. the misalignment computation
4340 2. the extra vector load (for the optimized realignment scheme).
4341 3. the phi node for the two vectors from which the realignment is
4342 done (for the optimized realignment scheme). */
4344 /* 1. Determine where to generate the misalignment computation.
4346 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4347 calculation will be generated by this function, outside the loop (in the
4348 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4349 caller, inside the loop.
4351 Background: If the misalignment remains fixed throughout the iterations of
4352 the loop, then both realignment schemes are applicable, and also the
4353 misalignment computation can be done outside LOOP. This is because we are
4354 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4355 are a multiple of VS (the Vector Size), and therefore the misalignment in
4356 different vectorized LOOP iterations is always the same.
4357 The problem arises only if the memory access is in an inner-loop nested
4358 inside LOOP, which is now being vectorized using outer-loop vectorization.
4359 This is the only case when the misalignment of the memory access may not
4360 remain fixed throughout the iterations of the inner-loop (as explained in
4361 detail in vect_supportable_dr_alignment). In this case, not only is the
4362 optimized realignment scheme not applicable, but also the misalignment
4363 computation (and generation of the realignment token that is passed to
4364 REALIGN_LOAD) have to be done inside the loop.
4366 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4367 or not, which in turn determines if the misalignment is computed inside
4368 the inner-loop, or outside LOOP. */
4370 if (init_addr != NULL_TREE || !loop_vinfo)
4372 compute_in_loop = true;
4373 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4377 /* 2. Determine where to generate the extra vector load.
4379 For the optimized realignment scheme, instead of generating two vector
4380 loads in each iteration, we generate a single extra vector load in the
4381 preheader of the loop, and in each iteration reuse the result of the
4382 vector load from the previous iteration. In case the memory access is in
4383 an inner-loop nested inside LOOP, which is now being vectorized using
4384 outer-loop vectorization, we need to determine whether this initial vector
4385 load should be generated at the preheader of the inner-loop, or can be
4386 generated at the preheader of LOOP. If the memory access has no evolution
4387 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4388 to be generated inside LOOP (in the preheader of the inner-loop). */
4390 if (nested_in_vect_loop)
4392 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4393 bool invariant_in_outerloop =
4394 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4395 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4397 else
4398 loop_for_initial_load = loop;
4399 if (at_loop)
4400 *at_loop = loop_for_initial_load;
4402 if (loop_for_initial_load)
4403 pe = loop_preheader_edge (loop_for_initial_load);
4405 /* 3. For the case of the optimized realignment, create the first vector
4406 load at the loop preheader. */
4408 if (alignment_support_scheme == dr_explicit_realign_optimized)
4410 /* Create msq_init = *(floor(p1)) in the loop preheader */
4412 gcc_assert (!compute_in_loop);
4413 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4414 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4415 NULL_TREE, &init_addr, NULL, &inc,
4416 true, &inv_p);
4417 new_temp = copy_ssa_name (ptr, NULL);
4418 new_stmt = gimple_build_assign_with_ops
4419 (BIT_AND_EXPR, new_temp, ptr,
4420 build_int_cst (TREE_TYPE (ptr),
4421 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4422 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4423 gcc_assert (!new_bb);
4424 data_ref
4425 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4426 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4427 new_stmt = gimple_build_assign (vec_dest, data_ref);
4428 new_temp = make_ssa_name (vec_dest, new_stmt);
4429 gimple_assign_set_lhs (new_stmt, new_temp);
4430 if (pe)
4432 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4433 gcc_assert (!new_bb);
4435 else
4436 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4438 msq_init = gimple_assign_lhs (new_stmt);
4441 /* 4. Create realignment token using a target builtin, if available.
4442 It is done either inside the containing loop, or before LOOP (as
4443 determined above). */
4445 if (targetm.vectorize.builtin_mask_for_load)
4447 tree builtin_decl;
4449 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4450 if (!init_addr)
4452 /* Generate the INIT_ADDR computation outside LOOP. */
4453 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4454 NULL_TREE, loop);
4455 if (loop)
4457 pe = loop_preheader_edge (loop);
4458 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4459 gcc_assert (!new_bb);
4461 else
4462 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4465 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4466 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4467 vec_dest =
4468 vect_create_destination_var (scalar_dest,
4469 gimple_call_return_type (new_stmt));
4470 new_temp = make_ssa_name (vec_dest, new_stmt);
4471 gimple_call_set_lhs (new_stmt, new_temp);
4473 if (compute_in_loop)
4474 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4475 else
4477 /* Generate the misalignment computation outside LOOP. */
4478 pe = loop_preheader_edge (loop);
4479 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4480 gcc_assert (!new_bb);
4483 *realignment_token = gimple_call_lhs (new_stmt);
4485 /* The result of the CALL_EXPR to this builtin is determined from
4486 the value of the parameter and no global variables are touched
4487 which makes the builtin a "const" function. Requiring the
4488 builtin to have the "const" attribute makes it unnecessary
4489 to call mark_call_clobbered. */
4490 gcc_assert (TREE_READONLY (builtin_decl));
4493 if (alignment_support_scheme == dr_explicit_realign)
4494 return msq;
4496 gcc_assert (!compute_in_loop);
4497 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4500 /* 5. Create msq = phi <msq_init, lsq> in loop */
4502 pe = loop_preheader_edge (containing_loop);
4503 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4504 msq = make_ssa_name (vec_dest, NULL);
4505 phi_stmt = create_phi_node (msq, containing_loop->header);
4506 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4508 return msq;
4512 /* Function vect_grouped_load_supported.
4514 Returns TRUE if even and odd permutations are supported,
4515 and FALSE otherwise. */
4517 bool
4518 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4520 enum machine_mode mode = TYPE_MODE (vectype);
4522 /* vect_permute_load_chain requires the group size to be a power of two. */
4523 if (exact_log2 (count) == -1)
4525 if (dump_enabled_p ())
4526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4527 "the size of the group of accesses"
4528 " is not a power of 2\n");
4529 return false;
4532 /* Check that the permutation is supported. */
4533 if (VECTOR_MODE_P (mode))
4535 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4536 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4538 for (i = 0; i < nelt; i++)
4539 sel[i] = i * 2;
4540 if (can_vec_perm_p (mode, false, sel))
4542 for (i = 0; i < nelt; i++)
4543 sel[i] = i * 2 + 1;
4544 if (can_vec_perm_p (mode, false, sel))
4545 return true;
4549 if (dump_enabled_p ())
4550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4551 "extract even/odd not supported by target\n");
4552 return false;
4555 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4556 type VECTYPE. */
4558 bool
4559 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4561 return vect_lanes_optab_supported_p ("vec_load_lanes",
4562 vec_load_lanes_optab,
4563 vectype, count);
4566 /* Function vect_permute_load_chain.
4568 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4569 a power of 2, generate extract_even/odd stmts to reorder the input data
4570 correctly. Return the final references for loads in RESULT_CHAIN.
4572 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4573 The input is 4 vectors each containing 8 elements. We assign a number to each
4574 element, the input sequence is:
4576 1st vec: 0 1 2 3 4 5 6 7
4577 2nd vec: 8 9 10 11 12 13 14 15
4578 3rd vec: 16 17 18 19 20 21 22 23
4579 4th vec: 24 25 26 27 28 29 30 31
4581 The output sequence should be:
4583 1st vec: 0 4 8 12 16 20 24 28
4584 2nd vec: 1 5 9 13 17 21 25 29
4585 3rd vec: 2 6 10 14 18 22 26 30
4586 4th vec: 3 7 11 15 19 23 27 31
4588 i.e., the first output vector should contain the first elements of each
4589 interleaving group, etc.
4591 We use extract_even/odd instructions to create such output. The input of
4592 each extract_even/odd operation is two vectors
4593 1st vec 2nd vec
4594 0 1 2 3 4 5 6 7
4596 and the output is the vector of extracted even/odd elements. The output of
4597 extract_even will be: 0 2 4 6
4598 and of extract_odd: 1 3 5 7
4601 The permutation is done in log LENGTH stages. In each stage extract_even
4602 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4603 their order. In our example,
4605 E1: extract_even (1st vec, 2nd vec)
4606 E2: extract_odd (1st vec, 2nd vec)
4607 E3: extract_even (3rd vec, 4th vec)
4608 E4: extract_odd (3rd vec, 4th vec)
4610 The output for the first stage will be:
4612 E1: 0 2 4 6 8 10 12 14
4613 E2: 1 3 5 7 9 11 13 15
4614 E3: 16 18 20 22 24 26 28 30
4615 E4: 17 19 21 23 25 27 29 31
4617 In order to proceed and create the correct sequence for the next stage (or
4618 for the correct output, if the second stage is the last one, as in our
4619 example), we first put the output of extract_even operation and then the
4620 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4621 The input for the second stage is:
4623 1st vec (E1): 0 2 4 6 8 10 12 14
4624 2nd vec (E3): 16 18 20 22 24 26 28 30
4625 3rd vec (E2): 1 3 5 7 9 11 13 15
4626 4th vec (E4): 17 19 21 23 25 27 29 31
4628 The output of the second stage:
4630 E1: 0 4 8 12 16 20 24 28
4631 E2: 2 6 10 14 18 22 26 30
4632 E3: 1 5 9 13 17 21 25 29
4633 E4: 3 7 11 15 19 23 27 31
4635 And RESULT_CHAIN after reordering:
4637 1st vec (E1): 0 4 8 12 16 20 24 28
4638 2nd vec (E3): 1 5 9 13 17 21 25 29
4639 3rd vec (E2): 2 6 10 14 18 22 26 30
4640 4th vec (E4): 3 7 11 15 19 23 27 31. */
4642 static void
4643 vect_permute_load_chain (vec<tree> dr_chain,
4644 unsigned int length,
4645 gimple stmt,
4646 gimple_stmt_iterator *gsi,
4647 vec<tree> *result_chain)
4649 tree data_ref, first_vect, second_vect;
4650 tree perm_mask_even, perm_mask_odd;
4651 gimple perm_stmt;
4652 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4653 unsigned int i, j, log_length = exact_log2 (length);
4654 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4655 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4657 result_chain->quick_grow (length);
4658 memcpy (result_chain->address (), dr_chain.address (),
4659 length * sizeof (tree));
4661 for (i = 0; i < nelt; ++i)
4662 sel[i] = i * 2;
4663 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4664 gcc_assert (perm_mask_even != NULL);
4666 for (i = 0; i < nelt; ++i)
4667 sel[i] = i * 2 + 1;
4668 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4669 gcc_assert (perm_mask_odd != NULL);
4671 for (i = 0; i < log_length; i++)
4673 for (j = 0; j < length; j += 2)
4675 first_vect = dr_chain[j];
4676 second_vect = dr_chain[j+1];
4678 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4679 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4680 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4681 first_vect, second_vect,
4682 perm_mask_even);
4683 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4684 (*result_chain)[j/2] = data_ref;
4686 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4687 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4688 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4689 first_vect, second_vect,
4690 perm_mask_odd);
4691 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4692 (*result_chain)[j/2+length/2] = data_ref;
4694 memcpy (dr_chain.address (), result_chain->address (),
4695 length * sizeof (tree));
4700 /* Function vect_transform_grouped_load.
4702 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4703 to perform their permutation and ascribe the result vectorized statements to
4704 the scalar statements.
4707 void
4708 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4709 gimple_stmt_iterator *gsi)
4711 vec<tree> result_chain = vNULL;
4713 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4714 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4715 vectors, that are ready for vector computation. */
4716 result_chain.create (size);
4717 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4718 vect_record_grouped_load_vectors (stmt, result_chain);
4719 result_chain.release ();
4722 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4723 generated as part of the vectorization of STMT. Assign the statement
4724 for each vector to the associated scalar statement. */
4726 void
4727 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4729 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4730 gimple next_stmt, new_stmt;
4731 unsigned int i, gap_count;
4732 tree tmp_data_ref;
4734 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4735 Since we scan the chain starting from it's first node, their order
4736 corresponds the order of data-refs in RESULT_CHAIN. */
4737 next_stmt = first_stmt;
4738 gap_count = 1;
4739 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4741 if (!next_stmt)
4742 break;
4744 /* Skip the gaps. Loads created for the gaps will be removed by dead
4745 code elimination pass later. No need to check for the first stmt in
4746 the group, since it always exists.
4747 GROUP_GAP is the number of steps in elements from the previous
4748 access (if there is no gap GROUP_GAP is 1). We skip loads that
4749 correspond to the gaps. */
4750 if (next_stmt != first_stmt
4751 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4753 gap_count++;
4754 continue;
4757 while (next_stmt)
4759 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4760 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4761 copies, and we put the new vector statement in the first available
4762 RELATED_STMT. */
4763 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4764 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4765 else
4767 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4769 gimple prev_stmt =
4770 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4771 gimple rel_stmt =
4772 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4773 while (rel_stmt)
4775 prev_stmt = rel_stmt;
4776 rel_stmt =
4777 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4780 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4781 new_stmt;
4785 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4786 gap_count = 1;
4787 /* If NEXT_STMT accesses the same DR as the previous statement,
4788 put the same TMP_DATA_REF as its vectorized statement; otherwise
4789 get the next data-ref from RESULT_CHAIN. */
4790 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4791 break;
4796 /* Function vect_force_dr_alignment_p.
4798 Returns whether the alignment of a DECL can be forced to be aligned
4799 on ALIGNMENT bit boundary. */
4801 bool
4802 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4804 if (TREE_CODE (decl) != VAR_DECL)
4805 return false;
4807 /* We cannot change alignment of common or external symbols as another
4808 translation unit may contain a definition with lower alignment.
4809 The rules of common symbol linking mean that the definition
4810 will override the common symbol. The same is true for constant
4811 pool entries which may be shared and are not properly merged
4812 by LTO. */
4813 if (DECL_EXTERNAL (decl)
4814 || DECL_COMMON (decl)
4815 || DECL_IN_CONSTANT_POOL (decl))
4816 return false;
4818 if (TREE_ASM_WRITTEN (decl))
4819 return false;
4821 /* Do not override the alignment as specified by the ABI when the used
4822 attribute is set. */
4823 if (DECL_PRESERVE_P (decl))
4824 return false;
4826 /* Do not override explicit alignment set by the user when an explicit
4827 section name is also used. This is a common idiom used by many
4828 software projects. */
4829 if (DECL_SECTION_NAME (decl) != NULL_TREE
4830 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4831 return false;
4833 if (TREE_STATIC (decl))
4834 return (alignment <= MAX_OFILE_ALIGNMENT);
4835 else
4836 return (alignment <= MAX_STACK_ALIGNMENT);
4840 /* Return whether the data reference DR is supported with respect to its
4841 alignment.
4842 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4843 it is aligned, i.e., check if it is possible to vectorize it with different
4844 alignment. */
4846 enum dr_alignment_support
4847 vect_supportable_dr_alignment (struct data_reference *dr,
4848 bool check_aligned_accesses)
4850 gimple stmt = DR_STMT (dr);
4851 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4852 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4853 enum machine_mode mode = TYPE_MODE (vectype);
4854 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4855 struct loop *vect_loop = NULL;
4856 bool nested_in_vect_loop = false;
4858 if (aligned_access_p (dr) && !check_aligned_accesses)
4859 return dr_aligned;
4861 if (loop_vinfo)
4863 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4864 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4867 /* Possibly unaligned access. */
4869 /* We can choose between using the implicit realignment scheme (generating
4870 a misaligned_move stmt) and the explicit realignment scheme (generating
4871 aligned loads with a REALIGN_LOAD). There are two variants to the
4872 explicit realignment scheme: optimized, and unoptimized.
4873 We can optimize the realignment only if the step between consecutive
4874 vector loads is equal to the vector size. Since the vector memory
4875 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4876 is guaranteed that the misalignment amount remains the same throughout the
4877 execution of the vectorized loop. Therefore, we can create the
4878 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4879 at the loop preheader.
4881 However, in the case of outer-loop vectorization, when vectorizing a
4882 memory access in the inner-loop nested within the LOOP that is now being
4883 vectorized, while it is guaranteed that the misalignment of the
4884 vectorized memory access will remain the same in different outer-loop
4885 iterations, it is *not* guaranteed that is will remain the same throughout
4886 the execution of the inner-loop. This is because the inner-loop advances
4887 with the original scalar step (and not in steps of VS). If the inner-loop
4888 step happens to be a multiple of VS, then the misalignment remains fixed
4889 and we can use the optimized realignment scheme. For example:
4891 for (i=0; i<N; i++)
4892 for (j=0; j<M; j++)
4893 s += a[i+j];
4895 When vectorizing the i-loop in the above example, the step between
4896 consecutive vector loads is 1, and so the misalignment does not remain
4897 fixed across the execution of the inner-loop, and the realignment cannot
4898 be optimized (as illustrated in the following pseudo vectorized loop):
4900 for (i=0; i<N; i+=4)
4901 for (j=0; j<M; j++){
4902 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4903 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4904 // (assuming that we start from an aligned address).
4907 We therefore have to use the unoptimized realignment scheme:
4909 for (i=0; i<N; i+=4)
4910 for (j=k; j<M; j+=4)
4911 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4912 // that the misalignment of the initial address is
4913 // 0).
4915 The loop can then be vectorized as follows:
4917 for (k=0; k<4; k++){
4918 rt = get_realignment_token (&vp[k]);
4919 for (i=0; i<N; i+=4){
4920 v1 = vp[i+k];
4921 for (j=k; j<M; j+=4){
4922 v2 = vp[i+j+VS-1];
4923 va = REALIGN_LOAD <v1,v2,rt>;
4924 vs += va;
4925 v1 = v2;
4928 } */
4930 if (DR_IS_READ (dr))
4932 bool is_packed = false;
4933 tree type = (TREE_TYPE (DR_REF (dr)));
4935 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4936 && (!targetm.vectorize.builtin_mask_for_load
4937 || targetm.vectorize.builtin_mask_for_load ()))
4939 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4940 if ((nested_in_vect_loop
4941 && (TREE_INT_CST_LOW (DR_STEP (dr))
4942 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4943 || !loop_vinfo)
4944 return dr_explicit_realign;
4945 else
4946 return dr_explicit_realign_optimized;
4948 if (!known_alignment_for_access_p (dr))
4949 is_packed = not_size_aligned (DR_REF (dr));
4951 if ((TYPE_USER_ALIGN (type) && !is_packed)
4952 || targetm.vectorize.
4953 support_vector_misalignment (mode, type,
4954 DR_MISALIGNMENT (dr), is_packed))
4955 /* Can't software pipeline the loads, but can at least do them. */
4956 return dr_unaligned_supported;
4958 else
4960 bool is_packed = false;
4961 tree type = (TREE_TYPE (DR_REF (dr)));
4963 if (!known_alignment_for_access_p (dr))
4964 is_packed = not_size_aligned (DR_REF (dr));
4966 if ((TYPE_USER_ALIGN (type) && !is_packed)
4967 || targetm.vectorize.
4968 support_vector_misalignment (mode, type,
4969 DR_MISALIGNMENT (dr), is_packed))
4970 return dr_unaligned_supported;
4973 /* Unsupported. */
4974 return dr_unaligned_unsupported;