PR tree-optimization/37385
[official-gcc/alias-decl.git] / gcc / tree-vect-transform.c
bloba25960b58ec4c77c2c530bcebb667111c46d95ea
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
50 slp_tree, slp_instance);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 static int
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
99 default:
100 gcc_unreachable ();
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
109 loop.
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
117 int i;
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
141 return 0;
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
149 runtime_test = true;
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
156 vec_outside_cost +=
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
166 vec_outside_cost +=
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
180 iteration for now.
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
185 statements. */
187 /* FORNOW. */
188 if (loop->inner)
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
198 else
199 factor = 1;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
209 continue;
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
219 loop.
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
235 unknown. */
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
250 else
252 if (byte_misalign)
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
261 else
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
277 else
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
288 + peel_guard_costs;
290 /* FORNOW: The scalar outside cost is incremented in one of the
291 following ways:
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
299 jmp to vector code
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
307 if (cost <= th)
308 prologue = scalar_iters
309 if (prologue == 0)
310 jmp to vector code
311 else
312 execute prologue
313 if (prologue == num_iters)
314 go to exit
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
323 if (prologue == 0)
324 jmp to vector code
325 else
326 execute prologue
327 if (prologue == num_iters)
328 go to exit
329 vector code:
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
331 jmp to epilogue
333 Hence the run-time scalar cost should be incremented by 2 taken
334 branches.
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
340 if (runtime_test)
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
346 else
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
353 else
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
358 /* Add SLP costs. */
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
368 must hold true:
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
370 where
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
380 else
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
386 - vec_inside_cost);
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
395 else
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
402 return -1;
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
409 vec_inside_cost);
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
411 vec_outside_cost);
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
448 static bool
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
450 int ncopies)
452 int outer_cost = 0;
453 enum tree_code code;
454 optab optab;
455 tree vectype;
456 gimple stmt, orig_stmt;
457 tree reduction_op;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
473 break;
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
476 break;
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
479 break;
480 default:
481 gcc_unreachable ();
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
485 if (!vectype)
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
492 return false;
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
498 if (!orig_stmt)
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
515 else
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
518 tree bitsize =
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
533 else
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
547 return true;
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
555 static void
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
576 void
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
580 int i;
581 int inside_cost = 0, outside_cost = 0;
583 /* The SLP costs were already calculated during SLP tree build. */
584 if (PURE_SLP_STMT (stmt_info))
585 return;
587 inside_cost = ncopies * TARG_VEC_STMT_COST;
589 /* FORNOW: Assuming maximum 2 args per stmts. */
590 for (i = 0; i < 2; i++)
592 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
593 outside_cost += TARG_SCALAR_TO_VEC_COST;
596 if (vect_print_dump_info (REPORT_COST))
597 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
598 "outside_cost = %d .", inside_cost, outside_cost);
600 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
601 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
602 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
606 /* Function vect_cost_strided_group_size
608 For strided load or store, return the group_size only if it is the first
609 load or store of a group, else return 1. This ensures that group size is
610 only returned once per group. */
612 static int
613 vect_cost_strided_group_size (stmt_vec_info stmt_info)
615 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
617 if (first_stmt == STMT_VINFO_STMT (stmt_info))
618 return DR_GROUP_SIZE (stmt_info);
620 return 1;
624 /* Function vect_model_store_cost
626 Models cost for stores. In the case of strided accesses, one access
627 has the overhead of the strided access attributed to it. */
629 void
630 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
631 enum vect_def_type dt, slp_tree slp_node)
633 int group_size;
634 int inside_cost = 0, outside_cost = 0;
636 /* The SLP costs were already calculated during SLP tree build. */
637 if (PURE_SLP_STMT (stmt_info))
638 return;
640 if (dt == vect_constant_def || dt == vect_invariant_def)
641 outside_cost = TARG_SCALAR_TO_VEC_COST;
643 /* Strided access? */
644 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
645 group_size = vect_cost_strided_group_size (stmt_info);
646 /* Not a strided access. */
647 else
648 group_size = 1;
650 /* Is this an access in a group of stores, which provide strided access?
651 If so, add in the cost of the permutes. */
652 if (group_size > 1)
654 /* Uses a high and low interleave operation for each needed permute. */
655 inside_cost = ncopies * exact_log2(group_size) * group_size
656 * TARG_VEC_STMT_COST;
658 if (vect_print_dump_info (REPORT_COST))
659 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
660 group_size);
664 /* Costs of the stores. */
665 inside_cost += ncopies * TARG_VEC_STORE_COST;
667 if (vect_print_dump_info (REPORT_COST))
668 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
669 "outside_cost = %d .", inside_cost, outside_cost);
671 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
672 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
673 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
677 /* Function vect_model_load_cost
679 Models cost for loads. In the case of strided accesses, the last access
680 has the overhead of the strided access attributed to it. Since unaligned
681 accesses are supported for loads, we also account for the costs of the
682 access scheme chosen. */
684 void
685 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
688 int group_size;
689 int alignment_support_cheme;
690 gimple first_stmt;
691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
692 int inside_cost = 0, outside_cost = 0;
694 /* The SLP costs were already calculated during SLP tree build. */
695 if (PURE_SLP_STMT (stmt_info))
696 return;
698 /* Strided accesses? */
699 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
700 if (first_stmt && !slp_node)
702 group_size = vect_cost_strided_group_size (stmt_info);
703 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
705 /* Not a strided access. */
706 else
708 group_size = 1;
709 first_dr = dr;
712 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
714 /* Is this an access in a group of loads providing strided access?
715 If so, add in the cost of the permutes. */
716 if (group_size > 1)
718 /* Uses an even and odd extract operations for each needed permute. */
719 inside_cost = ncopies * exact_log2(group_size) * group_size
720 * TARG_VEC_STMT_COST;
722 if (vect_print_dump_info (REPORT_COST))
723 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
724 group_size);
728 /* The loads themselves. */
729 switch (alignment_support_cheme)
731 case dr_aligned:
733 inside_cost += ncopies * TARG_VEC_LOAD_COST;
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: aligned.");
738 break;
740 case dr_unaligned_supported:
742 /* Here, we assign an additional cost for the unaligned load. */
743 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
745 if (vect_print_dump_info (REPORT_COST))
746 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
747 "hardware.");
749 break;
751 case dr_explicit_realign:
753 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 /* FIXME: If the misalignment remains fixed across the iterations of
756 the containing loop, the following cost should be added to the
757 outside costs. */
758 if (targetm.vectorize.builtin_mask_for_load)
759 inside_cost += TARG_VEC_STMT_COST;
761 break;
763 case dr_explicit_realign_optimized:
765 if (vect_print_dump_info (REPORT_COST))
766 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
767 "pipelined.");
769 /* Unaligned software pipeline has a load of an address, an initial
770 load, and possibly a mask operation to "prime" the loop. However,
771 if this is an access in a group of loads, which provide strided
772 access, then the above cost should only be considered for one
773 access in the group. Inside the loop, there is a load op
774 and a realignment op. */
776 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
778 outside_cost = 2*TARG_VEC_STMT_COST;
779 if (targetm.vectorize.builtin_mask_for_load)
780 outside_cost += TARG_VEC_STMT_COST;
783 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
785 break;
788 default:
789 gcc_unreachable ();
792 if (vect_print_dump_info (REPORT_COST))
793 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
794 "outside_cost = %d .", inside_cost, outside_cost);
796 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
797 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
798 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
802 /* Function vect_get_new_vect_var.
804 Returns a name for a new variable. The current naming scheme appends the
805 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
806 the name of vectorizer generated variables, and appends that to NAME if
807 provided. */
809 static tree
810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
812 const char *prefix;
813 tree new_vect_var;
815 switch (var_kind)
817 case vect_simple_var:
818 prefix = "vect_";
819 break;
820 case vect_scalar_var:
821 prefix = "stmp_";
822 break;
823 case vect_pointer_var:
824 prefix = "vect_p";
825 break;
826 default:
827 gcc_unreachable ();
830 if (name)
832 char* tmp = concat (prefix, name, NULL);
833 new_vect_var = create_tmp_var (type, tmp);
834 free (tmp);
836 else
837 new_vect_var = create_tmp_var (type, prefix);
839 /* Mark vector typed variable as a gimple register variable. */
840 if (TREE_CODE (type) == VECTOR_TYPE)
841 DECL_GIMPLE_REG_P (new_vect_var) = true;
843 return new_vect_var;
847 /* Function vect_create_addr_base_for_vector_ref.
849 Create an expression that computes the address of the first memory location
850 that will be accessed for a data reference.
852 Input:
853 STMT: The statement containing the data reference.
854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
855 OFFSET: Optional. If supplied, it is be added to the initial address.
856 LOOP: Specify relative to which loop-nest should the address be computed.
857 For example, when the dataref is in an inner-loop nested in an
858 outer-loop that is now being vectorized, LOOP can be either the
859 outer-loop, or the inner-loop. The first memory location accessed
860 by the following dataref ('in' points to short):
862 for (i=0; i<N; i++)
863 for (j=0; j<M; j++)
864 s += in[i+j]
866 is as follows:
867 if LOOP=i_loop: &in (relative to i_loop)
868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
870 Output:
871 1. Return an SSA_NAME whose value is the address of the memory location of
872 the first vector of the data reference.
873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
874 these statement(s) which define the returned SSA_NAME.
876 FORNOW: We are only handling array accesses with step 1. */
878 static tree
879 vect_create_addr_base_for_vector_ref (gimple stmt,
880 gimple_seq *new_stmt_list,
881 tree offset,
882 struct loop *loop)
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
887 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
888 tree base_name;
889 tree data_ref_base_var;
890 tree vec_stmt;
891 tree addr_base, addr_expr;
892 tree dest;
893 gimple_seq seq = NULL;
894 tree base_offset = unshare_expr (DR_OFFSET (dr));
895 tree init = unshare_expr (DR_INIT (dr));
896 tree vect_ptr_type, addr_expr2;
897 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
899 gcc_assert (loop);
900 if (loop != containing_loop)
902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
905 gcc_assert (nested_in_vect_loop_p (loop, stmt));
907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
912 /* Create data_ref_base */
913 base_name = build_fold_indirect_ref (data_ref_base);
914 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
915 add_referenced_var (data_ref_base_var);
916 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
917 data_ref_base_var);
918 gimple_seq_add_seq (new_stmt_list, seq);
920 /* Create base_offset */
921 base_offset = size_binop (PLUS_EXPR, base_offset, init);
922 base_offset = fold_convert (sizetype, base_offset);
923 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
924 add_referenced_var (dest);
925 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
926 gimple_seq_add_seq (new_stmt_list, seq);
928 if (offset)
930 tree tmp = create_tmp_var (sizetype, "offset");
932 add_referenced_var (tmp);
933 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
934 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
935 base_offset, offset);
936 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
937 gimple_seq_add_seq (new_stmt_list, seq);
940 /* base + base_offset */
941 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
942 data_ref_base, base_offset);
944 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
946 /* addr_expr = addr_base */
947 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
948 get_name (base_name));
949 add_referenced_var (addr_expr);
950 vec_stmt = fold_convert (vect_ptr_type, addr_base);
951 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
952 get_name (base_name));
953 add_referenced_var (addr_expr2);
954 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
955 gimple_seq_add_seq (new_stmt_list, seq);
957 if (vect_print_dump_info (REPORT_DETAILS))
959 fprintf (vect_dump, "created ");
960 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
962 return vec_stmt;
966 /* Function vect_create_data_ref_ptr.
968 Create a new pointer to vector type (vp), that points to the first location
969 accessed in the loop by STMT, along with the def-use update chain to
970 appropriately advance the pointer through the loop iterations. Also set
971 aliasing information for the pointer. This vector pointer is used by the
972 callers to this function to create a memory reference expression for vector
973 load/store access.
975 Input:
976 1. STMT: a stmt that references memory. Expected to be of the form
977 GIMPLE_ASSIGN <name, data-ref> or
978 GIMPLE_ASSIGN <data-ref, name>.
979 2. AT_LOOP: the loop where the vector memref is to be created.
980 3. OFFSET (optional): an offset to be added to the initial address accessed
981 by the data-ref in STMT.
982 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
983 pointing to the initial address.
984 5. TYPE: if not NULL indicates the required type of the data-ref.
986 Output:
987 1. Declare a new ptr to vector_type, and have it point to the base of the
988 data reference (initial addressed accessed by the data reference).
989 For example, for vector of type V8HI, the following code is generated:
991 v8hi *vp;
992 vp = (v8hi *)initial_address;
994 if OFFSET is not supplied:
995 initial_address = &a[init];
996 if OFFSET is supplied:
997 initial_address = &a[init + OFFSET];
999 Return the initial_address in INITIAL_ADDRESS.
1001 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1002 update the pointer in each iteration of the loop.
1004 Return the increment stmt that updates the pointer in PTR_INCR.
1006 3. Set INV_P to true if the access pattern of the data reference in the
1007 vectorized loop is invariant. Set it to false otherwise.
1009 4. Return the pointer. */
1011 static tree
1012 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1013 tree offset, tree *initial_address, gimple *ptr_incr,
1014 bool only_init, bool *inv_p, tree type)
1016 tree base_name;
1017 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1018 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1019 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1021 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1022 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1023 tree vect_ptr_type;
1024 tree vect_ptr;
1025 tree tag;
1026 tree new_temp;
1027 gimple vec_stmt;
1028 gimple_seq new_stmt_list = NULL;
1029 edge pe;
1030 basic_block new_bb;
1031 tree vect_ptr_init;
1032 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1033 tree vptr;
1034 gimple_stmt_iterator incr_gsi;
1035 bool insert_after;
1036 tree indx_before_incr, indx_after_incr;
1037 gimple incr;
1038 tree step;
1040 /* Check the step (evolution) of the load in LOOP, and record
1041 whether it's invariant. */
1042 if (nested_in_vect_loop)
1043 step = STMT_VINFO_DR_STEP (stmt_info);
1044 else
1045 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1047 if (tree_int_cst_compare (step, size_zero_node) == 0)
1048 *inv_p = true;
1049 else
1050 *inv_p = false;
1052 /* Create an expression for the first address accessed by this load
1053 in LOOP. */
1054 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1056 if (vect_print_dump_info (REPORT_DETAILS))
1058 tree data_ref_base = base_name;
1059 fprintf (vect_dump, "create vector-pointer variable to type: ");
1060 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1061 if (TREE_CODE (data_ref_base) == VAR_DECL)
1062 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1063 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1064 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1065 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1066 fprintf (vect_dump, " vectorizing a record based array ref: ");
1067 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1068 fprintf (vect_dump, " vectorizing a pointer ref: ");
1069 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1072 /** (1) Create the new vector-pointer variable: **/
1073 if (type)
1074 vect_ptr_type = build_pointer_type (type);
1075 else
1076 vect_ptr_type = build_pointer_type (vectype);
1078 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1079 get_name (base_name));
1080 add_referenced_var (vect_ptr);
1082 /** (2) Add aliasing information to the new vector-pointer:
1083 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1085 tag = DR_SYMBOL_TAG (dr);
1086 gcc_assert (tag);
1088 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1089 tag must be created with tag added to its may alias list. */
1090 if (!MTAG_P (tag))
1091 new_type_alias (vect_ptr, tag, DR_REF (dr));
1092 else
1093 set_symbol_mem_tag (vect_ptr, tag);
1095 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1096 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1097 def-use update cycles for the pointer: One relative to the outer-loop
1098 (LOOP), which is what steps (3) and (4) below do. The other is relative
1099 to the inner-loop (which is the inner-most loop containing the dataref),
1100 and this is done be step (5) below.
1102 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1103 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1104 redundant. Steps (3),(4) create the following:
1106 vp0 = &base_addr;
1107 LOOP: vp1 = phi(vp0,vp2)
1108 ...
1110 vp2 = vp1 + step
1111 goto LOOP
1113 If there is an inner-loop nested in loop, then step (5) will also be
1114 applied, and an additional update in the inner-loop will be created:
1116 vp0 = &base_addr;
1117 LOOP: vp1 = phi(vp0,vp2)
1119 inner: vp3 = phi(vp1,vp4)
1120 vp4 = vp3 + inner_step
1121 if () goto inner
1123 vp2 = vp1 + step
1124 if () goto LOOP */
1126 /** (3) Calculate the initial address the vector-pointer, and set
1127 the vector-pointer to point to it before the loop: **/
1129 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1131 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1132 offset, loop);
1133 pe = loop_preheader_edge (loop);
1134 if (new_stmt_list)
1136 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1137 gcc_assert (!new_bb);
1140 *initial_address = new_temp;
1142 /* Create: p = (vectype *) initial_base */
1143 vec_stmt = gimple_build_assign (vect_ptr,
1144 fold_convert (vect_ptr_type, new_temp));
1145 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1146 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1147 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1148 gcc_assert (!new_bb);
1151 /** (4) Handle the updating of the vector-pointer inside the loop.
1152 This is needed when ONLY_INIT is false, and also when AT_LOOP
1153 is the inner-loop nested in LOOP (during outer-loop vectorization).
1156 if (only_init && at_loop == loop) /* No update in loop is required. */
1158 /* Copy the points-to information if it exists. */
1159 if (DR_PTR_INFO (dr))
1160 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1161 vptr = vect_ptr_init;
1163 else
1165 /* The step of the vector pointer is the Vector Size. */
1166 tree step = TYPE_SIZE_UNIT (vectype);
1167 /* One exception to the above is when the scalar step of the load in
1168 LOOP is zero. In this case the step here is also zero. */
1169 if (*inv_p)
1170 step = size_zero_node;
1172 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1174 create_iv (vect_ptr_init,
1175 fold_convert (vect_ptr_type, step),
1176 NULL_TREE, loop, &incr_gsi, insert_after,
1177 &indx_before_incr, &indx_after_incr);
1178 incr = gsi_stmt (incr_gsi);
1179 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1181 /* Copy the points-to information if it exists. */
1182 if (DR_PTR_INFO (dr))
1184 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1185 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1187 merge_alias_info (vect_ptr_init, indx_before_incr);
1188 merge_alias_info (vect_ptr_init, indx_after_incr);
1189 if (ptr_incr)
1190 *ptr_incr = incr;
1192 vptr = indx_before_incr;
1195 if (!nested_in_vect_loop || only_init)
1196 return vptr;
1199 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1200 nested in LOOP, if exists: **/
1202 gcc_assert (nested_in_vect_loop);
1203 if (!only_init)
1205 standard_iv_increment_position (containing_loop, &incr_gsi,
1206 &insert_after);
1207 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1208 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1209 &indx_after_incr);
1210 incr = gsi_stmt (incr_gsi);
1211 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1213 /* Copy the points-to information if it exists. */
1214 if (DR_PTR_INFO (dr))
1216 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1217 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1219 merge_alias_info (vect_ptr_init, indx_before_incr);
1220 merge_alias_info (vect_ptr_init, indx_after_incr);
1221 if (ptr_incr)
1222 *ptr_incr = incr;
1224 return indx_before_incr;
1226 else
1227 gcc_unreachable ();
1231 /* Function bump_vector_ptr
1233 Increment a pointer (to a vector type) by vector-size. If requested,
1234 i.e. if PTR-INCR is given, then also connect the new increment stmt
1235 to the existing def-use update-chain of the pointer, by modifying
1236 the PTR_INCR as illustrated below:
1238 The pointer def-use update-chain before this function:
1239 DATAREF_PTR = phi (p_0, p_2)
1240 ....
1241 PTR_INCR: p_2 = DATAREF_PTR + step
1243 The pointer def-use update-chain after this function:
1244 DATAREF_PTR = phi (p_0, p_2)
1245 ....
1246 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1247 ....
1248 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1250 Input:
1251 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1252 in the loop.
1253 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1254 the loop. The increment amount across iterations is expected
1255 to be vector_size.
1256 BSI - location where the new update stmt is to be placed.
1257 STMT - the original scalar memory-access stmt that is being vectorized.
1258 BUMP - optional. The offset by which to bump the pointer. If not given,
1259 the offset is assumed to be vector_size.
1261 Output: Return NEW_DATAREF_PTR as illustrated above.
1265 static tree
1266 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1267 gimple stmt, tree bump)
1269 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1270 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1271 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1272 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1273 tree update = TYPE_SIZE_UNIT (vectype);
1274 gimple incr_stmt;
1275 ssa_op_iter iter;
1276 use_operand_p use_p;
1277 tree new_dataref_ptr;
1279 if (bump)
1280 update = bump;
1282 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1283 dataref_ptr, update);
1284 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1285 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1286 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1288 /* Copy the points-to information if it exists. */
1289 if (DR_PTR_INFO (dr))
1290 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1291 merge_alias_info (new_dataref_ptr, dataref_ptr);
1293 if (!ptr_incr)
1294 return new_dataref_ptr;
1296 /* Update the vector-pointer's cross-iteration increment. */
1297 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1299 tree use = USE_FROM_PTR (use_p);
1301 if (use == dataref_ptr)
1302 SET_USE (use_p, new_dataref_ptr);
1303 else
1304 gcc_assert (tree_int_cst_compare (use, update) == 0);
1307 return new_dataref_ptr;
1311 /* Function vect_create_destination_var.
1313 Create a new temporary of type VECTYPE. */
1315 static tree
1316 vect_create_destination_var (tree scalar_dest, tree vectype)
1318 tree vec_dest;
1319 const char *new_name;
1320 tree type;
1321 enum vect_var_kind kind;
1323 kind = vectype ? vect_simple_var : vect_scalar_var;
1324 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1326 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1328 new_name = get_name (scalar_dest);
1329 if (!new_name)
1330 new_name = "var_";
1331 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1332 add_referenced_var (vec_dest);
1334 return vec_dest;
1338 /* Function vect_init_vector.
1340 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1341 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1342 is not NULL. Otherwise, place the initialization at the loop preheader.
1343 Return the DEF of INIT_STMT.
1344 It will be used in the vectorization of STMT. */
1346 static tree
1347 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1348 gimple_stmt_iterator *gsi)
1350 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1351 tree new_var;
1352 gimple init_stmt;
1353 tree vec_oprnd;
1354 edge pe;
1355 tree new_temp;
1356 basic_block new_bb;
1358 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1359 add_referenced_var (new_var);
1360 init_stmt = gimple_build_assign (new_var, vector_var);
1361 new_temp = make_ssa_name (new_var, init_stmt);
1362 gimple_assign_set_lhs (init_stmt, new_temp);
1364 if (gsi)
1365 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1366 else
1368 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1369 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1371 if (nested_in_vect_loop_p (loop, stmt))
1372 loop = loop->inner;
1373 pe = loop_preheader_edge (loop);
1374 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1375 gcc_assert (!new_bb);
1378 if (vect_print_dump_info (REPORT_DETAILS))
1380 fprintf (vect_dump, "created new init_stmt: ");
1381 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1384 vec_oprnd = gimple_assign_lhs (init_stmt);
1385 return vec_oprnd;
1389 /* For constant and loop invariant defs of SLP_NODE this function returns
1390 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1391 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1392 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1394 static void
1395 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1396 unsigned int op_num, unsigned int number_of_vectors)
1398 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1399 gimple stmt = VEC_index (gimple, stmts, 0);
1400 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1401 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1402 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1403 tree vec_cst;
1404 tree t = NULL_TREE;
1405 int j, number_of_places_left_in_vector;
1406 tree vector_type;
1407 tree op, vop;
1408 int group_size = VEC_length (gimple, stmts);
1409 unsigned int vec_num, i;
1410 int number_of_copies = 1;
1411 bool is_store = false;
1412 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1413 bool constant_p;
1415 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1416 is_store = true;
1418 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1419 created vectors. It is greater than 1 if unrolling is performed.
1421 For example, we have two scalar operands, s1 and s2 (e.g., group of
1422 strided accesses of size two), while NUNITS is four (i.e., four scalars
1423 of this type can be packed in a vector). The output vector will contain
1424 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1425 will be 2).
1427 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1428 containing the operands.
1430 For example, NUNITS is four as before, and the group size is 8
1431 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1432 {s5, s6, s7, s8}. */
1434 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1436 number_of_places_left_in_vector = nunits;
1437 constant_p = true;
1438 for (j = 0; j < number_of_copies; j++)
1440 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1442 if (is_store)
1443 op = gimple_assign_rhs1 (stmt);
1444 else
1445 op = gimple_op (stmt, op_num + 1);
1446 if (!CONSTANT_CLASS_P (op))
1447 constant_p = false;
1449 /* Create 'vect_ = {op0,op1,...,opn}'. */
1450 t = tree_cons (NULL_TREE, op, t);
1452 number_of_places_left_in_vector--;
1454 if (number_of_places_left_in_vector == 0)
1456 number_of_places_left_in_vector = nunits;
1458 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1459 gcc_assert (vector_type);
1460 if (constant_p)
1461 vec_cst = build_vector (vector_type, t);
1462 else
1463 vec_cst = build_constructor_from_list (vector_type, t);
1464 constant_p = true;
1465 VEC_quick_push (tree, voprnds,
1466 vect_init_vector (stmt, vec_cst, vector_type,
1467 NULL));
1468 t = NULL_TREE;
1473 /* Since the vectors are created in the reverse order, we should invert
1474 them. */
1475 vec_num = VEC_length (tree, voprnds);
1476 for (j = vec_num - 1; j >= 0; j--)
1478 vop = VEC_index (tree, voprnds, j);
1479 VEC_quick_push (tree, *vec_oprnds, vop);
1482 VEC_free (tree, heap, voprnds);
1484 /* In case that VF is greater than the unrolling factor needed for the SLP
1485 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1486 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1487 to replicate the vectors. */
1488 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1490 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1491 VEC_quick_push (tree, *vec_oprnds, vop);
1496 /* Get vectorized definitions from SLP_NODE that contains corresponding
1497 vectorized def-stmts. */
1499 static void
1500 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1502 tree vec_oprnd;
1503 gimple vec_def_stmt;
1504 unsigned int i;
1506 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1508 for (i = 0;
1509 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1510 i++)
1512 gcc_assert (vec_def_stmt);
1513 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1514 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1519 /* Get vectorized definitions for SLP_NODE.
1520 If the scalar definitions are loop invariants or constants, collect them and
1521 call vect_get_constant_vectors() to create vector stmts.
1522 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1523 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1524 vect_get_slp_vect_defs() to retrieve them.
1525 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1526 the right node. This is used when the second operand must remain scalar. */
1528 static void
1529 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1530 VEC (tree,heap) **vec_oprnds1)
1532 gimple first_stmt;
1533 enum tree_code code;
1534 int number_of_vects;
1535 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1537 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1538 /* The number of vector defs is determined by the number of vector statements
1539 in the node from which we get those statements. */
1540 if (SLP_TREE_LEFT (slp_node))
1541 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1542 else
1544 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1545 /* Number of vector stmts was calculated according to LHS in
1546 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1547 necessary. See vect_get_smallest_scalar_type() for details. */
1548 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1549 &rhs_size_unit);
1550 if (rhs_size_unit != lhs_size_unit)
1552 number_of_vects *= rhs_size_unit;
1553 number_of_vects /= lhs_size_unit;
1557 /* Allocate memory for vectorized defs. */
1558 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1560 /* SLP_NODE corresponds either to a group of stores or to a group of
1561 unary/binary operations. We don't call this function for loads. */
1562 if (SLP_TREE_LEFT (slp_node))
1563 /* The defs are already vectorized. */
1564 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1565 else
1566 /* Build vectors from scalar defs. */
1567 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1569 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1570 /* Since we don't call this function with loads, this is a group of
1571 stores. */
1572 return;
1574 code = gimple_assign_rhs_code (first_stmt);
1575 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1576 return;
1578 /* The number of vector defs is determined by the number of vector statements
1579 in the node from which we get those statements. */
1580 if (SLP_TREE_RIGHT (slp_node))
1581 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1582 else
1583 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1585 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1587 if (SLP_TREE_RIGHT (slp_node))
1588 /* The defs are already vectorized. */
1589 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1590 else
1591 /* Build vectors from scalar defs. */
1592 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1596 /* Function get_initial_def_for_induction
1598 Input:
1599 STMT - a stmt that performs an induction operation in the loop.
1600 IV_PHI - the initial value of the induction variable
1602 Output:
1603 Return a vector variable, initialized with the first VF values of
1604 the induction variable. E.g., for an iv with IV_PHI='X' and
1605 evolution S, for a vector of 4 units, we want to return:
1606 [X, X + S, X + 2*S, X + 3*S]. */
1608 static tree
1609 get_initial_def_for_induction (gimple iv_phi)
1611 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1612 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1613 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1614 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1615 tree vectype;
1616 int nunits;
1617 edge pe = loop_preheader_edge (loop);
1618 struct loop *iv_loop;
1619 basic_block new_bb;
1620 tree vec, vec_init, vec_step, t;
1621 tree access_fn;
1622 tree new_var;
1623 tree new_name;
1624 gimple init_stmt, induction_phi, new_stmt;
1625 tree induc_def, vec_def, vec_dest;
1626 tree init_expr, step_expr;
1627 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1628 int i;
1629 bool ok;
1630 int ncopies;
1631 tree expr;
1632 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1633 bool nested_in_vect_loop = false;
1634 gimple_seq stmts = NULL;
1635 imm_use_iterator imm_iter;
1636 use_operand_p use_p;
1637 gimple exit_phi;
1638 edge latch_e;
1639 tree loop_arg;
1640 gimple_stmt_iterator si;
1641 basic_block bb = gimple_bb (iv_phi);
1643 vectype = get_vectype_for_scalar_type (scalar_type);
1644 gcc_assert (vectype);
1645 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1646 ncopies = vf / nunits;
1648 gcc_assert (phi_info);
1649 gcc_assert (ncopies >= 1);
1651 /* Find the first insertion point in the BB. */
1652 si = gsi_after_labels (bb);
1654 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1655 step_expr = build_int_cst (scalar_type, 0);
1656 else
1657 step_expr = build_real (scalar_type, dconst0);
1659 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1660 if (nested_in_vect_loop_p (loop, iv_phi))
1662 nested_in_vect_loop = true;
1663 iv_loop = loop->inner;
1665 else
1666 iv_loop = loop;
1667 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1669 latch_e = loop_latch_edge (iv_loop);
1670 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1672 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1673 gcc_assert (access_fn);
1674 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1675 &init_expr, &step_expr);
1676 gcc_assert (ok);
1677 pe = loop_preheader_edge (iv_loop);
1679 /* Create the vector that holds the initial_value of the induction. */
1680 if (nested_in_vect_loop)
1682 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1683 been created during vectorization of previous stmts; We obtain it from
1684 the STMT_VINFO_VEC_STMT of the defining stmt. */
1685 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1686 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1688 else
1690 /* iv_loop is the loop to be vectorized. Create:
1691 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1692 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1693 add_referenced_var (new_var);
1695 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1696 if (stmts)
1698 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1699 gcc_assert (!new_bb);
1702 t = NULL_TREE;
1703 t = tree_cons (NULL_TREE, init_expr, t);
1704 for (i = 1; i < nunits; i++)
1706 /* Create: new_name_i = new_name + step_expr */
1707 enum tree_code code = POINTER_TYPE_P (scalar_type)
1708 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1709 init_stmt = gimple_build_assign_with_ops (code, new_var,
1710 new_name, step_expr);
1711 new_name = make_ssa_name (new_var, init_stmt);
1712 gimple_assign_set_lhs (init_stmt, new_name);
1714 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1715 gcc_assert (!new_bb);
1717 if (vect_print_dump_info (REPORT_DETAILS))
1719 fprintf (vect_dump, "created new init_stmt: ");
1720 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1722 t = tree_cons (NULL_TREE, new_name, t);
1724 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1725 vec = build_constructor_from_list (vectype, nreverse (t));
1726 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1730 /* Create the vector that holds the step of the induction. */
1731 if (nested_in_vect_loop)
1732 /* iv_loop is nested in the loop to be vectorized. Generate:
1733 vec_step = [S, S, S, S] */
1734 new_name = step_expr;
1735 else
1737 /* iv_loop is the loop to be vectorized. Generate:
1738 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1739 expr = build_int_cst (scalar_type, vf);
1740 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1743 t = NULL_TREE;
1744 for (i = 0; i < nunits; i++)
1745 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1746 gcc_assert (CONSTANT_CLASS_P (new_name));
1747 vec = build_vector (vectype, t);
1748 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1751 /* Create the following def-use cycle:
1752 loop prolog:
1753 vec_init = ...
1754 vec_step = ...
1755 loop:
1756 vec_iv = PHI <vec_init, vec_loop>
1758 STMT
1760 vec_loop = vec_iv + vec_step; */
1762 /* Create the induction-phi that defines the induction-operand. */
1763 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1764 add_referenced_var (vec_dest);
1765 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1766 set_vinfo_for_stmt (induction_phi,
1767 new_stmt_vec_info (induction_phi, loop_vinfo));
1768 induc_def = PHI_RESULT (induction_phi);
1770 /* Create the iv update inside the loop */
1771 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1772 induc_def, vec_step);
1773 vec_def = make_ssa_name (vec_dest, new_stmt);
1774 gimple_assign_set_lhs (new_stmt, vec_def);
1775 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1776 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1778 /* Set the arguments of the phi node: */
1779 add_phi_arg (induction_phi, vec_init, pe);
1780 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1783 /* In case that vectorization factor (VF) is bigger than the number
1784 of elements that we can fit in a vectype (nunits), we have to generate
1785 more than one vector stmt - i.e - we need to "unroll" the
1786 vector stmt by a factor VF/nunits. For more details see documentation
1787 in vectorizable_operation. */
1789 if (ncopies > 1)
1791 stmt_vec_info prev_stmt_vinfo;
1792 /* FORNOW. This restriction should be relaxed. */
1793 gcc_assert (!nested_in_vect_loop);
1795 /* Create the vector that holds the step of the induction. */
1796 expr = build_int_cst (scalar_type, nunits);
1797 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1798 t = NULL_TREE;
1799 for (i = 0; i < nunits; i++)
1800 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1801 gcc_assert (CONSTANT_CLASS_P (new_name));
1802 vec = build_vector (vectype, t);
1803 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1805 vec_def = induc_def;
1806 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1807 for (i = 1; i < ncopies; i++)
1809 /* vec_i = vec_prev + vec_step */
1810 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1811 vec_def, vec_step);
1812 vec_def = make_ssa_name (vec_dest, new_stmt);
1813 gimple_assign_set_lhs (new_stmt, vec_def);
1815 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1816 set_vinfo_for_stmt (new_stmt,
1817 new_stmt_vec_info (new_stmt, loop_vinfo));
1818 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1819 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1823 if (nested_in_vect_loop)
1825 /* Find the loop-closed exit-phi of the induction, and record
1826 the final vector of induction results: */
1827 exit_phi = NULL;
1828 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1830 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1832 exit_phi = USE_STMT (use_p);
1833 break;
1836 if (exit_phi)
1838 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1839 /* FORNOW. Currently not supporting the case that an inner-loop induction
1840 is not used in the outer-loop (i.e. only outside the outer-loop). */
1841 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1842 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1844 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1845 if (vect_print_dump_info (REPORT_DETAILS))
1847 fprintf (vect_dump, "vector of inductions after inner-loop:");
1848 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1854 if (vect_print_dump_info (REPORT_DETAILS))
1856 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1857 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1858 fprintf (vect_dump, "\n");
1859 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1862 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1863 return induc_def;
1867 /* Function vect_get_vec_def_for_operand.
1869 OP is an operand in STMT. This function returns a (vector) def that will be
1870 used in the vectorized stmt for STMT.
1872 In the case that OP is an SSA_NAME which is defined in the loop, then
1873 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1875 In case OP is an invariant or constant, a new stmt that creates a vector def
1876 needs to be introduced. */
1878 static tree
1879 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1881 tree vec_oprnd;
1882 gimple vec_stmt;
1883 gimple def_stmt;
1884 stmt_vec_info def_stmt_info = NULL;
1885 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1886 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1887 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1888 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1889 tree vec_inv;
1890 tree vec_cst;
1891 tree t = NULL_TREE;
1892 tree def;
1893 int i;
1894 enum vect_def_type dt;
1895 bool is_simple_use;
1896 tree vector_type;
1898 if (vect_print_dump_info (REPORT_DETAILS))
1900 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1901 print_generic_expr (vect_dump, op, TDF_SLIM);
1904 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1905 gcc_assert (is_simple_use);
1906 if (vect_print_dump_info (REPORT_DETAILS))
1908 if (def)
1910 fprintf (vect_dump, "def = ");
1911 print_generic_expr (vect_dump, def, TDF_SLIM);
1913 if (def_stmt)
1915 fprintf (vect_dump, " def_stmt = ");
1916 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1920 switch (dt)
1922 /* Case 1: operand is a constant. */
1923 case vect_constant_def:
1925 if (scalar_def)
1926 *scalar_def = op;
1928 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1929 if (vect_print_dump_info (REPORT_DETAILS))
1930 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1932 for (i = nunits - 1; i >= 0; --i)
1934 t = tree_cons (NULL_TREE, op, t);
1936 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1937 gcc_assert (vector_type);
1938 vec_cst = build_vector (vector_type, t);
1940 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1943 /* Case 2: operand is defined outside the loop - loop invariant. */
1944 case vect_invariant_def:
1946 if (scalar_def)
1947 *scalar_def = def;
1949 /* Create 'vec_inv = {inv,inv,..,inv}' */
1950 if (vect_print_dump_info (REPORT_DETAILS))
1951 fprintf (vect_dump, "Create vector_inv.");
1953 for (i = nunits - 1; i >= 0; --i)
1955 t = tree_cons (NULL_TREE, def, t);
1958 /* FIXME: use build_constructor directly. */
1959 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1960 gcc_assert (vector_type);
1961 vec_inv = build_constructor_from_list (vector_type, t);
1962 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1965 /* Case 3: operand is defined inside the loop. */
1966 case vect_loop_def:
1968 if (scalar_def)
1969 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1971 /* Get the def from the vectorized stmt. */
1972 def_stmt_info = vinfo_for_stmt (def_stmt);
1973 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1974 gcc_assert (vec_stmt);
1975 if (gimple_code (vec_stmt) == GIMPLE_PHI)
1976 vec_oprnd = PHI_RESULT (vec_stmt);
1977 else if (is_gimple_call (vec_stmt))
1978 vec_oprnd = gimple_call_lhs (vec_stmt);
1979 else
1980 vec_oprnd = gimple_assign_lhs (vec_stmt);
1981 return vec_oprnd;
1984 /* Case 4: operand is defined by a loop header phi - reduction */
1985 case vect_reduction_def:
1987 struct loop *loop;
1989 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
1990 loop = (gimple_bb (def_stmt))->loop_father;
1992 /* Get the def before the loop */
1993 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1994 return get_initial_def_for_reduction (stmt, op, scalar_def);
1997 /* Case 5: operand is defined by loop-header phi - induction. */
1998 case vect_induction_def:
2000 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2002 /* Get the def from the vectorized stmt. */
2003 def_stmt_info = vinfo_for_stmt (def_stmt);
2004 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2005 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2006 vec_oprnd = PHI_RESULT (vec_stmt);
2007 return vec_oprnd;
2010 default:
2011 gcc_unreachable ();
2016 /* Function vect_get_vec_def_for_stmt_copy
2018 Return a vector-def for an operand. This function is used when the
2019 vectorized stmt to be created (by the caller to this function) is a "copy"
2020 created in case the vectorized result cannot fit in one vector, and several
2021 copies of the vector-stmt are required. In this case the vector-def is
2022 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2023 of the stmt that defines VEC_OPRND.
2024 DT is the type of the vector def VEC_OPRND.
2026 Context:
2027 In case the vectorization factor (VF) is bigger than the number
2028 of elements that can fit in a vectype (nunits), we have to generate
2029 more than one vector stmt to vectorize the scalar stmt. This situation
2030 arises when there are multiple data-types operated upon in the loop; the
2031 smallest data-type determines the VF, and as a result, when vectorizing
2032 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2033 vector stmt (each computing a vector of 'nunits' results, and together
2034 computing 'VF' results in each iteration). This function is called when
2035 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2036 which VF=16 and nunits=4, so the number of copies required is 4):
2038 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2040 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2041 VS1.1: vx.1 = memref1 VS1.2
2042 VS1.2: vx.2 = memref2 VS1.3
2043 VS1.3: vx.3 = memref3
2045 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2046 VSnew.1: vz1 = vx.1 + ... VSnew.2
2047 VSnew.2: vz2 = vx.2 + ... VSnew.3
2048 VSnew.3: vz3 = vx.3 + ...
2050 The vectorization of S1 is explained in vectorizable_load.
2051 The vectorization of S2:
2052 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2053 the function 'vect_get_vec_def_for_operand' is called to
2054 get the relevant vector-def for each operand of S2. For operand x it
2055 returns the vector-def 'vx.0'.
2057 To create the remaining copies of the vector-stmt (VSnew.j), this
2058 function is called to get the relevant vector-def for each operand. It is
2059 obtained from the respective VS1.j stmt, which is recorded in the
2060 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2062 For example, to obtain the vector-def 'vx.1' in order to create the
2063 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2064 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2065 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2066 and return its def ('vx.1').
2067 Overall, to create the above sequence this function will be called 3 times:
2068 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2069 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2070 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2072 static tree
2073 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2075 gimple vec_stmt_for_operand;
2076 stmt_vec_info def_stmt_info;
2078 /* Do nothing; can reuse same def. */
2079 if (dt == vect_invariant_def || dt == vect_constant_def )
2080 return vec_oprnd;
2082 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2083 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2084 gcc_assert (def_stmt_info);
2085 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2086 gcc_assert (vec_stmt_for_operand);
2087 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2088 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2089 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2090 else
2091 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2092 return vec_oprnd;
2096 /* Get vectorized definitions for the operands to create a copy of an original
2097 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2099 static void
2100 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2101 VEC(tree,heap) **vec_oprnds0,
2102 VEC(tree,heap) **vec_oprnds1)
2104 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2106 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2107 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2109 if (vec_oprnds1 && *vec_oprnds1)
2111 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2112 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2113 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2118 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2120 static void
2121 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2122 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2123 slp_tree slp_node)
2125 if (slp_node)
2126 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2127 else
2129 tree vec_oprnd;
2131 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2132 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2133 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2135 if (op1)
2137 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2138 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2139 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2145 /* Function vect_finish_stmt_generation.
2147 Insert a new stmt. */
2149 static void
2150 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2151 gimple_stmt_iterator *gsi)
2153 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2154 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2156 gcc_assert (stmt == gsi_stmt (*gsi));
2157 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2159 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2161 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2163 if (vect_print_dump_info (REPORT_DETAILS))
2165 fprintf (vect_dump, "add new stmt: ");
2166 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2169 /* Make sure gsi points to the stmt that is being vectorized. */
2170 gcc_assert (stmt == gsi_stmt (*gsi));
2172 gimple_set_location (vec_stmt, gimple_location (stmt));
2176 /* Function get_initial_def_for_reduction
2178 Input:
2179 STMT - a stmt that performs a reduction operation in the loop.
2180 INIT_VAL - the initial value of the reduction variable
2182 Output:
2183 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2184 of the reduction (used for adjusting the epilog - see below).
2185 Return a vector variable, initialized according to the operation that STMT
2186 performs. This vector will be used as the initial value of the
2187 vector of partial results.
2189 Option1 (adjust in epilog): Initialize the vector as follows:
2190 add: [0,0,...,0,0]
2191 mult: [1,1,...,1,1]
2192 min/max: [init_val,init_val,..,init_val,init_val]
2193 bit and/or: [init_val,init_val,..,init_val,init_val]
2194 and when necessary (e.g. add/mult case) let the caller know
2195 that it needs to adjust the result by init_val.
2197 Option2: Initialize the vector as follows:
2198 add: [0,0,...,0,init_val]
2199 mult: [1,1,...,1,init_val]
2200 min/max: [init_val,init_val,...,init_val]
2201 bit and/or: [init_val,init_val,...,init_val]
2202 and no adjustments are needed.
2204 For example, for the following code:
2206 s = init_val;
2207 for (i=0;i<n;i++)
2208 s = s + a[i];
2210 STMT is 's = s + a[i]', and the reduction variable is 's'.
2211 For a vector of 4 units, we want to return either [0,0,0,init_val],
2212 or [0,0,0,0] and let the caller know that it needs to adjust
2213 the result at the end by 'init_val'.
2215 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2216 initialization vector is simpler (same element in all entries).
2217 A cost model should help decide between these two schemes. */
2219 static tree
2220 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2222 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2223 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2224 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2225 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2226 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2227 enum tree_code code = gimple_assign_rhs_code (stmt);
2228 tree type = TREE_TYPE (init_val);
2229 tree vecdef;
2230 tree def_for_init;
2231 tree init_def;
2232 tree t = NULL_TREE;
2233 int i;
2234 tree vector_type;
2235 bool nested_in_vect_loop = false;
2237 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2238 if (nested_in_vect_loop_p (loop, stmt))
2239 nested_in_vect_loop = true;
2240 else
2241 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2243 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2245 switch (code)
2247 case WIDEN_SUM_EXPR:
2248 case DOT_PROD_EXPR:
2249 case PLUS_EXPR:
2250 if (nested_in_vect_loop)
2251 *adjustment_def = vecdef;
2252 else
2253 *adjustment_def = init_val;
2254 /* Create a vector of zeros for init_def. */
2255 if (SCALAR_FLOAT_TYPE_P (type))
2256 def_for_init = build_real (type, dconst0);
2257 else
2258 def_for_init = build_int_cst (type, 0);
2259 for (i = nunits - 1; i >= 0; --i)
2260 t = tree_cons (NULL_TREE, def_for_init, t);
2261 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2262 gcc_assert (vector_type);
2263 init_def = build_vector (vector_type, t);
2264 break;
2266 case MIN_EXPR:
2267 case MAX_EXPR:
2268 *adjustment_def = NULL_TREE;
2269 init_def = vecdef;
2270 break;
2272 default:
2273 gcc_unreachable ();
2276 return init_def;
2280 /* Function vect_create_epilog_for_reduction
2282 Create code at the loop-epilog to finalize the result of a reduction
2283 computation.
2285 VECT_DEF is a vector of partial results.
2286 REDUC_CODE is the tree-code for the epilog reduction.
2287 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2288 number of elements that we can fit in a vectype (nunits). In this case
2289 we have to generate more than one vector stmt - i.e - we need to "unroll"
2290 the vector stmt by a factor VF/nunits. For more details see documentation
2291 in vectorizable_operation.
2292 STMT is the scalar reduction stmt that is being vectorized.
2293 REDUCTION_PHI is the phi-node that carries the reduction computation.
2295 This function:
2296 1. Creates the reduction def-use cycle: sets the arguments for
2297 REDUCTION_PHI:
2298 The loop-entry argument is the vectorized initial-value of the reduction.
2299 The loop-latch argument is VECT_DEF - the vector of partial sums.
2300 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2301 by applying the operation specified by REDUC_CODE if available, or by
2302 other means (whole-vector shifts or a scalar loop).
2303 The function also creates a new phi node at the loop exit to preserve
2304 loop-closed form, as illustrated below.
2306 The flow at the entry to this function:
2308 loop:
2309 vec_def = phi <null, null> # REDUCTION_PHI
2310 VECT_DEF = vector_stmt # vectorized form of STMT
2311 s_loop = scalar_stmt # (scalar) STMT
2312 loop_exit:
2313 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2314 use <s_out0>
2315 use <s_out0>
2317 The above is transformed by this function into:
2319 loop:
2320 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2321 VECT_DEF = vector_stmt # vectorized form of STMT
2322 s_loop = scalar_stmt # (scalar) STMT
2323 loop_exit:
2324 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2325 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2326 v_out2 = reduce <v_out1>
2327 s_out3 = extract_field <v_out2, 0>
2328 s_out4 = adjust_result <s_out3>
2329 use <s_out4>
2330 use <s_out4>
2333 static void
2334 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2335 int ncopies,
2336 enum tree_code reduc_code,
2337 gimple reduction_phi)
2339 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2340 stmt_vec_info prev_phi_info;
2341 tree vectype;
2342 enum machine_mode mode;
2343 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2344 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2345 basic_block exit_bb;
2346 tree scalar_dest;
2347 tree scalar_type;
2348 gimple new_phi = NULL, phi;
2349 gimple_stmt_iterator exit_gsi;
2350 tree vec_dest;
2351 tree new_temp = NULL_TREE;
2352 tree new_name;
2353 gimple epilog_stmt = NULL;
2354 tree new_scalar_dest, new_dest;
2355 gimple exit_phi;
2356 tree bitsize, bitpos, bytesize;
2357 enum tree_code code = gimple_assign_rhs_code (stmt);
2358 tree adjustment_def;
2359 tree vec_initial_def, def;
2360 tree orig_name;
2361 imm_use_iterator imm_iter;
2362 use_operand_p use_p;
2363 bool extract_scalar_result = false;
2364 tree reduction_op, expr;
2365 gimple orig_stmt;
2366 gimple use_stmt;
2367 bool nested_in_vect_loop = false;
2368 VEC(gimple,heap) *phis = NULL;
2369 enum vect_def_type dt = vect_unknown_def_type;
2370 int j, i;
2372 if (nested_in_vect_loop_p (loop, stmt))
2374 loop = loop->inner;
2375 nested_in_vect_loop = true;
2378 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2380 case GIMPLE_SINGLE_RHS:
2381 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2382 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2383 break;
2384 case GIMPLE_UNARY_RHS:
2385 reduction_op = gimple_assign_rhs1 (stmt);
2386 break;
2387 case GIMPLE_BINARY_RHS:
2388 reduction_op = gimple_assign_rhs2 (stmt);
2389 break;
2390 default:
2391 gcc_unreachable ();
2394 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2395 gcc_assert (vectype);
2396 mode = TYPE_MODE (vectype);
2398 /*** 1. Create the reduction def-use cycle ***/
2400 /* For the case of reduction, vect_get_vec_def_for_operand returns
2401 the scalar def before the loop, that defines the initial value
2402 of the reduction variable. */
2403 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2404 &adjustment_def);
2406 phi = reduction_phi;
2407 def = vect_def;
2408 for (j = 0; j < ncopies; j++)
2410 /* 1.1 set the loop-entry arg of the reduction-phi: */
2411 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2413 /* 1.2 set the loop-latch arg for the reduction-phi: */
2414 if (j > 0)
2415 def = vect_get_vec_def_for_stmt_copy (dt, def);
2416 add_phi_arg (phi, def, loop_latch_edge (loop));
2418 if (vect_print_dump_info (REPORT_DETAILS))
2420 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2421 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2422 fprintf (vect_dump, "\n");
2423 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2426 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2429 /*** 2. Create epilog code
2430 The reduction epilog code operates across the elements of the vector
2431 of partial results computed by the vectorized loop.
2432 The reduction epilog code consists of:
2433 step 1: compute the scalar result in a vector (v_out2)
2434 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2435 step 3: adjust the scalar result (s_out3) if needed.
2437 Step 1 can be accomplished using one the following three schemes:
2438 (scheme 1) using reduc_code, if available.
2439 (scheme 2) using whole-vector shifts, if available.
2440 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2441 combined.
2443 The overall epilog code looks like this:
2445 s_out0 = phi <s_loop> # original EXIT_PHI
2446 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2447 v_out2 = reduce <v_out1> # step 1
2448 s_out3 = extract_field <v_out2, 0> # step 2
2449 s_out4 = adjust_result <s_out3> # step 3
2451 (step 3 is optional, and steps 1 and 2 may be combined).
2452 Lastly, the uses of s_out0 are replaced by s_out4.
2454 ***/
2456 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2457 v_out1 = phi <v_loop> */
2459 exit_bb = single_exit (loop)->dest;
2460 def = vect_def;
2461 prev_phi_info = NULL;
2462 for (j = 0; j < ncopies; j++)
2464 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2465 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2466 if (j == 0)
2467 new_phi = phi;
2468 else
2470 def = vect_get_vec_def_for_stmt_copy (dt, def);
2471 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2473 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2474 prev_phi_info = vinfo_for_stmt (phi);
2476 exit_gsi = gsi_after_labels (exit_bb);
2478 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2479 (i.e. when reduc_code is not available) and in the final adjustment
2480 code (if needed). Also get the original scalar reduction variable as
2481 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2482 represents a reduction pattern), the tree-code and scalar-def are
2483 taken from the original stmt that the pattern-stmt (STMT) replaces.
2484 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2485 are taken from STMT. */
2487 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2488 if (!orig_stmt)
2490 /* Regular reduction */
2491 orig_stmt = stmt;
2493 else
2495 /* Reduction pattern */
2496 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2497 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2498 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2500 code = gimple_assign_rhs_code (orig_stmt);
2501 scalar_dest = gimple_assign_lhs (orig_stmt);
2502 scalar_type = TREE_TYPE (scalar_dest);
2503 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2504 bitsize = TYPE_SIZE (scalar_type);
2505 bytesize = TYPE_SIZE_UNIT (scalar_type);
2508 /* In case this is a reduction in an inner-loop while vectorizing an outer
2509 loop - we don't need to extract a single scalar result at the end of the
2510 inner-loop. The final vector of partial results will be used in the
2511 vectorized outer-loop, or reduced to a scalar result at the end of the
2512 outer-loop. */
2513 if (nested_in_vect_loop)
2514 goto vect_finalize_reduction;
2516 /* FORNOW */
2517 gcc_assert (ncopies == 1);
2519 /* 2.3 Create the reduction code, using one of the three schemes described
2520 above. */
2522 if (reduc_code < NUM_TREE_CODES)
2524 tree tmp;
2526 /*** Case 1: Create:
2527 v_out2 = reduc_expr <v_out1> */
2529 if (vect_print_dump_info (REPORT_DETAILS))
2530 fprintf (vect_dump, "Reduce using direct vector reduction.");
2532 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2533 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2534 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2535 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2536 gimple_assign_set_lhs (epilog_stmt, new_temp);
2537 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2539 extract_scalar_result = true;
2541 else
2543 enum tree_code shift_code = 0;
2544 bool have_whole_vector_shift = true;
2545 int bit_offset;
2546 int element_bitsize = tree_low_cst (bitsize, 1);
2547 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2548 tree vec_temp;
2550 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2551 shift_code = VEC_RSHIFT_EXPR;
2552 else
2553 have_whole_vector_shift = false;
2555 /* Regardless of whether we have a whole vector shift, if we're
2556 emulating the operation via tree-vect-generic, we don't want
2557 to use it. Only the first round of the reduction is likely
2558 to still be profitable via emulation. */
2559 /* ??? It might be better to emit a reduction tree code here, so that
2560 tree-vect-generic can expand the first round via bit tricks. */
2561 if (!VECTOR_MODE_P (mode))
2562 have_whole_vector_shift = false;
2563 else
2565 optab optab = optab_for_tree_code (code, vectype, optab_default);
2566 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2567 have_whole_vector_shift = false;
2570 if (have_whole_vector_shift)
2572 /*** Case 2: Create:
2573 for (offset = VS/2; offset >= element_size; offset/=2)
2575 Create: va' = vec_shift <va, offset>
2576 Create: va = vop <va, va'>
2577 } */
2579 if (vect_print_dump_info (REPORT_DETAILS))
2580 fprintf (vect_dump, "Reduce using vector shifts");
2582 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2583 new_temp = PHI_RESULT (new_phi);
2585 for (bit_offset = vec_size_in_bits/2;
2586 bit_offset >= element_bitsize;
2587 bit_offset /= 2)
2589 tree bitpos = size_int (bit_offset);
2590 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2591 new_temp, bitpos);
2592 new_name = make_ssa_name (vec_dest, epilog_stmt);
2593 gimple_assign_set_lhs (epilog_stmt, new_name);
2594 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2596 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2597 new_name, new_temp);
2598 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2599 gimple_assign_set_lhs (epilog_stmt, new_temp);
2600 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2603 extract_scalar_result = true;
2605 else
2607 tree rhs;
2609 /*** Case 3: Create:
2610 s = extract_field <v_out2, 0>
2611 for (offset = element_size;
2612 offset < vector_size;
2613 offset += element_size;)
2615 Create: s' = extract_field <v_out2, offset>
2616 Create: s = op <s, s'>
2617 } */
2619 if (vect_print_dump_info (REPORT_DETAILS))
2620 fprintf (vect_dump, "Reduce using scalar code. ");
2622 vec_temp = PHI_RESULT (new_phi);
2623 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2624 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2625 bitsize_zero_node);
2626 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2627 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2628 gimple_assign_set_lhs (epilog_stmt, new_temp);
2629 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2631 for (bit_offset = element_bitsize;
2632 bit_offset < vec_size_in_bits;
2633 bit_offset += element_bitsize)
2635 tree bitpos = bitsize_int (bit_offset);
2636 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2637 bitpos);
2639 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2640 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2641 gimple_assign_set_lhs (epilog_stmt, new_name);
2642 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2644 epilog_stmt = gimple_build_assign_with_ops (code,
2645 new_scalar_dest,
2646 new_name, new_temp);
2647 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2648 gimple_assign_set_lhs (epilog_stmt, new_temp);
2649 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2652 extract_scalar_result = false;
2656 /* 2.4 Extract the final scalar result. Create:
2657 s_out3 = extract_field <v_out2, bitpos> */
2659 if (extract_scalar_result)
2661 tree rhs;
2663 gcc_assert (!nested_in_vect_loop);
2664 if (vect_print_dump_info (REPORT_DETAILS))
2665 fprintf (vect_dump, "extract scalar result");
2667 if (BYTES_BIG_ENDIAN)
2668 bitpos = size_binop (MULT_EXPR,
2669 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2670 TYPE_SIZE (scalar_type));
2671 else
2672 bitpos = bitsize_zero_node;
2674 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2675 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2676 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2677 gimple_assign_set_lhs (epilog_stmt, new_temp);
2678 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2681 vect_finalize_reduction:
2683 /* 2.5 Adjust the final result by the initial value of the reduction
2684 variable. (When such adjustment is not needed, then
2685 'adjustment_def' is zero). For example, if code is PLUS we create:
2686 new_temp = loop_exit_def + adjustment_def */
2688 if (adjustment_def)
2690 if (nested_in_vect_loop)
2692 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2693 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2694 new_dest = vect_create_destination_var (scalar_dest, vectype);
2696 else
2698 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2699 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2700 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2702 epilog_stmt = gimple_build_assign (new_dest, expr);
2703 new_temp = make_ssa_name (new_dest, epilog_stmt);
2704 gimple_assign_set_lhs (epilog_stmt, new_temp);
2705 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2706 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2710 /* 2.6 Handle the loop-exit phi */
2712 /* Replace uses of s_out0 with uses of s_out3:
2713 Find the loop-closed-use at the loop exit of the original scalar result.
2714 (The reduction result is expected to have two immediate uses - one at the
2715 latch block, and one at the loop exit). */
2716 phis = VEC_alloc (gimple, heap, 10);
2717 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2719 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2721 exit_phi = USE_STMT (use_p);
2722 VEC_quick_push (gimple, phis, exit_phi);
2725 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2726 gcc_assert (!VEC_empty (gimple, phis));
2728 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2730 if (nested_in_vect_loop)
2732 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2734 /* FORNOW. Currently not supporting the case that an inner-loop
2735 reduction is not used in the outer-loop (but only outside the
2736 outer-loop). */
2737 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2738 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2740 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2741 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2742 set_vinfo_for_stmt (epilog_stmt,
2743 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2744 if (adjustment_def)
2745 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2746 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2747 continue;
2750 /* Replace the uses: */
2751 orig_name = PHI_RESULT (exit_phi);
2752 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2753 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2754 SET_USE (use_p, new_temp);
2756 VEC_free (gimple, heap, phis);
2760 /* Function vectorizable_reduction.
2762 Check if STMT performs a reduction operation that can be vectorized.
2763 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2764 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2765 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2767 This function also handles reduction idioms (patterns) that have been
2768 recognized in advance during vect_pattern_recog. In this case, STMT may be
2769 of this form:
2770 X = pattern_expr (arg0, arg1, ..., X)
2771 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2772 sequence that had been detected and replaced by the pattern-stmt (STMT).
2774 In some cases of reduction patterns, the type of the reduction variable X is
2775 different than the type of the other arguments of STMT.
2776 In such cases, the vectype that is used when transforming STMT into a vector
2777 stmt is different than the vectype that is used to determine the
2778 vectorization factor, because it consists of a different number of elements
2779 than the actual number of elements that are being operated upon in parallel.
2781 For example, consider an accumulation of shorts into an int accumulator.
2782 On some targets it's possible to vectorize this pattern operating on 8
2783 shorts at a time (hence, the vectype for purposes of determining the
2784 vectorization factor should be V8HI); on the other hand, the vectype that
2785 is used to create the vector form is actually V4SI (the type of the result).
2787 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2788 indicates what is the actual level of parallelism (V8HI in the example), so
2789 that the right vectorization factor would be derived. This vectype
2790 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2791 be used to create the vectorized stmt. The right vectype for the vectorized
2792 stmt is obtained from the type of the result X:
2793 get_vectype_for_scalar_type (TREE_TYPE (X))
2795 This means that, contrary to "regular" reductions (or "regular" stmts in
2796 general), the following equation:
2797 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2798 does *NOT* necessarily hold for reduction patterns. */
2800 bool
2801 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2802 gimple *vec_stmt)
2804 tree vec_dest;
2805 tree scalar_dest;
2806 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2807 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2808 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2809 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2810 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2811 enum tree_code code, orig_code, epilog_reduc_code = 0;
2812 enum machine_mode vec_mode;
2813 int op_type;
2814 optab optab, reduc_optab;
2815 tree new_temp = NULL_TREE;
2816 tree def;
2817 gimple def_stmt;
2818 enum vect_def_type dt;
2819 gimple new_phi = NULL;
2820 tree scalar_type;
2821 bool is_simple_use;
2822 gimple orig_stmt;
2823 stmt_vec_info orig_stmt_info;
2824 tree expr = NULL_TREE;
2825 int i;
2826 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2827 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2828 int epilog_copies;
2829 stmt_vec_info prev_stmt_info, prev_phi_info;
2830 gimple first_phi = NULL;
2831 bool single_defuse_cycle = false;
2832 tree reduc_def;
2833 gimple new_stmt = NULL;
2834 int j;
2835 tree ops[3];
2837 if (nested_in_vect_loop_p (loop, stmt))
2838 loop = loop->inner;
2840 gcc_assert (ncopies >= 1);
2842 /* FORNOW: SLP not supported. */
2843 if (STMT_SLP_TYPE (stmt_info))
2844 return false;
2846 /* 1. Is vectorizable reduction? */
2848 /* Not supportable if the reduction variable is used in the loop. */
2849 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2850 return false;
2852 /* Reductions that are not used even in an enclosing outer-loop,
2853 are expected to be "live" (used out of the loop). */
2854 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2855 && !STMT_VINFO_LIVE_P (stmt_info))
2856 return false;
2858 /* Make sure it was already recognized as a reduction computation. */
2859 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2860 return false;
2862 /* 2. Has this been recognized as a reduction pattern?
2864 Check if STMT represents a pattern that has been recognized
2865 in earlier analysis stages. For stmts that represent a pattern,
2866 the STMT_VINFO_RELATED_STMT field records the last stmt in
2867 the original sequence that constitutes the pattern. */
2869 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2870 if (orig_stmt)
2872 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2873 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2874 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2875 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2878 /* 3. Check the operands of the operation. The first operands are defined
2879 inside the loop body. The last operand is the reduction variable,
2880 which is defined by the loop-header-phi. */
2882 gcc_assert (is_gimple_assign (stmt));
2884 /* Flatten RHS */
2885 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2887 case GIMPLE_SINGLE_RHS:
2888 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2889 if (op_type == ternary_op)
2891 tree rhs = gimple_assign_rhs1 (stmt);
2892 ops[0] = TREE_OPERAND (rhs, 0);
2893 ops[1] = TREE_OPERAND (rhs, 1);
2894 ops[2] = TREE_OPERAND (rhs, 2);
2895 code = TREE_CODE (rhs);
2897 else
2898 return false;
2899 break;
2901 case GIMPLE_BINARY_RHS:
2902 code = gimple_assign_rhs_code (stmt);
2903 op_type = TREE_CODE_LENGTH (code);
2904 gcc_assert (op_type == binary_op);
2905 ops[0] = gimple_assign_rhs1 (stmt);
2906 ops[1] = gimple_assign_rhs2 (stmt);
2907 break;
2909 case GIMPLE_UNARY_RHS:
2910 return false;
2912 default:
2913 gcc_unreachable ();
2916 scalar_dest = gimple_assign_lhs (stmt);
2917 scalar_type = TREE_TYPE (scalar_dest);
2918 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2919 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2920 return false;
2922 /* All uses but the last are expected to be defined in the loop.
2923 The last use is the reduction variable. */
2924 for (i = 0; i < op_type-1; i++)
2926 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2927 &def, &dt);
2928 gcc_assert (is_simple_use);
2929 if (dt != vect_loop_def
2930 && dt != vect_invariant_def
2931 && dt != vect_constant_def
2932 && dt != vect_induction_def)
2933 return false;
2936 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2937 gcc_assert (is_simple_use);
2938 gcc_assert (dt == vect_reduction_def);
2939 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2940 if (orig_stmt)
2941 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2942 else
2943 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2945 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2946 return false;
2948 /* 4. Supportable by target? */
2950 /* 4.1. check support for the operation in the loop */
2951 optab = optab_for_tree_code (code, vectype, optab_default);
2952 if (!optab)
2954 if (vect_print_dump_info (REPORT_DETAILS))
2955 fprintf (vect_dump, "no optab.");
2956 return false;
2958 vec_mode = TYPE_MODE (vectype);
2959 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2961 if (vect_print_dump_info (REPORT_DETAILS))
2962 fprintf (vect_dump, "op not supported by target.");
2963 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2964 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2965 < vect_min_worthwhile_factor (code))
2966 return false;
2967 if (vect_print_dump_info (REPORT_DETAILS))
2968 fprintf (vect_dump, "proceeding using word mode.");
2971 /* Worthwhile without SIMD support? */
2972 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2973 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2974 < vect_min_worthwhile_factor (code))
2976 if (vect_print_dump_info (REPORT_DETAILS))
2977 fprintf (vect_dump, "not worthwhile without SIMD support.");
2978 return false;
2981 /* 4.2. Check support for the epilog operation.
2983 If STMT represents a reduction pattern, then the type of the
2984 reduction variable may be different than the type of the rest
2985 of the arguments. For example, consider the case of accumulation
2986 of shorts into an int accumulator; The original code:
2987 S1: int_a = (int) short_a;
2988 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2990 was replaced with:
2991 STMT: int_acc = widen_sum <short_a, int_acc>
2993 This means that:
2994 1. The tree-code that is used to create the vector operation in the
2995 epilog code (that reduces the partial results) is not the
2996 tree-code of STMT, but is rather the tree-code of the original
2997 stmt from the pattern that STMT is replacing. I.e, in the example
2998 above we want to use 'widen_sum' in the loop, but 'plus' in the
2999 epilog.
3000 2. The type (mode) we use to check available target support
3001 for the vector operation to be created in the *epilog*, is
3002 determined by the type of the reduction variable (in the example
3003 above we'd check this: plus_optab[vect_int_mode]).
3004 However the type (mode) we use to check available target support
3005 for the vector operation to be created *inside the loop*, is
3006 determined by the type of the other arguments to STMT (in the
3007 example we'd check this: widen_sum_optab[vect_short_mode]).
3009 This is contrary to "regular" reductions, in which the types of all
3010 the arguments are the same as the type of the reduction variable.
3011 For "regular" reductions we can therefore use the same vector type
3012 (and also the same tree-code) when generating the epilog code and
3013 when generating the code inside the loop. */
3015 if (orig_stmt)
3017 /* This is a reduction pattern: get the vectype from the type of the
3018 reduction variable, and get the tree-code from orig_stmt. */
3019 orig_code = gimple_assign_rhs_code (orig_stmt);
3020 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3021 if (!vectype)
3023 if (vect_print_dump_info (REPORT_DETAILS))
3025 fprintf (vect_dump, "unsupported data-type ");
3026 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3028 return false;
3031 vec_mode = TYPE_MODE (vectype);
3033 else
3035 /* Regular reduction: use the same vectype and tree-code as used for
3036 the vector code inside the loop can be used for the epilog code. */
3037 orig_code = code;
3040 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3041 return false;
3042 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3043 if (!reduc_optab)
3045 if (vect_print_dump_info (REPORT_DETAILS))
3046 fprintf (vect_dump, "no optab for reduction.");
3047 epilog_reduc_code = NUM_TREE_CODES;
3049 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3051 if (vect_print_dump_info (REPORT_DETAILS))
3052 fprintf (vect_dump, "reduc op not supported by target.");
3053 epilog_reduc_code = NUM_TREE_CODES;
3056 if (!vec_stmt) /* transformation not required. */
3058 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3059 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3060 return false;
3061 return true;
3064 /** Transform. **/
3066 if (vect_print_dump_info (REPORT_DETAILS))
3067 fprintf (vect_dump, "transform reduction.");
3069 /* Create the destination vector */
3070 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3072 /* In case the vectorization factor (VF) is bigger than the number
3073 of elements that we can fit in a vectype (nunits), we have to generate
3074 more than one vector stmt - i.e - we need to "unroll" the
3075 vector stmt by a factor VF/nunits. For more details see documentation
3076 in vectorizable_operation. */
3078 /* If the reduction is used in an outer loop we need to generate
3079 VF intermediate results, like so (e.g. for ncopies=2):
3080 r0 = phi (init, r0)
3081 r1 = phi (init, r1)
3082 r0 = x0 + r0;
3083 r1 = x1 + r1;
3084 (i.e. we generate VF results in 2 registers).
3085 In this case we have a separate def-use cycle for each copy, and therefore
3086 for each copy we get the vector def for the reduction variable from the
3087 respective phi node created for this copy.
3089 Otherwise (the reduction is unused in the loop nest), we can combine
3090 together intermediate results, like so (e.g. for ncopies=2):
3091 r = phi (init, r)
3092 r = x0 + r;
3093 r = x1 + r;
3094 (i.e. we generate VF/2 results in a single register).
3095 In this case for each copy we get the vector def for the reduction variable
3096 from the vectorized reduction operation generated in the previous iteration.
3099 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3101 single_defuse_cycle = true;
3102 epilog_copies = 1;
3104 else
3105 epilog_copies = ncopies;
3107 prev_stmt_info = NULL;
3108 prev_phi_info = NULL;
3109 for (j = 0; j < ncopies; j++)
3111 if (j == 0 || !single_defuse_cycle)
3113 /* Create the reduction-phi that defines the reduction-operand. */
3114 new_phi = create_phi_node (vec_dest, loop->header);
3115 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3118 /* Handle uses. */
3119 if (j == 0)
3121 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3122 if (op_type == ternary_op)
3124 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3127 /* Get the vector def for the reduction variable from the phi node */
3128 reduc_def = PHI_RESULT (new_phi);
3129 first_phi = new_phi;
3131 else
3133 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3134 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3135 if (op_type == ternary_op)
3136 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3138 if (single_defuse_cycle)
3139 reduc_def = gimple_assign_lhs (new_stmt);
3140 else
3141 reduc_def = PHI_RESULT (new_phi);
3143 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3146 /* Arguments are ready. create the new vector stmt. */
3147 if (op_type == binary_op)
3148 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3149 else
3150 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3151 reduc_def);
3152 new_stmt = gimple_build_assign (vec_dest, expr);
3153 new_temp = make_ssa_name (vec_dest, new_stmt);
3154 gimple_assign_set_lhs (new_stmt, new_temp);
3155 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3157 if (j == 0)
3158 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3159 else
3160 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3161 prev_stmt_info = vinfo_for_stmt (new_stmt);
3162 prev_phi_info = vinfo_for_stmt (new_phi);
3165 /* Finalize the reduction-phi (set its arguments) and create the
3166 epilog reduction code. */
3167 if (!single_defuse_cycle)
3168 new_temp = gimple_assign_lhs (*vec_stmt);
3169 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3170 epilog_reduc_code, first_phi);
3171 return true;
3174 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3175 a function declaration if the target has a vectorized version
3176 of the function, or NULL_TREE if the function cannot be vectorized. */
3178 tree
3179 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3181 tree fndecl = gimple_call_fndecl (call);
3182 enum built_in_function code;
3184 /* We only handle functions that do not read or clobber memory -- i.e.
3185 const or novops ones. */
3186 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3187 return NULL_TREE;
3189 if (!fndecl
3190 || TREE_CODE (fndecl) != FUNCTION_DECL
3191 || !DECL_BUILT_IN (fndecl))
3192 return NULL_TREE;
3194 code = DECL_FUNCTION_CODE (fndecl);
3195 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3196 vectype_in);
3199 /* Function vectorizable_call.
3201 Check if STMT performs a function call that can be vectorized.
3202 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3203 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3204 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3206 bool
3207 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3209 tree vec_dest;
3210 tree scalar_dest;
3211 tree op, type;
3212 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3213 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3214 tree vectype_out, vectype_in;
3215 int nunits_in;
3216 int nunits_out;
3217 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3218 tree fndecl, new_temp, def, rhs_type, lhs_type;
3219 gimple def_stmt;
3220 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3221 gimple new_stmt;
3222 int ncopies, j;
3223 VEC(tree, heap) *vargs = NULL;
3224 enum { NARROW, NONE, WIDEN } modifier;
3225 size_t i, nargs;
3227 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3228 return false;
3230 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3231 return false;
3233 /* FORNOW: SLP not supported. */
3234 if (STMT_SLP_TYPE (stmt_info))
3235 return false;
3237 /* Is STMT a vectorizable call? */
3238 if (!is_gimple_call (stmt))
3239 return false;
3241 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3242 return false;
3244 /* Process function arguments. */
3245 rhs_type = NULL_TREE;
3246 nargs = gimple_call_num_args (stmt);
3248 /* Bail out if the function has more than two arguments, we
3249 do not have interesting builtin functions to vectorize with
3250 more than two arguments. No arguments is also not good. */
3251 if (nargs == 0 || nargs > 2)
3252 return false;
3254 for (i = 0; i < nargs; i++)
3256 op = gimple_call_arg (stmt, i);
3258 /* We can only handle calls with arguments of the same type. */
3259 if (rhs_type
3260 && rhs_type != TREE_TYPE (op))
3262 if (vect_print_dump_info (REPORT_DETAILS))
3263 fprintf (vect_dump, "argument types differ.");
3264 return false;
3266 rhs_type = TREE_TYPE (op);
3268 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3270 if (vect_print_dump_info (REPORT_DETAILS))
3271 fprintf (vect_dump, "use not simple.");
3272 return false;
3276 vectype_in = get_vectype_for_scalar_type (rhs_type);
3277 if (!vectype_in)
3278 return false;
3279 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3281 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3282 vectype_out = get_vectype_for_scalar_type (lhs_type);
3283 if (!vectype_out)
3284 return false;
3285 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3287 /* FORNOW */
3288 if (nunits_in == nunits_out / 2)
3289 modifier = NARROW;
3290 else if (nunits_out == nunits_in)
3291 modifier = NONE;
3292 else if (nunits_out == nunits_in / 2)
3293 modifier = WIDEN;
3294 else
3295 return false;
3297 /* For now, we only vectorize functions if a target specific builtin
3298 is available. TODO -- in some cases, it might be profitable to
3299 insert the calls for pieces of the vector, in order to be able
3300 to vectorize other operations in the loop. */
3301 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3302 if (fndecl == NULL_TREE)
3304 if (vect_print_dump_info (REPORT_DETAILS))
3305 fprintf (vect_dump, "function is not vectorizable.");
3307 return false;
3310 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3312 if (modifier == NARROW)
3313 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3314 else
3315 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3317 /* Sanity check: make sure that at least one copy of the vectorized stmt
3318 needs to be generated. */
3319 gcc_assert (ncopies >= 1);
3321 if (!vec_stmt) /* transformation not required. */
3323 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3324 if (vect_print_dump_info (REPORT_DETAILS))
3325 fprintf (vect_dump, "=== vectorizable_call ===");
3326 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3327 return true;
3330 /** Transform. **/
3332 if (vect_print_dump_info (REPORT_DETAILS))
3333 fprintf (vect_dump, "transform operation.");
3335 /* Handle def. */
3336 scalar_dest = gimple_call_lhs (stmt);
3337 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3339 prev_stmt_info = NULL;
3340 switch (modifier)
3342 case NONE:
3343 for (j = 0; j < ncopies; ++j)
3345 /* Build argument list for the vectorized call. */
3346 if (j == 0)
3347 vargs = VEC_alloc (tree, heap, nargs);
3348 else
3349 VEC_truncate (tree, vargs, 0);
3351 for (i = 0; i < nargs; i++)
3353 op = gimple_call_arg (stmt, i);
3354 if (j == 0)
3355 vec_oprnd0
3356 = vect_get_vec_def_for_operand (op, stmt, NULL);
3357 else
3358 vec_oprnd0
3359 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3361 VEC_quick_push (tree, vargs, vec_oprnd0);
3364 new_stmt = gimple_build_call_vec (fndecl, vargs);
3365 new_temp = make_ssa_name (vec_dest, new_stmt);
3366 gimple_call_set_lhs (new_stmt, new_temp);
3368 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3370 if (j == 0)
3371 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3372 else
3373 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3375 prev_stmt_info = vinfo_for_stmt (new_stmt);
3378 break;
3380 case NARROW:
3381 for (j = 0; j < ncopies; ++j)
3383 /* Build argument list for the vectorized call. */
3384 if (j == 0)
3385 vargs = VEC_alloc (tree, heap, nargs * 2);
3386 else
3387 VEC_truncate (tree, vargs, 0);
3389 for (i = 0; i < nargs; i++)
3391 op = gimple_call_arg (stmt, i);
3392 if (j == 0)
3394 vec_oprnd0
3395 = vect_get_vec_def_for_operand (op, stmt, NULL);
3396 vec_oprnd1
3397 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3399 else
3401 vec_oprnd0
3402 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3403 vec_oprnd1
3404 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3407 VEC_quick_push (tree, vargs, vec_oprnd0);
3408 VEC_quick_push (tree, vargs, vec_oprnd1);
3411 new_stmt = gimple_build_call_vec (fndecl, vargs);
3412 new_temp = make_ssa_name (vec_dest, new_stmt);
3413 gimple_call_set_lhs (new_stmt, new_temp);
3415 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3417 if (j == 0)
3418 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3419 else
3420 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3422 prev_stmt_info = vinfo_for_stmt (new_stmt);
3425 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3427 break;
3429 case WIDEN:
3430 /* No current target implements this case. */
3431 return false;
3434 VEC_free (tree, heap, vargs);
3436 /* The call in STMT might prevent it from being removed in dce.
3437 We however cannot remove it here, due to the way the ssa name
3438 it defines is mapped to the new definition. So just replace
3439 rhs of the statement with something harmless. */
3441 type = TREE_TYPE (scalar_dest);
3442 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3443 fold_convert (type, integer_zero_node));
3444 set_vinfo_for_stmt (new_stmt, stmt_info);
3445 set_vinfo_for_stmt (stmt, NULL);
3446 STMT_VINFO_STMT (stmt_info) = new_stmt;
3447 gsi_replace (gsi, new_stmt, false);
3448 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3450 return true;
3454 /* Function vect_gen_widened_results_half
3456 Create a vector stmt whose code, type, number of arguments, and result
3457 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3458 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3459 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3460 needs to be created (DECL is a function-decl of a target-builtin).
3461 STMT is the original scalar stmt that we are vectorizing. */
3463 static gimple
3464 vect_gen_widened_results_half (enum tree_code code,
3465 tree decl,
3466 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3467 tree vec_dest, gimple_stmt_iterator *gsi,
3468 gimple stmt)
3470 gimple new_stmt;
3471 tree new_temp;
3472 tree sym;
3473 ssa_op_iter iter;
3475 /* Generate half of the widened result: */
3476 if (code == CALL_EXPR)
3478 /* Target specific support */
3479 if (op_type == binary_op)
3480 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3481 else
3482 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3483 new_temp = make_ssa_name (vec_dest, new_stmt);
3484 gimple_call_set_lhs (new_stmt, new_temp);
3486 else
3488 /* Generic support */
3489 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3490 if (op_type != binary_op)
3491 vec_oprnd1 = NULL;
3492 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3493 vec_oprnd1);
3494 new_temp = make_ssa_name (vec_dest, new_stmt);
3495 gimple_assign_set_lhs (new_stmt, new_temp);
3497 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3499 if (code == CALL_EXPR)
3501 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3503 if (TREE_CODE (sym) == SSA_NAME)
3504 sym = SSA_NAME_VAR (sym);
3505 mark_sym_for_renaming (sym);
3509 return new_stmt;
3513 /* Check if STMT performs a conversion operation, that can be vectorized.
3514 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3515 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3516 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3518 bool
3519 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3520 gimple *vec_stmt, slp_tree slp_node)
3522 tree vec_dest;
3523 tree scalar_dest;
3524 tree op0;
3525 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3526 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3527 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3528 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3529 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3530 tree new_temp;
3531 tree def;
3532 gimple def_stmt;
3533 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3534 gimple new_stmt = NULL;
3535 stmt_vec_info prev_stmt_info;
3536 int nunits_in;
3537 int nunits_out;
3538 tree vectype_out, vectype_in;
3539 int ncopies, j;
3540 tree expr;
3541 tree rhs_type, lhs_type;
3542 tree builtin_decl;
3543 enum { NARROW, NONE, WIDEN } modifier;
3544 int i;
3545 VEC(tree,heap) *vec_oprnds0 = NULL;
3546 tree vop0;
3547 tree integral_type;
3548 VEC(tree,heap) *dummy = NULL;
3549 int dummy_int;
3551 /* Is STMT a vectorizable conversion? */
3553 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3554 return false;
3556 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3557 return false;
3559 if (!is_gimple_assign (stmt))
3560 return false;
3562 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3563 return false;
3565 code = gimple_assign_rhs_code (stmt);
3566 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3567 return false;
3569 /* Check types of lhs and rhs. */
3570 op0 = gimple_assign_rhs1 (stmt);
3571 rhs_type = TREE_TYPE (op0);
3572 vectype_in = get_vectype_for_scalar_type (rhs_type);
3573 if (!vectype_in)
3574 return false;
3575 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3577 scalar_dest = gimple_assign_lhs (stmt);
3578 lhs_type = TREE_TYPE (scalar_dest);
3579 vectype_out = get_vectype_for_scalar_type (lhs_type);
3580 if (!vectype_out)
3581 return false;
3582 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3584 /* FORNOW */
3585 if (nunits_in == nunits_out / 2)
3586 modifier = NARROW;
3587 else if (nunits_out == nunits_in)
3588 modifier = NONE;
3589 else if (nunits_out == nunits_in / 2)
3590 modifier = WIDEN;
3591 else
3592 return false;
3594 if (modifier == NONE)
3595 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3597 /* Bail out if the types are both integral or non-integral. */
3598 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3599 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3600 return false;
3602 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3604 if (modifier == NARROW)
3605 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3606 else
3607 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3609 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3610 this, so we can safely override NCOPIES with 1 here. */
3611 if (slp_node)
3612 ncopies = 1;
3614 /* Sanity check: make sure that at least one copy of the vectorized stmt
3615 needs to be generated. */
3616 gcc_assert (ncopies >= 1);
3618 /* Check the operands of the operation. */
3619 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3621 if (vect_print_dump_info (REPORT_DETAILS))
3622 fprintf (vect_dump, "use not simple.");
3623 return false;
3626 /* Supportable by target? */
3627 if ((modifier == NONE
3628 && !targetm.vectorize.builtin_conversion (code, integral_type))
3629 || (modifier == WIDEN
3630 && !supportable_widening_operation (code, stmt, vectype_in,
3631 &decl1, &decl2,
3632 &code1, &code2,
3633 &dummy_int, &dummy))
3634 || (modifier == NARROW
3635 && !supportable_narrowing_operation (code, stmt, vectype_in,
3636 &code1, &dummy_int, &dummy)))
3638 if (vect_print_dump_info (REPORT_DETAILS))
3639 fprintf (vect_dump, "conversion not supported by target.");
3640 return false;
3643 if (modifier != NONE)
3645 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3646 /* FORNOW: SLP not supported. */
3647 if (STMT_SLP_TYPE (stmt_info))
3648 return false;
3651 if (!vec_stmt) /* transformation not required. */
3653 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3654 return true;
3657 /** Transform. **/
3658 if (vect_print_dump_info (REPORT_DETAILS))
3659 fprintf (vect_dump, "transform conversion.");
3661 /* Handle def. */
3662 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3664 if (modifier == NONE && !slp_node)
3665 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3667 prev_stmt_info = NULL;
3668 switch (modifier)
3670 case NONE:
3671 for (j = 0; j < ncopies; j++)
3673 tree sym;
3674 ssa_op_iter iter;
3676 if (j == 0)
3677 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3678 else
3679 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3681 builtin_decl =
3682 targetm.vectorize.builtin_conversion (code, integral_type);
3683 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3685 /* Arguments are ready. create the new vector stmt. */
3686 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3687 new_temp = make_ssa_name (vec_dest, new_stmt);
3688 gimple_call_set_lhs (new_stmt, new_temp);
3689 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3690 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3691 SSA_OP_ALL_VIRTUALS)
3693 if (TREE_CODE (sym) == SSA_NAME)
3694 sym = SSA_NAME_VAR (sym);
3695 mark_sym_for_renaming (sym);
3697 if (slp_node)
3698 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3701 if (j == 0)
3702 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3703 else
3704 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3705 prev_stmt_info = vinfo_for_stmt (new_stmt);
3707 break;
3709 case WIDEN:
3710 /* In case the vectorization factor (VF) is bigger than the number
3711 of elements that we can fit in a vectype (nunits), we have to
3712 generate more than one vector stmt - i.e - we need to "unroll"
3713 the vector stmt by a factor VF/nunits. */
3714 for (j = 0; j < ncopies; j++)
3716 if (j == 0)
3717 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3718 else
3719 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3721 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3723 /* Generate first half of the widened result: */
3724 new_stmt
3725 = vect_gen_widened_results_half (code1, decl1,
3726 vec_oprnd0, vec_oprnd1,
3727 unary_op, vec_dest, gsi, stmt);
3728 if (j == 0)
3729 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3730 else
3731 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3732 prev_stmt_info = vinfo_for_stmt (new_stmt);
3734 /* Generate second half of the widened result: */
3735 new_stmt
3736 = vect_gen_widened_results_half (code2, decl2,
3737 vec_oprnd0, vec_oprnd1,
3738 unary_op, vec_dest, gsi, stmt);
3739 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3740 prev_stmt_info = vinfo_for_stmt (new_stmt);
3742 break;
3744 case NARROW:
3745 /* In case the vectorization factor (VF) is bigger than the number
3746 of elements that we can fit in a vectype (nunits), we have to
3747 generate more than one vector stmt - i.e - we need to "unroll"
3748 the vector stmt by a factor VF/nunits. */
3749 for (j = 0; j < ncopies; j++)
3751 /* Handle uses. */
3752 if (j == 0)
3754 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3755 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3757 else
3759 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3760 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3763 /* Arguments are ready. Create the new vector stmt. */
3764 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3765 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3766 vec_oprnd1);
3767 new_temp = make_ssa_name (vec_dest, new_stmt);
3768 gimple_assign_set_lhs (new_stmt, new_temp);
3769 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3771 if (j == 0)
3772 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3773 else
3774 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3776 prev_stmt_info = vinfo_for_stmt (new_stmt);
3779 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3782 if (vec_oprnds0)
3783 VEC_free (tree, heap, vec_oprnds0);
3785 return true;
3789 /* Function vectorizable_assignment.
3791 Check if STMT performs an assignment (copy) that can be vectorized.
3792 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3793 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3794 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3796 bool
3797 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3798 gimple *vec_stmt, slp_tree slp_node)
3800 tree vec_dest;
3801 tree scalar_dest;
3802 tree op;
3803 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3804 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3805 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3806 tree new_temp;
3807 tree def;
3808 gimple def_stmt;
3809 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3810 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3811 int ncopies;
3812 int i;
3813 VEC(tree,heap) *vec_oprnds = NULL;
3814 tree vop;
3816 /* Multiple types in SLP are handled by creating the appropriate number of
3817 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3818 case of SLP. */
3819 if (slp_node)
3820 ncopies = 1;
3821 else
3822 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3824 gcc_assert (ncopies >= 1);
3825 if (ncopies > 1)
3826 return false; /* FORNOW */
3828 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3829 return false;
3831 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3832 return false;
3834 /* Is vectorizable assignment? */
3835 if (!is_gimple_assign (stmt))
3836 return false;
3838 scalar_dest = gimple_assign_lhs (stmt);
3839 if (TREE_CODE (scalar_dest) != SSA_NAME)
3840 return false;
3842 if (gimple_assign_single_p (stmt)
3843 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3844 op = gimple_assign_rhs1 (stmt);
3845 else
3846 return false;
3848 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3850 if (vect_print_dump_info (REPORT_DETAILS))
3851 fprintf (vect_dump, "use not simple.");
3852 return false;
3855 if (!vec_stmt) /* transformation not required. */
3857 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3858 if (vect_print_dump_info (REPORT_DETAILS))
3859 fprintf (vect_dump, "=== vectorizable_assignment ===");
3860 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3861 return true;
3864 /** Transform. **/
3865 if (vect_print_dump_info (REPORT_DETAILS))
3866 fprintf (vect_dump, "transform assignment.");
3868 /* Handle def. */
3869 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3871 /* Handle use. */
3872 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3874 /* Arguments are ready. create the new vector stmt. */
3875 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3877 *vec_stmt = gimple_build_assign (vec_dest, vop);
3878 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3879 gimple_assign_set_lhs (*vec_stmt, new_temp);
3880 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3881 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3883 if (slp_node)
3884 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3887 VEC_free (tree, heap, vec_oprnds);
3888 return true;
3892 /* Function vect_min_worthwhile_factor.
3894 For a loop where we could vectorize the operation indicated by CODE,
3895 return the minimum vectorization factor that makes it worthwhile
3896 to use generic vectors. */
3897 static int
3898 vect_min_worthwhile_factor (enum tree_code code)
3900 switch (code)
3902 case PLUS_EXPR:
3903 case MINUS_EXPR:
3904 case NEGATE_EXPR:
3905 return 4;
3907 case BIT_AND_EXPR:
3908 case BIT_IOR_EXPR:
3909 case BIT_XOR_EXPR:
3910 case BIT_NOT_EXPR:
3911 return 2;
3913 default:
3914 return INT_MAX;
3919 /* Function vectorizable_induction
3921 Check if PHI performs an induction computation that can be vectorized.
3922 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3923 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3924 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3926 bool
3927 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3928 gimple *vec_stmt)
3930 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3931 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3932 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3934 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3935 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3936 tree vec_def;
3938 gcc_assert (ncopies >= 1);
3939 /* FORNOW. This restriction should be relaxed. */
3940 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3942 if (vect_print_dump_info (REPORT_DETAILS))
3943 fprintf (vect_dump, "multiple types in nested loop.");
3944 return false;
3947 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3948 return false;
3950 /* FORNOW: SLP not supported. */
3951 if (STMT_SLP_TYPE (stmt_info))
3952 return false;
3954 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3956 if (gimple_code (phi) != GIMPLE_PHI)
3957 return false;
3959 if (!vec_stmt) /* transformation not required. */
3961 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3962 if (vect_print_dump_info (REPORT_DETAILS))
3963 fprintf (vect_dump, "=== vectorizable_induction ===");
3964 vect_model_induction_cost (stmt_info, ncopies);
3965 return true;
3968 /** Transform. **/
3970 if (vect_print_dump_info (REPORT_DETAILS))
3971 fprintf (vect_dump, "transform induction phi.");
3973 vec_def = get_initial_def_for_induction (phi);
3974 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3975 return true;
3979 /* Function vectorizable_operation.
3981 Check if STMT performs a binary or unary operation that can be vectorized.
3982 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3983 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3984 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3986 bool
3987 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
3988 gimple *vec_stmt, slp_tree slp_node)
3990 tree vec_dest;
3991 tree scalar_dest;
3992 tree op0, op1 = NULL;
3993 tree vec_oprnd1 = NULL_TREE;
3994 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3995 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3996 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3997 enum tree_code code;
3998 enum machine_mode vec_mode;
3999 tree new_temp;
4000 int op_type;
4001 optab optab;
4002 int icode;
4003 enum machine_mode optab_op2_mode;
4004 tree def;
4005 gimple def_stmt;
4006 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4007 gimple new_stmt = NULL;
4008 stmt_vec_info prev_stmt_info;
4009 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4010 int nunits_out;
4011 tree vectype_out;
4012 int ncopies;
4013 int j, i;
4014 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4015 tree vop0, vop1;
4016 unsigned int k;
4017 bool shift_p = false;
4018 bool scalar_shift_arg = false;
4020 /* Multiple types in SLP are handled by creating the appropriate number of
4021 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4022 case of SLP. */
4023 if (slp_node)
4024 ncopies = 1;
4025 else
4026 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4028 gcc_assert (ncopies >= 1);
4030 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4031 return false;
4033 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4034 return false;
4036 /* Is STMT a vectorizable binary/unary operation? */
4037 if (!is_gimple_assign (stmt))
4038 return false;
4040 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4041 return false;
4043 scalar_dest = gimple_assign_lhs (stmt);
4044 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4045 if (!vectype_out)
4046 return false;
4047 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4048 if (nunits_out != nunits_in)
4049 return false;
4051 code = gimple_assign_rhs_code (stmt);
4053 /* For pointer addition, we should use the normal plus for
4054 the vector addition. */
4055 if (code == POINTER_PLUS_EXPR)
4056 code = PLUS_EXPR;
4058 /* Support only unary or binary operations. */
4059 op_type = TREE_CODE_LENGTH (code);
4060 if (op_type != unary_op && op_type != binary_op)
4062 if (vect_print_dump_info (REPORT_DETAILS))
4063 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4064 return false;
4067 op0 = gimple_assign_rhs1 (stmt);
4068 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4070 if (vect_print_dump_info (REPORT_DETAILS))
4071 fprintf (vect_dump, "use not simple.");
4072 return false;
4075 if (op_type == binary_op)
4077 op1 = gimple_assign_rhs2 (stmt);
4078 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4080 if (vect_print_dump_info (REPORT_DETAILS))
4081 fprintf (vect_dump, "use not simple.");
4082 return false;
4086 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4087 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4088 shift optabs. */
4089 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4090 || code == RROTATE_EXPR)
4092 shift_p = true;
4094 /* vector shifted by vector */
4095 if (dt[1] == vect_loop_def)
4097 optab = optab_for_tree_code (code, vectype, optab_vector);
4098 if (vect_print_dump_info (REPORT_DETAILS))
4099 fprintf (vect_dump, "vector/vector shift/rotate found.");
4102 /* See if the machine has a vector shifted by scalar insn and if not
4103 then see if it has a vector shifted by vector insn */
4104 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4106 optab = optab_for_tree_code (code, vectype, optab_scalar);
4107 if (optab
4108 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4109 != CODE_FOR_nothing))
4111 scalar_shift_arg = true;
4112 if (vect_print_dump_info (REPORT_DETAILS))
4113 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4115 else
4117 optab = optab_for_tree_code (code, vectype, optab_vector);
4118 if (vect_print_dump_info (REPORT_DETAILS)
4119 && optab
4120 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4121 != CODE_FOR_nothing))
4122 fprintf (vect_dump, "vector/vector shift/rotate found.");
4126 else
4128 if (vect_print_dump_info (REPORT_DETAILS))
4129 fprintf (vect_dump, "operand mode requires invariant argument.");
4130 return false;
4133 else
4134 optab = optab_for_tree_code (code, vectype, optab_default);
4136 /* Supportable by target? */
4137 if (!optab)
4139 if (vect_print_dump_info (REPORT_DETAILS))
4140 fprintf (vect_dump, "no optab.");
4141 return false;
4143 vec_mode = TYPE_MODE (vectype);
4144 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4145 if (icode == CODE_FOR_nothing)
4147 if (vect_print_dump_info (REPORT_DETAILS))
4148 fprintf (vect_dump, "op not supported by target.");
4149 /* Check only during analysis. */
4150 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4151 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4152 < vect_min_worthwhile_factor (code)
4153 && !vec_stmt))
4154 return false;
4155 if (vect_print_dump_info (REPORT_DETAILS))
4156 fprintf (vect_dump, "proceeding using word mode.");
4159 /* Worthwhile without SIMD support? Check only during analysis. */
4160 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4161 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4162 < vect_min_worthwhile_factor (code)
4163 && !vec_stmt)
4165 if (vect_print_dump_info (REPORT_DETAILS))
4166 fprintf (vect_dump, "not worthwhile without SIMD support.");
4167 return false;
4170 if (!vec_stmt) /* transformation not required. */
4172 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4173 if (vect_print_dump_info (REPORT_DETAILS))
4174 fprintf (vect_dump, "=== vectorizable_operation ===");
4175 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4176 return true;
4179 /** Transform. **/
4181 if (vect_print_dump_info (REPORT_DETAILS))
4182 fprintf (vect_dump, "transform binary/unary operation.");
4184 /* Handle def. */
4185 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4187 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4188 created in the previous stages of the recursion, so no allocation is
4189 needed, except for the case of shift with scalar shift argument. In that
4190 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4191 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4192 In case of loop-based vectorization we allocate VECs of size 1. We
4193 allocate VEC_OPRNDS1 only in case of binary operation. */
4194 if (!slp_node)
4196 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4197 if (op_type == binary_op)
4198 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4200 else if (scalar_shift_arg)
4201 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4203 /* In case the vectorization factor (VF) is bigger than the number
4204 of elements that we can fit in a vectype (nunits), we have to generate
4205 more than one vector stmt - i.e - we need to "unroll" the
4206 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4207 from one copy of the vector stmt to the next, in the field
4208 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4209 stages to find the correct vector defs to be used when vectorizing
4210 stmts that use the defs of the current stmt. The example below illustrates
4211 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4212 4 vectorized stmts):
4214 before vectorization:
4215 RELATED_STMT VEC_STMT
4216 S1: x = memref - -
4217 S2: z = x + 1 - -
4219 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4220 there):
4221 RELATED_STMT VEC_STMT
4222 VS1_0: vx0 = memref0 VS1_1 -
4223 VS1_1: vx1 = memref1 VS1_2 -
4224 VS1_2: vx2 = memref2 VS1_3 -
4225 VS1_3: vx3 = memref3 - -
4226 S1: x = load - VS1_0
4227 S2: z = x + 1 - -
4229 step2: vectorize stmt S2 (done here):
4230 To vectorize stmt S2 we first need to find the relevant vector
4231 def for the first operand 'x'. This is, as usual, obtained from
4232 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4233 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4234 relevant vector def 'vx0'. Having found 'vx0' we can generate
4235 the vector stmt VS2_0, and as usual, record it in the
4236 STMT_VINFO_VEC_STMT of stmt S2.
4237 When creating the second copy (VS2_1), we obtain the relevant vector
4238 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4239 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4240 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4241 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4242 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4243 chain of stmts and pointers:
4244 RELATED_STMT VEC_STMT
4245 VS1_0: vx0 = memref0 VS1_1 -
4246 VS1_1: vx1 = memref1 VS1_2 -
4247 VS1_2: vx2 = memref2 VS1_3 -
4248 VS1_3: vx3 = memref3 - -
4249 S1: x = load - VS1_0
4250 VS2_0: vz0 = vx0 + v1 VS2_1 -
4251 VS2_1: vz1 = vx1 + v1 VS2_2 -
4252 VS2_2: vz2 = vx2 + v1 VS2_3 -
4253 VS2_3: vz3 = vx3 + v1 - -
4254 S2: z = x + 1 - VS2_0 */
4256 prev_stmt_info = NULL;
4257 for (j = 0; j < ncopies; j++)
4259 /* Handle uses. */
4260 if (j == 0)
4262 if (op_type == binary_op && scalar_shift_arg)
4264 /* Vector shl and shr insn patterns can be defined with scalar
4265 operand 2 (shift operand). In this case, use constant or loop
4266 invariant op1 directly, without extending it to vector mode
4267 first. */
4268 optab_op2_mode = insn_data[icode].operand[2].mode;
4269 if (!VECTOR_MODE_P (optab_op2_mode))
4271 if (vect_print_dump_info (REPORT_DETAILS))
4272 fprintf (vect_dump, "operand 1 using scalar mode.");
4273 vec_oprnd1 = op1;
4274 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4275 if (slp_node)
4277 /* Store vec_oprnd1 for every vector stmt to be created
4278 for SLP_NODE. We check during the analysis that all the
4279 shift arguments are the same.
4280 TODO: Allow different constants for different vector
4281 stmts generated for an SLP instance. */
4282 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4283 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4288 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4289 (a special case for certain kind of vector shifts); otherwise,
4290 operand 1 should be of a vector type (the usual case). */
4291 if (op_type == binary_op && !vec_oprnd1)
4292 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4293 slp_node);
4294 else
4295 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4296 slp_node);
4298 else
4299 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4301 /* Arguments are ready. Create the new vector stmt. */
4302 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4304 vop1 = ((op_type == binary_op)
4305 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4306 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4307 new_temp = make_ssa_name (vec_dest, new_stmt);
4308 gimple_assign_set_lhs (new_stmt, new_temp);
4309 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4310 if (slp_node)
4311 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4314 if (slp_node)
4315 continue;
4317 if (j == 0)
4318 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4319 else
4320 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4321 prev_stmt_info = vinfo_for_stmt (new_stmt);
4324 VEC_free (tree, heap, vec_oprnds0);
4325 if (vec_oprnds1)
4326 VEC_free (tree, heap, vec_oprnds1);
4328 return true;
4332 /* Get vectorized definitions for loop-based vectorization. For the first
4333 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4334 scalar operand), and for the rest we get a copy with
4335 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4336 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4337 The vectors are collected into VEC_OPRNDS. */
4339 static void
4340 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4341 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4343 tree vec_oprnd;
4345 /* Get first vector operand. */
4346 /* All the vector operands except the very first one (that is scalar oprnd)
4347 are stmt copies. */
4348 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4349 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4350 else
4351 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4353 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4355 /* Get second vector operand. */
4356 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4357 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4359 *oprnd = vec_oprnd;
4361 /* For conversion in multiple steps, continue to get operands
4362 recursively. */
4363 if (multi_step_cvt)
4364 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4368 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4369 For multi-step conversions store the resulting vectors and call the function
4370 recursively. */
4372 static void
4373 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4374 int multi_step_cvt, gimple stmt,
4375 VEC (tree, heap) *vec_dsts,
4376 gimple_stmt_iterator *gsi,
4377 slp_tree slp_node, enum tree_code code,
4378 stmt_vec_info *prev_stmt_info)
4380 unsigned int i;
4381 tree vop0, vop1, new_tmp, vec_dest;
4382 gimple new_stmt;
4383 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4385 vec_dest = VEC_pop (tree, vec_dsts);
4387 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4389 /* Create demotion operation. */
4390 vop0 = VEC_index (tree, *vec_oprnds, i);
4391 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4392 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4393 new_tmp = make_ssa_name (vec_dest, new_stmt);
4394 gimple_assign_set_lhs (new_stmt, new_tmp);
4395 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4397 if (multi_step_cvt)
4398 /* Store the resulting vector for next recursive call. */
4399 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4400 else
4402 /* This is the last step of the conversion sequence. Store the
4403 vectors in SLP_NODE or in vector info of the scalar statement
4404 (or in STMT_VINFO_RELATED_STMT chain). */
4405 if (slp_node)
4406 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4407 else
4409 if (!*prev_stmt_info)
4410 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4411 else
4412 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4414 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4419 /* For multi-step demotion operations we first generate demotion operations
4420 from the source type to the intermediate types, and then combine the
4421 results (stored in VEC_OPRNDS) in demotion operation to the destination
4422 type. */
4423 if (multi_step_cvt)
4425 /* At each level of recursion we have have of the operands we had at the
4426 previous level. */
4427 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4428 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4429 stmt, vec_dsts, gsi, slp_node,
4430 code, prev_stmt_info);
4435 /* Function vectorizable_type_demotion
4437 Check if STMT performs a binary or unary operation that involves
4438 type demotion, and if it can be vectorized.
4439 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4440 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4441 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4443 bool
4444 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4445 gimple *vec_stmt, slp_tree slp_node)
4447 tree vec_dest;
4448 tree scalar_dest;
4449 tree op0;
4450 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4451 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4452 enum tree_code code, code1 = ERROR_MARK;
4453 tree def;
4454 gimple def_stmt;
4455 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4456 stmt_vec_info prev_stmt_info;
4457 int nunits_in;
4458 int nunits_out;
4459 tree vectype_out;
4460 int ncopies;
4461 int j, i;
4462 tree vectype_in;
4463 int multi_step_cvt = 0;
4464 VEC (tree, heap) *vec_oprnds0 = NULL;
4465 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4466 tree last_oprnd, intermediate_type;
4468 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4469 return false;
4471 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4472 return false;
4474 /* Is STMT a vectorizable type-demotion operation? */
4475 if (!is_gimple_assign (stmt))
4476 return false;
4478 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4479 return false;
4481 code = gimple_assign_rhs_code (stmt);
4482 if (!CONVERT_EXPR_CODE_P (code))
4483 return false;
4485 op0 = gimple_assign_rhs1 (stmt);
4486 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4487 if (!vectype_in)
4488 return false;
4489 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4491 scalar_dest = gimple_assign_lhs (stmt);
4492 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4493 if (!vectype_out)
4494 return false;
4495 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4496 if (nunits_in >= nunits_out)
4497 return false;
4499 /* Multiple types in SLP are handled by creating the appropriate number of
4500 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4501 case of SLP. */
4502 if (slp_node)
4503 ncopies = 1;
4504 else
4505 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4507 gcc_assert (ncopies >= 1);
4509 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4510 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4511 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4512 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4513 && CONVERT_EXPR_CODE_P (code))))
4514 return false;
4516 /* Check the operands of the operation. */
4517 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4519 if (vect_print_dump_info (REPORT_DETAILS))
4520 fprintf (vect_dump, "use not simple.");
4521 return false;
4524 /* Supportable by target? */
4525 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4526 &multi_step_cvt, &interm_types))
4527 return false;
4529 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4531 if (!vec_stmt) /* transformation not required. */
4533 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4534 if (vect_print_dump_info (REPORT_DETAILS))
4535 fprintf (vect_dump, "=== vectorizable_demotion ===");
4536 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4537 return true;
4540 /** Transform. **/
4541 if (vect_print_dump_info (REPORT_DETAILS))
4542 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4543 ncopies);
4545 /* In case of multi-step demotion, we first generate demotion operations to
4546 the intermediate types, and then from that types to the final one.
4547 We create vector destinations for the intermediate type (TYPES) received
4548 from supportable_narrowing_operation, and store them in the correct order
4549 for future use in vect_create_vectorized_demotion_stmts(). */
4550 if (multi_step_cvt)
4551 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4552 else
4553 vec_dsts = VEC_alloc (tree, heap, 1);
4555 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4556 VEC_quick_push (tree, vec_dsts, vec_dest);
4558 if (multi_step_cvt)
4560 for (i = VEC_length (tree, interm_types) - 1;
4561 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4563 vec_dest = vect_create_destination_var (scalar_dest,
4564 intermediate_type);
4565 VEC_quick_push (tree, vec_dsts, vec_dest);
4569 /* In case the vectorization factor (VF) is bigger than the number
4570 of elements that we can fit in a vectype (nunits), we have to generate
4571 more than one vector stmt - i.e - we need to "unroll" the
4572 vector stmt by a factor VF/nunits. */
4573 last_oprnd = op0;
4574 prev_stmt_info = NULL;
4575 for (j = 0; j < ncopies; j++)
4577 /* Handle uses. */
4578 if (slp_node)
4579 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4580 else
4582 VEC_free (tree, heap, vec_oprnds0);
4583 vec_oprnds0 = VEC_alloc (tree, heap,
4584 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4585 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4586 vect_pow2 (multi_step_cvt) - 1);
4589 /* Arguments are ready. Create the new vector stmts. */
4590 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4591 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4592 multi_step_cvt, stmt, tmp_vec_dsts,
4593 gsi, slp_node, code1,
4594 &prev_stmt_info);
4597 VEC_free (tree, heap, vec_oprnds0);
4598 VEC_free (tree, heap, vec_dsts);
4599 VEC_free (tree, heap, tmp_vec_dsts);
4600 VEC_free (tree, heap, interm_types);
4602 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4603 return true;
4607 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4608 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4609 the resulting vectors and call the function recursively. */
4611 static void
4612 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4613 VEC (tree, heap) **vec_oprnds1,
4614 int multi_step_cvt, gimple stmt,
4615 VEC (tree, heap) *vec_dsts,
4616 gimple_stmt_iterator *gsi,
4617 slp_tree slp_node, enum tree_code code1,
4618 enum tree_code code2, tree decl1,
4619 tree decl2, int op_type,
4620 stmt_vec_info *prev_stmt_info)
4622 int i;
4623 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4624 gimple new_stmt1, new_stmt2;
4625 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4626 VEC (tree, heap) *vec_tmp;
4628 vec_dest = VEC_pop (tree, vec_dsts);
4629 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4631 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4633 if (op_type == binary_op)
4634 vop1 = VEC_index (tree, *vec_oprnds1, i);
4635 else
4636 vop1 = NULL_TREE;
4638 /* Generate the two halves of promotion operation. */
4639 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4640 op_type, vec_dest, gsi, stmt);
4641 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4642 op_type, vec_dest, gsi, stmt);
4643 if (is_gimple_call (new_stmt1))
4645 new_tmp1 = gimple_call_lhs (new_stmt1);
4646 new_tmp2 = gimple_call_lhs (new_stmt2);
4648 else
4650 new_tmp1 = gimple_assign_lhs (new_stmt1);
4651 new_tmp2 = gimple_assign_lhs (new_stmt2);
4654 if (multi_step_cvt)
4656 /* Store the results for the recursive call. */
4657 VEC_quick_push (tree, vec_tmp, new_tmp1);
4658 VEC_quick_push (tree, vec_tmp, new_tmp2);
4660 else
4662 /* Last step of promotion sequience - store the results. */
4663 if (slp_node)
4665 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4666 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4668 else
4670 if (!*prev_stmt_info)
4671 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4672 else
4673 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4675 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4676 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4677 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4682 if (multi_step_cvt)
4684 /* For multi-step promotion operation we first generate we call the
4685 function recurcively for every stage. We start from the input type,
4686 create promotion operations to the intermediate types, and then
4687 create promotions to the output type. */
4688 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4689 VEC_free (tree, heap, vec_tmp);
4690 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4691 multi_step_cvt - 1, stmt,
4692 vec_dsts, gsi, slp_node, code1,
4693 code2, decl2, decl2, op_type,
4694 prev_stmt_info);
4699 /* Function vectorizable_type_promotion
4701 Check if STMT performs a binary or unary operation that involves
4702 type promotion, and if it can be vectorized.
4703 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4704 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4705 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4707 bool
4708 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4709 gimple *vec_stmt, slp_tree slp_node)
4711 tree vec_dest;
4712 tree scalar_dest;
4713 tree op0, op1 = NULL;
4714 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4715 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4716 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4717 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4718 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4719 int op_type;
4720 tree def;
4721 gimple def_stmt;
4722 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4723 stmt_vec_info prev_stmt_info;
4724 int nunits_in;
4725 int nunits_out;
4726 tree vectype_out;
4727 int ncopies;
4728 int j, i;
4729 tree vectype_in;
4730 tree intermediate_type = NULL_TREE;
4731 int multi_step_cvt = 0;
4732 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4733 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4735 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4736 return false;
4738 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4739 return false;
4741 /* Is STMT a vectorizable type-promotion operation? */
4742 if (!is_gimple_assign (stmt))
4743 return false;
4745 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4746 return false;
4748 code = gimple_assign_rhs_code (stmt);
4749 if (!CONVERT_EXPR_CODE_P (code)
4750 && code != WIDEN_MULT_EXPR)
4751 return false;
4753 op0 = gimple_assign_rhs1 (stmt);
4754 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4755 if (!vectype_in)
4756 return false;
4757 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4759 scalar_dest = gimple_assign_lhs (stmt);
4760 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4761 if (!vectype_out)
4762 return false;
4763 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4764 if (nunits_in <= nunits_out)
4765 return false;
4767 /* Multiple types in SLP are handled by creating the appropriate number of
4768 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4769 case of SLP. */
4770 if (slp_node)
4771 ncopies = 1;
4772 else
4773 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4775 gcc_assert (ncopies >= 1);
4777 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4778 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4779 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4780 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4781 && CONVERT_EXPR_CODE_P (code))))
4782 return false;
4784 /* Check the operands of the operation. */
4785 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4787 if (vect_print_dump_info (REPORT_DETAILS))
4788 fprintf (vect_dump, "use not simple.");
4789 return false;
4792 op_type = TREE_CODE_LENGTH (code);
4793 if (op_type == binary_op)
4795 op1 = gimple_assign_rhs2 (stmt);
4796 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4798 if (vect_print_dump_info (REPORT_DETAILS))
4799 fprintf (vect_dump, "use not simple.");
4800 return false;
4804 /* Supportable by target? */
4805 if (!supportable_widening_operation (code, stmt, vectype_in,
4806 &decl1, &decl2, &code1, &code2,
4807 &multi_step_cvt, &interm_types))
4808 return false;
4810 /* Binary widening operation can only be supported directly by the
4811 architecture. */
4812 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4814 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4816 if (!vec_stmt) /* transformation not required. */
4818 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4819 if (vect_print_dump_info (REPORT_DETAILS))
4820 fprintf (vect_dump, "=== vectorizable_promotion ===");
4821 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4822 return true;
4825 /** Transform. **/
4827 if (vect_print_dump_info (REPORT_DETAILS))
4828 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4829 ncopies);
4831 /* Handle def. */
4832 /* In case of multi-step promotion, we first generate promotion operations
4833 to the intermediate types, and then from that types to the final one.
4834 We store vector destination in VEC_DSTS in the correct order for
4835 recursive creation of promotion operations in
4836 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4837 according to TYPES recieved from supportable_widening_operation(). */
4838 if (multi_step_cvt)
4839 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4840 else
4841 vec_dsts = VEC_alloc (tree, heap, 1);
4843 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4844 VEC_quick_push (tree, vec_dsts, vec_dest);
4846 if (multi_step_cvt)
4848 for (i = VEC_length (tree, interm_types) - 1;
4849 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4851 vec_dest = vect_create_destination_var (scalar_dest,
4852 intermediate_type);
4853 VEC_quick_push (tree, vec_dsts, vec_dest);
4857 if (!slp_node)
4859 vec_oprnds0 = VEC_alloc (tree, heap,
4860 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4861 if (op_type == binary_op)
4862 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4865 /* In case the vectorization factor (VF) is bigger than the number
4866 of elements that we can fit in a vectype (nunits), we have to generate
4867 more than one vector stmt - i.e - we need to "unroll" the
4868 vector stmt by a factor VF/nunits. */
4870 prev_stmt_info = NULL;
4871 for (j = 0; j < ncopies; j++)
4873 /* Handle uses. */
4874 if (j == 0)
4876 if (slp_node)
4877 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4878 else
4880 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4881 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4882 if (op_type == binary_op)
4884 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4885 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4889 else
4891 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4892 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4893 if (op_type == binary_op)
4895 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4896 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4900 /* Arguments are ready. Create the new vector stmts. */
4901 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4902 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4903 multi_step_cvt, stmt,
4904 tmp_vec_dsts,
4905 gsi, slp_node, code1, code2,
4906 decl1, decl2, op_type,
4907 &prev_stmt_info);
4910 VEC_free (tree, heap, vec_dsts);
4911 VEC_free (tree, heap, tmp_vec_dsts);
4912 VEC_free (tree, heap, interm_types);
4913 VEC_free (tree, heap, vec_oprnds0);
4914 VEC_free (tree, heap, vec_oprnds1);
4916 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4917 return true;
4921 /* Function vect_strided_store_supported.
4923 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4924 and FALSE otherwise. */
4926 static bool
4927 vect_strided_store_supported (tree vectype)
4929 optab interleave_high_optab, interleave_low_optab;
4930 int mode;
4932 mode = (int) TYPE_MODE (vectype);
4934 /* Check that the operation is supported. */
4935 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4936 vectype, optab_default);
4937 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4938 vectype, optab_default);
4939 if (!interleave_high_optab || !interleave_low_optab)
4941 if (vect_print_dump_info (REPORT_DETAILS))
4942 fprintf (vect_dump, "no optab for interleave.");
4943 return false;
4946 if (optab_handler (interleave_high_optab, mode)->insn_code
4947 == CODE_FOR_nothing
4948 || optab_handler (interleave_low_optab, mode)->insn_code
4949 == CODE_FOR_nothing)
4951 if (vect_print_dump_info (REPORT_DETAILS))
4952 fprintf (vect_dump, "interleave op not supported by target.");
4953 return false;
4956 return true;
4960 /* Function vect_permute_store_chain.
4962 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4963 a power of 2, generate interleave_high/low stmts to reorder the data
4964 correctly for the stores. Return the final references for stores in
4965 RESULT_CHAIN.
4967 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4968 The input is 4 vectors each containing 8 elements. We assign a number to each
4969 element, the input sequence is:
4971 1st vec: 0 1 2 3 4 5 6 7
4972 2nd vec: 8 9 10 11 12 13 14 15
4973 3rd vec: 16 17 18 19 20 21 22 23
4974 4th vec: 24 25 26 27 28 29 30 31
4976 The output sequence should be:
4978 1st vec: 0 8 16 24 1 9 17 25
4979 2nd vec: 2 10 18 26 3 11 19 27
4980 3rd vec: 4 12 20 28 5 13 21 30
4981 4th vec: 6 14 22 30 7 15 23 31
4983 i.e., we interleave the contents of the four vectors in their order.
4985 We use interleave_high/low instructions to create such output. The input of
4986 each interleave_high/low operation is two vectors:
4987 1st vec 2nd vec
4988 0 1 2 3 4 5 6 7
4989 the even elements of the result vector are obtained left-to-right from the
4990 high/low elements of the first vector. The odd elements of the result are
4991 obtained left-to-right from the high/low elements of the second vector.
4992 The output of interleave_high will be: 0 4 1 5
4993 and of interleave_low: 2 6 3 7
4996 The permutation is done in log LENGTH stages. In each stage interleave_high
4997 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4998 where the first argument is taken from the first half of DR_CHAIN and the
4999 second argument from it's second half.
5000 In our example,
5002 I1: interleave_high (1st vec, 3rd vec)
5003 I2: interleave_low (1st vec, 3rd vec)
5004 I3: interleave_high (2nd vec, 4th vec)
5005 I4: interleave_low (2nd vec, 4th vec)
5007 The output for the first stage is:
5009 I1: 0 16 1 17 2 18 3 19
5010 I2: 4 20 5 21 6 22 7 23
5011 I3: 8 24 9 25 10 26 11 27
5012 I4: 12 28 13 29 14 30 15 31
5014 The output of the second stage, i.e. the final result is:
5016 I1: 0 8 16 24 1 9 17 25
5017 I2: 2 10 18 26 3 11 19 27
5018 I3: 4 12 20 28 5 13 21 30
5019 I4: 6 14 22 30 7 15 23 31. */
5021 static bool
5022 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5023 unsigned int length,
5024 gimple stmt,
5025 gimple_stmt_iterator *gsi,
5026 VEC(tree,heap) **result_chain)
5028 tree perm_dest, vect1, vect2, high, low;
5029 gimple perm_stmt;
5030 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5031 tree scalar_dest;
5032 int i;
5033 unsigned int j;
5034 enum tree_code high_code, low_code;
5036 scalar_dest = gimple_assign_lhs (stmt);
5038 /* Check that the operation is supported. */
5039 if (!vect_strided_store_supported (vectype))
5040 return false;
5042 *result_chain = VEC_copy (tree, heap, dr_chain);
5044 for (i = 0; i < exact_log2 (length); i++)
5046 for (j = 0; j < length/2; j++)
5048 vect1 = VEC_index (tree, dr_chain, j);
5049 vect2 = VEC_index (tree, dr_chain, j+length/2);
5051 /* Create interleaving stmt:
5052 in the case of big endian:
5053 high = interleave_high (vect1, vect2)
5054 and in the case of little endian:
5055 high = interleave_low (vect1, vect2). */
5056 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5057 DECL_GIMPLE_REG_P (perm_dest) = 1;
5058 add_referenced_var (perm_dest);
5059 if (BYTES_BIG_ENDIAN)
5061 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5062 low_code = VEC_INTERLEAVE_LOW_EXPR;
5064 else
5066 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5067 high_code = VEC_INTERLEAVE_LOW_EXPR;
5069 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5070 vect1, vect2);
5071 high = make_ssa_name (perm_dest, perm_stmt);
5072 gimple_assign_set_lhs (perm_stmt, high);
5073 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5074 VEC_replace (tree, *result_chain, 2*j, high);
5076 /* Create interleaving stmt:
5077 in the case of big endian:
5078 low = interleave_low (vect1, vect2)
5079 and in the case of little endian:
5080 low = interleave_high (vect1, vect2). */
5081 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5082 DECL_GIMPLE_REG_P (perm_dest) = 1;
5083 add_referenced_var (perm_dest);
5084 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5085 vect1, vect2);
5086 low = make_ssa_name (perm_dest, perm_stmt);
5087 gimple_assign_set_lhs (perm_stmt, low);
5088 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5089 VEC_replace (tree, *result_chain, 2*j+1, low);
5091 dr_chain = VEC_copy (tree, heap, *result_chain);
5093 return true;
5097 /* Function vectorizable_store.
5099 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5100 can be vectorized.
5101 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5102 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5103 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5105 bool
5106 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5107 slp_tree slp_node)
5109 tree scalar_dest;
5110 tree data_ref;
5111 tree op;
5112 tree vec_oprnd = NULL_TREE;
5113 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5114 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5115 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5116 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5117 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5118 enum machine_mode vec_mode;
5119 tree dummy;
5120 enum dr_alignment_support alignment_support_scheme;
5121 tree def;
5122 gimple def_stmt;
5123 enum vect_def_type dt;
5124 stmt_vec_info prev_stmt_info = NULL;
5125 tree dataref_ptr = NULL_TREE;
5126 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5127 int ncopies;
5128 int j;
5129 gimple next_stmt, first_stmt = NULL;
5130 bool strided_store = false;
5131 unsigned int group_size, i;
5132 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5133 bool inv_p;
5134 VEC(tree,heap) *vec_oprnds = NULL;
5135 bool slp = (slp_node != NULL);
5136 stmt_vec_info first_stmt_vinfo;
5137 unsigned int vec_num;
5139 /* Multiple types in SLP are handled by creating the appropriate number of
5140 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5141 case of SLP. */
5142 if (slp)
5143 ncopies = 1;
5144 else
5145 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5147 gcc_assert (ncopies >= 1);
5149 /* FORNOW. This restriction should be relaxed. */
5150 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5152 if (vect_print_dump_info (REPORT_DETAILS))
5153 fprintf (vect_dump, "multiple types in nested loop.");
5154 return false;
5157 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5158 return false;
5160 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5161 return false;
5163 /* Is vectorizable store? */
5165 if (!is_gimple_assign (stmt))
5166 return false;
5168 scalar_dest = gimple_assign_lhs (stmt);
5169 if (TREE_CODE (scalar_dest) != ARRAY_REF
5170 && TREE_CODE (scalar_dest) != INDIRECT_REF
5171 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5172 return false;
5174 gcc_assert (gimple_assign_single_p (stmt));
5175 op = gimple_assign_rhs1 (stmt);
5176 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5178 if (vect_print_dump_info (REPORT_DETAILS))
5179 fprintf (vect_dump, "use not simple.");
5180 return false;
5183 /* If accesses through a pointer to vectype do not alias the original
5184 memory reference we have a problem. */
5185 if (get_alias_set (vectype) != get_alias_set (TREE_TYPE (scalar_dest))
5186 && !alias_set_subset_of (get_alias_set (vectype),
5187 get_alias_set (TREE_TYPE (scalar_dest))))
5189 if (vect_print_dump_info (REPORT_DETAILS))
5190 fprintf (vect_dump, "vector type does not alias scalar type");
5191 return false;
5194 if (!useless_type_conversion_p (TREE_TYPE (op), TREE_TYPE (scalar_dest)))
5196 if (vect_print_dump_info (REPORT_DETAILS))
5197 fprintf (vect_dump, "operands of different types");
5198 return false;
5201 vec_mode = TYPE_MODE (vectype);
5202 /* FORNOW. In some cases can vectorize even if data-type not supported
5203 (e.g. - array initialization with 0). */
5204 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5205 return false;
5207 if (!STMT_VINFO_DATA_REF (stmt_info))
5208 return false;
5210 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5212 strided_store = true;
5213 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5214 if (!vect_strided_store_supported (vectype)
5215 && !PURE_SLP_STMT (stmt_info) && !slp)
5216 return false;
5218 if (first_stmt == stmt)
5220 /* STMT is the leader of the group. Check the operands of all the
5221 stmts of the group. */
5222 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5223 while (next_stmt)
5225 gcc_assert (gimple_assign_single_p (next_stmt));
5226 op = gimple_assign_rhs1 (next_stmt);
5227 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5229 if (vect_print_dump_info (REPORT_DETAILS))
5230 fprintf (vect_dump, "use not simple.");
5231 return false;
5233 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5238 if (!vec_stmt) /* transformation not required. */
5240 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5241 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5242 return true;
5245 /** Transform. **/
5247 if (strided_store)
5249 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5250 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5252 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5254 /* FORNOW */
5255 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5257 /* We vectorize all the stmts of the interleaving group when we
5258 reach the last stmt in the group. */
5259 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5260 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5261 && !slp)
5263 *vec_stmt = NULL;
5264 return true;
5267 if (slp)
5268 strided_store = false;
5270 /* VEC_NUM is the number of vect stmts to be created for this group. */
5271 if (slp)
5272 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5273 else
5274 vec_num = group_size;
5276 else
5278 first_stmt = stmt;
5279 first_dr = dr;
5280 group_size = vec_num = 1;
5281 first_stmt_vinfo = stmt_info;
5284 if (vect_print_dump_info (REPORT_DETAILS))
5285 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5287 dr_chain = VEC_alloc (tree, heap, group_size);
5288 oprnds = VEC_alloc (tree, heap, group_size);
5290 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5291 gcc_assert (alignment_support_scheme);
5292 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5294 /* In case the vectorization factor (VF) is bigger than the number
5295 of elements that we can fit in a vectype (nunits), we have to generate
5296 more than one vector stmt - i.e - we need to "unroll" the
5297 vector stmt by a factor VF/nunits. For more details see documentation in
5298 vect_get_vec_def_for_copy_stmt. */
5300 /* In case of interleaving (non-unit strided access):
5302 S1: &base + 2 = x2
5303 S2: &base = x0
5304 S3: &base + 1 = x1
5305 S4: &base + 3 = x3
5307 We create vectorized stores starting from base address (the access of the
5308 first stmt in the chain (S2 in the above example), when the last store stmt
5309 of the chain (S4) is reached:
5311 VS1: &base = vx2
5312 VS2: &base + vec_size*1 = vx0
5313 VS3: &base + vec_size*2 = vx1
5314 VS4: &base + vec_size*3 = vx3
5316 Then permutation statements are generated:
5318 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5319 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5322 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5323 (the order of the data-refs in the output of vect_permute_store_chain
5324 corresponds to the order of scalar stmts in the interleaving chain - see
5325 the documentation of vect_permute_store_chain()).
5327 In case of both multiple types and interleaving, above vector stores and
5328 permutation stmts are created for every copy. The result vector stmts are
5329 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5330 STMT_VINFO_RELATED_STMT for the next copies.
5333 prev_stmt_info = NULL;
5334 for (j = 0; j < ncopies; j++)
5336 gimple new_stmt;
5337 gimple ptr_incr;
5339 if (j == 0)
5341 if (slp)
5343 /* Get vectorized arguments for SLP_NODE. */
5344 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5346 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5348 else
5350 /* For interleaved stores we collect vectorized defs for all the
5351 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5352 used as an input to vect_permute_store_chain(), and OPRNDS as
5353 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5355 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5356 OPRNDS are of size 1. */
5357 next_stmt = first_stmt;
5358 for (i = 0; i < group_size; i++)
5360 /* Since gaps are not supported for interleaved stores,
5361 GROUP_SIZE is the exact number of stmts in the chain.
5362 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5363 there is no interleaving, GROUP_SIZE is 1, and only one
5364 iteration of the loop will be executed. */
5365 gcc_assert (next_stmt);
5366 gcc_assert (gimple_assign_single_p (next_stmt));
5367 op = gimple_assign_rhs1 (next_stmt);
5369 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5370 NULL);
5371 VEC_quick_push(tree, dr_chain, vec_oprnd);
5372 VEC_quick_push(tree, oprnds, vec_oprnd);
5373 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5377 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5378 &dummy, &ptr_incr, false,
5379 &inv_p, TREE_TYPE (vec_oprnd));
5380 gcc_assert (!inv_p);
5382 else
5384 /* For interleaved stores we created vectorized defs for all the
5385 defs stored in OPRNDS in the previous iteration (previous copy).
5386 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5387 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5388 next copy.
5389 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5390 OPRNDS are of size 1. */
5391 for (i = 0; i < group_size; i++)
5393 op = VEC_index (tree, oprnds, i);
5394 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5395 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5396 VEC_replace(tree, dr_chain, i, vec_oprnd);
5397 VEC_replace(tree, oprnds, i, vec_oprnd);
5399 dataref_ptr =
5400 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5403 if (strided_store)
5405 result_chain = VEC_alloc (tree, heap, group_size);
5406 /* Permute. */
5407 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5408 &result_chain))
5409 return false;
5412 next_stmt = first_stmt;
5413 for (i = 0; i < vec_num; i++)
5415 if (i > 0)
5416 /* Bump the vector pointer. */
5417 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5418 NULL_TREE);
5420 if (slp)
5421 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5422 else if (strided_store)
5423 /* For strided stores vectorized defs are interleaved in
5424 vect_permute_store_chain(). */
5425 vec_oprnd = VEC_index (tree, result_chain, i);
5427 data_ref = build_fold_indirect_ref (dataref_ptr);
5428 /* Arguments are ready. Create the new vector stmt. */
5429 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5430 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5431 mark_symbols_for_renaming (new_stmt);
5433 if (slp)
5434 continue;
5436 if (j == 0)
5437 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5438 else
5439 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5441 prev_stmt_info = vinfo_for_stmt (new_stmt);
5442 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5443 if (!next_stmt)
5444 break;
5448 VEC_free (tree, heap, dr_chain);
5449 VEC_free (tree, heap, oprnds);
5450 if (result_chain)
5451 VEC_free (tree, heap, result_chain);
5453 return true;
5457 /* Function vect_setup_realignment
5459 This function is called when vectorizing an unaligned load using
5460 the dr_explicit_realign[_optimized] scheme.
5461 This function generates the following code at the loop prolog:
5463 p = initial_addr;
5464 x msq_init = *(floor(p)); # prolog load
5465 realignment_token = call target_builtin;
5466 loop:
5467 x msq = phi (msq_init, ---)
5469 The stmts marked with x are generated only for the case of
5470 dr_explicit_realign_optimized.
5472 The code above sets up a new (vector) pointer, pointing to the first
5473 location accessed by STMT, and a "floor-aligned" load using that pointer.
5474 It also generates code to compute the "realignment-token" (if the relevant
5475 target hook was defined), and creates a phi-node at the loop-header bb
5476 whose arguments are the result of the prolog-load (created by this
5477 function) and the result of a load that takes place in the loop (to be
5478 created by the caller to this function).
5480 For the case of dr_explicit_realign_optimized:
5481 The caller to this function uses the phi-result (msq) to create the
5482 realignment code inside the loop, and sets up the missing phi argument,
5483 as follows:
5484 loop:
5485 msq = phi (msq_init, lsq)
5486 lsq = *(floor(p')); # load in loop
5487 result = realign_load (msq, lsq, realignment_token);
5489 For the case of dr_explicit_realign:
5490 loop:
5491 msq = *(floor(p)); # load in loop
5492 p' = p + (VS-1);
5493 lsq = *(floor(p')); # load in loop
5494 result = realign_load (msq, lsq, realignment_token);
5496 Input:
5497 STMT - (scalar) load stmt to be vectorized. This load accesses
5498 a memory location that may be unaligned.
5499 BSI - place where new code is to be inserted.
5500 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5501 is used.
5503 Output:
5504 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5505 target hook, if defined.
5506 Return value - the result of the loop-header phi node. */
5508 static tree
5509 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5510 tree *realignment_token,
5511 enum dr_alignment_support alignment_support_scheme,
5512 tree init_addr,
5513 struct loop **at_loop)
5515 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5516 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5517 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5519 edge pe;
5520 tree scalar_dest = gimple_assign_lhs (stmt);
5521 tree vec_dest;
5522 gimple inc;
5523 tree ptr;
5524 tree data_ref;
5525 gimple new_stmt;
5526 basic_block new_bb;
5527 tree msq_init = NULL_TREE;
5528 tree new_temp;
5529 gimple phi_stmt;
5530 tree msq = NULL_TREE;
5531 gimple_seq stmts = NULL;
5532 bool inv_p;
5533 bool compute_in_loop = false;
5534 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5535 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5536 struct loop *loop_for_initial_load;
5538 gcc_assert (alignment_support_scheme == dr_explicit_realign
5539 || alignment_support_scheme == dr_explicit_realign_optimized);
5541 /* We need to generate three things:
5542 1. the misalignment computation
5543 2. the extra vector load (for the optimized realignment scheme).
5544 3. the phi node for the two vectors from which the realignment is
5545 done (for the optimized realignment scheme).
5548 /* 1. Determine where to generate the misalignment computation.
5550 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5551 calculation will be generated by this function, outside the loop (in the
5552 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5553 caller, inside the loop.
5555 Background: If the misalignment remains fixed throughout the iterations of
5556 the loop, then both realignment schemes are applicable, and also the
5557 misalignment computation can be done outside LOOP. This is because we are
5558 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5559 are a multiple of VS (the Vector Size), and therefore the misalignment in
5560 different vectorized LOOP iterations is always the same.
5561 The problem arises only if the memory access is in an inner-loop nested
5562 inside LOOP, which is now being vectorized using outer-loop vectorization.
5563 This is the only case when the misalignment of the memory access may not
5564 remain fixed throughout the iterations of the inner-loop (as explained in
5565 detail in vect_supportable_dr_alignment). In this case, not only is the
5566 optimized realignment scheme not applicable, but also the misalignment
5567 computation (and generation of the realignment token that is passed to
5568 REALIGN_LOAD) have to be done inside the loop.
5570 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5571 or not, which in turn determines if the misalignment is computed inside
5572 the inner-loop, or outside LOOP. */
5574 if (init_addr != NULL_TREE)
5576 compute_in_loop = true;
5577 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5581 /* 2. Determine where to generate the extra vector load.
5583 For the optimized realignment scheme, instead of generating two vector
5584 loads in each iteration, we generate a single extra vector load in the
5585 preheader of the loop, and in each iteration reuse the result of the
5586 vector load from the previous iteration. In case the memory access is in
5587 an inner-loop nested inside LOOP, which is now being vectorized using
5588 outer-loop vectorization, we need to determine whether this initial vector
5589 load should be generated at the preheader of the inner-loop, or can be
5590 generated at the preheader of LOOP. If the memory access has no evolution
5591 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5592 to be generated inside LOOP (in the preheader of the inner-loop). */
5594 if (nested_in_vect_loop)
5596 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5597 bool invariant_in_outerloop =
5598 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5599 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5601 else
5602 loop_for_initial_load = loop;
5603 if (at_loop)
5604 *at_loop = loop_for_initial_load;
5606 /* 3. For the case of the optimized realignment, create the first vector
5607 load at the loop preheader. */
5609 if (alignment_support_scheme == dr_explicit_realign_optimized)
5611 /* Create msq_init = *(floor(p1)) in the loop preheader */
5613 gcc_assert (!compute_in_loop);
5614 pe = loop_preheader_edge (loop_for_initial_load);
5615 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5616 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5617 &init_addr, &inc, true, &inv_p, NULL_TREE);
5618 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5619 new_stmt = gimple_build_assign (vec_dest, data_ref);
5620 new_temp = make_ssa_name (vec_dest, new_stmt);
5621 gimple_assign_set_lhs (new_stmt, new_temp);
5622 mark_symbols_for_renaming (new_stmt);
5623 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5624 gcc_assert (!new_bb);
5625 msq_init = gimple_assign_lhs (new_stmt);
5628 /* 4. Create realignment token using a target builtin, if available.
5629 It is done either inside the containing loop, or before LOOP (as
5630 determined above). */
5632 if (targetm.vectorize.builtin_mask_for_load)
5634 tree builtin_decl;
5636 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5637 if (compute_in_loop)
5638 gcc_assert (init_addr); /* already computed by the caller. */
5639 else
5641 /* Generate the INIT_ADDR computation outside LOOP. */
5642 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5643 NULL_TREE, loop);
5644 pe = loop_preheader_edge (loop);
5645 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5646 gcc_assert (!new_bb);
5649 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5650 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5651 vec_dest =
5652 vect_create_destination_var (scalar_dest,
5653 gimple_call_return_type (new_stmt));
5654 new_temp = make_ssa_name (vec_dest, new_stmt);
5655 gimple_call_set_lhs (new_stmt, new_temp);
5657 if (compute_in_loop)
5658 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5659 else
5661 /* Generate the misalignment computation outside LOOP. */
5662 pe = loop_preheader_edge (loop);
5663 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5664 gcc_assert (!new_bb);
5667 *realignment_token = gimple_call_lhs (new_stmt);
5669 /* The result of the CALL_EXPR to this builtin is determined from
5670 the value of the parameter and no global variables are touched
5671 which makes the builtin a "const" function. Requiring the
5672 builtin to have the "const" attribute makes it unnecessary
5673 to call mark_call_clobbered. */
5674 gcc_assert (TREE_READONLY (builtin_decl));
5677 if (alignment_support_scheme == dr_explicit_realign)
5678 return msq;
5680 gcc_assert (!compute_in_loop);
5681 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5684 /* 5. Create msq = phi <msq_init, lsq> in loop */
5686 pe = loop_preheader_edge (containing_loop);
5687 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5688 msq = make_ssa_name (vec_dest, NULL);
5689 phi_stmt = create_phi_node (msq, containing_loop->header);
5690 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5691 add_phi_arg (phi_stmt, msq_init, pe);
5693 return msq;
5697 /* Function vect_strided_load_supported.
5699 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5700 and FALSE otherwise. */
5702 static bool
5703 vect_strided_load_supported (tree vectype)
5705 optab perm_even_optab, perm_odd_optab;
5706 int mode;
5708 mode = (int) TYPE_MODE (vectype);
5710 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5711 optab_default);
5712 if (!perm_even_optab)
5714 if (vect_print_dump_info (REPORT_DETAILS))
5715 fprintf (vect_dump, "no optab for perm_even.");
5716 return false;
5719 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5721 if (vect_print_dump_info (REPORT_DETAILS))
5722 fprintf (vect_dump, "perm_even op not supported by target.");
5723 return false;
5726 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5727 optab_default);
5728 if (!perm_odd_optab)
5730 if (vect_print_dump_info (REPORT_DETAILS))
5731 fprintf (vect_dump, "no optab for perm_odd.");
5732 return false;
5735 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5737 if (vect_print_dump_info (REPORT_DETAILS))
5738 fprintf (vect_dump, "perm_odd op not supported by target.");
5739 return false;
5741 return true;
5745 /* Function vect_permute_load_chain.
5747 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5748 a power of 2, generate extract_even/odd stmts to reorder the input data
5749 correctly. Return the final references for loads in RESULT_CHAIN.
5751 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5752 The input is 4 vectors each containing 8 elements. We assign a number to each
5753 element, the input sequence is:
5755 1st vec: 0 1 2 3 4 5 6 7
5756 2nd vec: 8 9 10 11 12 13 14 15
5757 3rd vec: 16 17 18 19 20 21 22 23
5758 4th vec: 24 25 26 27 28 29 30 31
5760 The output sequence should be:
5762 1st vec: 0 4 8 12 16 20 24 28
5763 2nd vec: 1 5 9 13 17 21 25 29
5764 3rd vec: 2 6 10 14 18 22 26 30
5765 4th vec: 3 7 11 15 19 23 27 31
5767 i.e., the first output vector should contain the first elements of each
5768 interleaving group, etc.
5770 We use extract_even/odd instructions to create such output. The input of each
5771 extract_even/odd operation is two vectors
5772 1st vec 2nd vec
5773 0 1 2 3 4 5 6 7
5775 and the output is the vector of extracted even/odd elements. The output of
5776 extract_even will be: 0 2 4 6
5777 and of extract_odd: 1 3 5 7
5780 The permutation is done in log LENGTH stages. In each stage extract_even and
5781 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5782 order. In our example,
5784 E1: extract_even (1st vec, 2nd vec)
5785 E2: extract_odd (1st vec, 2nd vec)
5786 E3: extract_even (3rd vec, 4th vec)
5787 E4: extract_odd (3rd vec, 4th vec)
5789 The output for the first stage will be:
5791 E1: 0 2 4 6 8 10 12 14
5792 E2: 1 3 5 7 9 11 13 15
5793 E3: 16 18 20 22 24 26 28 30
5794 E4: 17 19 21 23 25 27 29 31
5796 In order to proceed and create the correct sequence for the next stage (or
5797 for the correct output, if the second stage is the last one, as in our
5798 example), we first put the output of extract_even operation and then the
5799 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5800 The input for the second stage is:
5802 1st vec (E1): 0 2 4 6 8 10 12 14
5803 2nd vec (E3): 16 18 20 22 24 26 28 30
5804 3rd vec (E2): 1 3 5 7 9 11 13 15
5805 4th vec (E4): 17 19 21 23 25 27 29 31
5807 The output of the second stage:
5809 E1: 0 4 8 12 16 20 24 28
5810 E2: 2 6 10 14 18 22 26 30
5811 E3: 1 5 9 13 17 21 25 29
5812 E4: 3 7 11 15 19 23 27 31
5814 And RESULT_CHAIN after reordering:
5816 1st vec (E1): 0 4 8 12 16 20 24 28
5817 2nd vec (E3): 1 5 9 13 17 21 25 29
5818 3rd vec (E2): 2 6 10 14 18 22 26 30
5819 4th vec (E4): 3 7 11 15 19 23 27 31. */
5821 static bool
5822 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5823 unsigned int length,
5824 gimple stmt,
5825 gimple_stmt_iterator *gsi,
5826 VEC(tree,heap) **result_chain)
5828 tree perm_dest, data_ref, first_vect, second_vect;
5829 gimple perm_stmt;
5830 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5831 int i;
5832 unsigned int j;
5834 /* Check that the operation is supported. */
5835 if (!vect_strided_load_supported (vectype))
5836 return false;
5838 *result_chain = VEC_copy (tree, heap, dr_chain);
5839 for (i = 0; i < exact_log2 (length); i++)
5841 for (j = 0; j < length; j +=2)
5843 first_vect = VEC_index (tree, dr_chain, j);
5844 second_vect = VEC_index (tree, dr_chain, j+1);
5846 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5847 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5848 DECL_GIMPLE_REG_P (perm_dest) = 1;
5849 add_referenced_var (perm_dest);
5851 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5852 perm_dest, first_vect,
5853 second_vect);
5855 data_ref = make_ssa_name (perm_dest, perm_stmt);
5856 gimple_assign_set_lhs (perm_stmt, data_ref);
5857 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5858 mark_symbols_for_renaming (perm_stmt);
5860 VEC_replace (tree, *result_chain, j/2, data_ref);
5862 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5863 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5864 DECL_GIMPLE_REG_P (perm_dest) = 1;
5865 add_referenced_var (perm_dest);
5867 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5868 perm_dest, first_vect,
5869 second_vect);
5870 data_ref = make_ssa_name (perm_dest, perm_stmt);
5871 gimple_assign_set_lhs (perm_stmt, data_ref);
5872 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5873 mark_symbols_for_renaming (perm_stmt);
5875 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5877 dr_chain = VEC_copy (tree, heap, *result_chain);
5879 return true;
5883 /* Function vect_transform_strided_load.
5885 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5886 to perform their permutation and ascribe the result vectorized statements to
5887 the scalar statements.
5890 static bool
5891 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5892 gimple_stmt_iterator *gsi)
5894 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5895 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5896 gimple next_stmt, new_stmt;
5897 VEC(tree,heap) *result_chain = NULL;
5898 unsigned int i, gap_count;
5899 tree tmp_data_ref;
5901 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5902 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5903 vectors, that are ready for vector computation. */
5904 result_chain = VEC_alloc (tree, heap, size);
5905 /* Permute. */
5906 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5907 return false;
5909 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5910 Since we scan the chain starting from it's first node, their order
5911 corresponds the order of data-refs in RESULT_CHAIN. */
5912 next_stmt = first_stmt;
5913 gap_count = 1;
5914 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5916 if (!next_stmt)
5917 break;
5919 /* Skip the gaps. Loads created for the gaps will be removed by dead
5920 code elimination pass later. No need to check for the first stmt in
5921 the group, since it always exists.
5922 DR_GROUP_GAP is the number of steps in elements from the previous
5923 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5924 correspond to the gaps.
5926 if (next_stmt != first_stmt
5927 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5929 gap_count++;
5930 continue;
5933 while (next_stmt)
5935 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5936 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5937 copies, and we put the new vector statement in the first available
5938 RELATED_STMT. */
5939 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5940 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5941 else
5943 gimple prev_stmt =
5944 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5945 gimple rel_stmt =
5946 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5947 while (rel_stmt)
5949 prev_stmt = rel_stmt;
5950 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5952 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5954 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5955 gap_count = 1;
5956 /* If NEXT_STMT accesses the same DR as the previous statement,
5957 put the same TMP_DATA_REF as its vectorized statement; otherwise
5958 get the next data-ref from RESULT_CHAIN. */
5959 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5960 break;
5964 VEC_free (tree, heap, result_chain);
5965 return true;
5969 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5970 building a vector of type MASK_TYPE from it) and two input vectors placed in
5971 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5972 shifting by STRIDE elements of DR_CHAIN for every copy.
5973 (STRIDE is the number of vectorized stmts for NODE divided by the number of
5974 copies).
5975 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
5976 the created stmts must be inserted. */
5978 static inline void
5979 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
5980 int *mask_array, int mask_nunits,
5981 tree mask_element_type, tree mask_type,
5982 int first_vec_indx, int second_vec_indx,
5983 gimple_stmt_iterator *gsi, slp_tree node,
5984 tree builtin_decl, tree vectype,
5985 VEC(tree,heap) *dr_chain,
5986 int ncopies, int vect_stmts_counter)
5988 tree t = NULL_TREE, mask_vec, mask, perm_dest;
5989 gimple perm_stmt = NULL;
5990 stmt_vec_info next_stmt_info;
5991 int i, group_size, stride, dr_chain_size;
5992 tree first_vec, second_vec, data_ref;
5993 tree sym;
5994 ssa_op_iter iter;
5995 VEC (tree, heap) *params = NULL;
5997 /* Create a vector mask. */
5998 for (i = mask_nunits - 1; i >= 0; --i)
5999 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6002 mask_vec = build_vector (mask_type, t);
6003 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6005 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6006 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6007 dr_chain_size = VEC_length (tree, dr_chain);
6009 /* Initialize the vect stmts of NODE to properly insert the generated
6010 stmts later. */
6011 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6012 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6013 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6015 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6016 for (i = 0; i < ncopies; i++)
6018 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6019 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6021 /* Build argument list for the vectorized call. */
6022 VEC_free (tree, heap, params);
6023 params = VEC_alloc (tree, heap, 3);
6024 VEC_quick_push (tree, params, first_vec);
6025 VEC_quick_push (tree, params, second_vec);
6026 VEC_quick_push (tree, params, mask);
6028 /* Generate the permute statement. */
6029 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6030 data_ref = make_ssa_name (perm_dest, perm_stmt);
6031 gimple_call_set_lhs (perm_stmt, data_ref);
6032 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6033 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6035 if (TREE_CODE (sym) == SSA_NAME)
6036 sym = SSA_NAME_VAR (sym);
6037 mark_sym_for_renaming (sym);
6040 /* Store the vector statement in NODE. */
6041 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6042 stride * i + vect_stmts_counter, perm_stmt);
6044 first_vec_indx += stride;
6045 second_vec_indx += stride;
6048 /* Mark the scalar stmt as vectorized. */
6049 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6050 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6054 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6055 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6056 representation. Check that the mask is valid and return FALSE if not.
6057 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6058 the next vector, i.e., the current first vector is not needed. */
6060 static bool
6061 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6062 int mask_nunits, bool only_one_vec, int index,
6063 int *mask, int *current_mask_element,
6064 bool *need_next_vector)
6066 int i;
6067 static int number_of_mask_fixes = 1;
6068 static bool mask_fixed = false;
6069 static bool needs_first_vector = false;
6071 /* Convert to target specific representation. */
6072 *current_mask_element = first_mask_element + m;
6073 /* Adjust the value in case it's a mask for second and third vectors. */
6074 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6076 if (*current_mask_element < mask_nunits)
6077 needs_first_vector = true;
6079 /* We have only one input vector to permute but the mask accesses values in
6080 the next vector as well. */
6081 if (only_one_vec && *current_mask_element >= mask_nunits)
6083 if (vect_print_dump_info (REPORT_DETAILS))
6085 fprintf (vect_dump, "permutation requires at least two vectors ");
6086 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6089 return false;
6092 /* The mask requires the next vector. */
6093 if (*current_mask_element >= mask_nunits * 2)
6095 if (needs_first_vector || mask_fixed)
6097 /* We either need the first vector too or have already moved to the
6098 next vector. In both cases, this permutation needs three
6099 vectors. */
6100 if (vect_print_dump_info (REPORT_DETAILS))
6102 fprintf (vect_dump, "permutation requires at "
6103 "least three vectors ");
6104 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6107 return false;
6110 /* We move to the next vector, dropping the first one and working with
6111 the second and the third - we need to adjust the values of the mask
6112 accordingly. */
6113 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6115 for (i = 0; i < index; i++)
6116 mask[i] -= mask_nunits * number_of_mask_fixes;
6118 (number_of_mask_fixes)++;
6119 mask_fixed = true;
6122 *need_next_vector = mask_fixed;
6124 /* This was the last element of this mask. Start a new one. */
6125 if (index == mask_nunits - 1)
6127 number_of_mask_fixes = 1;
6128 mask_fixed = false;
6129 needs_first_vector = false;
6132 return true;
6136 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6137 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6138 permute statements for SLP_NODE_INSTANCE. */
6139 bool
6140 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6141 gimple_stmt_iterator *gsi, int vf,
6142 slp_instance slp_node_instance, bool analyze_only)
6144 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6145 tree mask_element_type = NULL_TREE, mask_type;
6146 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6147 slp_tree node;
6148 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6149 gimple next_scalar_stmt;
6150 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6151 int first_mask_element;
6152 int index, unroll_factor, *mask, current_mask_element, ncopies;
6153 bool only_one_vec = false, need_next_vector = false;
6154 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6156 if (!targetm.vectorize.builtin_vec_perm)
6158 if (vect_print_dump_info (REPORT_DETAILS))
6160 fprintf (vect_dump, "no builtin for vect permute for ");
6161 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6164 return false;
6167 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6168 &mask_element_type);
6169 if (!builtin_decl || !mask_element_type)
6171 if (vect_print_dump_info (REPORT_DETAILS))
6173 fprintf (vect_dump, "no builtin for vect permute for ");
6174 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6177 return false;
6180 mask_type = get_vectype_for_scalar_type (mask_element_type);
6181 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6182 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6183 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6184 scale = mask_nunits / nunits;
6185 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6187 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6188 unrolling factor. */
6189 orig_vec_stmts_num = group_size *
6190 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6191 if (orig_vec_stmts_num == 1)
6192 only_one_vec = true;
6194 /* Number of copies is determined by the final vectorization factor
6195 relatively to SLP_NODE_INSTANCE unrolling factor. */
6196 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6198 /* Generate permutation masks for every NODE. Number of masks for each NODE
6199 is equal to GROUP_SIZE.
6200 E.g., we have a group of three nodes with three loads from the same
6201 location in each node, and the vector size is 4. I.e., we have a
6202 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6203 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6204 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6207 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6208 scpecific type, e.g., in bytes for Altivec.
6209 The last mask is illegal since we assume two operands for permute
6210 operation, and the mask element values can't be outside that range. Hence,
6211 the last mask must be converted into {2,5,5,5}.
6212 For the first two permutations we need the first and the second input
6213 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6214 we need the second and the third vectors: {b1,c1,a2,b2} and
6215 {c2,a3,b3,c3}. */
6217 for (i = 0;
6218 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6219 i, node);
6220 i++)
6222 scalar_index = 0;
6223 index = 0;
6224 vect_stmts_counter = 0;
6225 vec_index = 0;
6226 first_vec_index = vec_index++;
6227 if (only_one_vec)
6228 second_vec_index = first_vec_index;
6229 else
6230 second_vec_index = vec_index++;
6232 for (j = 0; j < unroll_factor; j++)
6234 for (k = 0; k < group_size; k++)
6236 first_mask_element = (i + j * group_size) * scale;
6237 for (m = 0; m < scale; m++)
6239 if (!vect_get_mask_element (stmt, first_mask_element, m,
6240 mask_nunits, only_one_vec, index, mask,
6241 &current_mask_element, &need_next_vector))
6242 return false;
6244 mask[index++] = current_mask_element;
6247 if (index == mask_nunits)
6249 index = 0;
6250 if (!analyze_only)
6252 if (need_next_vector)
6254 first_vec_index = second_vec_index;
6255 second_vec_index = vec_index;
6258 next_scalar_stmt = VEC_index (gimple,
6259 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6261 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6262 mask, mask_nunits, mask_element_type, mask_type,
6263 first_vec_index, second_vec_index, gsi, node,
6264 builtin_decl, vectype, dr_chain, ncopies,
6265 vect_stmts_counter++);
6272 free (mask);
6273 return true;
6276 /* vectorizable_load.
6278 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6279 can be vectorized.
6280 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6281 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6282 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6284 bool
6285 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6286 slp_tree slp_node, slp_instance slp_node_instance)
6288 tree scalar_dest;
6289 tree vec_dest = NULL;
6290 tree data_ref = NULL;
6291 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6292 stmt_vec_info prev_stmt_info;
6293 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6294 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6295 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6296 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6297 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6298 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6299 tree new_temp;
6300 int mode;
6301 gimple new_stmt = NULL;
6302 tree dummy;
6303 enum dr_alignment_support alignment_support_scheme;
6304 tree dataref_ptr = NULL_TREE;
6305 gimple ptr_incr;
6306 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6307 int ncopies;
6308 int i, j, group_size;
6309 tree msq = NULL_TREE, lsq;
6310 tree offset = NULL_TREE;
6311 tree realignment_token = NULL_TREE;
6312 gimple phi = NULL;
6313 VEC(tree,heap) *dr_chain = NULL;
6314 bool strided_load = false;
6315 gimple first_stmt;
6316 tree scalar_type;
6317 bool inv_p;
6318 bool compute_in_loop = false;
6319 struct loop *at_loop;
6320 int vec_num;
6321 bool slp = (slp_node != NULL);
6322 bool slp_perm = false;
6323 enum tree_code code;
6325 /* Multiple types in SLP are handled by creating the appropriate number of
6326 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6327 case of SLP. */
6328 if (slp)
6329 ncopies = 1;
6330 else
6331 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6333 gcc_assert (ncopies >= 1);
6335 /* FORNOW. This restriction should be relaxed. */
6336 if (nested_in_vect_loop && ncopies > 1)
6338 if (vect_print_dump_info (REPORT_DETAILS))
6339 fprintf (vect_dump, "multiple types in nested loop.");
6340 return false;
6343 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6344 slp_perm = true;
6346 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6347 return false;
6349 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6350 return false;
6352 /* Is vectorizable load? */
6353 if (!is_gimple_assign (stmt))
6354 return false;
6356 scalar_dest = gimple_assign_lhs (stmt);
6357 if (TREE_CODE (scalar_dest) != SSA_NAME)
6358 return false;
6360 code = gimple_assign_rhs_code (stmt);
6361 if (code != ARRAY_REF
6362 && code != INDIRECT_REF
6363 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6364 return false;
6366 if (!STMT_VINFO_DATA_REF (stmt_info))
6367 return false;
6369 scalar_type = TREE_TYPE (DR_REF (dr));
6370 mode = (int) TYPE_MODE (vectype);
6372 /* FORNOW. In some cases can vectorize even if data-type not supported
6373 (e.g. - data copies). */
6374 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6376 if (vect_print_dump_info (REPORT_DETAILS))
6377 fprintf (vect_dump, "Aligned load, but unsupported type.");
6378 return false;
6381 /* If accesses through a pointer to vectype do not alias the original
6382 memory reference we have a problem. */
6383 if (get_alias_set (vectype) != get_alias_set (scalar_type)
6384 && !alias_set_subset_of (get_alias_set (vectype),
6385 get_alias_set (scalar_type)))
6387 if (vect_print_dump_info (REPORT_DETAILS))
6388 fprintf (vect_dump, "vector type does not alias scalar type");
6389 return false;
6392 /* Check if the load is a part of an interleaving chain. */
6393 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6395 strided_load = true;
6396 /* FORNOW */
6397 gcc_assert (! nested_in_vect_loop);
6399 /* Check if interleaving is supported. */
6400 if (!vect_strided_load_supported (vectype)
6401 && !PURE_SLP_STMT (stmt_info) && !slp)
6402 return false;
6405 if (!vec_stmt) /* transformation not required. */
6407 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6408 vect_model_load_cost (stmt_info, ncopies, NULL);
6409 return true;
6412 if (vect_print_dump_info (REPORT_DETAILS))
6413 fprintf (vect_dump, "transform load.");
6415 /** Transform. **/
6417 if (strided_load)
6419 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6420 /* Check if the chain of loads is already vectorized. */
6421 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6423 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6424 return true;
6426 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6427 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6429 /* VEC_NUM is the number of vect stmts to be created for this group. */
6430 if (slp)
6432 strided_load = false;
6433 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6435 else
6436 vec_num = group_size;
6438 dr_chain = VEC_alloc (tree, heap, vec_num);
6440 else
6442 first_stmt = stmt;
6443 first_dr = dr;
6444 group_size = vec_num = 1;
6447 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6448 gcc_assert (alignment_support_scheme);
6450 /* In case the vectorization factor (VF) is bigger than the number
6451 of elements that we can fit in a vectype (nunits), we have to generate
6452 more than one vector stmt - i.e - we need to "unroll" the
6453 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6454 from one copy of the vector stmt to the next, in the field
6455 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6456 stages to find the correct vector defs to be used when vectorizing
6457 stmts that use the defs of the current stmt. The example below illustrates
6458 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6459 4 vectorized stmts):
6461 before vectorization:
6462 RELATED_STMT VEC_STMT
6463 S1: x = memref - -
6464 S2: z = x + 1 - -
6466 step 1: vectorize stmt S1:
6467 We first create the vector stmt VS1_0, and, as usual, record a
6468 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6469 Next, we create the vector stmt VS1_1, and record a pointer to
6470 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6471 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6472 stmts and pointers:
6473 RELATED_STMT VEC_STMT
6474 VS1_0: vx0 = memref0 VS1_1 -
6475 VS1_1: vx1 = memref1 VS1_2 -
6476 VS1_2: vx2 = memref2 VS1_3 -
6477 VS1_3: vx3 = memref3 - -
6478 S1: x = load - VS1_0
6479 S2: z = x + 1 - -
6481 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6482 information we recorded in RELATED_STMT field is used to vectorize
6483 stmt S2. */
6485 /* In case of interleaving (non-unit strided access):
6487 S1: x2 = &base + 2
6488 S2: x0 = &base
6489 S3: x1 = &base + 1
6490 S4: x3 = &base + 3
6492 Vectorized loads are created in the order of memory accesses
6493 starting from the access of the first stmt of the chain:
6495 VS1: vx0 = &base
6496 VS2: vx1 = &base + vec_size*1
6497 VS3: vx3 = &base + vec_size*2
6498 VS4: vx4 = &base + vec_size*3
6500 Then permutation statements are generated:
6502 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6503 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6506 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6507 (the order of the data-refs in the output of vect_permute_load_chain
6508 corresponds to the order of scalar stmts in the interleaving chain - see
6509 the documentation of vect_permute_load_chain()).
6510 The generation of permutation stmts and recording them in
6511 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6513 In case of both multiple types and interleaving, the vector loads and
6514 permutation stmts above are created for every copy. The result vector stmts
6515 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6516 STMT_VINFO_RELATED_STMT for the next copies. */
6518 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6519 on a target that supports unaligned accesses (dr_unaligned_supported)
6520 we generate the following code:
6521 p = initial_addr;
6522 indx = 0;
6523 loop {
6524 p = p + indx * vectype_size;
6525 vec_dest = *(p);
6526 indx = indx + 1;
6529 Otherwise, the data reference is potentially unaligned on a target that
6530 does not support unaligned accesses (dr_explicit_realign_optimized) -
6531 then generate the following code, in which the data in each iteration is
6532 obtained by two vector loads, one from the previous iteration, and one
6533 from the current iteration:
6534 p1 = initial_addr;
6535 msq_init = *(floor(p1))
6536 p2 = initial_addr + VS - 1;
6537 realignment_token = call target_builtin;
6538 indx = 0;
6539 loop {
6540 p2 = p2 + indx * vectype_size
6541 lsq = *(floor(p2))
6542 vec_dest = realign_load (msq, lsq, realignment_token)
6543 indx = indx + 1;
6544 msq = lsq;
6545 } */
6547 /* If the misalignment remains the same throughout the execution of the
6548 loop, we can create the init_addr and permutation mask at the loop
6549 preheader. Otherwise, it needs to be created inside the loop.
6550 This can only occur when vectorizing memory accesses in the inner-loop
6551 nested within an outer-loop that is being vectorized. */
6553 if (nested_in_vect_loop_p (loop, stmt)
6554 && (TREE_INT_CST_LOW (DR_STEP (dr))
6555 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6557 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6558 compute_in_loop = true;
6561 if ((alignment_support_scheme == dr_explicit_realign_optimized
6562 || alignment_support_scheme == dr_explicit_realign)
6563 && !compute_in_loop)
6565 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6566 alignment_support_scheme, NULL_TREE,
6567 &at_loop);
6568 if (alignment_support_scheme == dr_explicit_realign_optimized)
6570 phi = SSA_NAME_DEF_STMT (msq);
6571 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6574 else
6575 at_loop = loop;
6577 prev_stmt_info = NULL;
6578 for (j = 0; j < ncopies; j++)
6580 /* 1. Create the vector pointer update chain. */
6581 if (j == 0)
6582 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6583 at_loop, offset,
6584 &dummy, &ptr_incr, false,
6585 &inv_p, NULL_TREE);
6586 else
6587 dataref_ptr =
6588 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6590 for (i = 0; i < vec_num; i++)
6592 if (i > 0)
6593 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6594 NULL_TREE);
6596 /* 2. Create the vector-load in the loop. */
6597 switch (alignment_support_scheme)
6599 case dr_aligned:
6600 gcc_assert (aligned_access_p (first_dr));
6601 data_ref = build_fold_indirect_ref (dataref_ptr);
6602 break;
6603 case dr_unaligned_supported:
6605 int mis = DR_MISALIGNMENT (first_dr);
6606 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6608 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6609 data_ref =
6610 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6611 break;
6613 case dr_explicit_realign:
6615 tree ptr, bump;
6616 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6618 if (compute_in_loop)
6619 msq = vect_setup_realignment (first_stmt, gsi,
6620 &realignment_token,
6621 dr_explicit_realign,
6622 dataref_ptr, NULL);
6624 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6625 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6626 new_stmt = gimple_build_assign (vec_dest, data_ref);
6627 new_temp = make_ssa_name (vec_dest, new_stmt);
6628 gimple_assign_set_lhs (new_stmt, new_temp);
6629 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6630 copy_virtual_operands (new_stmt, stmt);
6631 mark_symbols_for_renaming (new_stmt);
6632 msq = new_temp;
6634 bump = size_binop (MULT_EXPR, vs_minus_1,
6635 TYPE_SIZE_UNIT (scalar_type));
6636 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6637 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6638 break;
6640 case dr_explicit_realign_optimized:
6641 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6642 break;
6643 default:
6644 gcc_unreachable ();
6646 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6647 new_stmt = gimple_build_assign (vec_dest, data_ref);
6648 new_temp = make_ssa_name (vec_dest, new_stmt);
6649 gimple_assign_set_lhs (new_stmt, new_temp);
6650 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6651 mark_symbols_for_renaming (new_stmt);
6653 /* 3. Handle explicit realignment if necessary/supported. Create in
6654 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6655 if (alignment_support_scheme == dr_explicit_realign_optimized
6656 || alignment_support_scheme == dr_explicit_realign)
6658 tree tmp;
6660 lsq = gimple_assign_lhs (new_stmt);
6661 if (!realignment_token)
6662 realignment_token = dataref_ptr;
6663 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6664 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6665 realignment_token);
6666 new_stmt = gimple_build_assign (vec_dest, tmp);
6667 new_temp = make_ssa_name (vec_dest, new_stmt);
6668 gimple_assign_set_lhs (new_stmt, new_temp);
6669 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6671 if (alignment_support_scheme == dr_explicit_realign_optimized)
6673 gcc_assert (phi);
6674 if (i == vec_num - 1 && j == ncopies - 1)
6675 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6676 msq = lsq;
6680 /* 4. Handle invariant-load. */
6681 if (inv_p)
6683 gcc_assert (!strided_load);
6684 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6685 if (j == 0)
6687 int k;
6688 tree t = NULL_TREE;
6689 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6691 /* CHECKME: bitpos depends on endianess? */
6692 bitpos = bitsize_zero_node;
6693 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6694 bitsize, bitpos);
6695 vec_dest =
6696 vect_create_destination_var (scalar_dest, NULL_TREE);
6697 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6698 new_temp = make_ssa_name (vec_dest, new_stmt);
6699 gimple_assign_set_lhs (new_stmt, new_temp);
6700 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6702 for (k = nunits - 1; k >= 0; --k)
6703 t = tree_cons (NULL_TREE, new_temp, t);
6704 /* FIXME: use build_constructor directly. */
6705 vec_inv = build_constructor_from_list (vectype, t);
6706 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6707 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6709 else
6710 gcc_unreachable (); /* FORNOW. */
6713 /* Collect vector loads and later create their permutation in
6714 vect_transform_strided_load (). */
6715 if (strided_load || slp_perm)
6716 VEC_quick_push (tree, dr_chain, new_temp);
6718 /* Store vector loads in the corresponding SLP_NODE. */
6719 if (slp && !slp_perm)
6720 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6723 if (slp && !slp_perm)
6724 continue;
6726 if (slp_perm)
6728 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6729 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6730 slp_node_instance, false))
6732 VEC_free (tree, heap, dr_chain);
6733 return false;
6736 else
6738 if (strided_load)
6740 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6741 return false;
6743 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6744 VEC_free (tree, heap, dr_chain);
6745 dr_chain = VEC_alloc (tree, heap, group_size);
6747 else
6749 if (j == 0)
6750 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6751 else
6752 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6753 prev_stmt_info = vinfo_for_stmt (new_stmt);
6758 if (dr_chain)
6759 VEC_free (tree, heap, dr_chain);
6761 return true;
6765 /* Function vectorizable_live_operation.
6767 STMT computes a value that is used outside the loop. Check if
6768 it can be supported. */
6770 bool
6771 vectorizable_live_operation (gimple stmt,
6772 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6773 gimple *vec_stmt ATTRIBUTE_UNUSED)
6775 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6776 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6777 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6778 int i;
6779 int op_type;
6780 tree op;
6781 tree def;
6782 gimple def_stmt;
6783 enum vect_def_type dt;
6784 enum tree_code code;
6785 enum gimple_rhs_class rhs_class;
6787 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6789 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6790 return false;
6792 if (!is_gimple_assign (stmt))
6793 return false;
6795 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6796 return false;
6798 /* FORNOW. CHECKME. */
6799 if (nested_in_vect_loop_p (loop, stmt))
6800 return false;
6802 code = gimple_assign_rhs_code (stmt);
6803 op_type = TREE_CODE_LENGTH (code);
6804 rhs_class = get_gimple_rhs_class (code);
6805 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6806 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6808 /* FORNOW: support only if all uses are invariant. This means
6809 that the scalar operations can remain in place, unvectorized.
6810 The original last scalar value that they compute will be used. */
6812 for (i = 0; i < op_type; i++)
6814 if (rhs_class == GIMPLE_SINGLE_RHS)
6815 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6816 else
6817 op = gimple_op (stmt, i + 1);
6818 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6820 if (vect_print_dump_info (REPORT_DETAILS))
6821 fprintf (vect_dump, "use not simple.");
6822 return false;
6825 if (dt != vect_invariant_def && dt != vect_constant_def)
6826 return false;
6829 /* No transformation is required for the cases we currently support. */
6830 return true;
6834 /* Function vect_is_simple_cond.
6836 Input:
6837 LOOP - the loop that is being vectorized.
6838 COND - Condition that is checked for simple use.
6840 Returns whether a COND can be vectorized. Checks whether
6841 condition operands are supportable using vec_is_simple_use. */
6843 static bool
6844 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6846 tree lhs, rhs;
6847 tree def;
6848 enum vect_def_type dt;
6850 if (!COMPARISON_CLASS_P (cond))
6851 return false;
6853 lhs = TREE_OPERAND (cond, 0);
6854 rhs = TREE_OPERAND (cond, 1);
6856 if (TREE_CODE (lhs) == SSA_NAME)
6858 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6859 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6860 return false;
6862 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6863 && TREE_CODE (lhs) != FIXED_CST)
6864 return false;
6866 if (TREE_CODE (rhs) == SSA_NAME)
6868 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6869 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6870 return false;
6872 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6873 && TREE_CODE (rhs) != FIXED_CST)
6874 return false;
6876 return true;
6879 /* vectorizable_condition.
6881 Check if STMT is conditional modify expression that can be vectorized.
6882 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6883 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6884 at BSI.
6886 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6888 bool
6889 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6890 gimple *vec_stmt)
6892 tree scalar_dest = NULL_TREE;
6893 tree vec_dest = NULL_TREE;
6894 tree op = NULL_TREE;
6895 tree cond_expr, then_clause, else_clause;
6896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6897 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6898 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6899 tree vec_compare, vec_cond_expr;
6900 tree new_temp;
6901 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6902 enum machine_mode vec_mode;
6903 tree def;
6904 enum vect_def_type dt;
6905 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6906 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6907 enum tree_code code;
6909 gcc_assert (ncopies >= 1);
6910 if (ncopies > 1)
6911 return false; /* FORNOW */
6913 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6914 return false;
6916 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6917 return false;
6919 /* FORNOW: SLP not supported. */
6920 if (STMT_SLP_TYPE (stmt_info))
6921 return false;
6923 /* FORNOW: not yet supported. */
6924 if (STMT_VINFO_LIVE_P (stmt_info))
6926 if (vect_print_dump_info (REPORT_DETAILS))
6927 fprintf (vect_dump, "value used after loop.");
6928 return false;
6931 /* Is vectorizable conditional operation? */
6932 if (!is_gimple_assign (stmt))
6933 return false;
6935 code = gimple_assign_rhs_code (stmt);
6937 if (code != COND_EXPR)
6938 return false;
6940 gcc_assert (gimple_assign_single_p (stmt));
6941 op = gimple_assign_rhs1 (stmt);
6942 cond_expr = TREE_OPERAND (op, 0);
6943 then_clause = TREE_OPERAND (op, 1);
6944 else_clause = TREE_OPERAND (op, 2);
6946 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6947 return false;
6949 /* We do not handle two different vector types for the condition
6950 and the values. */
6951 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6952 return false;
6954 if (TREE_CODE (then_clause) == SSA_NAME)
6956 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6957 if (!vect_is_simple_use (then_clause, loop_vinfo,
6958 &then_def_stmt, &def, &dt))
6959 return false;
6961 else if (TREE_CODE (then_clause) != INTEGER_CST
6962 && TREE_CODE (then_clause) != REAL_CST
6963 && TREE_CODE (then_clause) != FIXED_CST)
6964 return false;
6966 if (TREE_CODE (else_clause) == SSA_NAME)
6968 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6969 if (!vect_is_simple_use (else_clause, loop_vinfo,
6970 &else_def_stmt, &def, &dt))
6971 return false;
6973 else if (TREE_CODE (else_clause) != INTEGER_CST
6974 && TREE_CODE (else_clause) != REAL_CST
6975 && TREE_CODE (else_clause) != FIXED_CST)
6976 return false;
6979 vec_mode = TYPE_MODE (vectype);
6981 if (!vec_stmt)
6983 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6984 return expand_vec_cond_expr_p (op, vec_mode);
6987 /* Transform */
6989 /* Handle def. */
6990 scalar_dest = gimple_assign_lhs (stmt);
6991 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6993 /* Handle cond expr. */
6994 vec_cond_lhs =
6995 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6996 vec_cond_rhs =
6997 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6998 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6999 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7001 /* Arguments are ready. Create the new vector stmt. */
7002 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7003 vec_cond_lhs, vec_cond_rhs);
7004 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7005 vec_compare, vec_then_clause, vec_else_clause);
7007 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7008 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7009 gimple_assign_set_lhs (*vec_stmt, new_temp);
7010 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7012 return true;
7016 /* Function vect_transform_stmt.
7018 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7020 static bool
7021 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7022 bool *strided_store, slp_tree slp_node,
7023 slp_instance slp_node_instance)
7025 bool is_store = false;
7026 gimple vec_stmt = NULL;
7027 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7028 gimple orig_stmt_in_pattern;
7029 bool done;
7031 switch (STMT_VINFO_TYPE (stmt_info))
7033 case type_demotion_vec_info_type:
7034 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7035 gcc_assert (done);
7036 break;
7038 case type_promotion_vec_info_type:
7039 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7040 gcc_assert (done);
7041 break;
7043 case type_conversion_vec_info_type:
7044 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7045 gcc_assert (done);
7046 break;
7048 case induc_vec_info_type:
7049 gcc_assert (!slp_node);
7050 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7051 gcc_assert (done);
7052 break;
7054 case op_vec_info_type:
7055 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7056 gcc_assert (done);
7057 break;
7059 case assignment_vec_info_type:
7060 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7061 gcc_assert (done);
7062 break;
7064 case load_vec_info_type:
7065 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7066 slp_node_instance);
7067 gcc_assert (done);
7068 break;
7070 case store_vec_info_type:
7071 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7072 gcc_assert (done);
7073 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7075 /* In case of interleaving, the whole chain is vectorized when the
7076 last store in the chain is reached. Store stmts before the last
7077 one are skipped, and there vec_stmt_info shouldn't be freed
7078 meanwhile. */
7079 *strided_store = true;
7080 if (STMT_VINFO_VEC_STMT (stmt_info))
7081 is_store = true;
7083 else
7084 is_store = true;
7085 break;
7087 case condition_vec_info_type:
7088 gcc_assert (!slp_node);
7089 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7090 gcc_assert (done);
7091 break;
7093 case call_vec_info_type:
7094 gcc_assert (!slp_node);
7095 done = vectorizable_call (stmt, gsi, &vec_stmt);
7096 break;
7098 case reduc_vec_info_type:
7099 gcc_assert (!slp_node);
7100 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7101 gcc_assert (done);
7102 break;
7104 default:
7105 if (!STMT_VINFO_LIVE_P (stmt_info))
7107 if (vect_print_dump_info (REPORT_DETAILS))
7108 fprintf (vect_dump, "stmt not supported.");
7109 gcc_unreachable ();
7113 if (STMT_VINFO_LIVE_P (stmt_info)
7114 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7116 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7117 gcc_assert (done);
7120 if (vec_stmt)
7122 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7123 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7124 if (orig_stmt_in_pattern)
7126 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7127 /* STMT was inserted by the vectorizer to replace a computation idiom.
7128 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7129 computed this idiom. We need to record a pointer to VEC_STMT in
7130 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7131 documentation of vect_pattern_recog. */
7132 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7134 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7135 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7140 return is_store;
7144 /* This function builds ni_name = number of iterations loop executes
7145 on the loop preheader. */
7147 static tree
7148 vect_build_loop_niters (loop_vec_info loop_vinfo)
7150 tree ni_name, var;
7151 gimple_seq stmts = NULL;
7152 edge pe;
7153 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7154 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7156 var = create_tmp_var (TREE_TYPE (ni), "niters");
7157 add_referenced_var (var);
7158 ni_name = force_gimple_operand (ni, &stmts, false, var);
7160 pe = loop_preheader_edge (loop);
7161 if (stmts)
7163 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7164 gcc_assert (!new_bb);
7167 return ni_name;
7171 /* This function generates the following statements:
7173 ni_name = number of iterations loop executes
7174 ratio = ni_name / vf
7175 ratio_mult_vf_name = ratio * vf
7177 and places them at the loop preheader edge. */
7179 static void
7180 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7181 tree *ni_name_ptr,
7182 tree *ratio_mult_vf_name_ptr,
7183 tree *ratio_name_ptr)
7186 edge pe;
7187 basic_block new_bb;
7188 gimple_seq stmts;
7189 tree ni_name;
7190 tree var;
7191 tree ratio_name;
7192 tree ratio_mult_vf_name;
7193 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7194 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7195 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7196 tree log_vf;
7198 pe = loop_preheader_edge (loop);
7200 /* Generate temporary variable that contains
7201 number of iterations loop executes. */
7203 ni_name = vect_build_loop_niters (loop_vinfo);
7204 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7206 /* Create: ratio = ni >> log2(vf) */
7208 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7209 if (!is_gimple_val (ratio_name))
7211 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7212 add_referenced_var (var);
7214 stmts = NULL;
7215 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7216 pe = loop_preheader_edge (loop);
7217 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7218 gcc_assert (!new_bb);
7221 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7223 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7224 ratio_name, log_vf);
7225 if (!is_gimple_val (ratio_mult_vf_name))
7227 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7228 add_referenced_var (var);
7230 stmts = NULL;
7231 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7232 true, var);
7233 pe = loop_preheader_edge (loop);
7234 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7235 gcc_assert (!new_bb);
7238 *ni_name_ptr = ni_name;
7239 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7240 *ratio_name_ptr = ratio_name;
7242 return;
7246 /* Function vect_update_ivs_after_vectorizer.
7248 "Advance" the induction variables of LOOP to the value they should take
7249 after the execution of LOOP. This is currently necessary because the
7250 vectorizer does not handle induction variables that are used after the
7251 loop. Such a situation occurs when the last iterations of LOOP are
7252 peeled, because:
7253 1. We introduced new uses after LOOP for IVs that were not originally used
7254 after LOOP: the IVs of LOOP are now used by an epilog loop.
7255 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7256 times, whereas the loop IVs should be bumped N times.
7258 Input:
7259 - LOOP - a loop that is going to be vectorized. The last few iterations
7260 of LOOP were peeled.
7261 - NITERS - the number of iterations that LOOP executes (before it is
7262 vectorized). i.e, the number of times the ivs should be bumped.
7263 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7264 coming out from LOOP on which there are uses of the LOOP ivs
7265 (this is the path from LOOP->exit to epilog_loop->preheader).
7267 The new definitions of the ivs are placed in LOOP->exit.
7268 The phi args associated with the edge UPDATE_E in the bb
7269 UPDATE_E->dest are updated accordingly.
7271 Assumption 1: Like the rest of the vectorizer, this function assumes
7272 a single loop exit that has a single predecessor.
7274 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7275 organized in the same order.
7277 Assumption 3: The access function of the ivs is simple enough (see
7278 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7280 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7281 coming out of LOOP on which the ivs of LOOP are used (this is the path
7282 that leads to the epilog loop; other paths skip the epilog loop). This
7283 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7284 needs to have its phis updated.
7287 static void
7288 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7289 edge update_e)
7291 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7292 basic_block exit_bb = single_exit (loop)->dest;
7293 gimple phi, phi1;
7294 gimple_stmt_iterator gsi, gsi1;
7295 basic_block update_bb = update_e->dest;
7297 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7299 /* Make sure there exists a single-predecessor exit bb: */
7300 gcc_assert (single_pred_p (exit_bb));
7302 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7303 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7304 gsi_next (&gsi), gsi_next (&gsi1))
7306 tree access_fn = NULL;
7307 tree evolution_part;
7308 tree init_expr;
7309 tree step_expr;
7310 tree var, ni, ni_name;
7311 gimple_stmt_iterator last_gsi;
7313 phi = gsi_stmt (gsi);
7314 phi1 = gsi_stmt (gsi1);
7315 if (vect_print_dump_info (REPORT_DETAILS))
7317 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7318 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7321 /* Skip virtual phi's. */
7322 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7324 if (vect_print_dump_info (REPORT_DETAILS))
7325 fprintf (vect_dump, "virtual phi. skip.");
7326 continue;
7329 /* Skip reduction phis. */
7330 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7332 if (vect_print_dump_info (REPORT_DETAILS))
7333 fprintf (vect_dump, "reduc phi. skip.");
7334 continue;
7337 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7338 gcc_assert (access_fn);
7339 STRIP_NOPS (access_fn);
7340 evolution_part =
7341 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7342 gcc_assert (evolution_part != NULL_TREE);
7344 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7345 of degree >= 2 or exponential. */
7346 gcc_assert (!tree_is_chrec (evolution_part));
7348 step_expr = evolution_part;
7349 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7350 loop->num));
7352 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7353 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7354 init_expr,
7355 fold_convert (sizetype,
7356 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7357 niters, step_expr)));
7358 else
7359 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7360 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7361 fold_convert (TREE_TYPE (init_expr),
7362 niters),
7363 step_expr),
7364 init_expr);
7368 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7369 add_referenced_var (var);
7371 last_gsi = gsi_last_bb (exit_bb);
7372 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7373 true, GSI_SAME_STMT);
7375 /* Fix phi expressions in the successor bb. */
7376 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7380 /* Return the more conservative threshold between the
7381 min_profitable_iters returned by the cost model and the user
7382 specified threshold, if provided. */
7384 static unsigned int
7385 conservative_cost_threshold (loop_vec_info loop_vinfo,
7386 int min_profitable_iters)
7388 unsigned int th;
7389 int min_scalar_loop_bound;
7391 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7392 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7394 /* Use the cost model only if it is more conservative than user specified
7395 threshold. */
7396 th = (unsigned) min_scalar_loop_bound;
7397 if (min_profitable_iters
7398 && (!min_scalar_loop_bound
7399 || min_profitable_iters > min_scalar_loop_bound))
7400 th = (unsigned) min_profitable_iters;
7402 if (th && vect_print_dump_info (REPORT_COST))
7403 fprintf (vect_dump, "Vectorization may not be profitable.");
7405 return th;
7408 /* Function vect_do_peeling_for_loop_bound
7410 Peel the last iterations of the loop represented by LOOP_VINFO.
7411 The peeled iterations form a new epilog loop. Given that the loop now
7412 iterates NITERS times, the new epilog loop iterates
7413 NITERS % VECTORIZATION_FACTOR times.
7415 The original loop will later be made to iterate
7416 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7418 static void
7419 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7421 tree ni_name, ratio_mult_vf_name;
7422 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7423 struct loop *new_loop;
7424 edge update_e;
7425 basic_block preheader;
7426 int loop_num;
7427 bool check_profitability = false;
7428 unsigned int th = 0;
7429 int min_profitable_iters;
7431 if (vect_print_dump_info (REPORT_DETAILS))
7432 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7434 initialize_original_copy_tables ();
7436 /* Generate the following variables on the preheader of original loop:
7438 ni_name = number of iteration the original loop executes
7439 ratio = ni_name / vf
7440 ratio_mult_vf_name = ratio * vf */
7441 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7442 &ratio_mult_vf_name, ratio);
7444 loop_num = loop->num;
7446 /* If cost model check not done during versioning and
7447 peeling for alignment. */
7448 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7449 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7450 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7452 check_profitability = true;
7454 /* Get profitability threshold for vectorized loop. */
7455 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7457 th = conservative_cost_threshold (loop_vinfo,
7458 min_profitable_iters);
7461 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7462 ratio_mult_vf_name, ni_name, false,
7463 th, check_profitability);
7464 gcc_assert (new_loop);
7465 gcc_assert (loop_num == loop->num);
7466 #ifdef ENABLE_CHECKING
7467 slpeel_verify_cfg_after_peeling (loop, new_loop);
7468 #endif
7470 /* A guard that controls whether the new_loop is to be executed or skipped
7471 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7472 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7473 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7474 is on the path where the LOOP IVs are used and need to be updated. */
7476 preheader = loop_preheader_edge (new_loop)->src;
7477 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7478 update_e = EDGE_PRED (preheader, 0);
7479 else
7480 update_e = EDGE_PRED (preheader, 1);
7482 /* Update IVs of original loop as if they were advanced
7483 by ratio_mult_vf_name steps. */
7484 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7486 /* After peeling we have to reset scalar evolution analyzer. */
7487 scev_reset ();
7489 free_original_copy_tables ();
7493 /* Function vect_gen_niters_for_prolog_loop
7495 Set the number of iterations for the loop represented by LOOP_VINFO
7496 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7497 and the misalignment of DR - the data reference recorded in
7498 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7499 this loop, the data reference DR will refer to an aligned location.
7501 The following computation is generated:
7503 If the misalignment of DR is known at compile time:
7504 addr_mis = int mis = DR_MISALIGNMENT (dr);
7505 Else, compute address misalignment in bytes:
7506 addr_mis = addr & (vectype_size - 1)
7508 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7510 (elem_size = element type size; an element is the scalar element whose type
7511 is the inner type of the vectype)
7513 When the step of the data-ref in the loop is not 1 (as in interleaved data
7514 and SLP), the number of iterations of the prolog must be divided by the step
7515 (which is equal to the size of interleaved group).
7517 The above formulas assume that VF == number of elements in the vector. This
7518 may not hold when there are multiple-types in the loop.
7519 In this case, for some data-references in the loop the VF does not represent
7520 the number of elements that fit in the vector. Therefore, instead of VF we
7521 use TYPE_VECTOR_SUBPARTS. */
7523 static tree
7524 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7526 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7527 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7528 tree var;
7529 gimple_seq stmts;
7530 tree iters, iters_name;
7531 edge pe;
7532 basic_block new_bb;
7533 gimple dr_stmt = DR_STMT (dr);
7534 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7535 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7536 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7537 tree niters_type = TREE_TYPE (loop_niters);
7538 int step = 1;
7539 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7540 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7542 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7543 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7545 pe = loop_preheader_edge (loop);
7547 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7549 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7550 int elem_misalign = byte_misalign / element_size;
7552 if (vect_print_dump_info (REPORT_DETAILS))
7553 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7555 iters = build_int_cst (niters_type,
7556 (((nelements - elem_misalign) & (nelements - 1)) / step));
7558 else
7560 gimple_seq new_stmts = NULL;
7561 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7562 &new_stmts, NULL_TREE, loop);
7563 tree ptr_type = TREE_TYPE (start_addr);
7564 tree size = TYPE_SIZE (ptr_type);
7565 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7566 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7567 tree elem_size_log =
7568 build_int_cst (type, exact_log2 (vectype_align/nelements));
7569 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7570 tree nelements_tree = build_int_cst (type, nelements);
7571 tree byte_misalign;
7572 tree elem_misalign;
7574 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7575 gcc_assert (!new_bb);
7577 /* Create: byte_misalign = addr & (vectype_size - 1) */
7578 byte_misalign =
7579 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7581 /* Create: elem_misalign = byte_misalign / element_size */
7582 elem_misalign =
7583 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7585 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7586 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7587 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7588 iters = fold_convert (niters_type, iters);
7591 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7592 /* If the loop bound is known at compile time we already verified that it is
7593 greater than vf; since the misalignment ('iters') is at most vf, there's
7594 no need to generate the MIN_EXPR in this case. */
7595 if (TREE_CODE (loop_niters) != INTEGER_CST)
7596 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7598 if (vect_print_dump_info (REPORT_DETAILS))
7600 fprintf (vect_dump, "niters for prolog loop: ");
7601 print_generic_expr (vect_dump, iters, TDF_SLIM);
7604 var = create_tmp_var (niters_type, "prolog_loop_niters");
7605 add_referenced_var (var);
7606 stmts = NULL;
7607 iters_name = force_gimple_operand (iters, &stmts, false, var);
7609 /* Insert stmt on loop preheader edge. */
7610 if (stmts)
7612 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7613 gcc_assert (!new_bb);
7616 return iters_name;
7620 /* Function vect_update_init_of_dr
7622 NITERS iterations were peeled from LOOP. DR represents a data reference
7623 in LOOP. This function updates the information recorded in DR to
7624 account for the fact that the first NITERS iterations had already been
7625 executed. Specifically, it updates the OFFSET field of DR. */
7627 static void
7628 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7630 tree offset = DR_OFFSET (dr);
7632 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
7633 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
7634 DR_OFFSET (dr) = offset;
7638 /* Function vect_update_inits_of_drs
7640 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7641 This function updates the information recorded for the data references in
7642 the loop to account for the fact that the first NITERS iterations had
7643 already been executed. Specifically, it updates the initial_condition of
7644 the access_function of all the data_references in the loop. */
7646 static void
7647 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7649 unsigned int i;
7650 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7651 struct data_reference *dr;
7653 if (vect_print_dump_info (REPORT_DETAILS))
7654 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7656 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7657 vect_update_init_of_dr (dr, niters);
7661 /* Function vect_do_peeling_for_alignment
7663 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7664 'niters' is set to the misalignment of one of the data references in the
7665 loop, thereby forcing it to refer to an aligned location at the beginning
7666 of the execution of this loop. The data reference for which we are
7667 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7669 static void
7670 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7672 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7673 tree niters_of_prolog_loop, ni_name;
7674 tree n_iters;
7675 struct loop *new_loop;
7676 bool check_profitability = false;
7677 unsigned int th = 0;
7678 int min_profitable_iters;
7680 if (vect_print_dump_info (REPORT_DETAILS))
7681 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7683 initialize_original_copy_tables ();
7685 ni_name = vect_build_loop_niters (loop_vinfo);
7686 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7689 /* If cost model check not done during versioning. */
7690 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7691 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7693 check_profitability = true;
7695 /* Get profitability threshold for vectorized loop. */
7696 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7698 th = conservative_cost_threshold (loop_vinfo,
7699 min_profitable_iters);
7702 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7703 new_loop =
7704 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7705 niters_of_prolog_loop, ni_name, true,
7706 th, check_profitability);
7708 gcc_assert (new_loop);
7709 #ifdef ENABLE_CHECKING
7710 slpeel_verify_cfg_after_peeling (new_loop, loop);
7711 #endif
7713 /* Update number of times loop executes. */
7714 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7715 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7716 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7718 /* Update the init conditions of the access functions of all data refs. */
7719 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7721 /* After peeling we have to reset scalar evolution analyzer. */
7722 scev_reset ();
7724 free_original_copy_tables ();
7728 /* Function vect_create_cond_for_align_checks.
7730 Create a conditional expression that represents the alignment checks for
7731 all of data references (array element references) whose alignment must be
7732 checked at runtime.
7734 Input:
7735 COND_EXPR - input conditional expression. New conditions will be chained
7736 with logical AND operation.
7737 LOOP_VINFO - two fields of the loop information are used.
7738 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7739 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7741 Output:
7742 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7743 expression.
7744 The returned value is the conditional expression to be used in the if
7745 statement that controls which version of the loop gets executed at runtime.
7747 The algorithm makes two assumptions:
7748 1) The number of bytes "n" in a vector is a power of 2.
7749 2) An address "a" is aligned if a%n is zero and that this
7750 test can be done as a&(n-1) == 0. For example, for 16
7751 byte vectors the test is a&0xf == 0. */
7753 static void
7754 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7755 tree *cond_expr,
7756 gimple_seq *cond_expr_stmt_list)
7758 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7759 VEC(gimple,heap) *may_misalign_stmts
7760 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7761 gimple ref_stmt;
7762 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7763 tree mask_cst;
7764 unsigned int i;
7765 tree psize;
7766 tree int_ptrsize_type;
7767 char tmp_name[20];
7768 tree or_tmp_name = NULL_TREE;
7769 tree and_tmp, and_tmp_name;
7770 gimple and_stmt;
7771 tree ptrsize_zero;
7772 tree part_cond_expr;
7774 /* Check that mask is one less than a power of 2, i.e., mask is
7775 all zeros followed by all ones. */
7776 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7778 /* CHECKME: what is the best integer or unsigned type to use to hold a
7779 cast from a pointer value? */
7780 psize = TYPE_SIZE (ptr_type_node);
7781 int_ptrsize_type
7782 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7784 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7785 of the first vector of the i'th data reference. */
7787 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7789 gimple_seq new_stmt_list = NULL;
7790 tree addr_base;
7791 tree addr_tmp, addr_tmp_name;
7792 tree or_tmp, new_or_tmp_name;
7793 gimple addr_stmt, or_stmt;
7795 /* create: addr_tmp = (int)(address_of_first_vector) */
7796 addr_base =
7797 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7798 NULL_TREE, loop);
7799 if (new_stmt_list != NULL)
7800 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7802 sprintf (tmp_name, "%s%d", "addr2int", i);
7803 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7804 add_referenced_var (addr_tmp);
7805 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7806 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7807 addr_base, NULL_TREE);
7808 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7809 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7811 /* The addresses are OR together. */
7813 if (or_tmp_name != NULL_TREE)
7815 /* create: or_tmp = or_tmp | addr_tmp */
7816 sprintf (tmp_name, "%s%d", "orptrs", i);
7817 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7818 add_referenced_var (or_tmp);
7819 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7820 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7821 new_or_tmp_name,
7822 or_tmp_name, addr_tmp_name);
7823 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7824 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7825 or_tmp_name = new_or_tmp_name;
7827 else
7828 or_tmp_name = addr_tmp_name;
7830 } /* end for i */
7832 mask_cst = build_int_cst (int_ptrsize_type, mask);
7834 /* create: and_tmp = or_tmp & mask */
7835 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7836 add_referenced_var (and_tmp);
7837 and_tmp_name = make_ssa_name (and_tmp, NULL);
7839 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7840 or_tmp_name, mask_cst);
7841 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7842 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7844 /* Make and_tmp the left operand of the conditional test against zero.
7845 if and_tmp has a nonzero bit then some address is unaligned. */
7846 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7847 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7848 and_tmp_name, ptrsize_zero);
7849 if (*cond_expr)
7850 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7851 *cond_expr, part_cond_expr);
7852 else
7853 *cond_expr = part_cond_expr;
7856 /* Function vect_vfa_segment_size.
7858 Create an expression that computes the size of segment
7859 that will be accessed for a data reference. The functions takes into
7860 account that realignment loads may access one more vector.
7862 Input:
7863 DR: The data reference.
7864 VECT_FACTOR: vectorization factor.
7866 Return an expression whose value is the size of segment which will be
7867 accessed by DR. */
7869 static tree
7870 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7872 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7873 DR_STEP (dr), vect_factor);
7875 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7877 tree vector_size = TYPE_SIZE_UNIT
7878 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7880 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7881 segment_length, vector_size);
7883 return fold_convert (sizetype, segment_length);
7886 /* Function vect_create_cond_for_alias_checks.
7888 Create a conditional expression that represents the run-time checks for
7889 overlapping of address ranges represented by a list of data references
7890 relations passed as input.
7892 Input:
7893 COND_EXPR - input conditional expression. New conditions will be chained
7894 with logical AND operation.
7895 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7896 to be checked.
7898 Output:
7899 COND_EXPR - conditional expression.
7900 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7901 expression.
7904 The returned value is the conditional expression to be used in the if
7905 statement that controls which version of the loop gets executed at runtime.
7908 static void
7909 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7910 tree * cond_expr,
7911 gimple_seq * cond_expr_stmt_list)
7913 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7914 VEC (ddr_p, heap) * may_alias_ddrs =
7915 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7916 tree vect_factor =
7917 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7919 ddr_p ddr;
7920 unsigned int i;
7921 tree part_cond_expr;
7923 /* Create expression
7924 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7925 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7929 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7930 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7932 if (VEC_empty (ddr_p, may_alias_ddrs))
7933 return;
7935 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7937 struct data_reference *dr_a, *dr_b;
7938 gimple dr_group_first_a, dr_group_first_b;
7939 tree addr_base_a, addr_base_b;
7940 tree segment_length_a, segment_length_b;
7941 gimple stmt_a, stmt_b;
7943 dr_a = DDR_A (ddr);
7944 stmt_a = DR_STMT (DDR_A (ddr));
7945 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7946 if (dr_group_first_a)
7948 stmt_a = dr_group_first_a;
7949 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7952 dr_b = DDR_B (ddr);
7953 stmt_b = DR_STMT (DDR_B (ddr));
7954 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7955 if (dr_group_first_b)
7957 stmt_b = dr_group_first_b;
7958 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7961 addr_base_a =
7962 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7963 NULL_TREE, loop);
7964 addr_base_b =
7965 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7966 NULL_TREE, loop);
7968 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7969 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7971 if (vect_print_dump_info (REPORT_DR_DETAILS))
7973 fprintf (vect_dump,
7974 "create runtime check for data references ");
7975 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7976 fprintf (vect_dump, " and ");
7977 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7981 part_cond_expr =
7982 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7983 fold_build2 (LT_EXPR, boolean_type_node,
7984 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7985 addr_base_a,
7986 segment_length_a),
7987 addr_base_b),
7988 fold_build2 (LT_EXPR, boolean_type_node,
7989 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7990 addr_base_b,
7991 segment_length_b),
7992 addr_base_a));
7994 if (*cond_expr)
7995 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7996 *cond_expr, part_cond_expr);
7997 else
7998 *cond_expr = part_cond_expr;
8000 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8001 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8002 VEC_length (ddr_p, may_alias_ddrs));
8006 /* Function vect_loop_versioning.
8008 If the loop has data references that may or may not be aligned or/and
8009 has data reference relations whose independence was not proven then
8010 two versions of the loop need to be generated, one which is vectorized
8011 and one which isn't. A test is then generated to control which of the
8012 loops is executed. The test checks for the alignment of all of the
8013 data references that may or may not be aligned. An additional
8014 sequence of runtime tests is generated for each pairs of DDRs whose
8015 independence was not proven. The vectorized version of loop is
8016 executed only if both alias and alignment tests are passed.
8018 The test generated to check which version of loop is executed
8019 is modified to also check for profitability as indicated by the
8020 cost model initially. */
8022 static void
8023 vect_loop_versioning (loop_vec_info loop_vinfo)
8025 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8026 struct loop *nloop;
8027 tree cond_expr = NULL_TREE;
8028 gimple_seq cond_expr_stmt_list = NULL;
8029 basic_block condition_bb;
8030 gimple_stmt_iterator gsi, cond_exp_gsi;
8031 basic_block merge_bb;
8032 basic_block new_exit_bb;
8033 edge new_exit_e, e;
8034 gimple orig_phi, new_phi;
8035 tree arg;
8036 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8037 gimple_seq gimplify_stmt_list = NULL;
8038 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8039 int min_profitable_iters = 0;
8040 unsigned int th;
8042 /* Get profitability threshold for vectorized loop. */
8043 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8045 th = conservative_cost_threshold (loop_vinfo,
8046 min_profitable_iters);
8048 cond_expr =
8049 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8050 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8052 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8053 false, NULL_TREE);
8055 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8056 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8057 &cond_expr_stmt_list);
8059 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8060 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8061 &cond_expr_stmt_list);
8063 cond_expr =
8064 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8065 cond_expr =
8066 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8067 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8069 initialize_original_copy_tables ();
8070 nloop = loop_version (loop, cond_expr, &condition_bb,
8071 prob, prob, REG_BR_PROB_BASE - prob, true);
8072 free_original_copy_tables();
8074 /* Loop versioning violates an assumption we try to maintain during
8075 vectorization - that the loop exit block has a single predecessor.
8076 After versioning, the exit block of both loop versions is the same
8077 basic block (i.e. it has two predecessors). Just in order to simplify
8078 following transformations in the vectorizer, we fix this situation
8079 here by adding a new (empty) block on the exit-edge of the loop,
8080 with the proper loop-exit phis to maintain loop-closed-form. */
8082 merge_bb = single_exit (loop)->dest;
8083 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8084 new_exit_bb = split_edge (single_exit (loop));
8085 new_exit_e = single_exit (loop);
8086 e = EDGE_SUCC (new_exit_bb, 0);
8088 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8090 orig_phi = gsi_stmt (gsi);
8091 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8092 new_exit_bb);
8093 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8094 add_phi_arg (new_phi, arg, new_exit_e);
8095 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8098 /* End loop-exit-fixes after versioning. */
8100 update_ssa (TODO_update_ssa);
8101 if (cond_expr_stmt_list)
8103 cond_exp_gsi = gsi_last_bb (condition_bb);
8104 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8108 /* Remove a group of stores (for SLP or interleaving), free their
8109 stmt_vec_info. */
8111 static void
8112 vect_remove_stores (gimple first_stmt)
8114 gimple next = first_stmt;
8115 gimple tmp;
8116 gimple_stmt_iterator next_si;
8118 while (next)
8120 /* Free the attached stmt_vec_info and remove the stmt. */
8121 next_si = gsi_for_stmt (next);
8122 gsi_remove (&next_si, true);
8123 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8124 free_stmt_vec_info (next);
8125 next = tmp;
8130 /* Vectorize SLP instance tree in postorder. */
8132 static bool
8133 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8134 unsigned int vectorization_factor)
8136 gimple stmt;
8137 bool strided_store, is_store;
8138 gimple_stmt_iterator si;
8139 stmt_vec_info stmt_info;
8140 unsigned int vec_stmts_size, nunits, group_size;
8141 tree vectype;
8142 int i;
8143 slp_tree loads_node;
8145 if (!node)
8146 return false;
8148 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8149 vectorization_factor);
8150 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8151 vectorization_factor);
8153 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8154 stmt_info = vinfo_for_stmt (stmt);
8155 /* VECTYPE is the type of the destination. */
8156 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8157 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8158 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8160 /* For each SLP instance calculate number of vector stmts to be created
8161 for the scalar stmts in each node of the SLP tree. Number of vector
8162 elements in one vector iteration is the number of scalar elements in
8163 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8164 size. */
8165 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8167 /* In case of load permutation we have to allocate vectorized statements for
8168 all the nodes that participate in that permutation. */
8169 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8171 for (i = 0;
8172 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8173 i++)
8175 if (!SLP_TREE_VEC_STMTS (loads_node))
8177 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8178 vec_stmts_size);
8179 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8184 if (!SLP_TREE_VEC_STMTS (node))
8186 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8187 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8190 if (vect_print_dump_info (REPORT_DETAILS))
8192 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8193 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8196 si = gsi_for_stmt (stmt);
8197 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8198 if (is_store)
8200 if (DR_GROUP_FIRST_DR (stmt_info))
8201 /* If IS_STORE is TRUE, the vectorization of the
8202 interleaving chain was completed - free all the stores in
8203 the chain. */
8204 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8205 else
8206 /* FORNOW: SLP originates only from strided stores. */
8207 gcc_unreachable ();
8209 return true;
8212 /* FORNOW: SLP originates only from strided stores. */
8213 return false;
8217 static bool
8218 vect_schedule_slp (loop_vec_info loop_vinfo)
8220 VEC (slp_instance, heap) *slp_instances =
8221 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8222 slp_instance instance;
8223 unsigned int i;
8224 bool is_store = false;
8226 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8228 /* Schedule the tree of INSTANCE. */
8229 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8230 instance,
8231 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8233 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8234 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8235 fprintf (vect_dump, "vectorizing stmts using SLP.");
8238 return is_store;
8241 /* Function vect_transform_loop.
8243 The analysis phase has determined that the loop is vectorizable.
8244 Vectorize the loop - created vectorized stmts to replace the scalar
8245 stmts in the loop, and update the loop exit condition. */
8247 void
8248 vect_transform_loop (loop_vec_info loop_vinfo)
8250 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8251 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8252 int nbbs = loop->num_nodes;
8253 gimple_stmt_iterator si;
8254 int i;
8255 tree ratio = NULL;
8256 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8257 bool strided_store;
8258 bool slp_scheduled = false;
8259 unsigned int nunits;
8261 if (vect_print_dump_info (REPORT_DETAILS))
8262 fprintf (vect_dump, "=== vec_transform_loop ===");
8264 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8265 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8266 vect_loop_versioning (loop_vinfo);
8268 /* CHECKME: we wouldn't need this if we called update_ssa once
8269 for all loops. */
8270 bitmap_zero (vect_memsyms_to_rename);
8272 /* Peel the loop if there are data refs with unknown alignment.
8273 Only one data ref with unknown store is allowed. */
8275 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8276 vect_do_peeling_for_alignment (loop_vinfo);
8278 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8279 compile time constant), or it is a constant that doesn't divide by the
8280 vectorization factor, then an epilog loop needs to be created.
8281 We therefore duplicate the loop: the original loop will be vectorized,
8282 and will compute the first (n/VF) iterations. The second copy of the loop
8283 will remain scalar and will compute the remaining (n%VF) iterations.
8284 (VF is the vectorization factor). */
8286 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8287 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8288 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8289 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8290 else
8291 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8292 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8294 /* 1) Make sure the loop header has exactly two entries
8295 2) Make sure we have a preheader basic block. */
8297 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8299 split_edge (loop_preheader_edge (loop));
8301 /* FORNOW: the vectorizer supports only loops which body consist
8302 of one basic block (header + empty latch). When the vectorizer will
8303 support more involved loop forms, the order by which the BBs are
8304 traversed need to be reconsidered. */
8306 for (i = 0; i < nbbs; i++)
8308 basic_block bb = bbs[i];
8309 stmt_vec_info stmt_info;
8310 gimple phi;
8312 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8314 phi = gsi_stmt (si);
8315 if (vect_print_dump_info (REPORT_DETAILS))
8317 fprintf (vect_dump, "------>vectorizing phi: ");
8318 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8320 stmt_info = vinfo_for_stmt (phi);
8321 if (!stmt_info)
8322 continue;
8324 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8325 && !STMT_VINFO_LIVE_P (stmt_info))
8326 continue;
8328 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8329 != (unsigned HOST_WIDE_INT) vectorization_factor)
8330 && vect_print_dump_info (REPORT_DETAILS))
8331 fprintf (vect_dump, "multiple-types.");
8333 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8335 if (vect_print_dump_info (REPORT_DETAILS))
8336 fprintf (vect_dump, "transform phi.");
8337 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8341 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8343 gimple stmt = gsi_stmt (si);
8344 bool is_store;
8346 if (vect_print_dump_info (REPORT_DETAILS))
8348 fprintf (vect_dump, "------>vectorizing statement: ");
8349 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8352 stmt_info = vinfo_for_stmt (stmt);
8354 /* vector stmts created in the outer-loop during vectorization of
8355 stmts in an inner-loop may not have a stmt_info, and do not
8356 need to be vectorized. */
8357 if (!stmt_info)
8359 gsi_next (&si);
8360 continue;
8363 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8364 && !STMT_VINFO_LIVE_P (stmt_info))
8366 gsi_next (&si);
8367 continue;
8370 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8371 nunits =
8372 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8373 if (!STMT_SLP_TYPE (stmt_info)
8374 && nunits != (unsigned int) vectorization_factor
8375 && vect_print_dump_info (REPORT_DETAILS))
8376 /* For SLP VF is set according to unrolling factor, and not to
8377 vector size, hence for SLP this print is not valid. */
8378 fprintf (vect_dump, "multiple-types.");
8380 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8381 reached. */
8382 if (STMT_SLP_TYPE (stmt_info))
8384 if (!slp_scheduled)
8386 slp_scheduled = true;
8388 if (vect_print_dump_info (REPORT_DETAILS))
8389 fprintf (vect_dump, "=== scheduling SLP instances ===");
8391 is_store = vect_schedule_slp (loop_vinfo);
8393 /* IS_STORE is true if STMT is a store. Stores cannot be of
8394 hybrid SLP type. They are removed in
8395 vect_schedule_slp_instance and their vinfo is destroyed. */
8396 if (is_store)
8398 gsi_next (&si);
8399 continue;
8403 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8404 if (PURE_SLP_STMT (stmt_info))
8406 gsi_next (&si);
8407 continue;
8411 /* -------- vectorize statement ------------ */
8412 if (vect_print_dump_info (REPORT_DETAILS))
8413 fprintf (vect_dump, "transform statement.");
8415 strided_store = false;
8416 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8417 if (is_store)
8419 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8421 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8422 interleaving chain was completed - free all the stores in
8423 the chain. */
8424 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8425 gsi_remove (&si, true);
8426 continue;
8428 else
8430 /* Free the attached stmt_vec_info and remove the stmt. */
8431 free_stmt_vec_info (stmt);
8432 gsi_remove (&si, true);
8433 continue;
8436 gsi_next (&si);
8437 } /* stmts in BB */
8438 } /* BBs in loop */
8440 slpeel_make_loop_iterate_ntimes (loop, ratio);
8442 mark_set_for_renaming (vect_memsyms_to_rename);
8444 /* The memory tags and pointers in vectorized statements need to
8445 have their SSA forms updated. FIXME, why can't this be delayed
8446 until all the loops have been transformed? */
8447 update_ssa (TODO_update_ssa);
8449 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8450 fprintf (vect_dump, "LOOP VECTORIZED.");
8451 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8452 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");