PR libgomp/38270
[official-gcc.git] / gcc / tree-vect-transform.c
blob18c22e7eac7a0133a737162719d380248bae2f0c
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
50 slp_tree, slp_instance);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 static int
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
99 default:
100 gcc_unreachable ();
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
109 loop.
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
117 int i;
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
141 return 0;
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
149 runtime_test = true;
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
156 vec_outside_cost +=
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
166 vec_outside_cost +=
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
180 iteration for now.
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
185 statements. */
187 /* FORNOW. */
188 if (loop->inner)
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
198 else
199 factor = 1;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
209 continue;
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
219 loop.
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
235 unknown. */
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
250 else
252 if (byte_misalign)
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
261 else
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
277 else
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
288 + peel_guard_costs;
290 /* FORNOW: The scalar outside cost is incremented in one of the
291 following ways:
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
299 jmp to vector code
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
307 if (cost <= th)
308 prologue = scalar_iters
309 if (prologue == 0)
310 jmp to vector code
311 else
312 execute prologue
313 if (prologue == num_iters)
314 go to exit
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
323 if (prologue == 0)
324 jmp to vector code
325 else
326 execute prologue
327 if (prologue == num_iters)
328 go to exit
329 vector code:
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
331 jmp to epilogue
333 Hence the run-time scalar cost should be incremented by 2 taken
334 branches.
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
340 if (runtime_test)
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
346 else
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
353 else
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
358 /* Add SLP costs. */
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
368 must hold true:
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
370 where
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
380 else
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
386 - vec_inside_cost);
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
395 else
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
402 return -1;
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
409 vec_inside_cost);
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
411 vec_outside_cost);
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
448 static bool
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
450 int ncopies)
452 int outer_cost = 0;
453 enum tree_code code;
454 optab optab;
455 tree vectype;
456 gimple stmt, orig_stmt;
457 tree reduction_op;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
473 break;
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
476 break;
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
479 break;
480 default:
481 gcc_unreachable ();
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
485 if (!vectype)
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
492 return false;
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
498 if (!orig_stmt)
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
515 else
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
518 tree bitsize =
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
533 else
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
547 return true;
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
555 static void
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
576 void
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
580 int i;
581 int inside_cost = 0, outside_cost = 0;
583 /* The SLP costs were already calculated during SLP tree build. */
584 if (PURE_SLP_STMT (stmt_info))
585 return;
587 inside_cost = ncopies * TARG_VEC_STMT_COST;
589 /* FORNOW: Assuming maximum 2 args per stmts. */
590 for (i = 0; i < 2; i++)
592 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
593 outside_cost += TARG_SCALAR_TO_VEC_COST;
596 if (vect_print_dump_info (REPORT_COST))
597 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
598 "outside_cost = %d .", inside_cost, outside_cost);
600 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
601 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
602 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
606 /* Function vect_cost_strided_group_size
608 For strided load or store, return the group_size only if it is the first
609 load or store of a group, else return 1. This ensures that group size is
610 only returned once per group. */
612 static int
613 vect_cost_strided_group_size (stmt_vec_info stmt_info)
615 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
617 if (first_stmt == STMT_VINFO_STMT (stmt_info))
618 return DR_GROUP_SIZE (stmt_info);
620 return 1;
624 /* Function vect_model_store_cost
626 Models cost for stores. In the case of strided accesses, one access
627 has the overhead of the strided access attributed to it. */
629 void
630 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
631 enum vect_def_type dt, slp_tree slp_node)
633 int group_size;
634 int inside_cost = 0, outside_cost = 0;
636 /* The SLP costs were already calculated during SLP tree build. */
637 if (PURE_SLP_STMT (stmt_info))
638 return;
640 if (dt == vect_constant_def || dt == vect_invariant_def)
641 outside_cost = TARG_SCALAR_TO_VEC_COST;
643 /* Strided access? */
644 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
645 group_size = vect_cost_strided_group_size (stmt_info);
646 /* Not a strided access. */
647 else
648 group_size = 1;
650 /* Is this an access in a group of stores, which provide strided access?
651 If so, add in the cost of the permutes. */
652 if (group_size > 1)
654 /* Uses a high and low interleave operation for each needed permute. */
655 inside_cost = ncopies * exact_log2(group_size) * group_size
656 * TARG_VEC_STMT_COST;
658 if (vect_print_dump_info (REPORT_COST))
659 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
660 group_size);
664 /* Costs of the stores. */
665 inside_cost += ncopies * TARG_VEC_STORE_COST;
667 if (vect_print_dump_info (REPORT_COST))
668 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
669 "outside_cost = %d .", inside_cost, outside_cost);
671 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
672 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
673 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
677 /* Function vect_model_load_cost
679 Models cost for loads. In the case of strided accesses, the last access
680 has the overhead of the strided access attributed to it. Since unaligned
681 accesses are supported for loads, we also account for the costs of the
682 access scheme chosen. */
684 void
685 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
688 int group_size;
689 int alignment_support_cheme;
690 gimple first_stmt;
691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
692 int inside_cost = 0, outside_cost = 0;
694 /* The SLP costs were already calculated during SLP tree build. */
695 if (PURE_SLP_STMT (stmt_info))
696 return;
698 /* Strided accesses? */
699 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
700 if (first_stmt && !slp_node)
702 group_size = vect_cost_strided_group_size (stmt_info);
703 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
705 /* Not a strided access. */
706 else
708 group_size = 1;
709 first_dr = dr;
712 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
714 /* Is this an access in a group of loads providing strided access?
715 If so, add in the cost of the permutes. */
716 if (group_size > 1)
718 /* Uses an even and odd extract operations for each needed permute. */
719 inside_cost = ncopies * exact_log2(group_size) * group_size
720 * TARG_VEC_STMT_COST;
722 if (vect_print_dump_info (REPORT_COST))
723 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
724 group_size);
728 /* The loads themselves. */
729 switch (alignment_support_cheme)
731 case dr_aligned:
733 inside_cost += ncopies * TARG_VEC_LOAD_COST;
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: aligned.");
738 break;
740 case dr_unaligned_supported:
742 /* Here, we assign an additional cost for the unaligned load. */
743 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
745 if (vect_print_dump_info (REPORT_COST))
746 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
747 "hardware.");
749 break;
751 case dr_explicit_realign:
753 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 /* FIXME: If the misalignment remains fixed across the iterations of
756 the containing loop, the following cost should be added to the
757 outside costs. */
758 if (targetm.vectorize.builtin_mask_for_load)
759 inside_cost += TARG_VEC_STMT_COST;
761 break;
763 case dr_explicit_realign_optimized:
765 if (vect_print_dump_info (REPORT_COST))
766 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
767 "pipelined.");
769 /* Unaligned software pipeline has a load of an address, an initial
770 load, and possibly a mask operation to "prime" the loop. However,
771 if this is an access in a group of loads, which provide strided
772 access, then the above cost should only be considered for one
773 access in the group. Inside the loop, there is a load op
774 and a realignment op. */
776 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
778 outside_cost = 2*TARG_VEC_STMT_COST;
779 if (targetm.vectorize.builtin_mask_for_load)
780 outside_cost += TARG_VEC_STMT_COST;
783 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
785 break;
788 default:
789 gcc_unreachable ();
792 if (vect_print_dump_info (REPORT_COST))
793 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
794 "outside_cost = %d .", inside_cost, outside_cost);
796 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
797 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
798 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
802 /* Function vect_get_new_vect_var.
804 Returns a name for a new variable. The current naming scheme appends the
805 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
806 the name of vectorizer generated variables, and appends that to NAME if
807 provided. */
809 static tree
810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
812 const char *prefix;
813 tree new_vect_var;
815 switch (var_kind)
817 case vect_simple_var:
818 prefix = "vect_";
819 break;
820 case vect_scalar_var:
821 prefix = "stmp_";
822 break;
823 case vect_pointer_var:
824 prefix = "vect_p";
825 break;
826 default:
827 gcc_unreachable ();
830 if (name)
832 char* tmp = concat (prefix, name, NULL);
833 new_vect_var = create_tmp_var (type, tmp);
834 free (tmp);
836 else
837 new_vect_var = create_tmp_var (type, prefix);
839 /* Mark vector typed variable as a gimple register variable. */
840 if (TREE_CODE (type) == VECTOR_TYPE)
841 DECL_GIMPLE_REG_P (new_vect_var) = true;
843 return new_vect_var;
847 /* Function vect_create_addr_base_for_vector_ref.
849 Create an expression that computes the address of the first memory location
850 that will be accessed for a data reference.
852 Input:
853 STMT: The statement containing the data reference.
854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
855 OFFSET: Optional. If supplied, it is be added to the initial address.
856 LOOP: Specify relative to which loop-nest should the address be computed.
857 For example, when the dataref is in an inner-loop nested in an
858 outer-loop that is now being vectorized, LOOP can be either the
859 outer-loop, or the inner-loop. The first memory location accessed
860 by the following dataref ('in' points to short):
862 for (i=0; i<N; i++)
863 for (j=0; j<M; j++)
864 s += in[i+j]
866 is as follows:
867 if LOOP=i_loop: &in (relative to i_loop)
868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
870 Output:
871 1. Return an SSA_NAME whose value is the address of the memory location of
872 the first vector of the data reference.
873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
874 these statement(s) which define the returned SSA_NAME.
876 FORNOW: We are only handling array accesses with step 1. */
878 static tree
879 vect_create_addr_base_for_vector_ref (gimple stmt,
880 gimple_seq *new_stmt_list,
881 tree offset,
882 struct loop *loop)
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
887 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
888 tree base_name;
889 tree data_ref_base_var;
890 tree vec_stmt;
891 tree addr_base, addr_expr;
892 tree dest;
893 gimple_seq seq = NULL;
894 tree base_offset = unshare_expr (DR_OFFSET (dr));
895 tree init = unshare_expr (DR_INIT (dr));
896 tree vect_ptr_type, addr_expr2;
897 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
899 gcc_assert (loop);
900 if (loop != containing_loop)
902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
905 gcc_assert (nested_in_vect_loop_p (loop, stmt));
907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
912 /* Create data_ref_base */
913 base_name = build_fold_indirect_ref (data_ref_base);
914 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
915 add_referenced_var (data_ref_base_var);
916 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
917 data_ref_base_var);
918 gimple_seq_add_seq (new_stmt_list, seq);
920 /* Create base_offset */
921 base_offset = size_binop (PLUS_EXPR,
922 fold_convert (sizetype, base_offset),
923 fold_convert (sizetype, init));
924 dest = create_tmp_var (sizetype, "base_off");
925 add_referenced_var (dest);
926 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
927 gimple_seq_add_seq (new_stmt_list, seq);
929 if (offset)
931 tree tmp = create_tmp_var (sizetype, "offset");
933 add_referenced_var (tmp);
934 offset = fold_build2 (MULT_EXPR, sizetype,
935 fold_convert (sizetype, offset), step);
936 base_offset = fold_build2 (PLUS_EXPR, sizetype,
937 base_offset, offset);
938 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
939 gimple_seq_add_seq (new_stmt_list, seq);
942 /* base + base_offset */
943 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
944 data_ref_base, base_offset);
946 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
948 /* addr_expr = addr_base */
949 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
950 get_name (base_name));
951 add_referenced_var (addr_expr);
952 vec_stmt = fold_convert (vect_ptr_type, addr_base);
953 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
954 get_name (base_name));
955 add_referenced_var (addr_expr2);
956 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
957 gimple_seq_add_seq (new_stmt_list, seq);
959 if (vect_print_dump_info (REPORT_DETAILS))
961 fprintf (vect_dump, "created ");
962 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
964 return vec_stmt;
968 /* Function vect_create_data_ref_ptr.
970 Create a new pointer to vector type (vp), that points to the first location
971 accessed in the loop by STMT, along with the def-use update chain to
972 appropriately advance the pointer through the loop iterations. Also set
973 aliasing information for the pointer. This vector pointer is used by the
974 callers to this function to create a memory reference expression for vector
975 load/store access.
977 Input:
978 1. STMT: a stmt that references memory. Expected to be of the form
979 GIMPLE_ASSIGN <name, data-ref> or
980 GIMPLE_ASSIGN <data-ref, name>.
981 2. AT_LOOP: the loop where the vector memref is to be created.
982 3. OFFSET (optional): an offset to be added to the initial address accessed
983 by the data-ref in STMT.
984 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
985 pointing to the initial address.
986 5. TYPE: if not NULL indicates the required type of the data-ref.
988 Output:
989 1. Declare a new ptr to vector_type, and have it point to the base of the
990 data reference (initial addressed accessed by the data reference).
991 For example, for vector of type V8HI, the following code is generated:
993 v8hi *vp;
994 vp = (v8hi *)initial_address;
996 if OFFSET is not supplied:
997 initial_address = &a[init];
998 if OFFSET is supplied:
999 initial_address = &a[init + OFFSET];
1001 Return the initial_address in INITIAL_ADDRESS.
1003 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1004 update the pointer in each iteration of the loop.
1006 Return the increment stmt that updates the pointer in PTR_INCR.
1008 3. Set INV_P to true if the access pattern of the data reference in the
1009 vectorized loop is invariant. Set it to false otherwise.
1011 4. Return the pointer. */
1013 static tree
1014 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1015 tree offset, tree *initial_address, gimple *ptr_incr,
1016 bool only_init, bool *inv_p, tree type)
1018 tree base_name;
1019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1021 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1022 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1023 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1024 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1025 tree vect_ptr_type;
1026 tree vect_ptr;
1027 tree tag;
1028 tree new_temp;
1029 gimple vec_stmt;
1030 gimple_seq new_stmt_list = NULL;
1031 edge pe;
1032 basic_block new_bb;
1033 tree vect_ptr_init;
1034 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1035 tree vptr;
1036 gimple_stmt_iterator incr_gsi;
1037 bool insert_after;
1038 tree indx_before_incr, indx_after_incr;
1039 gimple incr;
1040 tree step;
1042 /* Check the step (evolution) of the load in LOOP, and record
1043 whether it's invariant. */
1044 if (nested_in_vect_loop)
1045 step = STMT_VINFO_DR_STEP (stmt_info);
1046 else
1047 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1049 if (tree_int_cst_compare (step, size_zero_node) == 0)
1050 *inv_p = true;
1051 else
1052 *inv_p = false;
1054 /* Create an expression for the first address accessed by this load
1055 in LOOP. */
1056 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1058 if (vect_print_dump_info (REPORT_DETAILS))
1060 tree data_ref_base = base_name;
1061 fprintf (vect_dump, "create vector-pointer variable to type: ");
1062 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1063 if (TREE_CODE (data_ref_base) == VAR_DECL)
1064 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1065 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1066 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1067 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1068 fprintf (vect_dump, " vectorizing a record based array ref: ");
1069 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1070 fprintf (vect_dump, " vectorizing a pointer ref: ");
1071 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1074 /** (1) Create the new vector-pointer variable: **/
1075 if (type)
1076 vect_ptr_type = build_pointer_type (type);
1077 else
1078 vect_ptr_type = build_pointer_type (vectype);
1080 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1081 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1082 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT);
1083 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1084 get_name (base_name));
1085 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1086 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1088 get_alias_set (base_name);
1089 DECL_POINTER_ALIAS_SET (vect_ptr)
1090 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr)));
1093 add_referenced_var (vect_ptr);
1095 /** (2) Add aliasing information to the new vector-pointer:
1096 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1098 tag = DR_SYMBOL_TAG (dr);
1099 gcc_assert (tag);
1101 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1102 tag must be created with tag added to its may alias list. */
1103 if (!MTAG_P (tag))
1104 new_type_alias (vect_ptr, tag, DR_REF (dr));
1105 else
1106 set_symbol_mem_tag (vect_ptr, tag);
1108 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1109 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1110 def-use update cycles for the pointer: One relative to the outer-loop
1111 (LOOP), which is what steps (3) and (4) below do. The other is relative
1112 to the inner-loop (which is the inner-most loop containing the dataref),
1113 and this is done be step (5) below.
1115 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1116 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1117 redundant. Steps (3),(4) create the following:
1119 vp0 = &base_addr;
1120 LOOP: vp1 = phi(vp0,vp2)
1121 ...
1123 vp2 = vp1 + step
1124 goto LOOP
1126 If there is an inner-loop nested in loop, then step (5) will also be
1127 applied, and an additional update in the inner-loop will be created:
1129 vp0 = &base_addr;
1130 LOOP: vp1 = phi(vp0,vp2)
1132 inner: vp3 = phi(vp1,vp4)
1133 vp4 = vp3 + inner_step
1134 if () goto inner
1136 vp2 = vp1 + step
1137 if () goto LOOP */
1139 /** (3) Calculate the initial address the vector-pointer, and set
1140 the vector-pointer to point to it before the loop: **/
1142 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1144 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1145 offset, loop);
1146 pe = loop_preheader_edge (loop);
1147 if (new_stmt_list)
1149 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1150 gcc_assert (!new_bb);
1153 *initial_address = new_temp;
1155 /* Create: p = (vectype *) initial_base */
1156 vec_stmt = gimple_build_assign (vect_ptr,
1157 fold_convert (vect_ptr_type, new_temp));
1158 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1159 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1160 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1161 gcc_assert (!new_bb);
1164 /** (4) Handle the updating of the vector-pointer inside the loop.
1165 This is needed when ONLY_INIT is false, and also when AT_LOOP
1166 is the inner-loop nested in LOOP (during outer-loop vectorization).
1169 if (only_init && at_loop == loop) /* No update in loop is required. */
1171 /* Copy the points-to information if it exists. */
1172 if (DR_PTR_INFO (dr))
1173 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1174 vptr = vect_ptr_init;
1176 else
1178 /* The step of the vector pointer is the Vector Size. */
1179 tree step = TYPE_SIZE_UNIT (vectype);
1180 /* One exception to the above is when the scalar step of the load in
1181 LOOP is zero. In this case the step here is also zero. */
1182 if (*inv_p)
1183 step = size_zero_node;
1185 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1187 create_iv (vect_ptr_init,
1188 fold_convert (vect_ptr_type, step),
1189 vect_ptr, loop, &incr_gsi, insert_after,
1190 &indx_before_incr, &indx_after_incr);
1191 incr = gsi_stmt (incr_gsi);
1192 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1194 /* Copy the points-to information if it exists. */
1195 if (DR_PTR_INFO (dr))
1197 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1198 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1200 merge_alias_info (vect_ptr_init, indx_before_incr);
1201 merge_alias_info (vect_ptr_init, indx_after_incr);
1202 if (ptr_incr)
1203 *ptr_incr = incr;
1205 vptr = indx_before_incr;
1208 if (!nested_in_vect_loop || only_init)
1209 return vptr;
1212 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1213 nested in LOOP, if exists: **/
1215 gcc_assert (nested_in_vect_loop);
1216 if (!only_init)
1218 standard_iv_increment_position (containing_loop, &incr_gsi,
1219 &insert_after);
1220 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr,
1221 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1222 &indx_after_incr);
1223 incr = gsi_stmt (incr_gsi);
1224 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1226 /* Copy the points-to information if it exists. */
1227 if (DR_PTR_INFO (dr))
1229 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1230 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1232 merge_alias_info (vect_ptr_init, indx_before_incr);
1233 merge_alias_info (vect_ptr_init, indx_after_incr);
1234 if (ptr_incr)
1235 *ptr_incr = incr;
1237 return indx_before_incr;
1239 else
1240 gcc_unreachable ();
1244 /* Function bump_vector_ptr
1246 Increment a pointer (to a vector type) by vector-size. If requested,
1247 i.e. if PTR-INCR is given, then also connect the new increment stmt
1248 to the existing def-use update-chain of the pointer, by modifying
1249 the PTR_INCR as illustrated below:
1251 The pointer def-use update-chain before this function:
1252 DATAREF_PTR = phi (p_0, p_2)
1253 ....
1254 PTR_INCR: p_2 = DATAREF_PTR + step
1256 The pointer def-use update-chain after this function:
1257 DATAREF_PTR = phi (p_0, p_2)
1258 ....
1259 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1260 ....
1261 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1263 Input:
1264 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1265 in the loop.
1266 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1267 the loop. The increment amount across iterations is expected
1268 to be vector_size.
1269 BSI - location where the new update stmt is to be placed.
1270 STMT - the original scalar memory-access stmt that is being vectorized.
1271 BUMP - optional. The offset by which to bump the pointer. If not given,
1272 the offset is assumed to be vector_size.
1274 Output: Return NEW_DATAREF_PTR as illustrated above.
1278 static tree
1279 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1280 gimple stmt, tree bump)
1282 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1283 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1284 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1285 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1286 tree update = TYPE_SIZE_UNIT (vectype);
1287 gimple incr_stmt;
1288 ssa_op_iter iter;
1289 use_operand_p use_p;
1290 tree new_dataref_ptr;
1292 if (bump)
1293 update = bump;
1295 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1296 dataref_ptr, update);
1297 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1298 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1299 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1301 /* Copy the points-to information if it exists. */
1302 if (DR_PTR_INFO (dr))
1303 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1304 merge_alias_info (new_dataref_ptr, dataref_ptr);
1306 if (!ptr_incr)
1307 return new_dataref_ptr;
1309 /* Update the vector-pointer's cross-iteration increment. */
1310 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1312 tree use = USE_FROM_PTR (use_p);
1314 if (use == dataref_ptr)
1315 SET_USE (use_p, new_dataref_ptr);
1316 else
1317 gcc_assert (tree_int_cst_compare (use, update) == 0);
1320 return new_dataref_ptr;
1324 /* Function vect_create_destination_var.
1326 Create a new temporary of type VECTYPE. */
1328 static tree
1329 vect_create_destination_var (tree scalar_dest, tree vectype)
1331 tree vec_dest;
1332 const char *new_name;
1333 tree type;
1334 enum vect_var_kind kind;
1336 kind = vectype ? vect_simple_var : vect_scalar_var;
1337 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1339 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1341 new_name = get_name (scalar_dest);
1342 if (!new_name)
1343 new_name = "var_";
1344 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1345 add_referenced_var (vec_dest);
1347 return vec_dest;
1351 /* Function vect_init_vector.
1353 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1354 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1355 is not NULL. Otherwise, place the initialization at the loop preheader.
1356 Return the DEF of INIT_STMT.
1357 It will be used in the vectorization of STMT. */
1359 static tree
1360 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1361 gimple_stmt_iterator *gsi)
1363 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1364 tree new_var;
1365 gimple init_stmt;
1366 tree vec_oprnd;
1367 edge pe;
1368 tree new_temp;
1369 basic_block new_bb;
1371 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1372 add_referenced_var (new_var);
1373 init_stmt = gimple_build_assign (new_var, vector_var);
1374 new_temp = make_ssa_name (new_var, init_stmt);
1375 gimple_assign_set_lhs (init_stmt, new_temp);
1377 if (gsi)
1378 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1379 else
1381 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1382 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1384 if (nested_in_vect_loop_p (loop, stmt))
1385 loop = loop->inner;
1386 pe = loop_preheader_edge (loop);
1387 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1388 gcc_assert (!new_bb);
1391 if (vect_print_dump_info (REPORT_DETAILS))
1393 fprintf (vect_dump, "created new init_stmt: ");
1394 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1397 vec_oprnd = gimple_assign_lhs (init_stmt);
1398 return vec_oprnd;
1402 /* For constant and loop invariant defs of SLP_NODE this function returns
1403 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1404 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1405 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1407 static void
1408 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1409 unsigned int op_num, unsigned int number_of_vectors)
1411 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1412 gimple stmt = VEC_index (gimple, stmts, 0);
1413 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1414 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1415 int nunits;
1416 tree vec_cst;
1417 tree t = NULL_TREE;
1418 int j, number_of_places_left_in_vector;
1419 tree vector_type;
1420 tree op, vop;
1421 int group_size = VEC_length (gimple, stmts);
1422 unsigned int vec_num, i;
1423 int number_of_copies = 1;
1424 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1425 bool constant_p, is_store;
1427 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1429 is_store = true;
1430 op = gimple_assign_rhs1 (stmt);
1432 else
1434 is_store = false;
1435 op = gimple_op (stmt, op_num + 1);
1438 if (CONSTANT_CLASS_P (op))
1440 vector_type = vectype;
1441 constant_p = true;
1443 else
1445 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1446 gcc_assert (vector_type);
1447 constant_p = false;
1450 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1452 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1453 created vectors. It is greater than 1 if unrolling is performed.
1455 For example, we have two scalar operands, s1 and s2 (e.g., group of
1456 strided accesses of size two), while NUNITS is four (i.e., four scalars
1457 of this type can be packed in a vector). The output vector will contain
1458 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1459 will be 2).
1461 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1462 containing the operands.
1464 For example, NUNITS is four as before, and the group size is 8
1465 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1466 {s5, s6, s7, s8}. */
1468 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1470 number_of_places_left_in_vector = nunits;
1471 for (j = 0; j < number_of_copies; j++)
1473 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1475 if (is_store)
1476 op = gimple_assign_rhs1 (stmt);
1477 else
1478 op = gimple_op (stmt, op_num + 1);
1480 /* Create 'vect_ = {op0,op1,...,opn}'. */
1481 t = tree_cons (NULL_TREE, op, t);
1483 number_of_places_left_in_vector--;
1485 if (number_of_places_left_in_vector == 0)
1487 number_of_places_left_in_vector = nunits;
1489 if (constant_p)
1490 vec_cst = build_vector (vector_type, t);
1491 else
1492 vec_cst = build_constructor_from_list (vector_type, t);
1493 VEC_quick_push (tree, voprnds,
1494 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1495 t = NULL_TREE;
1500 /* Since the vectors are created in the reverse order, we should invert
1501 them. */
1502 vec_num = VEC_length (tree, voprnds);
1503 for (j = vec_num - 1; j >= 0; j--)
1505 vop = VEC_index (tree, voprnds, j);
1506 VEC_quick_push (tree, *vec_oprnds, vop);
1509 VEC_free (tree, heap, voprnds);
1511 /* In case that VF is greater than the unrolling factor needed for the SLP
1512 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1513 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1514 to replicate the vectors. */
1515 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1517 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1518 VEC_quick_push (tree, *vec_oprnds, vop);
1523 /* Get vectorized definitions from SLP_NODE that contains corresponding
1524 vectorized def-stmts. */
1526 static void
1527 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1529 tree vec_oprnd;
1530 gimple vec_def_stmt;
1531 unsigned int i;
1533 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1535 for (i = 0;
1536 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1537 i++)
1539 gcc_assert (vec_def_stmt);
1540 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1541 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1546 /* Get vectorized definitions for SLP_NODE.
1547 If the scalar definitions are loop invariants or constants, collect them and
1548 call vect_get_constant_vectors() to create vector stmts.
1549 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1550 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1551 vect_get_slp_vect_defs() to retrieve them.
1552 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1553 the right node. This is used when the second operand must remain scalar. */
1555 static void
1556 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1557 VEC (tree,heap) **vec_oprnds1)
1559 gimple first_stmt;
1560 enum tree_code code;
1561 int number_of_vects;
1562 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1564 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1565 /* The number of vector defs is determined by the number of vector statements
1566 in the node from which we get those statements. */
1567 if (SLP_TREE_LEFT (slp_node))
1568 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1569 else
1571 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1572 /* Number of vector stmts was calculated according to LHS in
1573 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1574 necessary. See vect_get_smallest_scalar_type() for details. */
1575 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1576 &rhs_size_unit);
1577 if (rhs_size_unit != lhs_size_unit)
1579 number_of_vects *= rhs_size_unit;
1580 number_of_vects /= lhs_size_unit;
1584 /* Allocate memory for vectorized defs. */
1585 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1587 /* SLP_NODE corresponds either to a group of stores or to a group of
1588 unary/binary operations. We don't call this function for loads. */
1589 if (SLP_TREE_LEFT (slp_node))
1590 /* The defs are already vectorized. */
1591 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1592 else
1593 /* Build vectors from scalar defs. */
1594 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1596 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1597 /* Since we don't call this function with loads, this is a group of
1598 stores. */
1599 return;
1601 code = gimple_assign_rhs_code (first_stmt);
1602 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1603 return;
1605 /* The number of vector defs is determined by the number of vector statements
1606 in the node from which we get those statements. */
1607 if (SLP_TREE_RIGHT (slp_node))
1608 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1609 else
1610 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1612 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1614 if (SLP_TREE_RIGHT (slp_node))
1615 /* The defs are already vectorized. */
1616 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1617 else
1618 /* Build vectors from scalar defs. */
1619 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1623 /* Function get_initial_def_for_induction
1625 Input:
1626 STMT - a stmt that performs an induction operation in the loop.
1627 IV_PHI - the initial value of the induction variable
1629 Output:
1630 Return a vector variable, initialized with the first VF values of
1631 the induction variable. E.g., for an iv with IV_PHI='X' and
1632 evolution S, for a vector of 4 units, we want to return:
1633 [X, X + S, X + 2*S, X + 3*S]. */
1635 static tree
1636 get_initial_def_for_induction (gimple iv_phi)
1638 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1639 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1640 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1641 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1642 tree vectype;
1643 int nunits;
1644 edge pe = loop_preheader_edge (loop);
1645 struct loop *iv_loop;
1646 basic_block new_bb;
1647 tree vec, vec_init, vec_step, t;
1648 tree access_fn;
1649 tree new_var;
1650 tree new_name;
1651 gimple init_stmt, induction_phi, new_stmt;
1652 tree induc_def, vec_def, vec_dest;
1653 tree init_expr, step_expr;
1654 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1655 int i;
1656 bool ok;
1657 int ncopies;
1658 tree expr;
1659 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1660 bool nested_in_vect_loop = false;
1661 gimple_seq stmts = NULL;
1662 imm_use_iterator imm_iter;
1663 use_operand_p use_p;
1664 gimple exit_phi;
1665 edge latch_e;
1666 tree loop_arg;
1667 gimple_stmt_iterator si;
1668 basic_block bb = gimple_bb (iv_phi);
1670 vectype = get_vectype_for_scalar_type (scalar_type);
1671 gcc_assert (vectype);
1672 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1673 ncopies = vf / nunits;
1675 gcc_assert (phi_info);
1676 gcc_assert (ncopies >= 1);
1678 /* Find the first insertion point in the BB. */
1679 si = gsi_after_labels (bb);
1681 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1682 step_expr = build_int_cst (scalar_type, 0);
1683 else
1684 step_expr = build_real (scalar_type, dconst0);
1686 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1687 if (nested_in_vect_loop_p (loop, iv_phi))
1689 nested_in_vect_loop = true;
1690 iv_loop = loop->inner;
1692 else
1693 iv_loop = loop;
1694 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1696 latch_e = loop_latch_edge (iv_loop);
1697 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1699 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1700 gcc_assert (access_fn);
1701 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1702 &init_expr, &step_expr);
1703 gcc_assert (ok);
1704 pe = loop_preheader_edge (iv_loop);
1706 /* Create the vector that holds the initial_value of the induction. */
1707 if (nested_in_vect_loop)
1709 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1710 been created during vectorization of previous stmts; We obtain it from
1711 the STMT_VINFO_VEC_STMT of the defining stmt. */
1712 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1713 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1715 else
1717 /* iv_loop is the loop to be vectorized. Create:
1718 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1719 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1720 add_referenced_var (new_var);
1722 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1723 if (stmts)
1725 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1726 gcc_assert (!new_bb);
1729 t = NULL_TREE;
1730 t = tree_cons (NULL_TREE, init_expr, t);
1731 for (i = 1; i < nunits; i++)
1733 /* Create: new_name_i = new_name + step_expr */
1734 enum tree_code code = POINTER_TYPE_P (scalar_type)
1735 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1736 init_stmt = gimple_build_assign_with_ops (code, new_var,
1737 new_name, step_expr);
1738 new_name = make_ssa_name (new_var, init_stmt);
1739 gimple_assign_set_lhs (init_stmt, new_name);
1741 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1742 gcc_assert (!new_bb);
1744 if (vect_print_dump_info (REPORT_DETAILS))
1746 fprintf (vect_dump, "created new init_stmt: ");
1747 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1749 t = tree_cons (NULL_TREE, new_name, t);
1751 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1752 vec = build_constructor_from_list (vectype, nreverse (t));
1753 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1757 /* Create the vector that holds the step of the induction. */
1758 if (nested_in_vect_loop)
1759 /* iv_loop is nested in the loop to be vectorized. Generate:
1760 vec_step = [S, S, S, S] */
1761 new_name = step_expr;
1762 else
1764 /* iv_loop is the loop to be vectorized. Generate:
1765 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1766 expr = build_int_cst (scalar_type, vf);
1767 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1770 t = NULL_TREE;
1771 for (i = 0; i < nunits; i++)
1772 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1773 gcc_assert (CONSTANT_CLASS_P (new_name));
1774 vec = build_vector (vectype, t);
1775 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1778 /* Create the following def-use cycle:
1779 loop prolog:
1780 vec_init = ...
1781 vec_step = ...
1782 loop:
1783 vec_iv = PHI <vec_init, vec_loop>
1785 STMT
1787 vec_loop = vec_iv + vec_step; */
1789 /* Create the induction-phi that defines the induction-operand. */
1790 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1791 add_referenced_var (vec_dest);
1792 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1793 set_vinfo_for_stmt (induction_phi,
1794 new_stmt_vec_info (induction_phi, loop_vinfo));
1795 induc_def = PHI_RESULT (induction_phi);
1797 /* Create the iv update inside the loop */
1798 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1799 induc_def, vec_step);
1800 vec_def = make_ssa_name (vec_dest, new_stmt);
1801 gimple_assign_set_lhs (new_stmt, vec_def);
1802 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1803 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1805 /* Set the arguments of the phi node: */
1806 add_phi_arg (induction_phi, vec_init, pe);
1807 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1810 /* In case that vectorization factor (VF) is bigger than the number
1811 of elements that we can fit in a vectype (nunits), we have to generate
1812 more than one vector stmt - i.e - we need to "unroll" the
1813 vector stmt by a factor VF/nunits. For more details see documentation
1814 in vectorizable_operation. */
1816 if (ncopies > 1)
1818 stmt_vec_info prev_stmt_vinfo;
1819 /* FORNOW. This restriction should be relaxed. */
1820 gcc_assert (!nested_in_vect_loop);
1822 /* Create the vector that holds the step of the induction. */
1823 expr = build_int_cst (scalar_type, nunits);
1824 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1825 t = NULL_TREE;
1826 for (i = 0; i < nunits; i++)
1827 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1828 gcc_assert (CONSTANT_CLASS_P (new_name));
1829 vec = build_vector (vectype, t);
1830 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1832 vec_def = induc_def;
1833 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1834 for (i = 1; i < ncopies; i++)
1836 /* vec_i = vec_prev + vec_step */
1837 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1838 vec_def, vec_step);
1839 vec_def = make_ssa_name (vec_dest, new_stmt);
1840 gimple_assign_set_lhs (new_stmt, vec_def);
1842 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1843 set_vinfo_for_stmt (new_stmt,
1844 new_stmt_vec_info (new_stmt, loop_vinfo));
1845 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1846 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1850 if (nested_in_vect_loop)
1852 /* Find the loop-closed exit-phi of the induction, and record
1853 the final vector of induction results: */
1854 exit_phi = NULL;
1855 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1857 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1859 exit_phi = USE_STMT (use_p);
1860 break;
1863 if (exit_phi)
1865 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1866 /* FORNOW. Currently not supporting the case that an inner-loop induction
1867 is not used in the outer-loop (i.e. only outside the outer-loop). */
1868 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1869 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1871 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1872 if (vect_print_dump_info (REPORT_DETAILS))
1874 fprintf (vect_dump, "vector of inductions after inner-loop:");
1875 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1881 if (vect_print_dump_info (REPORT_DETAILS))
1883 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1884 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1885 fprintf (vect_dump, "\n");
1886 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1889 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1890 return induc_def;
1894 /* Function vect_get_vec_def_for_operand.
1896 OP is an operand in STMT. This function returns a (vector) def that will be
1897 used in the vectorized stmt for STMT.
1899 In the case that OP is an SSA_NAME which is defined in the loop, then
1900 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1902 In case OP is an invariant or constant, a new stmt that creates a vector def
1903 needs to be introduced. */
1905 static tree
1906 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1908 tree vec_oprnd;
1909 gimple vec_stmt;
1910 gimple def_stmt;
1911 stmt_vec_info def_stmt_info = NULL;
1912 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1913 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1914 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1916 tree vec_inv;
1917 tree vec_cst;
1918 tree t = NULL_TREE;
1919 tree def;
1920 int i;
1921 enum vect_def_type dt;
1922 bool is_simple_use;
1923 tree vector_type;
1925 if (vect_print_dump_info (REPORT_DETAILS))
1927 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1928 print_generic_expr (vect_dump, op, TDF_SLIM);
1931 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1932 gcc_assert (is_simple_use);
1933 if (vect_print_dump_info (REPORT_DETAILS))
1935 if (def)
1937 fprintf (vect_dump, "def = ");
1938 print_generic_expr (vect_dump, def, TDF_SLIM);
1940 if (def_stmt)
1942 fprintf (vect_dump, " def_stmt = ");
1943 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1947 switch (dt)
1949 /* Case 1: operand is a constant. */
1950 case vect_constant_def:
1952 if (scalar_def)
1953 *scalar_def = op;
1955 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1956 if (vect_print_dump_info (REPORT_DETAILS))
1957 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1959 for (i = nunits - 1; i >= 0; --i)
1961 t = tree_cons (NULL_TREE, op, t);
1963 vec_cst = build_vector (vectype, t);
1964 return vect_init_vector (stmt, vec_cst, vectype, NULL);
1967 /* Case 2: operand is defined outside the loop - loop invariant. */
1968 case vect_invariant_def:
1970 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1971 gcc_assert (vector_type);
1972 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1974 if (scalar_def)
1975 *scalar_def = def;
1977 /* Create 'vec_inv = {inv,inv,..,inv}' */
1978 if (vect_print_dump_info (REPORT_DETAILS))
1979 fprintf (vect_dump, "Create vector_inv.");
1981 for (i = nunits - 1; i >= 0; --i)
1983 t = tree_cons (NULL_TREE, def, t);
1986 /* FIXME: use build_constructor directly. */
1987 vec_inv = build_constructor_from_list (vector_type, t);
1988 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1991 /* Case 3: operand is defined inside the loop. */
1992 case vect_loop_def:
1994 if (scalar_def)
1995 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1997 /* Get the def from the vectorized stmt. */
1998 def_stmt_info = vinfo_for_stmt (def_stmt);
1999 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2000 gcc_assert (vec_stmt);
2001 if (gimple_code (vec_stmt) == GIMPLE_PHI)
2002 vec_oprnd = PHI_RESULT (vec_stmt);
2003 else if (is_gimple_call (vec_stmt))
2004 vec_oprnd = gimple_call_lhs (vec_stmt);
2005 else
2006 vec_oprnd = gimple_assign_lhs (vec_stmt);
2007 return vec_oprnd;
2010 /* Case 4: operand is defined by a loop header phi - reduction */
2011 case vect_reduction_def:
2013 struct loop *loop;
2015 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2016 loop = (gimple_bb (def_stmt))->loop_father;
2018 /* Get the def before the loop */
2019 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
2020 return get_initial_def_for_reduction (stmt, op, scalar_def);
2023 /* Case 5: operand is defined by loop-header phi - induction. */
2024 case vect_induction_def:
2026 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2028 /* Get the def from the vectorized stmt. */
2029 def_stmt_info = vinfo_for_stmt (def_stmt);
2030 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2031 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2032 vec_oprnd = PHI_RESULT (vec_stmt);
2033 return vec_oprnd;
2036 default:
2037 gcc_unreachable ();
2042 /* Function vect_get_vec_def_for_stmt_copy
2044 Return a vector-def for an operand. This function is used when the
2045 vectorized stmt to be created (by the caller to this function) is a "copy"
2046 created in case the vectorized result cannot fit in one vector, and several
2047 copies of the vector-stmt are required. In this case the vector-def is
2048 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2049 of the stmt that defines VEC_OPRND.
2050 DT is the type of the vector def VEC_OPRND.
2052 Context:
2053 In case the vectorization factor (VF) is bigger than the number
2054 of elements that can fit in a vectype (nunits), we have to generate
2055 more than one vector stmt to vectorize the scalar stmt. This situation
2056 arises when there are multiple data-types operated upon in the loop; the
2057 smallest data-type determines the VF, and as a result, when vectorizing
2058 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2059 vector stmt (each computing a vector of 'nunits' results, and together
2060 computing 'VF' results in each iteration). This function is called when
2061 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2062 which VF=16 and nunits=4, so the number of copies required is 4):
2064 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2066 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2067 VS1.1: vx.1 = memref1 VS1.2
2068 VS1.2: vx.2 = memref2 VS1.3
2069 VS1.3: vx.3 = memref3
2071 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2072 VSnew.1: vz1 = vx.1 + ... VSnew.2
2073 VSnew.2: vz2 = vx.2 + ... VSnew.3
2074 VSnew.3: vz3 = vx.3 + ...
2076 The vectorization of S1 is explained in vectorizable_load.
2077 The vectorization of S2:
2078 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2079 the function 'vect_get_vec_def_for_operand' is called to
2080 get the relevant vector-def for each operand of S2. For operand x it
2081 returns the vector-def 'vx.0'.
2083 To create the remaining copies of the vector-stmt (VSnew.j), this
2084 function is called to get the relevant vector-def for each operand. It is
2085 obtained from the respective VS1.j stmt, which is recorded in the
2086 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2088 For example, to obtain the vector-def 'vx.1' in order to create the
2089 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2090 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2091 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2092 and return its def ('vx.1').
2093 Overall, to create the above sequence this function will be called 3 times:
2094 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2095 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2096 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2098 static tree
2099 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2101 gimple vec_stmt_for_operand;
2102 stmt_vec_info def_stmt_info;
2104 /* Do nothing; can reuse same def. */
2105 if (dt == vect_invariant_def || dt == vect_constant_def )
2106 return vec_oprnd;
2108 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2109 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2110 gcc_assert (def_stmt_info);
2111 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2112 gcc_assert (vec_stmt_for_operand);
2113 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2114 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2115 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2116 else
2117 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2118 return vec_oprnd;
2122 /* Get vectorized definitions for the operands to create a copy of an original
2123 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2125 static void
2126 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2127 VEC(tree,heap) **vec_oprnds0,
2128 VEC(tree,heap) **vec_oprnds1)
2130 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2132 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2133 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2135 if (vec_oprnds1 && *vec_oprnds1)
2137 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2138 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2139 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2144 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2146 static void
2147 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2148 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2149 slp_tree slp_node)
2151 if (slp_node)
2152 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2153 else
2155 tree vec_oprnd;
2157 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2158 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2159 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2161 if (op1)
2163 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2164 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2165 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2171 /* Function vect_finish_stmt_generation.
2173 Insert a new stmt. */
2175 static void
2176 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2177 gimple_stmt_iterator *gsi)
2179 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2182 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2184 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2186 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2188 if (vect_print_dump_info (REPORT_DETAILS))
2190 fprintf (vect_dump, "add new stmt: ");
2191 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2194 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2198 /* Function get_initial_def_for_reduction
2200 Input:
2201 STMT - a stmt that performs a reduction operation in the loop.
2202 INIT_VAL - the initial value of the reduction variable
2204 Output:
2205 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2206 of the reduction (used for adjusting the epilog - see below).
2207 Return a vector variable, initialized according to the operation that STMT
2208 performs. This vector will be used as the initial value of the
2209 vector of partial results.
2211 Option1 (adjust in epilog): Initialize the vector as follows:
2212 add: [0,0,...,0,0]
2213 mult: [1,1,...,1,1]
2214 min/max: [init_val,init_val,..,init_val,init_val]
2215 bit and/or: [init_val,init_val,..,init_val,init_val]
2216 and when necessary (e.g. add/mult case) let the caller know
2217 that it needs to adjust the result by init_val.
2219 Option2: Initialize the vector as follows:
2220 add: [0,0,...,0,init_val]
2221 mult: [1,1,...,1,init_val]
2222 min/max: [init_val,init_val,...,init_val]
2223 bit and/or: [init_val,init_val,...,init_val]
2224 and no adjustments are needed.
2226 For example, for the following code:
2228 s = init_val;
2229 for (i=0;i<n;i++)
2230 s = s + a[i];
2232 STMT is 's = s + a[i]', and the reduction variable is 's'.
2233 For a vector of 4 units, we want to return either [0,0,0,init_val],
2234 or [0,0,0,0] and let the caller know that it needs to adjust
2235 the result at the end by 'init_val'.
2237 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2238 initialization vector is simpler (same element in all entries).
2239 A cost model should help decide between these two schemes. */
2241 static tree
2242 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2244 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2245 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2246 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2247 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2248 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2249 tree scalar_type = TREE_TYPE (vectype);
2250 enum tree_code code = gimple_assign_rhs_code (stmt);
2251 tree type = TREE_TYPE (init_val);
2252 tree vecdef;
2253 tree def_for_init;
2254 tree init_def;
2255 tree t = NULL_TREE;
2256 int i;
2257 bool nested_in_vect_loop = false;
2259 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2260 if (nested_in_vect_loop_p (loop, stmt))
2261 nested_in_vect_loop = true;
2262 else
2263 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2265 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2267 switch (code)
2269 case WIDEN_SUM_EXPR:
2270 case DOT_PROD_EXPR:
2271 case PLUS_EXPR:
2272 if (nested_in_vect_loop)
2273 *adjustment_def = vecdef;
2274 else
2275 *adjustment_def = init_val;
2276 /* Create a vector of zeros for init_def. */
2277 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2278 def_for_init = build_real (scalar_type, dconst0);
2279 else
2280 def_for_init = build_int_cst (scalar_type, 0);
2282 for (i = nunits - 1; i >= 0; --i)
2283 t = tree_cons (NULL_TREE, def_for_init, t);
2284 init_def = build_vector (vectype, t);
2285 break;
2287 case MIN_EXPR:
2288 case MAX_EXPR:
2289 *adjustment_def = NULL_TREE;
2290 init_def = vecdef;
2291 break;
2293 default:
2294 gcc_unreachable ();
2297 return init_def;
2301 /* Function vect_create_epilog_for_reduction
2303 Create code at the loop-epilog to finalize the result of a reduction
2304 computation.
2306 VECT_DEF is a vector of partial results.
2307 REDUC_CODE is the tree-code for the epilog reduction.
2308 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2309 number of elements that we can fit in a vectype (nunits). In this case
2310 we have to generate more than one vector stmt - i.e - we need to "unroll"
2311 the vector stmt by a factor VF/nunits. For more details see documentation
2312 in vectorizable_operation.
2313 STMT is the scalar reduction stmt that is being vectorized.
2314 REDUCTION_PHI is the phi-node that carries the reduction computation.
2316 This function:
2317 1. Creates the reduction def-use cycle: sets the arguments for
2318 REDUCTION_PHI:
2319 The loop-entry argument is the vectorized initial-value of the reduction.
2320 The loop-latch argument is VECT_DEF - the vector of partial sums.
2321 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2322 by applying the operation specified by REDUC_CODE if available, or by
2323 other means (whole-vector shifts or a scalar loop).
2324 The function also creates a new phi node at the loop exit to preserve
2325 loop-closed form, as illustrated below.
2327 The flow at the entry to this function:
2329 loop:
2330 vec_def = phi <null, null> # REDUCTION_PHI
2331 VECT_DEF = vector_stmt # vectorized form of STMT
2332 s_loop = scalar_stmt # (scalar) STMT
2333 loop_exit:
2334 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2335 use <s_out0>
2336 use <s_out0>
2338 The above is transformed by this function into:
2340 loop:
2341 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2342 VECT_DEF = vector_stmt # vectorized form of STMT
2343 s_loop = scalar_stmt # (scalar) STMT
2344 loop_exit:
2345 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2346 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2347 v_out2 = reduce <v_out1>
2348 s_out3 = extract_field <v_out2, 0>
2349 s_out4 = adjust_result <s_out3>
2350 use <s_out4>
2351 use <s_out4>
2354 static void
2355 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2356 int ncopies,
2357 enum tree_code reduc_code,
2358 gimple reduction_phi)
2360 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2361 stmt_vec_info prev_phi_info;
2362 tree vectype;
2363 enum machine_mode mode;
2364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2366 basic_block exit_bb;
2367 tree scalar_dest;
2368 tree scalar_type;
2369 gimple new_phi = NULL, phi;
2370 gimple_stmt_iterator exit_gsi;
2371 tree vec_dest;
2372 tree new_temp = NULL_TREE;
2373 tree new_name;
2374 gimple epilog_stmt = NULL;
2375 tree new_scalar_dest, new_dest;
2376 gimple exit_phi;
2377 tree bitsize, bitpos, bytesize;
2378 enum tree_code code = gimple_assign_rhs_code (stmt);
2379 tree adjustment_def;
2380 tree vec_initial_def, def;
2381 tree orig_name;
2382 imm_use_iterator imm_iter;
2383 use_operand_p use_p;
2384 bool extract_scalar_result = false;
2385 tree reduction_op, expr;
2386 gimple orig_stmt;
2387 gimple use_stmt;
2388 bool nested_in_vect_loop = false;
2389 VEC(gimple,heap) *phis = NULL;
2390 enum vect_def_type dt = vect_unknown_def_type;
2391 int j, i;
2393 if (nested_in_vect_loop_p (loop, stmt))
2395 loop = loop->inner;
2396 nested_in_vect_loop = true;
2399 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2401 case GIMPLE_SINGLE_RHS:
2402 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2403 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2404 break;
2405 case GIMPLE_UNARY_RHS:
2406 reduction_op = gimple_assign_rhs1 (stmt);
2407 break;
2408 case GIMPLE_BINARY_RHS:
2409 reduction_op = gimple_assign_rhs2 (stmt);
2410 break;
2411 default:
2412 gcc_unreachable ();
2415 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2416 gcc_assert (vectype);
2417 mode = TYPE_MODE (vectype);
2419 /*** 1. Create the reduction def-use cycle ***/
2421 /* For the case of reduction, vect_get_vec_def_for_operand returns
2422 the scalar def before the loop, that defines the initial value
2423 of the reduction variable. */
2424 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2425 &adjustment_def);
2427 phi = reduction_phi;
2428 def = vect_def;
2429 for (j = 0; j < ncopies; j++)
2431 /* 1.1 set the loop-entry arg of the reduction-phi: */
2432 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2434 /* 1.2 set the loop-latch arg for the reduction-phi: */
2435 if (j > 0)
2436 def = vect_get_vec_def_for_stmt_copy (dt, def);
2437 add_phi_arg (phi, def, loop_latch_edge (loop));
2439 if (vect_print_dump_info (REPORT_DETAILS))
2441 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2442 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2443 fprintf (vect_dump, "\n");
2444 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2447 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2450 /*** 2. Create epilog code
2451 The reduction epilog code operates across the elements of the vector
2452 of partial results computed by the vectorized loop.
2453 The reduction epilog code consists of:
2454 step 1: compute the scalar result in a vector (v_out2)
2455 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2456 step 3: adjust the scalar result (s_out3) if needed.
2458 Step 1 can be accomplished using one the following three schemes:
2459 (scheme 1) using reduc_code, if available.
2460 (scheme 2) using whole-vector shifts, if available.
2461 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2462 combined.
2464 The overall epilog code looks like this:
2466 s_out0 = phi <s_loop> # original EXIT_PHI
2467 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2468 v_out2 = reduce <v_out1> # step 1
2469 s_out3 = extract_field <v_out2, 0> # step 2
2470 s_out4 = adjust_result <s_out3> # step 3
2472 (step 3 is optional, and steps 1 and 2 may be combined).
2473 Lastly, the uses of s_out0 are replaced by s_out4.
2475 ***/
2477 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2478 v_out1 = phi <v_loop> */
2480 exit_bb = single_exit (loop)->dest;
2481 def = vect_def;
2482 prev_phi_info = NULL;
2483 for (j = 0; j < ncopies; j++)
2485 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2486 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2487 if (j == 0)
2488 new_phi = phi;
2489 else
2491 def = vect_get_vec_def_for_stmt_copy (dt, def);
2492 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2494 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2495 prev_phi_info = vinfo_for_stmt (phi);
2497 exit_gsi = gsi_after_labels (exit_bb);
2499 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2500 (i.e. when reduc_code is not available) and in the final adjustment
2501 code (if needed). Also get the original scalar reduction variable as
2502 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2503 represents a reduction pattern), the tree-code and scalar-def are
2504 taken from the original stmt that the pattern-stmt (STMT) replaces.
2505 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2506 are taken from STMT. */
2508 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2509 if (!orig_stmt)
2511 /* Regular reduction */
2512 orig_stmt = stmt;
2514 else
2516 /* Reduction pattern */
2517 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2518 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2519 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2521 code = gimple_assign_rhs_code (orig_stmt);
2522 scalar_dest = gimple_assign_lhs (orig_stmt);
2523 scalar_type = TREE_TYPE (scalar_dest);
2524 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2525 bitsize = TYPE_SIZE (scalar_type);
2526 bytesize = TYPE_SIZE_UNIT (scalar_type);
2529 /* In case this is a reduction in an inner-loop while vectorizing an outer
2530 loop - we don't need to extract a single scalar result at the end of the
2531 inner-loop. The final vector of partial results will be used in the
2532 vectorized outer-loop, or reduced to a scalar result at the end of the
2533 outer-loop. */
2534 if (nested_in_vect_loop)
2535 goto vect_finalize_reduction;
2537 /* FORNOW */
2538 gcc_assert (ncopies == 1);
2540 /* 2.3 Create the reduction code, using one of the three schemes described
2541 above. */
2543 if (reduc_code < NUM_TREE_CODES)
2545 tree tmp;
2547 /*** Case 1: Create:
2548 v_out2 = reduc_expr <v_out1> */
2550 if (vect_print_dump_info (REPORT_DETAILS))
2551 fprintf (vect_dump, "Reduce using direct vector reduction.");
2553 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2554 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2555 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2556 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2557 gimple_assign_set_lhs (epilog_stmt, new_temp);
2558 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2560 extract_scalar_result = true;
2562 else
2564 enum tree_code shift_code = 0;
2565 bool have_whole_vector_shift = true;
2566 int bit_offset;
2567 int element_bitsize = tree_low_cst (bitsize, 1);
2568 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2569 tree vec_temp;
2571 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2572 shift_code = VEC_RSHIFT_EXPR;
2573 else
2574 have_whole_vector_shift = false;
2576 /* Regardless of whether we have a whole vector shift, if we're
2577 emulating the operation via tree-vect-generic, we don't want
2578 to use it. Only the first round of the reduction is likely
2579 to still be profitable via emulation. */
2580 /* ??? It might be better to emit a reduction tree code here, so that
2581 tree-vect-generic can expand the first round via bit tricks. */
2582 if (!VECTOR_MODE_P (mode))
2583 have_whole_vector_shift = false;
2584 else
2586 optab optab = optab_for_tree_code (code, vectype, optab_default);
2587 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2588 have_whole_vector_shift = false;
2591 if (have_whole_vector_shift)
2593 /*** Case 2: Create:
2594 for (offset = VS/2; offset >= element_size; offset/=2)
2596 Create: va' = vec_shift <va, offset>
2597 Create: va = vop <va, va'>
2598 } */
2600 if (vect_print_dump_info (REPORT_DETAILS))
2601 fprintf (vect_dump, "Reduce using vector shifts");
2603 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2604 new_temp = PHI_RESULT (new_phi);
2606 for (bit_offset = vec_size_in_bits/2;
2607 bit_offset >= element_bitsize;
2608 bit_offset /= 2)
2610 tree bitpos = size_int (bit_offset);
2611 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2612 new_temp, bitpos);
2613 new_name = make_ssa_name (vec_dest, epilog_stmt);
2614 gimple_assign_set_lhs (epilog_stmt, new_name);
2615 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2617 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2618 new_name, new_temp);
2619 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2620 gimple_assign_set_lhs (epilog_stmt, new_temp);
2621 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2624 extract_scalar_result = true;
2626 else
2628 tree rhs;
2630 /*** Case 3: Create:
2631 s = extract_field <v_out2, 0>
2632 for (offset = element_size;
2633 offset < vector_size;
2634 offset += element_size;)
2636 Create: s' = extract_field <v_out2, offset>
2637 Create: s = op <s, s'>
2638 } */
2640 if (vect_print_dump_info (REPORT_DETAILS))
2641 fprintf (vect_dump, "Reduce using scalar code. ");
2643 vec_temp = PHI_RESULT (new_phi);
2644 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2645 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2646 bitsize_zero_node);
2647 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2648 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2649 gimple_assign_set_lhs (epilog_stmt, new_temp);
2650 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2652 for (bit_offset = element_bitsize;
2653 bit_offset < vec_size_in_bits;
2654 bit_offset += element_bitsize)
2656 tree bitpos = bitsize_int (bit_offset);
2657 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2658 bitpos);
2660 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2661 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2662 gimple_assign_set_lhs (epilog_stmt, new_name);
2663 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2665 epilog_stmt = gimple_build_assign_with_ops (code,
2666 new_scalar_dest,
2667 new_name, new_temp);
2668 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2669 gimple_assign_set_lhs (epilog_stmt, new_temp);
2670 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2673 extract_scalar_result = false;
2677 /* 2.4 Extract the final scalar result. Create:
2678 s_out3 = extract_field <v_out2, bitpos> */
2680 if (extract_scalar_result)
2682 tree rhs;
2684 gcc_assert (!nested_in_vect_loop);
2685 if (vect_print_dump_info (REPORT_DETAILS))
2686 fprintf (vect_dump, "extract scalar result");
2688 if (BYTES_BIG_ENDIAN)
2689 bitpos = size_binop (MULT_EXPR,
2690 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2691 TYPE_SIZE (scalar_type));
2692 else
2693 bitpos = bitsize_zero_node;
2695 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2696 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2697 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2698 gimple_assign_set_lhs (epilog_stmt, new_temp);
2699 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2702 vect_finalize_reduction:
2704 /* 2.5 Adjust the final result by the initial value of the reduction
2705 variable. (When such adjustment is not needed, then
2706 'adjustment_def' is zero). For example, if code is PLUS we create:
2707 new_temp = loop_exit_def + adjustment_def */
2709 if (adjustment_def)
2711 if (nested_in_vect_loop)
2713 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2714 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2715 new_dest = vect_create_destination_var (scalar_dest, vectype);
2717 else
2719 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2720 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2721 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2723 epilog_stmt = gimple_build_assign (new_dest, expr);
2724 new_temp = make_ssa_name (new_dest, epilog_stmt);
2725 gimple_assign_set_lhs (epilog_stmt, new_temp);
2726 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2727 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2731 /* 2.6 Handle the loop-exit phi */
2733 /* Replace uses of s_out0 with uses of s_out3:
2734 Find the loop-closed-use at the loop exit of the original scalar result.
2735 (The reduction result is expected to have two immediate uses - one at the
2736 latch block, and one at the loop exit). */
2737 phis = VEC_alloc (gimple, heap, 10);
2738 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2740 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2742 exit_phi = USE_STMT (use_p);
2743 VEC_quick_push (gimple, phis, exit_phi);
2746 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2747 gcc_assert (!VEC_empty (gimple, phis));
2749 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2751 if (nested_in_vect_loop)
2753 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2755 /* FORNOW. Currently not supporting the case that an inner-loop
2756 reduction is not used in the outer-loop (but only outside the
2757 outer-loop). */
2758 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2759 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2761 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2762 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2763 set_vinfo_for_stmt (epilog_stmt,
2764 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2765 if (adjustment_def)
2766 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2767 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2768 continue;
2771 /* Replace the uses: */
2772 orig_name = PHI_RESULT (exit_phi);
2773 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2774 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2775 SET_USE (use_p, new_temp);
2777 VEC_free (gimple, heap, phis);
2781 /* Function vectorizable_reduction.
2783 Check if STMT performs a reduction operation that can be vectorized.
2784 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2785 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2786 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2788 This function also handles reduction idioms (patterns) that have been
2789 recognized in advance during vect_pattern_recog. In this case, STMT may be
2790 of this form:
2791 X = pattern_expr (arg0, arg1, ..., X)
2792 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2793 sequence that had been detected and replaced by the pattern-stmt (STMT).
2795 In some cases of reduction patterns, the type of the reduction variable X is
2796 different than the type of the other arguments of STMT.
2797 In such cases, the vectype that is used when transforming STMT into a vector
2798 stmt is different than the vectype that is used to determine the
2799 vectorization factor, because it consists of a different number of elements
2800 than the actual number of elements that are being operated upon in parallel.
2802 For example, consider an accumulation of shorts into an int accumulator.
2803 On some targets it's possible to vectorize this pattern operating on 8
2804 shorts at a time (hence, the vectype for purposes of determining the
2805 vectorization factor should be V8HI); on the other hand, the vectype that
2806 is used to create the vector form is actually V4SI (the type of the result).
2808 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2809 indicates what is the actual level of parallelism (V8HI in the example), so
2810 that the right vectorization factor would be derived. This vectype
2811 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2812 be used to create the vectorized stmt. The right vectype for the vectorized
2813 stmt is obtained from the type of the result X:
2814 get_vectype_for_scalar_type (TREE_TYPE (X))
2816 This means that, contrary to "regular" reductions (or "regular" stmts in
2817 general), the following equation:
2818 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2819 does *NOT* necessarily hold for reduction patterns. */
2821 bool
2822 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2823 gimple *vec_stmt)
2825 tree vec_dest;
2826 tree scalar_dest;
2827 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2828 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2829 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2832 enum tree_code code, orig_code, epilog_reduc_code = 0;
2833 enum machine_mode vec_mode;
2834 int op_type;
2835 optab optab, reduc_optab;
2836 tree new_temp = NULL_TREE;
2837 tree def;
2838 gimple def_stmt;
2839 enum vect_def_type dt;
2840 gimple new_phi = NULL;
2841 tree scalar_type;
2842 bool is_simple_use;
2843 gimple orig_stmt;
2844 stmt_vec_info orig_stmt_info;
2845 tree expr = NULL_TREE;
2846 int i;
2847 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2848 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2849 int epilog_copies;
2850 stmt_vec_info prev_stmt_info, prev_phi_info;
2851 gimple first_phi = NULL;
2852 bool single_defuse_cycle = false;
2853 tree reduc_def;
2854 gimple new_stmt = NULL;
2855 int j;
2856 tree ops[3];
2858 if (nested_in_vect_loop_p (loop, stmt))
2859 loop = loop->inner;
2861 gcc_assert (ncopies >= 1);
2863 /* FORNOW: SLP not supported. */
2864 if (STMT_SLP_TYPE (stmt_info))
2865 return false;
2867 /* 1. Is vectorizable reduction? */
2869 /* Not supportable if the reduction variable is used in the loop. */
2870 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2871 return false;
2873 /* Reductions that are not used even in an enclosing outer-loop,
2874 are expected to be "live" (used out of the loop). */
2875 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2876 && !STMT_VINFO_LIVE_P (stmt_info))
2877 return false;
2879 /* Make sure it was already recognized as a reduction computation. */
2880 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2881 return false;
2883 /* 2. Has this been recognized as a reduction pattern?
2885 Check if STMT represents a pattern that has been recognized
2886 in earlier analysis stages. For stmts that represent a pattern,
2887 the STMT_VINFO_RELATED_STMT field records the last stmt in
2888 the original sequence that constitutes the pattern. */
2890 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2891 if (orig_stmt)
2893 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2894 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2895 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2896 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2899 /* 3. Check the operands of the operation. The first operands are defined
2900 inside the loop body. The last operand is the reduction variable,
2901 which is defined by the loop-header-phi. */
2903 gcc_assert (is_gimple_assign (stmt));
2905 /* Flatten RHS */
2906 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2908 case GIMPLE_SINGLE_RHS:
2909 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2910 if (op_type == ternary_op)
2912 tree rhs = gimple_assign_rhs1 (stmt);
2913 ops[0] = TREE_OPERAND (rhs, 0);
2914 ops[1] = TREE_OPERAND (rhs, 1);
2915 ops[2] = TREE_OPERAND (rhs, 2);
2916 code = TREE_CODE (rhs);
2918 else
2919 return false;
2920 break;
2922 case GIMPLE_BINARY_RHS:
2923 code = gimple_assign_rhs_code (stmt);
2924 op_type = TREE_CODE_LENGTH (code);
2925 gcc_assert (op_type == binary_op);
2926 ops[0] = gimple_assign_rhs1 (stmt);
2927 ops[1] = gimple_assign_rhs2 (stmt);
2928 break;
2930 case GIMPLE_UNARY_RHS:
2931 return false;
2933 default:
2934 gcc_unreachable ();
2937 scalar_dest = gimple_assign_lhs (stmt);
2938 scalar_type = TREE_TYPE (scalar_dest);
2939 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2940 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2941 return false;
2943 /* All uses but the last are expected to be defined in the loop.
2944 The last use is the reduction variable. */
2945 for (i = 0; i < op_type-1; i++)
2947 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2948 &def, &dt);
2949 gcc_assert (is_simple_use);
2950 if (dt != vect_loop_def
2951 && dt != vect_invariant_def
2952 && dt != vect_constant_def
2953 && dt != vect_induction_def)
2954 return false;
2957 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2958 gcc_assert (is_simple_use);
2959 gcc_assert (dt == vect_reduction_def);
2960 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2961 if (orig_stmt)
2962 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2963 else
2964 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2966 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2967 return false;
2969 /* 4. Supportable by target? */
2971 /* 4.1. check support for the operation in the loop */
2972 optab = optab_for_tree_code (code, vectype, optab_default);
2973 if (!optab)
2975 if (vect_print_dump_info (REPORT_DETAILS))
2976 fprintf (vect_dump, "no optab.");
2977 return false;
2979 vec_mode = TYPE_MODE (vectype);
2980 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2982 if (vect_print_dump_info (REPORT_DETAILS))
2983 fprintf (vect_dump, "op not supported by target.");
2984 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2985 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2986 < vect_min_worthwhile_factor (code))
2987 return false;
2988 if (vect_print_dump_info (REPORT_DETAILS))
2989 fprintf (vect_dump, "proceeding using word mode.");
2992 /* Worthwhile without SIMD support? */
2993 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2994 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2995 < vect_min_worthwhile_factor (code))
2997 if (vect_print_dump_info (REPORT_DETAILS))
2998 fprintf (vect_dump, "not worthwhile without SIMD support.");
2999 return false;
3002 /* 4.2. Check support for the epilog operation.
3004 If STMT represents a reduction pattern, then the type of the
3005 reduction variable may be different than the type of the rest
3006 of the arguments. For example, consider the case of accumulation
3007 of shorts into an int accumulator; The original code:
3008 S1: int_a = (int) short_a;
3009 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3011 was replaced with:
3012 STMT: int_acc = widen_sum <short_a, int_acc>
3014 This means that:
3015 1. The tree-code that is used to create the vector operation in the
3016 epilog code (that reduces the partial results) is not the
3017 tree-code of STMT, but is rather the tree-code of the original
3018 stmt from the pattern that STMT is replacing. I.e, in the example
3019 above we want to use 'widen_sum' in the loop, but 'plus' in the
3020 epilog.
3021 2. The type (mode) we use to check available target support
3022 for the vector operation to be created in the *epilog*, is
3023 determined by the type of the reduction variable (in the example
3024 above we'd check this: plus_optab[vect_int_mode]).
3025 However the type (mode) we use to check available target support
3026 for the vector operation to be created *inside the loop*, is
3027 determined by the type of the other arguments to STMT (in the
3028 example we'd check this: widen_sum_optab[vect_short_mode]).
3030 This is contrary to "regular" reductions, in which the types of all
3031 the arguments are the same as the type of the reduction variable.
3032 For "regular" reductions we can therefore use the same vector type
3033 (and also the same tree-code) when generating the epilog code and
3034 when generating the code inside the loop. */
3036 if (orig_stmt)
3038 /* This is a reduction pattern: get the vectype from the type of the
3039 reduction variable, and get the tree-code from orig_stmt. */
3040 orig_code = gimple_assign_rhs_code (orig_stmt);
3041 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3042 if (!vectype)
3044 if (vect_print_dump_info (REPORT_DETAILS))
3046 fprintf (vect_dump, "unsupported data-type ");
3047 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3049 return false;
3052 vec_mode = TYPE_MODE (vectype);
3054 else
3056 /* Regular reduction: use the same vectype and tree-code as used for
3057 the vector code inside the loop can be used for the epilog code. */
3058 orig_code = code;
3061 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3062 return false;
3063 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3064 if (!reduc_optab)
3066 if (vect_print_dump_info (REPORT_DETAILS))
3067 fprintf (vect_dump, "no optab for reduction.");
3068 epilog_reduc_code = NUM_TREE_CODES;
3070 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3072 if (vect_print_dump_info (REPORT_DETAILS))
3073 fprintf (vect_dump, "reduc op not supported by target.");
3074 epilog_reduc_code = NUM_TREE_CODES;
3077 if (!vec_stmt) /* transformation not required. */
3079 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3080 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3081 return false;
3082 return true;
3085 /** Transform. **/
3087 if (vect_print_dump_info (REPORT_DETAILS))
3088 fprintf (vect_dump, "transform reduction.");
3090 /* Create the destination vector */
3091 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3093 /* In case the vectorization factor (VF) is bigger than the number
3094 of elements that we can fit in a vectype (nunits), we have to generate
3095 more than one vector stmt - i.e - we need to "unroll" the
3096 vector stmt by a factor VF/nunits. For more details see documentation
3097 in vectorizable_operation. */
3099 /* If the reduction is used in an outer loop we need to generate
3100 VF intermediate results, like so (e.g. for ncopies=2):
3101 r0 = phi (init, r0)
3102 r1 = phi (init, r1)
3103 r0 = x0 + r0;
3104 r1 = x1 + r1;
3105 (i.e. we generate VF results in 2 registers).
3106 In this case we have a separate def-use cycle for each copy, and therefore
3107 for each copy we get the vector def for the reduction variable from the
3108 respective phi node created for this copy.
3110 Otherwise (the reduction is unused in the loop nest), we can combine
3111 together intermediate results, like so (e.g. for ncopies=2):
3112 r = phi (init, r)
3113 r = x0 + r;
3114 r = x1 + r;
3115 (i.e. we generate VF/2 results in a single register).
3116 In this case for each copy we get the vector def for the reduction variable
3117 from the vectorized reduction operation generated in the previous iteration.
3120 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3122 single_defuse_cycle = true;
3123 epilog_copies = 1;
3125 else
3126 epilog_copies = ncopies;
3128 prev_stmt_info = NULL;
3129 prev_phi_info = NULL;
3130 for (j = 0; j < ncopies; j++)
3132 if (j == 0 || !single_defuse_cycle)
3134 /* Create the reduction-phi that defines the reduction-operand. */
3135 new_phi = create_phi_node (vec_dest, loop->header);
3136 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3139 /* Handle uses. */
3140 if (j == 0)
3142 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3143 if (op_type == ternary_op)
3145 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3148 /* Get the vector def for the reduction variable from the phi node */
3149 reduc_def = PHI_RESULT (new_phi);
3150 first_phi = new_phi;
3152 else
3154 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3155 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3156 if (op_type == ternary_op)
3157 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3159 if (single_defuse_cycle)
3160 reduc_def = gimple_assign_lhs (new_stmt);
3161 else
3162 reduc_def = PHI_RESULT (new_phi);
3164 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3167 /* Arguments are ready. create the new vector stmt. */
3168 if (op_type == binary_op)
3169 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3170 else
3171 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3172 reduc_def);
3173 new_stmt = gimple_build_assign (vec_dest, expr);
3174 new_temp = make_ssa_name (vec_dest, new_stmt);
3175 gimple_assign_set_lhs (new_stmt, new_temp);
3176 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3178 if (j == 0)
3179 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3180 else
3181 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3182 prev_stmt_info = vinfo_for_stmt (new_stmt);
3183 prev_phi_info = vinfo_for_stmt (new_phi);
3186 /* Finalize the reduction-phi (set its arguments) and create the
3187 epilog reduction code. */
3188 if (!single_defuse_cycle)
3189 new_temp = gimple_assign_lhs (*vec_stmt);
3190 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3191 epilog_reduc_code, first_phi);
3192 return true;
3195 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3196 a function declaration if the target has a vectorized version
3197 of the function, or NULL_TREE if the function cannot be vectorized. */
3199 tree
3200 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3202 tree fndecl = gimple_call_fndecl (call);
3203 enum built_in_function code;
3205 /* We only handle functions that do not read or clobber memory -- i.e.
3206 const or novops ones. */
3207 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3208 return NULL_TREE;
3210 if (!fndecl
3211 || TREE_CODE (fndecl) != FUNCTION_DECL
3212 || !DECL_BUILT_IN (fndecl))
3213 return NULL_TREE;
3215 code = DECL_FUNCTION_CODE (fndecl);
3216 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3217 vectype_in);
3220 /* Function vectorizable_call.
3222 Check if STMT performs a function call that can be vectorized.
3223 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3224 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3225 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3227 bool
3228 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3230 tree vec_dest;
3231 tree scalar_dest;
3232 tree op, type;
3233 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3234 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3235 tree vectype_out, vectype_in;
3236 int nunits_in;
3237 int nunits_out;
3238 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3239 tree fndecl, new_temp, def, rhs_type, lhs_type;
3240 gimple def_stmt;
3241 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3242 gimple new_stmt;
3243 int ncopies, j;
3244 VEC(tree, heap) *vargs = NULL;
3245 enum { NARROW, NONE, WIDEN } modifier;
3246 size_t i, nargs;
3248 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3249 return false;
3251 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3252 return false;
3254 /* FORNOW: SLP not supported. */
3255 if (STMT_SLP_TYPE (stmt_info))
3256 return false;
3258 /* Is STMT a vectorizable call? */
3259 if (!is_gimple_call (stmt))
3260 return false;
3262 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3263 return false;
3265 /* Process function arguments. */
3266 rhs_type = NULL_TREE;
3267 nargs = gimple_call_num_args (stmt);
3269 /* Bail out if the function has more than two arguments, we
3270 do not have interesting builtin functions to vectorize with
3271 more than two arguments. No arguments is also not good. */
3272 if (nargs == 0 || nargs > 2)
3273 return false;
3275 for (i = 0; i < nargs; i++)
3277 op = gimple_call_arg (stmt, i);
3279 /* We can only handle calls with arguments of the same type. */
3280 if (rhs_type
3281 && rhs_type != TREE_TYPE (op))
3283 if (vect_print_dump_info (REPORT_DETAILS))
3284 fprintf (vect_dump, "argument types differ.");
3285 return false;
3287 rhs_type = TREE_TYPE (op);
3289 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3291 if (vect_print_dump_info (REPORT_DETAILS))
3292 fprintf (vect_dump, "use not simple.");
3293 return false;
3297 vectype_in = get_vectype_for_scalar_type (rhs_type);
3298 if (!vectype_in)
3299 return false;
3300 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3302 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3303 vectype_out = get_vectype_for_scalar_type (lhs_type);
3304 if (!vectype_out)
3305 return false;
3306 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3308 /* FORNOW */
3309 if (nunits_in == nunits_out / 2)
3310 modifier = NARROW;
3311 else if (nunits_out == nunits_in)
3312 modifier = NONE;
3313 else if (nunits_out == nunits_in / 2)
3314 modifier = WIDEN;
3315 else
3316 return false;
3318 /* For now, we only vectorize functions if a target specific builtin
3319 is available. TODO -- in some cases, it might be profitable to
3320 insert the calls for pieces of the vector, in order to be able
3321 to vectorize other operations in the loop. */
3322 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3323 if (fndecl == NULL_TREE)
3325 if (vect_print_dump_info (REPORT_DETAILS))
3326 fprintf (vect_dump, "function is not vectorizable.");
3328 return false;
3331 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3333 if (modifier == NARROW)
3334 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3335 else
3336 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3338 /* Sanity check: make sure that at least one copy of the vectorized stmt
3339 needs to be generated. */
3340 gcc_assert (ncopies >= 1);
3342 if (!vec_stmt) /* transformation not required. */
3344 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3345 if (vect_print_dump_info (REPORT_DETAILS))
3346 fprintf (vect_dump, "=== vectorizable_call ===");
3347 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3348 return true;
3351 /** Transform. **/
3353 if (vect_print_dump_info (REPORT_DETAILS))
3354 fprintf (vect_dump, "transform operation.");
3356 /* Handle def. */
3357 scalar_dest = gimple_call_lhs (stmt);
3358 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3360 prev_stmt_info = NULL;
3361 switch (modifier)
3363 case NONE:
3364 for (j = 0; j < ncopies; ++j)
3366 /* Build argument list for the vectorized call. */
3367 if (j == 0)
3368 vargs = VEC_alloc (tree, heap, nargs);
3369 else
3370 VEC_truncate (tree, vargs, 0);
3372 for (i = 0; i < nargs; i++)
3374 op = gimple_call_arg (stmt, i);
3375 if (j == 0)
3376 vec_oprnd0
3377 = vect_get_vec_def_for_operand (op, stmt, NULL);
3378 else
3379 vec_oprnd0
3380 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3382 VEC_quick_push (tree, vargs, vec_oprnd0);
3385 new_stmt = gimple_build_call_vec (fndecl, vargs);
3386 new_temp = make_ssa_name (vec_dest, new_stmt);
3387 gimple_call_set_lhs (new_stmt, new_temp);
3389 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3391 if (j == 0)
3392 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3393 else
3394 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3396 prev_stmt_info = vinfo_for_stmt (new_stmt);
3399 break;
3401 case NARROW:
3402 for (j = 0; j < ncopies; ++j)
3404 /* Build argument list for the vectorized call. */
3405 if (j == 0)
3406 vargs = VEC_alloc (tree, heap, nargs * 2);
3407 else
3408 VEC_truncate (tree, vargs, 0);
3410 for (i = 0; i < nargs; i++)
3412 op = gimple_call_arg (stmt, i);
3413 if (j == 0)
3415 vec_oprnd0
3416 = vect_get_vec_def_for_operand (op, stmt, NULL);
3417 vec_oprnd1
3418 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3420 else
3422 vec_oprnd0
3423 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3424 vec_oprnd1
3425 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3428 VEC_quick_push (tree, vargs, vec_oprnd0);
3429 VEC_quick_push (tree, vargs, vec_oprnd1);
3432 new_stmt = gimple_build_call_vec (fndecl, vargs);
3433 new_temp = make_ssa_name (vec_dest, new_stmt);
3434 gimple_call_set_lhs (new_stmt, new_temp);
3436 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3438 if (j == 0)
3439 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3440 else
3441 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3443 prev_stmt_info = vinfo_for_stmt (new_stmt);
3446 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3448 break;
3450 case WIDEN:
3451 /* No current target implements this case. */
3452 return false;
3455 VEC_free (tree, heap, vargs);
3457 /* The call in STMT might prevent it from being removed in dce.
3458 We however cannot remove it here, due to the way the ssa name
3459 it defines is mapped to the new definition. So just replace
3460 rhs of the statement with something harmless. */
3462 type = TREE_TYPE (scalar_dest);
3463 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3464 fold_convert (type, integer_zero_node));
3465 set_vinfo_for_stmt (new_stmt, stmt_info);
3466 set_vinfo_for_stmt (stmt, NULL);
3467 STMT_VINFO_STMT (stmt_info) = new_stmt;
3468 gsi_replace (gsi, new_stmt, false);
3469 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3471 return true;
3475 /* Function vect_gen_widened_results_half
3477 Create a vector stmt whose code, type, number of arguments, and result
3478 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3479 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3480 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3481 needs to be created (DECL is a function-decl of a target-builtin).
3482 STMT is the original scalar stmt that we are vectorizing. */
3484 static gimple
3485 vect_gen_widened_results_half (enum tree_code code,
3486 tree decl,
3487 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3488 tree vec_dest, gimple_stmt_iterator *gsi,
3489 gimple stmt)
3491 gimple new_stmt;
3492 tree new_temp;
3493 tree sym;
3494 ssa_op_iter iter;
3496 /* Generate half of the widened result: */
3497 if (code == CALL_EXPR)
3499 /* Target specific support */
3500 if (op_type == binary_op)
3501 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3502 else
3503 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3504 new_temp = make_ssa_name (vec_dest, new_stmt);
3505 gimple_call_set_lhs (new_stmt, new_temp);
3507 else
3509 /* Generic support */
3510 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3511 if (op_type != binary_op)
3512 vec_oprnd1 = NULL;
3513 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3514 vec_oprnd1);
3515 new_temp = make_ssa_name (vec_dest, new_stmt);
3516 gimple_assign_set_lhs (new_stmt, new_temp);
3518 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3520 if (code == CALL_EXPR)
3522 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3524 if (TREE_CODE (sym) == SSA_NAME)
3525 sym = SSA_NAME_VAR (sym);
3526 mark_sym_for_renaming (sym);
3530 return new_stmt;
3534 /* Check if STMT performs a conversion operation, that can be vectorized.
3535 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3536 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3537 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3539 bool
3540 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3541 gimple *vec_stmt, slp_tree slp_node)
3543 tree vec_dest;
3544 tree scalar_dest;
3545 tree op0;
3546 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3547 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3548 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3549 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3550 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3551 tree new_temp;
3552 tree def;
3553 gimple def_stmt;
3554 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3555 gimple new_stmt = NULL;
3556 stmt_vec_info prev_stmt_info;
3557 int nunits_in;
3558 int nunits_out;
3559 tree vectype_out, vectype_in;
3560 int ncopies, j;
3561 tree expr;
3562 tree rhs_type, lhs_type;
3563 tree builtin_decl;
3564 enum { NARROW, NONE, WIDEN } modifier;
3565 int i;
3566 VEC(tree,heap) *vec_oprnds0 = NULL;
3567 tree vop0;
3568 tree integral_type;
3569 VEC(tree,heap) *dummy = NULL;
3570 int dummy_int;
3572 /* Is STMT a vectorizable conversion? */
3574 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3575 return false;
3577 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3578 return false;
3580 if (!is_gimple_assign (stmt))
3581 return false;
3583 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3584 return false;
3586 code = gimple_assign_rhs_code (stmt);
3587 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3588 return false;
3590 /* Check types of lhs and rhs. */
3591 op0 = gimple_assign_rhs1 (stmt);
3592 rhs_type = TREE_TYPE (op0);
3593 vectype_in = get_vectype_for_scalar_type (rhs_type);
3594 if (!vectype_in)
3595 return false;
3596 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3598 scalar_dest = gimple_assign_lhs (stmt);
3599 lhs_type = TREE_TYPE (scalar_dest);
3600 vectype_out = get_vectype_for_scalar_type (lhs_type);
3601 if (!vectype_out)
3602 return false;
3603 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3605 /* FORNOW */
3606 if (nunits_in == nunits_out / 2)
3607 modifier = NARROW;
3608 else if (nunits_out == nunits_in)
3609 modifier = NONE;
3610 else if (nunits_out == nunits_in / 2)
3611 modifier = WIDEN;
3612 else
3613 return false;
3615 if (modifier == NONE)
3616 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3618 /* Bail out if the types are both integral or non-integral. */
3619 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3620 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3621 return false;
3623 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3625 if (modifier == NARROW)
3626 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3627 else
3628 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3630 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3631 this, so we can safely override NCOPIES with 1 here. */
3632 if (slp_node)
3633 ncopies = 1;
3635 /* Sanity check: make sure that at least one copy of the vectorized stmt
3636 needs to be generated. */
3637 gcc_assert (ncopies >= 1);
3639 /* Check the operands of the operation. */
3640 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3642 if (vect_print_dump_info (REPORT_DETAILS))
3643 fprintf (vect_dump, "use not simple.");
3644 return false;
3647 /* Supportable by target? */
3648 if ((modifier == NONE
3649 && !targetm.vectorize.builtin_conversion (code, integral_type))
3650 || (modifier == WIDEN
3651 && !supportable_widening_operation (code, stmt, vectype_in,
3652 &decl1, &decl2,
3653 &code1, &code2,
3654 &dummy_int, &dummy))
3655 || (modifier == NARROW
3656 && !supportable_narrowing_operation (code, stmt, vectype_in,
3657 &code1, &dummy_int, &dummy)))
3659 if (vect_print_dump_info (REPORT_DETAILS))
3660 fprintf (vect_dump, "conversion not supported by target.");
3661 return false;
3664 if (modifier != NONE)
3666 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3667 /* FORNOW: SLP not supported. */
3668 if (STMT_SLP_TYPE (stmt_info))
3669 return false;
3672 if (!vec_stmt) /* transformation not required. */
3674 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3675 return true;
3678 /** Transform. **/
3679 if (vect_print_dump_info (REPORT_DETAILS))
3680 fprintf (vect_dump, "transform conversion.");
3682 /* Handle def. */
3683 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3685 if (modifier == NONE && !slp_node)
3686 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3688 prev_stmt_info = NULL;
3689 switch (modifier)
3691 case NONE:
3692 for (j = 0; j < ncopies; j++)
3694 tree sym;
3695 ssa_op_iter iter;
3697 if (j == 0)
3698 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3699 else
3700 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3702 builtin_decl =
3703 targetm.vectorize.builtin_conversion (code, integral_type);
3704 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3706 /* Arguments are ready. create the new vector stmt. */
3707 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3708 new_temp = make_ssa_name (vec_dest, new_stmt);
3709 gimple_call_set_lhs (new_stmt, new_temp);
3710 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3711 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3712 SSA_OP_ALL_VIRTUALS)
3714 if (TREE_CODE (sym) == SSA_NAME)
3715 sym = SSA_NAME_VAR (sym);
3716 mark_sym_for_renaming (sym);
3718 if (slp_node)
3719 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3722 if (j == 0)
3723 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3724 else
3725 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3726 prev_stmt_info = vinfo_for_stmt (new_stmt);
3728 break;
3730 case WIDEN:
3731 /* In case the vectorization factor (VF) is bigger than the number
3732 of elements that we can fit in a vectype (nunits), we have to
3733 generate more than one vector stmt - i.e - we need to "unroll"
3734 the vector stmt by a factor VF/nunits. */
3735 for (j = 0; j < ncopies; j++)
3737 if (j == 0)
3738 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3739 else
3740 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3742 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3744 /* Generate first half of the widened result: */
3745 new_stmt
3746 = vect_gen_widened_results_half (code1, decl1,
3747 vec_oprnd0, vec_oprnd1,
3748 unary_op, vec_dest, gsi, stmt);
3749 if (j == 0)
3750 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3751 else
3752 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3753 prev_stmt_info = vinfo_for_stmt (new_stmt);
3755 /* Generate second half of the widened result: */
3756 new_stmt
3757 = vect_gen_widened_results_half (code2, decl2,
3758 vec_oprnd0, vec_oprnd1,
3759 unary_op, vec_dest, gsi, stmt);
3760 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3761 prev_stmt_info = vinfo_for_stmt (new_stmt);
3763 break;
3765 case NARROW:
3766 /* In case the vectorization factor (VF) is bigger than the number
3767 of elements that we can fit in a vectype (nunits), we have to
3768 generate more than one vector stmt - i.e - we need to "unroll"
3769 the vector stmt by a factor VF/nunits. */
3770 for (j = 0; j < ncopies; j++)
3772 /* Handle uses. */
3773 if (j == 0)
3775 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3776 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3778 else
3780 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3781 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3784 /* Arguments are ready. Create the new vector stmt. */
3785 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3786 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3787 vec_oprnd1);
3788 new_temp = make_ssa_name (vec_dest, new_stmt);
3789 gimple_assign_set_lhs (new_stmt, new_temp);
3790 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3792 if (j == 0)
3793 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3794 else
3795 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3797 prev_stmt_info = vinfo_for_stmt (new_stmt);
3800 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3803 if (vec_oprnds0)
3804 VEC_free (tree, heap, vec_oprnds0);
3806 return true;
3810 /* Function vectorizable_assignment.
3812 Check if STMT performs an assignment (copy) that can be vectorized.
3813 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3814 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3815 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3817 bool
3818 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3819 gimple *vec_stmt, slp_tree slp_node)
3821 tree vec_dest;
3822 tree scalar_dest;
3823 tree op;
3824 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3825 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3826 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3827 tree new_temp;
3828 tree def;
3829 gimple def_stmt;
3830 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3831 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3832 int ncopies;
3833 int i;
3834 VEC(tree,heap) *vec_oprnds = NULL;
3835 tree vop;
3837 /* Multiple types in SLP are handled by creating the appropriate number of
3838 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3839 case of SLP. */
3840 if (slp_node)
3841 ncopies = 1;
3842 else
3843 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3845 gcc_assert (ncopies >= 1);
3846 if (ncopies > 1)
3847 return false; /* FORNOW */
3849 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3850 return false;
3852 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3853 return false;
3855 /* Is vectorizable assignment? */
3856 if (!is_gimple_assign (stmt))
3857 return false;
3859 scalar_dest = gimple_assign_lhs (stmt);
3860 if (TREE_CODE (scalar_dest) != SSA_NAME)
3861 return false;
3863 if (gimple_assign_single_p (stmt)
3864 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3865 op = gimple_assign_rhs1 (stmt);
3866 else
3867 return false;
3869 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3871 if (vect_print_dump_info (REPORT_DETAILS))
3872 fprintf (vect_dump, "use not simple.");
3873 return false;
3876 if (!vec_stmt) /* transformation not required. */
3878 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3879 if (vect_print_dump_info (REPORT_DETAILS))
3880 fprintf (vect_dump, "=== vectorizable_assignment ===");
3881 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3882 return true;
3885 /** Transform. **/
3886 if (vect_print_dump_info (REPORT_DETAILS))
3887 fprintf (vect_dump, "transform assignment.");
3889 /* Handle def. */
3890 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3892 /* Handle use. */
3893 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3895 /* Arguments are ready. create the new vector stmt. */
3896 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3898 *vec_stmt = gimple_build_assign (vec_dest, vop);
3899 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3900 gimple_assign_set_lhs (*vec_stmt, new_temp);
3901 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3902 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3904 if (slp_node)
3905 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3908 VEC_free (tree, heap, vec_oprnds);
3909 return true;
3913 /* Function vect_min_worthwhile_factor.
3915 For a loop where we could vectorize the operation indicated by CODE,
3916 return the minimum vectorization factor that makes it worthwhile
3917 to use generic vectors. */
3918 static int
3919 vect_min_worthwhile_factor (enum tree_code code)
3921 switch (code)
3923 case PLUS_EXPR:
3924 case MINUS_EXPR:
3925 case NEGATE_EXPR:
3926 return 4;
3928 case BIT_AND_EXPR:
3929 case BIT_IOR_EXPR:
3930 case BIT_XOR_EXPR:
3931 case BIT_NOT_EXPR:
3932 return 2;
3934 default:
3935 return INT_MAX;
3940 /* Function vectorizable_induction
3942 Check if PHI performs an induction computation that can be vectorized.
3943 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3944 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3945 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3947 bool
3948 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3949 gimple *vec_stmt)
3951 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3952 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3953 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3954 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3955 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3956 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3957 tree vec_def;
3959 gcc_assert (ncopies >= 1);
3960 /* FORNOW. This restriction should be relaxed. */
3961 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3963 if (vect_print_dump_info (REPORT_DETAILS))
3964 fprintf (vect_dump, "multiple types in nested loop.");
3965 return false;
3968 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3969 return false;
3971 /* FORNOW: SLP not supported. */
3972 if (STMT_SLP_TYPE (stmt_info))
3973 return false;
3975 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3977 if (gimple_code (phi) != GIMPLE_PHI)
3978 return false;
3980 if (!vec_stmt) /* transformation not required. */
3982 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3983 if (vect_print_dump_info (REPORT_DETAILS))
3984 fprintf (vect_dump, "=== vectorizable_induction ===");
3985 vect_model_induction_cost (stmt_info, ncopies);
3986 return true;
3989 /** Transform. **/
3991 if (vect_print_dump_info (REPORT_DETAILS))
3992 fprintf (vect_dump, "transform induction phi.");
3994 vec_def = get_initial_def_for_induction (phi);
3995 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3996 return true;
4000 /* Function vectorizable_operation.
4002 Check if STMT performs a binary or unary operation that can be vectorized.
4003 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4004 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4005 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4007 bool
4008 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
4009 gimple *vec_stmt, slp_tree slp_node)
4011 tree vec_dest;
4012 tree scalar_dest;
4013 tree op0, op1 = NULL;
4014 tree vec_oprnd1 = NULL_TREE;
4015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4016 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4017 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4018 enum tree_code code;
4019 enum machine_mode vec_mode;
4020 tree new_temp;
4021 int op_type;
4022 optab optab;
4023 int icode;
4024 enum machine_mode optab_op2_mode;
4025 tree def;
4026 gimple def_stmt;
4027 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4028 gimple new_stmt = NULL;
4029 stmt_vec_info prev_stmt_info;
4030 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4031 int nunits_out;
4032 tree vectype_out;
4033 int ncopies;
4034 int j, i;
4035 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4036 tree vop0, vop1;
4037 unsigned int k;
4038 bool shift_p = false;
4039 bool scalar_shift_arg = false;
4041 /* Multiple types in SLP are handled by creating the appropriate number of
4042 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4043 case of SLP. */
4044 if (slp_node)
4045 ncopies = 1;
4046 else
4047 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4049 gcc_assert (ncopies >= 1);
4051 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4052 return false;
4054 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4055 return false;
4057 /* Is STMT a vectorizable binary/unary operation? */
4058 if (!is_gimple_assign (stmt))
4059 return false;
4061 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4062 return false;
4064 scalar_dest = gimple_assign_lhs (stmt);
4065 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4066 if (!vectype_out)
4067 return false;
4068 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4069 if (nunits_out != nunits_in)
4070 return false;
4072 code = gimple_assign_rhs_code (stmt);
4074 /* For pointer addition, we should use the normal plus for
4075 the vector addition. */
4076 if (code == POINTER_PLUS_EXPR)
4077 code = PLUS_EXPR;
4079 /* Support only unary or binary operations. */
4080 op_type = TREE_CODE_LENGTH (code);
4081 if (op_type != unary_op && op_type != binary_op)
4083 if (vect_print_dump_info (REPORT_DETAILS))
4084 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4085 return false;
4088 op0 = gimple_assign_rhs1 (stmt);
4089 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4091 if (vect_print_dump_info (REPORT_DETAILS))
4092 fprintf (vect_dump, "use not simple.");
4093 return false;
4096 if (op_type == binary_op)
4098 op1 = gimple_assign_rhs2 (stmt);
4099 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4101 if (vect_print_dump_info (REPORT_DETAILS))
4102 fprintf (vect_dump, "use not simple.");
4103 return false;
4107 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4108 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4109 shift optabs. */
4110 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4111 || code == RROTATE_EXPR)
4113 shift_p = true;
4115 /* vector shifted by vector */
4116 if (dt[1] == vect_loop_def)
4118 optab = optab_for_tree_code (code, vectype, optab_vector);
4119 if (vect_print_dump_info (REPORT_DETAILS))
4120 fprintf (vect_dump, "vector/vector shift/rotate found.");
4123 /* See if the machine has a vector shifted by scalar insn and if not
4124 then see if it has a vector shifted by vector insn */
4125 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4127 optab = optab_for_tree_code (code, vectype, optab_scalar);
4128 if (optab
4129 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4130 != CODE_FOR_nothing))
4132 scalar_shift_arg = true;
4133 if (vect_print_dump_info (REPORT_DETAILS))
4134 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4136 else
4138 optab = optab_for_tree_code (code, vectype, optab_vector);
4139 if (vect_print_dump_info (REPORT_DETAILS)
4140 && optab
4141 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4142 != CODE_FOR_nothing))
4143 fprintf (vect_dump, "vector/vector shift/rotate found.");
4147 else
4149 if (vect_print_dump_info (REPORT_DETAILS))
4150 fprintf (vect_dump, "operand mode requires invariant argument.");
4151 return false;
4154 else
4155 optab = optab_for_tree_code (code, vectype, optab_default);
4157 /* Supportable by target? */
4158 if (!optab)
4160 if (vect_print_dump_info (REPORT_DETAILS))
4161 fprintf (vect_dump, "no optab.");
4162 return false;
4164 vec_mode = TYPE_MODE (vectype);
4165 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4166 if (icode == CODE_FOR_nothing)
4168 if (vect_print_dump_info (REPORT_DETAILS))
4169 fprintf (vect_dump, "op not supported by target.");
4170 /* Check only during analysis. */
4171 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4172 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4173 < vect_min_worthwhile_factor (code)
4174 && !vec_stmt))
4175 return false;
4176 if (vect_print_dump_info (REPORT_DETAILS))
4177 fprintf (vect_dump, "proceeding using word mode.");
4180 /* Worthwhile without SIMD support? Check only during analysis. */
4181 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4182 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4183 < vect_min_worthwhile_factor (code)
4184 && !vec_stmt)
4186 if (vect_print_dump_info (REPORT_DETAILS))
4187 fprintf (vect_dump, "not worthwhile without SIMD support.");
4188 return false;
4191 if (!vec_stmt) /* transformation not required. */
4193 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4194 if (vect_print_dump_info (REPORT_DETAILS))
4195 fprintf (vect_dump, "=== vectorizable_operation ===");
4196 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4197 return true;
4200 /** Transform. **/
4202 if (vect_print_dump_info (REPORT_DETAILS))
4203 fprintf (vect_dump, "transform binary/unary operation.");
4205 /* Handle def. */
4206 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4208 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4209 created in the previous stages of the recursion, so no allocation is
4210 needed, except for the case of shift with scalar shift argument. In that
4211 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4212 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4213 In case of loop-based vectorization we allocate VECs of size 1. We
4214 allocate VEC_OPRNDS1 only in case of binary operation. */
4215 if (!slp_node)
4217 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4218 if (op_type == binary_op)
4219 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4221 else if (scalar_shift_arg)
4222 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4224 /* In case the vectorization factor (VF) is bigger than the number
4225 of elements that we can fit in a vectype (nunits), we have to generate
4226 more than one vector stmt - i.e - we need to "unroll" the
4227 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4228 from one copy of the vector stmt to the next, in the field
4229 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4230 stages to find the correct vector defs to be used when vectorizing
4231 stmts that use the defs of the current stmt. The example below illustrates
4232 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4233 4 vectorized stmts):
4235 before vectorization:
4236 RELATED_STMT VEC_STMT
4237 S1: x = memref - -
4238 S2: z = x + 1 - -
4240 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4241 there):
4242 RELATED_STMT VEC_STMT
4243 VS1_0: vx0 = memref0 VS1_1 -
4244 VS1_1: vx1 = memref1 VS1_2 -
4245 VS1_2: vx2 = memref2 VS1_3 -
4246 VS1_3: vx3 = memref3 - -
4247 S1: x = load - VS1_0
4248 S2: z = x + 1 - -
4250 step2: vectorize stmt S2 (done here):
4251 To vectorize stmt S2 we first need to find the relevant vector
4252 def for the first operand 'x'. This is, as usual, obtained from
4253 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4254 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4255 relevant vector def 'vx0'. Having found 'vx0' we can generate
4256 the vector stmt VS2_0, and as usual, record it in the
4257 STMT_VINFO_VEC_STMT of stmt S2.
4258 When creating the second copy (VS2_1), we obtain the relevant vector
4259 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4260 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4261 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4262 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4263 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4264 chain of stmts and pointers:
4265 RELATED_STMT VEC_STMT
4266 VS1_0: vx0 = memref0 VS1_1 -
4267 VS1_1: vx1 = memref1 VS1_2 -
4268 VS1_2: vx2 = memref2 VS1_3 -
4269 VS1_3: vx3 = memref3 - -
4270 S1: x = load - VS1_0
4271 VS2_0: vz0 = vx0 + v1 VS2_1 -
4272 VS2_1: vz1 = vx1 + v1 VS2_2 -
4273 VS2_2: vz2 = vx2 + v1 VS2_3 -
4274 VS2_3: vz3 = vx3 + v1 - -
4275 S2: z = x + 1 - VS2_0 */
4277 prev_stmt_info = NULL;
4278 for (j = 0; j < ncopies; j++)
4280 /* Handle uses. */
4281 if (j == 0)
4283 if (op_type == binary_op && scalar_shift_arg)
4285 /* Vector shl and shr insn patterns can be defined with scalar
4286 operand 2 (shift operand). In this case, use constant or loop
4287 invariant op1 directly, without extending it to vector mode
4288 first. */
4289 optab_op2_mode = insn_data[icode].operand[2].mode;
4290 if (!VECTOR_MODE_P (optab_op2_mode))
4292 if (vect_print_dump_info (REPORT_DETAILS))
4293 fprintf (vect_dump, "operand 1 using scalar mode.");
4294 vec_oprnd1 = op1;
4295 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4296 if (slp_node)
4298 /* Store vec_oprnd1 for every vector stmt to be created
4299 for SLP_NODE. We check during the analysis that all the
4300 shift arguments are the same.
4301 TODO: Allow different constants for different vector
4302 stmts generated for an SLP instance. */
4303 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4304 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4309 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4310 (a special case for certain kind of vector shifts); otherwise,
4311 operand 1 should be of a vector type (the usual case). */
4312 if (op_type == binary_op && !vec_oprnd1)
4313 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4314 slp_node);
4315 else
4316 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4317 slp_node);
4319 else
4320 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4322 /* Arguments are ready. Create the new vector stmt. */
4323 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4325 vop1 = ((op_type == binary_op)
4326 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4327 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4328 new_temp = make_ssa_name (vec_dest, new_stmt);
4329 gimple_assign_set_lhs (new_stmt, new_temp);
4330 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4331 if (slp_node)
4332 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4335 if (slp_node)
4336 continue;
4338 if (j == 0)
4339 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4340 else
4341 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4342 prev_stmt_info = vinfo_for_stmt (new_stmt);
4345 VEC_free (tree, heap, vec_oprnds0);
4346 if (vec_oprnds1)
4347 VEC_free (tree, heap, vec_oprnds1);
4349 return true;
4353 /* Get vectorized definitions for loop-based vectorization. For the first
4354 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4355 scalar operand), and for the rest we get a copy with
4356 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4357 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4358 The vectors are collected into VEC_OPRNDS. */
4360 static void
4361 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4362 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4364 tree vec_oprnd;
4366 /* Get first vector operand. */
4367 /* All the vector operands except the very first one (that is scalar oprnd)
4368 are stmt copies. */
4369 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4370 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4371 else
4372 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4374 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4376 /* Get second vector operand. */
4377 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4378 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4380 *oprnd = vec_oprnd;
4382 /* For conversion in multiple steps, continue to get operands
4383 recursively. */
4384 if (multi_step_cvt)
4385 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4389 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4390 For multi-step conversions store the resulting vectors and call the function
4391 recursively. */
4393 static void
4394 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4395 int multi_step_cvt, gimple stmt,
4396 VEC (tree, heap) *vec_dsts,
4397 gimple_stmt_iterator *gsi,
4398 slp_tree slp_node, enum tree_code code,
4399 stmt_vec_info *prev_stmt_info)
4401 unsigned int i;
4402 tree vop0, vop1, new_tmp, vec_dest;
4403 gimple new_stmt;
4404 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4406 vec_dest = VEC_pop (tree, vec_dsts);
4408 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4410 /* Create demotion operation. */
4411 vop0 = VEC_index (tree, *vec_oprnds, i);
4412 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4413 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4414 new_tmp = make_ssa_name (vec_dest, new_stmt);
4415 gimple_assign_set_lhs (new_stmt, new_tmp);
4416 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4418 if (multi_step_cvt)
4419 /* Store the resulting vector for next recursive call. */
4420 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4421 else
4423 /* This is the last step of the conversion sequence. Store the
4424 vectors in SLP_NODE or in vector info of the scalar statement
4425 (or in STMT_VINFO_RELATED_STMT chain). */
4426 if (slp_node)
4427 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4428 else
4430 if (!*prev_stmt_info)
4431 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4432 else
4433 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4435 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4440 /* For multi-step demotion operations we first generate demotion operations
4441 from the source type to the intermediate types, and then combine the
4442 results (stored in VEC_OPRNDS) in demotion operation to the destination
4443 type. */
4444 if (multi_step_cvt)
4446 /* At each level of recursion we have have of the operands we had at the
4447 previous level. */
4448 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4449 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4450 stmt, vec_dsts, gsi, slp_node,
4451 code, prev_stmt_info);
4456 /* Function vectorizable_type_demotion
4458 Check if STMT performs a binary or unary operation that involves
4459 type demotion, and if it can be vectorized.
4460 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4461 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4462 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4464 bool
4465 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4466 gimple *vec_stmt, slp_tree slp_node)
4468 tree vec_dest;
4469 tree scalar_dest;
4470 tree op0;
4471 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4472 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4473 enum tree_code code, code1 = ERROR_MARK;
4474 tree def;
4475 gimple def_stmt;
4476 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4477 stmt_vec_info prev_stmt_info;
4478 int nunits_in;
4479 int nunits_out;
4480 tree vectype_out;
4481 int ncopies;
4482 int j, i;
4483 tree vectype_in;
4484 int multi_step_cvt = 0;
4485 VEC (tree, heap) *vec_oprnds0 = NULL;
4486 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4487 tree last_oprnd, intermediate_type;
4489 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4490 return false;
4492 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4493 return false;
4495 /* Is STMT a vectorizable type-demotion operation? */
4496 if (!is_gimple_assign (stmt))
4497 return false;
4499 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4500 return false;
4502 code = gimple_assign_rhs_code (stmt);
4503 if (!CONVERT_EXPR_CODE_P (code))
4504 return false;
4506 op0 = gimple_assign_rhs1 (stmt);
4507 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4508 if (!vectype_in)
4509 return false;
4510 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4512 scalar_dest = gimple_assign_lhs (stmt);
4513 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4514 if (!vectype_out)
4515 return false;
4516 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4517 if (nunits_in >= nunits_out)
4518 return false;
4520 /* Multiple types in SLP are handled by creating the appropriate number of
4521 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4522 case of SLP. */
4523 if (slp_node)
4524 ncopies = 1;
4525 else
4526 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4528 gcc_assert (ncopies >= 1);
4530 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4531 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4532 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4533 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4534 && CONVERT_EXPR_CODE_P (code))))
4535 return false;
4537 /* Check the operands of the operation. */
4538 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4540 if (vect_print_dump_info (REPORT_DETAILS))
4541 fprintf (vect_dump, "use not simple.");
4542 return false;
4545 /* Supportable by target? */
4546 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4547 &multi_step_cvt, &interm_types))
4548 return false;
4550 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4552 if (!vec_stmt) /* transformation not required. */
4554 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4555 if (vect_print_dump_info (REPORT_DETAILS))
4556 fprintf (vect_dump, "=== vectorizable_demotion ===");
4557 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4558 return true;
4561 /** Transform. **/
4562 if (vect_print_dump_info (REPORT_DETAILS))
4563 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4564 ncopies);
4566 /* In case of multi-step demotion, we first generate demotion operations to
4567 the intermediate types, and then from that types to the final one.
4568 We create vector destinations for the intermediate type (TYPES) received
4569 from supportable_narrowing_operation, and store them in the correct order
4570 for future use in vect_create_vectorized_demotion_stmts(). */
4571 if (multi_step_cvt)
4572 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4573 else
4574 vec_dsts = VEC_alloc (tree, heap, 1);
4576 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4577 VEC_quick_push (tree, vec_dsts, vec_dest);
4579 if (multi_step_cvt)
4581 for (i = VEC_length (tree, interm_types) - 1;
4582 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4584 vec_dest = vect_create_destination_var (scalar_dest,
4585 intermediate_type);
4586 VEC_quick_push (tree, vec_dsts, vec_dest);
4590 /* In case the vectorization factor (VF) is bigger than the number
4591 of elements that we can fit in a vectype (nunits), we have to generate
4592 more than one vector stmt - i.e - we need to "unroll" the
4593 vector stmt by a factor VF/nunits. */
4594 last_oprnd = op0;
4595 prev_stmt_info = NULL;
4596 for (j = 0; j < ncopies; j++)
4598 /* Handle uses. */
4599 if (slp_node)
4600 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4601 else
4603 VEC_free (tree, heap, vec_oprnds0);
4604 vec_oprnds0 = VEC_alloc (tree, heap,
4605 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4606 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4607 vect_pow2 (multi_step_cvt) - 1);
4610 /* Arguments are ready. Create the new vector stmts. */
4611 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4612 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4613 multi_step_cvt, stmt, tmp_vec_dsts,
4614 gsi, slp_node, code1,
4615 &prev_stmt_info);
4618 VEC_free (tree, heap, vec_oprnds0);
4619 VEC_free (tree, heap, vec_dsts);
4620 VEC_free (tree, heap, tmp_vec_dsts);
4621 VEC_free (tree, heap, interm_types);
4623 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4624 return true;
4628 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4629 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4630 the resulting vectors and call the function recursively. */
4632 static void
4633 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4634 VEC (tree, heap) **vec_oprnds1,
4635 int multi_step_cvt, gimple stmt,
4636 VEC (tree, heap) *vec_dsts,
4637 gimple_stmt_iterator *gsi,
4638 slp_tree slp_node, enum tree_code code1,
4639 enum tree_code code2, tree decl1,
4640 tree decl2, int op_type,
4641 stmt_vec_info *prev_stmt_info)
4643 int i;
4644 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4645 gimple new_stmt1, new_stmt2;
4646 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4647 VEC (tree, heap) *vec_tmp;
4649 vec_dest = VEC_pop (tree, vec_dsts);
4650 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4652 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4654 if (op_type == binary_op)
4655 vop1 = VEC_index (tree, *vec_oprnds1, i);
4656 else
4657 vop1 = NULL_TREE;
4659 /* Generate the two halves of promotion operation. */
4660 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4661 op_type, vec_dest, gsi, stmt);
4662 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4663 op_type, vec_dest, gsi, stmt);
4664 if (is_gimple_call (new_stmt1))
4666 new_tmp1 = gimple_call_lhs (new_stmt1);
4667 new_tmp2 = gimple_call_lhs (new_stmt2);
4669 else
4671 new_tmp1 = gimple_assign_lhs (new_stmt1);
4672 new_tmp2 = gimple_assign_lhs (new_stmt2);
4675 if (multi_step_cvt)
4677 /* Store the results for the recursive call. */
4678 VEC_quick_push (tree, vec_tmp, new_tmp1);
4679 VEC_quick_push (tree, vec_tmp, new_tmp2);
4681 else
4683 /* Last step of promotion sequience - store the results. */
4684 if (slp_node)
4686 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4687 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4689 else
4691 if (!*prev_stmt_info)
4692 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4693 else
4694 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4696 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4697 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4698 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4703 if (multi_step_cvt)
4705 /* For multi-step promotion operation we first generate we call the
4706 function recurcively for every stage. We start from the input type,
4707 create promotion operations to the intermediate types, and then
4708 create promotions to the output type. */
4709 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4710 VEC_free (tree, heap, vec_tmp);
4711 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4712 multi_step_cvt - 1, stmt,
4713 vec_dsts, gsi, slp_node, code1,
4714 code2, decl2, decl2, op_type,
4715 prev_stmt_info);
4720 /* Function vectorizable_type_promotion
4722 Check if STMT performs a binary or unary operation that involves
4723 type promotion, and if it can be vectorized.
4724 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4725 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4726 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4728 bool
4729 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4730 gimple *vec_stmt, slp_tree slp_node)
4732 tree vec_dest;
4733 tree scalar_dest;
4734 tree op0, op1 = NULL;
4735 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4736 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4737 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4738 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4739 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4740 int op_type;
4741 tree def;
4742 gimple def_stmt;
4743 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4744 stmt_vec_info prev_stmt_info;
4745 int nunits_in;
4746 int nunits_out;
4747 tree vectype_out;
4748 int ncopies;
4749 int j, i;
4750 tree vectype_in;
4751 tree intermediate_type = NULL_TREE;
4752 int multi_step_cvt = 0;
4753 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4754 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4756 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4757 return false;
4759 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4760 return false;
4762 /* Is STMT a vectorizable type-promotion operation? */
4763 if (!is_gimple_assign (stmt))
4764 return false;
4766 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4767 return false;
4769 code = gimple_assign_rhs_code (stmt);
4770 if (!CONVERT_EXPR_CODE_P (code)
4771 && code != WIDEN_MULT_EXPR)
4772 return false;
4774 op0 = gimple_assign_rhs1 (stmt);
4775 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4776 if (!vectype_in)
4777 return false;
4778 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4780 scalar_dest = gimple_assign_lhs (stmt);
4781 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4782 if (!vectype_out)
4783 return false;
4784 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4785 if (nunits_in <= nunits_out)
4786 return false;
4788 /* Multiple types in SLP are handled by creating the appropriate number of
4789 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4790 case of SLP. */
4791 if (slp_node)
4792 ncopies = 1;
4793 else
4794 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4796 gcc_assert (ncopies >= 1);
4798 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4799 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4800 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4801 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4802 && CONVERT_EXPR_CODE_P (code))))
4803 return false;
4805 /* Check the operands of the operation. */
4806 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4808 if (vect_print_dump_info (REPORT_DETAILS))
4809 fprintf (vect_dump, "use not simple.");
4810 return false;
4813 op_type = TREE_CODE_LENGTH (code);
4814 if (op_type == binary_op)
4816 op1 = gimple_assign_rhs2 (stmt);
4817 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4819 if (vect_print_dump_info (REPORT_DETAILS))
4820 fprintf (vect_dump, "use not simple.");
4821 return false;
4825 /* Supportable by target? */
4826 if (!supportable_widening_operation (code, stmt, vectype_in,
4827 &decl1, &decl2, &code1, &code2,
4828 &multi_step_cvt, &interm_types))
4829 return false;
4831 /* Binary widening operation can only be supported directly by the
4832 architecture. */
4833 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4835 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4837 if (!vec_stmt) /* transformation not required. */
4839 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4840 if (vect_print_dump_info (REPORT_DETAILS))
4841 fprintf (vect_dump, "=== vectorizable_promotion ===");
4842 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4843 return true;
4846 /** Transform. **/
4848 if (vect_print_dump_info (REPORT_DETAILS))
4849 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4850 ncopies);
4852 /* Handle def. */
4853 /* In case of multi-step promotion, we first generate promotion operations
4854 to the intermediate types, and then from that types to the final one.
4855 We store vector destination in VEC_DSTS in the correct order for
4856 recursive creation of promotion operations in
4857 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4858 according to TYPES recieved from supportable_widening_operation(). */
4859 if (multi_step_cvt)
4860 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4861 else
4862 vec_dsts = VEC_alloc (tree, heap, 1);
4864 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4865 VEC_quick_push (tree, vec_dsts, vec_dest);
4867 if (multi_step_cvt)
4869 for (i = VEC_length (tree, interm_types) - 1;
4870 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4872 vec_dest = vect_create_destination_var (scalar_dest,
4873 intermediate_type);
4874 VEC_quick_push (tree, vec_dsts, vec_dest);
4878 if (!slp_node)
4880 vec_oprnds0 = VEC_alloc (tree, heap,
4881 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4882 if (op_type == binary_op)
4883 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4886 /* In case the vectorization factor (VF) is bigger than the number
4887 of elements that we can fit in a vectype (nunits), we have to generate
4888 more than one vector stmt - i.e - we need to "unroll" the
4889 vector stmt by a factor VF/nunits. */
4891 prev_stmt_info = NULL;
4892 for (j = 0; j < ncopies; j++)
4894 /* Handle uses. */
4895 if (j == 0)
4897 if (slp_node)
4898 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4899 else
4901 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4902 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4903 if (op_type == binary_op)
4905 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4906 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4910 else
4912 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4913 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4914 if (op_type == binary_op)
4916 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4917 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4921 /* Arguments are ready. Create the new vector stmts. */
4922 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4923 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4924 multi_step_cvt, stmt,
4925 tmp_vec_dsts,
4926 gsi, slp_node, code1, code2,
4927 decl1, decl2, op_type,
4928 &prev_stmt_info);
4931 VEC_free (tree, heap, vec_dsts);
4932 VEC_free (tree, heap, tmp_vec_dsts);
4933 VEC_free (tree, heap, interm_types);
4934 VEC_free (tree, heap, vec_oprnds0);
4935 VEC_free (tree, heap, vec_oprnds1);
4937 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4938 return true;
4942 /* Function vect_strided_store_supported.
4944 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4945 and FALSE otherwise. */
4947 static bool
4948 vect_strided_store_supported (tree vectype)
4950 optab interleave_high_optab, interleave_low_optab;
4951 int mode;
4953 mode = (int) TYPE_MODE (vectype);
4955 /* Check that the operation is supported. */
4956 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4957 vectype, optab_default);
4958 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4959 vectype, optab_default);
4960 if (!interleave_high_optab || !interleave_low_optab)
4962 if (vect_print_dump_info (REPORT_DETAILS))
4963 fprintf (vect_dump, "no optab for interleave.");
4964 return false;
4967 if (optab_handler (interleave_high_optab, mode)->insn_code
4968 == CODE_FOR_nothing
4969 || optab_handler (interleave_low_optab, mode)->insn_code
4970 == CODE_FOR_nothing)
4972 if (vect_print_dump_info (REPORT_DETAILS))
4973 fprintf (vect_dump, "interleave op not supported by target.");
4974 return false;
4977 return true;
4981 /* Function vect_permute_store_chain.
4983 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4984 a power of 2, generate interleave_high/low stmts to reorder the data
4985 correctly for the stores. Return the final references for stores in
4986 RESULT_CHAIN.
4988 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4989 The input is 4 vectors each containing 8 elements. We assign a number to each
4990 element, the input sequence is:
4992 1st vec: 0 1 2 3 4 5 6 7
4993 2nd vec: 8 9 10 11 12 13 14 15
4994 3rd vec: 16 17 18 19 20 21 22 23
4995 4th vec: 24 25 26 27 28 29 30 31
4997 The output sequence should be:
4999 1st vec: 0 8 16 24 1 9 17 25
5000 2nd vec: 2 10 18 26 3 11 19 27
5001 3rd vec: 4 12 20 28 5 13 21 30
5002 4th vec: 6 14 22 30 7 15 23 31
5004 i.e., we interleave the contents of the four vectors in their order.
5006 We use interleave_high/low instructions to create such output. The input of
5007 each interleave_high/low operation is two vectors:
5008 1st vec 2nd vec
5009 0 1 2 3 4 5 6 7
5010 the even elements of the result vector are obtained left-to-right from the
5011 high/low elements of the first vector. The odd elements of the result are
5012 obtained left-to-right from the high/low elements of the second vector.
5013 The output of interleave_high will be: 0 4 1 5
5014 and of interleave_low: 2 6 3 7
5017 The permutation is done in log LENGTH stages. In each stage interleave_high
5018 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5019 where the first argument is taken from the first half of DR_CHAIN and the
5020 second argument from it's second half.
5021 In our example,
5023 I1: interleave_high (1st vec, 3rd vec)
5024 I2: interleave_low (1st vec, 3rd vec)
5025 I3: interleave_high (2nd vec, 4th vec)
5026 I4: interleave_low (2nd vec, 4th vec)
5028 The output for the first stage is:
5030 I1: 0 16 1 17 2 18 3 19
5031 I2: 4 20 5 21 6 22 7 23
5032 I3: 8 24 9 25 10 26 11 27
5033 I4: 12 28 13 29 14 30 15 31
5035 The output of the second stage, i.e. the final result is:
5037 I1: 0 8 16 24 1 9 17 25
5038 I2: 2 10 18 26 3 11 19 27
5039 I3: 4 12 20 28 5 13 21 30
5040 I4: 6 14 22 30 7 15 23 31. */
5042 static bool
5043 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5044 unsigned int length,
5045 gimple stmt,
5046 gimple_stmt_iterator *gsi,
5047 VEC(tree,heap) **result_chain)
5049 tree perm_dest, vect1, vect2, high, low;
5050 gimple perm_stmt;
5051 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5052 tree scalar_dest;
5053 int i;
5054 unsigned int j;
5055 enum tree_code high_code, low_code;
5057 scalar_dest = gimple_assign_lhs (stmt);
5059 /* Check that the operation is supported. */
5060 if (!vect_strided_store_supported (vectype))
5061 return false;
5063 *result_chain = VEC_copy (tree, heap, dr_chain);
5065 for (i = 0; i < exact_log2 (length); i++)
5067 for (j = 0; j < length/2; j++)
5069 vect1 = VEC_index (tree, dr_chain, j);
5070 vect2 = VEC_index (tree, dr_chain, j+length/2);
5072 /* Create interleaving stmt:
5073 in the case of big endian:
5074 high = interleave_high (vect1, vect2)
5075 and in the case of little endian:
5076 high = interleave_low (vect1, vect2). */
5077 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5078 DECL_GIMPLE_REG_P (perm_dest) = 1;
5079 add_referenced_var (perm_dest);
5080 if (BYTES_BIG_ENDIAN)
5082 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5083 low_code = VEC_INTERLEAVE_LOW_EXPR;
5085 else
5087 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5088 high_code = VEC_INTERLEAVE_LOW_EXPR;
5090 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5091 vect1, vect2);
5092 high = make_ssa_name (perm_dest, perm_stmt);
5093 gimple_assign_set_lhs (perm_stmt, high);
5094 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5095 VEC_replace (tree, *result_chain, 2*j, high);
5097 /* Create interleaving stmt:
5098 in the case of big endian:
5099 low = interleave_low (vect1, vect2)
5100 and in the case of little endian:
5101 low = interleave_high (vect1, vect2). */
5102 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5103 DECL_GIMPLE_REG_P (perm_dest) = 1;
5104 add_referenced_var (perm_dest);
5105 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5106 vect1, vect2);
5107 low = make_ssa_name (perm_dest, perm_stmt);
5108 gimple_assign_set_lhs (perm_stmt, low);
5109 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5110 VEC_replace (tree, *result_chain, 2*j+1, low);
5112 dr_chain = VEC_copy (tree, heap, *result_chain);
5114 return true;
5118 /* Function vectorizable_store.
5120 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5121 can be vectorized.
5122 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5123 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5124 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5126 bool
5127 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5128 slp_tree slp_node)
5130 tree scalar_dest;
5131 tree data_ref;
5132 tree op;
5133 tree vec_oprnd = NULL_TREE;
5134 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5135 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5136 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5137 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5138 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5139 enum machine_mode vec_mode;
5140 tree dummy;
5141 enum dr_alignment_support alignment_support_scheme;
5142 tree def;
5143 gimple def_stmt;
5144 enum vect_def_type dt;
5145 stmt_vec_info prev_stmt_info = NULL;
5146 tree dataref_ptr = NULL_TREE;
5147 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5148 int ncopies;
5149 int j;
5150 gimple next_stmt, first_stmt = NULL;
5151 bool strided_store = false;
5152 unsigned int group_size, i;
5153 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5154 bool inv_p;
5155 VEC(tree,heap) *vec_oprnds = NULL;
5156 bool slp = (slp_node != NULL);
5157 stmt_vec_info first_stmt_vinfo;
5158 unsigned int vec_num;
5160 /* Multiple types in SLP are handled by creating the appropriate number of
5161 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5162 case of SLP. */
5163 if (slp)
5164 ncopies = 1;
5165 else
5166 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5168 gcc_assert (ncopies >= 1);
5170 /* FORNOW. This restriction should be relaxed. */
5171 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5173 if (vect_print_dump_info (REPORT_DETAILS))
5174 fprintf (vect_dump, "multiple types in nested loop.");
5175 return false;
5178 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5179 return false;
5181 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5182 return false;
5184 /* Is vectorizable store? */
5186 if (!is_gimple_assign (stmt))
5187 return false;
5189 scalar_dest = gimple_assign_lhs (stmt);
5190 if (TREE_CODE (scalar_dest) != ARRAY_REF
5191 && TREE_CODE (scalar_dest) != INDIRECT_REF
5192 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5193 return false;
5195 gcc_assert (gimple_assign_single_p (stmt));
5196 op = gimple_assign_rhs1 (stmt);
5197 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5199 if (vect_print_dump_info (REPORT_DETAILS))
5200 fprintf (vect_dump, "use not simple.");
5201 return false;
5204 /* The scalar rhs type needs to be trivially convertible to the vector
5205 component type. This should always be the case. */
5206 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5208 if (vect_print_dump_info (REPORT_DETAILS))
5209 fprintf (vect_dump, "??? operands of different types");
5210 return false;
5213 vec_mode = TYPE_MODE (vectype);
5214 /* FORNOW. In some cases can vectorize even if data-type not supported
5215 (e.g. - array initialization with 0). */
5216 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5217 return false;
5219 if (!STMT_VINFO_DATA_REF (stmt_info))
5220 return false;
5222 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5224 strided_store = true;
5225 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5226 if (!vect_strided_store_supported (vectype)
5227 && !PURE_SLP_STMT (stmt_info) && !slp)
5228 return false;
5230 if (first_stmt == stmt)
5232 /* STMT is the leader of the group. Check the operands of all the
5233 stmts of the group. */
5234 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5235 while (next_stmt)
5237 gcc_assert (gimple_assign_single_p (next_stmt));
5238 op = gimple_assign_rhs1 (next_stmt);
5239 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5241 if (vect_print_dump_info (REPORT_DETAILS))
5242 fprintf (vect_dump, "use not simple.");
5243 return false;
5245 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5250 if (!vec_stmt) /* transformation not required. */
5252 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5253 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5254 return true;
5257 /** Transform. **/
5259 if (strided_store)
5261 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5262 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5264 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5266 /* FORNOW */
5267 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5269 /* We vectorize all the stmts of the interleaving group when we
5270 reach the last stmt in the group. */
5271 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5272 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5273 && !slp)
5275 *vec_stmt = NULL;
5276 return true;
5279 if (slp)
5280 strided_store = false;
5282 /* VEC_NUM is the number of vect stmts to be created for this group. */
5283 if (slp)
5284 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5285 else
5286 vec_num = group_size;
5288 else
5290 first_stmt = stmt;
5291 first_dr = dr;
5292 group_size = vec_num = 1;
5293 first_stmt_vinfo = stmt_info;
5296 if (vect_print_dump_info (REPORT_DETAILS))
5297 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5299 dr_chain = VEC_alloc (tree, heap, group_size);
5300 oprnds = VEC_alloc (tree, heap, group_size);
5302 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5303 gcc_assert (alignment_support_scheme);
5304 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5306 /* In case the vectorization factor (VF) is bigger than the number
5307 of elements that we can fit in a vectype (nunits), we have to generate
5308 more than one vector stmt - i.e - we need to "unroll" the
5309 vector stmt by a factor VF/nunits. For more details see documentation in
5310 vect_get_vec_def_for_copy_stmt. */
5312 /* In case of interleaving (non-unit strided access):
5314 S1: &base + 2 = x2
5315 S2: &base = x0
5316 S3: &base + 1 = x1
5317 S4: &base + 3 = x3
5319 We create vectorized stores starting from base address (the access of the
5320 first stmt in the chain (S2 in the above example), when the last store stmt
5321 of the chain (S4) is reached:
5323 VS1: &base = vx2
5324 VS2: &base + vec_size*1 = vx0
5325 VS3: &base + vec_size*2 = vx1
5326 VS4: &base + vec_size*3 = vx3
5328 Then permutation statements are generated:
5330 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5331 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5334 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5335 (the order of the data-refs in the output of vect_permute_store_chain
5336 corresponds to the order of scalar stmts in the interleaving chain - see
5337 the documentation of vect_permute_store_chain()).
5339 In case of both multiple types and interleaving, above vector stores and
5340 permutation stmts are created for every copy. The result vector stmts are
5341 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5342 STMT_VINFO_RELATED_STMT for the next copies.
5345 prev_stmt_info = NULL;
5346 for (j = 0; j < ncopies; j++)
5348 gimple new_stmt;
5349 gimple ptr_incr;
5351 if (j == 0)
5353 if (slp)
5355 /* Get vectorized arguments for SLP_NODE. */
5356 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5358 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5360 else
5362 /* For interleaved stores we collect vectorized defs for all the
5363 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5364 used as an input to vect_permute_store_chain(), and OPRNDS as
5365 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5367 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5368 OPRNDS are of size 1. */
5369 next_stmt = first_stmt;
5370 for (i = 0; i < group_size; i++)
5372 /* Since gaps are not supported for interleaved stores,
5373 GROUP_SIZE is the exact number of stmts in the chain.
5374 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5375 there is no interleaving, GROUP_SIZE is 1, and only one
5376 iteration of the loop will be executed. */
5377 gcc_assert (next_stmt
5378 && gimple_assign_single_p (next_stmt));
5379 op = gimple_assign_rhs1 (next_stmt);
5381 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5382 NULL);
5383 VEC_quick_push(tree, dr_chain, vec_oprnd);
5384 VEC_quick_push(tree, oprnds, vec_oprnd);
5385 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5389 /* We should have catched mismatched types earlier. */
5390 gcc_assert (useless_type_conversion_p (vectype,
5391 TREE_TYPE (vec_oprnd)));
5392 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5393 &dummy, &ptr_incr, false,
5394 &inv_p, NULL);
5395 gcc_assert (!inv_p);
5397 else
5399 /* For interleaved stores we created vectorized defs for all the
5400 defs stored in OPRNDS in the previous iteration (previous copy).
5401 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5402 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5403 next copy.
5404 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5405 OPRNDS are of size 1. */
5406 for (i = 0; i < group_size; i++)
5408 op = VEC_index (tree, oprnds, i);
5409 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5410 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5411 VEC_replace(tree, dr_chain, i, vec_oprnd);
5412 VEC_replace(tree, oprnds, i, vec_oprnd);
5414 dataref_ptr =
5415 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5418 if (strided_store)
5420 result_chain = VEC_alloc (tree, heap, group_size);
5421 /* Permute. */
5422 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5423 &result_chain))
5424 return false;
5427 next_stmt = first_stmt;
5428 for (i = 0; i < vec_num; i++)
5430 if (i > 0)
5431 /* Bump the vector pointer. */
5432 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5433 NULL_TREE);
5435 if (slp)
5436 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5437 else if (strided_store)
5438 /* For strided stores vectorized defs are interleaved in
5439 vect_permute_store_chain(). */
5440 vec_oprnd = VEC_index (tree, result_chain, i);
5442 data_ref = build_fold_indirect_ref (dataref_ptr);
5444 /* Arguments are ready. Create the new vector stmt. */
5445 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5446 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5447 mark_symbols_for_renaming (new_stmt);
5449 if (slp)
5450 continue;
5452 if (j == 0)
5453 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5454 else
5455 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5457 prev_stmt_info = vinfo_for_stmt (new_stmt);
5458 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5459 if (!next_stmt)
5460 break;
5464 VEC_free (tree, heap, dr_chain);
5465 VEC_free (tree, heap, oprnds);
5466 if (result_chain)
5467 VEC_free (tree, heap, result_chain);
5469 return true;
5473 /* Function vect_setup_realignment
5475 This function is called when vectorizing an unaligned load using
5476 the dr_explicit_realign[_optimized] scheme.
5477 This function generates the following code at the loop prolog:
5479 p = initial_addr;
5480 x msq_init = *(floor(p)); # prolog load
5481 realignment_token = call target_builtin;
5482 loop:
5483 x msq = phi (msq_init, ---)
5485 The stmts marked with x are generated only for the case of
5486 dr_explicit_realign_optimized.
5488 The code above sets up a new (vector) pointer, pointing to the first
5489 location accessed by STMT, and a "floor-aligned" load using that pointer.
5490 It also generates code to compute the "realignment-token" (if the relevant
5491 target hook was defined), and creates a phi-node at the loop-header bb
5492 whose arguments are the result of the prolog-load (created by this
5493 function) and the result of a load that takes place in the loop (to be
5494 created by the caller to this function).
5496 For the case of dr_explicit_realign_optimized:
5497 The caller to this function uses the phi-result (msq) to create the
5498 realignment code inside the loop, and sets up the missing phi argument,
5499 as follows:
5500 loop:
5501 msq = phi (msq_init, lsq)
5502 lsq = *(floor(p')); # load in loop
5503 result = realign_load (msq, lsq, realignment_token);
5505 For the case of dr_explicit_realign:
5506 loop:
5507 msq = *(floor(p)); # load in loop
5508 p' = p + (VS-1);
5509 lsq = *(floor(p')); # load in loop
5510 result = realign_load (msq, lsq, realignment_token);
5512 Input:
5513 STMT - (scalar) load stmt to be vectorized. This load accesses
5514 a memory location that may be unaligned.
5515 BSI - place where new code is to be inserted.
5516 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5517 is used.
5519 Output:
5520 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5521 target hook, if defined.
5522 Return value - the result of the loop-header phi node. */
5524 static tree
5525 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5526 tree *realignment_token,
5527 enum dr_alignment_support alignment_support_scheme,
5528 tree init_addr,
5529 struct loop **at_loop)
5531 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5532 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5533 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5534 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5535 edge pe;
5536 tree scalar_dest = gimple_assign_lhs (stmt);
5537 tree vec_dest;
5538 gimple inc;
5539 tree ptr;
5540 tree data_ref;
5541 gimple new_stmt;
5542 basic_block new_bb;
5543 tree msq_init = NULL_TREE;
5544 tree new_temp;
5545 gimple phi_stmt;
5546 tree msq = NULL_TREE;
5547 gimple_seq stmts = NULL;
5548 bool inv_p;
5549 bool compute_in_loop = false;
5550 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5551 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5552 struct loop *loop_for_initial_load;
5554 gcc_assert (alignment_support_scheme == dr_explicit_realign
5555 || alignment_support_scheme == dr_explicit_realign_optimized);
5557 /* We need to generate three things:
5558 1. the misalignment computation
5559 2. the extra vector load (for the optimized realignment scheme).
5560 3. the phi node for the two vectors from which the realignment is
5561 done (for the optimized realignment scheme).
5564 /* 1. Determine where to generate the misalignment computation.
5566 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5567 calculation will be generated by this function, outside the loop (in the
5568 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5569 caller, inside the loop.
5571 Background: If the misalignment remains fixed throughout the iterations of
5572 the loop, then both realignment schemes are applicable, and also the
5573 misalignment computation can be done outside LOOP. This is because we are
5574 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5575 are a multiple of VS (the Vector Size), and therefore the misalignment in
5576 different vectorized LOOP iterations is always the same.
5577 The problem arises only if the memory access is in an inner-loop nested
5578 inside LOOP, which is now being vectorized using outer-loop vectorization.
5579 This is the only case when the misalignment of the memory access may not
5580 remain fixed throughout the iterations of the inner-loop (as explained in
5581 detail in vect_supportable_dr_alignment). In this case, not only is the
5582 optimized realignment scheme not applicable, but also the misalignment
5583 computation (and generation of the realignment token that is passed to
5584 REALIGN_LOAD) have to be done inside the loop.
5586 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5587 or not, which in turn determines if the misalignment is computed inside
5588 the inner-loop, or outside LOOP. */
5590 if (init_addr != NULL_TREE)
5592 compute_in_loop = true;
5593 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5597 /* 2. Determine where to generate the extra vector load.
5599 For the optimized realignment scheme, instead of generating two vector
5600 loads in each iteration, we generate a single extra vector load in the
5601 preheader of the loop, and in each iteration reuse the result of the
5602 vector load from the previous iteration. In case the memory access is in
5603 an inner-loop nested inside LOOP, which is now being vectorized using
5604 outer-loop vectorization, we need to determine whether this initial vector
5605 load should be generated at the preheader of the inner-loop, or can be
5606 generated at the preheader of LOOP. If the memory access has no evolution
5607 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5608 to be generated inside LOOP (in the preheader of the inner-loop). */
5610 if (nested_in_vect_loop)
5612 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5613 bool invariant_in_outerloop =
5614 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5615 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5617 else
5618 loop_for_initial_load = loop;
5619 if (at_loop)
5620 *at_loop = loop_for_initial_load;
5622 /* 3. For the case of the optimized realignment, create the first vector
5623 load at the loop preheader. */
5625 if (alignment_support_scheme == dr_explicit_realign_optimized)
5627 /* Create msq_init = *(floor(p1)) in the loop preheader */
5629 gcc_assert (!compute_in_loop);
5630 pe = loop_preheader_edge (loop_for_initial_load);
5631 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5632 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5633 &init_addr, &inc, true, &inv_p, NULL_TREE);
5634 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5635 new_stmt = gimple_build_assign (vec_dest, data_ref);
5636 new_temp = make_ssa_name (vec_dest, new_stmt);
5637 gimple_assign_set_lhs (new_stmt, new_temp);
5638 mark_symbols_for_renaming (new_stmt);
5639 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5640 gcc_assert (!new_bb);
5641 msq_init = gimple_assign_lhs (new_stmt);
5644 /* 4. Create realignment token using a target builtin, if available.
5645 It is done either inside the containing loop, or before LOOP (as
5646 determined above). */
5648 if (targetm.vectorize.builtin_mask_for_load)
5650 tree builtin_decl;
5652 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5653 if (compute_in_loop)
5654 gcc_assert (init_addr); /* already computed by the caller. */
5655 else
5657 /* Generate the INIT_ADDR computation outside LOOP. */
5658 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5659 NULL_TREE, loop);
5660 pe = loop_preheader_edge (loop);
5661 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5662 gcc_assert (!new_bb);
5665 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5666 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5667 vec_dest =
5668 vect_create_destination_var (scalar_dest,
5669 gimple_call_return_type (new_stmt));
5670 new_temp = make_ssa_name (vec_dest, new_stmt);
5671 gimple_call_set_lhs (new_stmt, new_temp);
5673 if (compute_in_loop)
5674 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5675 else
5677 /* Generate the misalignment computation outside LOOP. */
5678 pe = loop_preheader_edge (loop);
5679 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5680 gcc_assert (!new_bb);
5683 *realignment_token = gimple_call_lhs (new_stmt);
5685 /* The result of the CALL_EXPR to this builtin is determined from
5686 the value of the parameter and no global variables are touched
5687 which makes the builtin a "const" function. Requiring the
5688 builtin to have the "const" attribute makes it unnecessary
5689 to call mark_call_clobbered. */
5690 gcc_assert (TREE_READONLY (builtin_decl));
5693 if (alignment_support_scheme == dr_explicit_realign)
5694 return msq;
5696 gcc_assert (!compute_in_loop);
5697 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5700 /* 5. Create msq = phi <msq_init, lsq> in loop */
5702 pe = loop_preheader_edge (containing_loop);
5703 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5704 msq = make_ssa_name (vec_dest, NULL);
5705 phi_stmt = create_phi_node (msq, containing_loop->header);
5706 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5707 add_phi_arg (phi_stmt, msq_init, pe);
5709 return msq;
5713 /* Function vect_strided_load_supported.
5715 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5716 and FALSE otherwise. */
5718 static bool
5719 vect_strided_load_supported (tree vectype)
5721 optab perm_even_optab, perm_odd_optab;
5722 int mode;
5724 mode = (int) TYPE_MODE (vectype);
5726 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5727 optab_default);
5728 if (!perm_even_optab)
5730 if (vect_print_dump_info (REPORT_DETAILS))
5731 fprintf (vect_dump, "no optab for perm_even.");
5732 return false;
5735 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5737 if (vect_print_dump_info (REPORT_DETAILS))
5738 fprintf (vect_dump, "perm_even op not supported by target.");
5739 return false;
5742 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5743 optab_default);
5744 if (!perm_odd_optab)
5746 if (vect_print_dump_info (REPORT_DETAILS))
5747 fprintf (vect_dump, "no optab for perm_odd.");
5748 return false;
5751 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5753 if (vect_print_dump_info (REPORT_DETAILS))
5754 fprintf (vect_dump, "perm_odd op not supported by target.");
5755 return false;
5757 return true;
5761 /* Function vect_permute_load_chain.
5763 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5764 a power of 2, generate extract_even/odd stmts to reorder the input data
5765 correctly. Return the final references for loads in RESULT_CHAIN.
5767 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5768 The input is 4 vectors each containing 8 elements. We assign a number to each
5769 element, the input sequence is:
5771 1st vec: 0 1 2 3 4 5 6 7
5772 2nd vec: 8 9 10 11 12 13 14 15
5773 3rd vec: 16 17 18 19 20 21 22 23
5774 4th vec: 24 25 26 27 28 29 30 31
5776 The output sequence should be:
5778 1st vec: 0 4 8 12 16 20 24 28
5779 2nd vec: 1 5 9 13 17 21 25 29
5780 3rd vec: 2 6 10 14 18 22 26 30
5781 4th vec: 3 7 11 15 19 23 27 31
5783 i.e., the first output vector should contain the first elements of each
5784 interleaving group, etc.
5786 We use extract_even/odd instructions to create such output. The input of each
5787 extract_even/odd operation is two vectors
5788 1st vec 2nd vec
5789 0 1 2 3 4 5 6 7
5791 and the output is the vector of extracted even/odd elements. The output of
5792 extract_even will be: 0 2 4 6
5793 and of extract_odd: 1 3 5 7
5796 The permutation is done in log LENGTH stages. In each stage extract_even and
5797 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5798 order. In our example,
5800 E1: extract_even (1st vec, 2nd vec)
5801 E2: extract_odd (1st vec, 2nd vec)
5802 E3: extract_even (3rd vec, 4th vec)
5803 E4: extract_odd (3rd vec, 4th vec)
5805 The output for the first stage will be:
5807 E1: 0 2 4 6 8 10 12 14
5808 E2: 1 3 5 7 9 11 13 15
5809 E3: 16 18 20 22 24 26 28 30
5810 E4: 17 19 21 23 25 27 29 31
5812 In order to proceed and create the correct sequence for the next stage (or
5813 for the correct output, if the second stage is the last one, as in our
5814 example), we first put the output of extract_even operation and then the
5815 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5816 The input for the second stage is:
5818 1st vec (E1): 0 2 4 6 8 10 12 14
5819 2nd vec (E3): 16 18 20 22 24 26 28 30
5820 3rd vec (E2): 1 3 5 7 9 11 13 15
5821 4th vec (E4): 17 19 21 23 25 27 29 31
5823 The output of the second stage:
5825 E1: 0 4 8 12 16 20 24 28
5826 E2: 2 6 10 14 18 22 26 30
5827 E3: 1 5 9 13 17 21 25 29
5828 E4: 3 7 11 15 19 23 27 31
5830 And RESULT_CHAIN after reordering:
5832 1st vec (E1): 0 4 8 12 16 20 24 28
5833 2nd vec (E3): 1 5 9 13 17 21 25 29
5834 3rd vec (E2): 2 6 10 14 18 22 26 30
5835 4th vec (E4): 3 7 11 15 19 23 27 31. */
5837 static bool
5838 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5839 unsigned int length,
5840 gimple stmt,
5841 gimple_stmt_iterator *gsi,
5842 VEC(tree,heap) **result_chain)
5844 tree perm_dest, data_ref, first_vect, second_vect;
5845 gimple perm_stmt;
5846 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5847 int i;
5848 unsigned int j;
5850 /* Check that the operation is supported. */
5851 if (!vect_strided_load_supported (vectype))
5852 return false;
5854 *result_chain = VEC_copy (tree, heap, dr_chain);
5855 for (i = 0; i < exact_log2 (length); i++)
5857 for (j = 0; j < length; j +=2)
5859 first_vect = VEC_index (tree, dr_chain, j);
5860 second_vect = VEC_index (tree, dr_chain, j+1);
5862 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5863 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5864 DECL_GIMPLE_REG_P (perm_dest) = 1;
5865 add_referenced_var (perm_dest);
5867 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5868 perm_dest, first_vect,
5869 second_vect);
5871 data_ref = make_ssa_name (perm_dest, perm_stmt);
5872 gimple_assign_set_lhs (perm_stmt, data_ref);
5873 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5874 mark_symbols_for_renaming (perm_stmt);
5876 VEC_replace (tree, *result_chain, j/2, data_ref);
5878 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5879 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5880 DECL_GIMPLE_REG_P (perm_dest) = 1;
5881 add_referenced_var (perm_dest);
5883 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5884 perm_dest, first_vect,
5885 second_vect);
5886 data_ref = make_ssa_name (perm_dest, perm_stmt);
5887 gimple_assign_set_lhs (perm_stmt, data_ref);
5888 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5889 mark_symbols_for_renaming (perm_stmt);
5891 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5893 dr_chain = VEC_copy (tree, heap, *result_chain);
5895 return true;
5899 /* Function vect_transform_strided_load.
5901 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5902 to perform their permutation and ascribe the result vectorized statements to
5903 the scalar statements.
5906 static bool
5907 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5908 gimple_stmt_iterator *gsi)
5910 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5911 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5912 gimple next_stmt, new_stmt;
5913 VEC(tree,heap) *result_chain = NULL;
5914 unsigned int i, gap_count;
5915 tree tmp_data_ref;
5917 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5918 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5919 vectors, that are ready for vector computation. */
5920 result_chain = VEC_alloc (tree, heap, size);
5921 /* Permute. */
5922 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5923 return false;
5925 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5926 Since we scan the chain starting from it's first node, their order
5927 corresponds the order of data-refs in RESULT_CHAIN. */
5928 next_stmt = first_stmt;
5929 gap_count = 1;
5930 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5932 if (!next_stmt)
5933 break;
5935 /* Skip the gaps. Loads created for the gaps will be removed by dead
5936 code elimination pass later. No need to check for the first stmt in
5937 the group, since it always exists.
5938 DR_GROUP_GAP is the number of steps in elements from the previous
5939 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5940 correspond to the gaps.
5942 if (next_stmt != first_stmt
5943 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5945 gap_count++;
5946 continue;
5949 while (next_stmt)
5951 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5952 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5953 copies, and we put the new vector statement in the first available
5954 RELATED_STMT. */
5955 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5956 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5957 else
5959 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5961 gimple prev_stmt =
5962 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5963 gimple rel_stmt =
5964 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5965 while (rel_stmt)
5967 prev_stmt = rel_stmt;
5968 rel_stmt =
5969 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5972 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5973 new_stmt;
5977 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5978 gap_count = 1;
5979 /* If NEXT_STMT accesses the same DR as the previous statement,
5980 put the same TMP_DATA_REF as its vectorized statement; otherwise
5981 get the next data-ref from RESULT_CHAIN. */
5982 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5983 break;
5987 VEC_free (tree, heap, result_chain);
5988 return true;
5992 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5993 building a vector of type MASK_TYPE from it) and two input vectors placed in
5994 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
5995 shifting by STRIDE elements of DR_CHAIN for every copy.
5996 (STRIDE is the number of vectorized stmts for NODE divided by the number of
5997 copies).
5998 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
5999 the created stmts must be inserted. */
6001 static inline void
6002 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
6003 int *mask_array, int mask_nunits,
6004 tree mask_element_type, tree mask_type,
6005 int first_vec_indx, int second_vec_indx,
6006 gimple_stmt_iterator *gsi, slp_tree node,
6007 tree builtin_decl, tree vectype,
6008 VEC(tree,heap) *dr_chain,
6009 int ncopies, int vect_stmts_counter)
6011 tree t = NULL_TREE, mask_vec, mask, perm_dest;
6012 gimple perm_stmt = NULL;
6013 stmt_vec_info next_stmt_info;
6014 int i, group_size, stride, dr_chain_size;
6015 tree first_vec, second_vec, data_ref;
6016 tree sym;
6017 ssa_op_iter iter;
6018 VEC (tree, heap) *params = NULL;
6020 /* Create a vector mask. */
6021 for (i = mask_nunits - 1; i >= 0; --i)
6022 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6024 mask_vec = build_vector (mask_type, t);
6025 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6027 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6028 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6029 dr_chain_size = VEC_length (tree, dr_chain);
6031 /* Initialize the vect stmts of NODE to properly insert the generated
6032 stmts later. */
6033 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6034 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6035 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6037 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6038 for (i = 0; i < ncopies; i++)
6040 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6041 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6043 /* Build argument list for the vectorized call. */
6044 VEC_free (tree, heap, params);
6045 params = VEC_alloc (tree, heap, 3);
6046 VEC_quick_push (tree, params, first_vec);
6047 VEC_quick_push (tree, params, second_vec);
6048 VEC_quick_push (tree, params, mask);
6050 /* Generate the permute statement. */
6051 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6052 data_ref = make_ssa_name (perm_dest, perm_stmt);
6053 gimple_call_set_lhs (perm_stmt, data_ref);
6054 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6055 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6057 if (TREE_CODE (sym) == SSA_NAME)
6058 sym = SSA_NAME_VAR (sym);
6059 mark_sym_for_renaming (sym);
6062 /* Store the vector statement in NODE. */
6063 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6064 stride * i + vect_stmts_counter, perm_stmt);
6066 first_vec_indx += stride;
6067 second_vec_indx += stride;
6070 /* Mark the scalar stmt as vectorized. */
6071 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6072 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6076 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6077 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6078 representation. Check that the mask is valid and return FALSE if not.
6079 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6080 the next vector, i.e., the current first vector is not needed. */
6082 static bool
6083 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6084 int mask_nunits, bool only_one_vec, int index,
6085 int *mask, int *current_mask_element,
6086 bool *need_next_vector)
6088 int i;
6089 static int number_of_mask_fixes = 1;
6090 static bool mask_fixed = false;
6091 static bool needs_first_vector = false;
6093 /* Convert to target specific representation. */
6094 *current_mask_element = first_mask_element + m;
6095 /* Adjust the value in case it's a mask for second and third vectors. */
6096 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6098 if (*current_mask_element < mask_nunits)
6099 needs_first_vector = true;
6101 /* We have only one input vector to permute but the mask accesses values in
6102 the next vector as well. */
6103 if (only_one_vec && *current_mask_element >= mask_nunits)
6105 if (vect_print_dump_info (REPORT_DETAILS))
6107 fprintf (vect_dump, "permutation requires at least two vectors ");
6108 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6111 return false;
6114 /* The mask requires the next vector. */
6115 if (*current_mask_element >= mask_nunits * 2)
6117 if (needs_first_vector || mask_fixed)
6119 /* We either need the first vector too or have already moved to the
6120 next vector. In both cases, this permutation needs three
6121 vectors. */
6122 if (vect_print_dump_info (REPORT_DETAILS))
6124 fprintf (vect_dump, "permutation requires at "
6125 "least three vectors ");
6126 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6129 return false;
6132 /* We move to the next vector, dropping the first one and working with
6133 the second and the third - we need to adjust the values of the mask
6134 accordingly. */
6135 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6137 for (i = 0; i < index; i++)
6138 mask[i] -= mask_nunits * number_of_mask_fixes;
6140 (number_of_mask_fixes)++;
6141 mask_fixed = true;
6144 *need_next_vector = mask_fixed;
6146 /* This was the last element of this mask. Start a new one. */
6147 if (index == mask_nunits - 1)
6149 number_of_mask_fixes = 1;
6150 mask_fixed = false;
6151 needs_first_vector = false;
6154 return true;
6158 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6159 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6160 permute statements for SLP_NODE_INSTANCE. */
6161 bool
6162 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6163 gimple_stmt_iterator *gsi, int vf,
6164 slp_instance slp_node_instance, bool analyze_only)
6166 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6167 tree mask_element_type = NULL_TREE, mask_type;
6168 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6169 slp_tree node;
6170 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6171 gimple next_scalar_stmt;
6172 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6173 int first_mask_element;
6174 int index, unroll_factor, *mask, current_mask_element, ncopies;
6175 bool only_one_vec = false, need_next_vector = false;
6176 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6178 if (!targetm.vectorize.builtin_vec_perm)
6180 if (vect_print_dump_info (REPORT_DETAILS))
6182 fprintf (vect_dump, "no builtin for vect permute for ");
6183 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6186 return false;
6189 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6190 &mask_element_type);
6191 if (!builtin_decl || !mask_element_type)
6193 if (vect_print_dump_info (REPORT_DETAILS))
6195 fprintf (vect_dump, "no builtin for vect permute for ");
6196 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6199 return false;
6202 mask_type = get_vectype_for_scalar_type (mask_element_type);
6203 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6204 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6205 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6206 scale = mask_nunits / nunits;
6207 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6209 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6210 unrolling factor. */
6211 orig_vec_stmts_num = group_size *
6212 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6213 if (orig_vec_stmts_num == 1)
6214 only_one_vec = true;
6216 /* Number of copies is determined by the final vectorization factor
6217 relatively to SLP_NODE_INSTANCE unrolling factor. */
6218 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6220 /* Generate permutation masks for every NODE. Number of masks for each NODE
6221 is equal to GROUP_SIZE.
6222 E.g., we have a group of three nodes with three loads from the same
6223 location in each node, and the vector size is 4. I.e., we have a
6224 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6225 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6226 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6229 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6230 scpecific type, e.g., in bytes for Altivec.
6231 The last mask is illegal since we assume two operands for permute
6232 operation, and the mask element values can't be outside that range. Hence,
6233 the last mask must be converted into {2,5,5,5}.
6234 For the first two permutations we need the first and the second input
6235 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6236 we need the second and the third vectors: {b1,c1,a2,b2} and
6237 {c2,a3,b3,c3}. */
6239 for (i = 0;
6240 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6241 i, node);
6242 i++)
6244 scalar_index = 0;
6245 index = 0;
6246 vect_stmts_counter = 0;
6247 vec_index = 0;
6248 first_vec_index = vec_index++;
6249 if (only_one_vec)
6250 second_vec_index = first_vec_index;
6251 else
6252 second_vec_index = vec_index++;
6254 for (j = 0; j < unroll_factor; j++)
6256 for (k = 0; k < group_size; k++)
6258 first_mask_element = (i + j * group_size) * scale;
6259 for (m = 0; m < scale; m++)
6261 if (!vect_get_mask_element (stmt, first_mask_element, m,
6262 mask_nunits, only_one_vec, index, mask,
6263 &current_mask_element, &need_next_vector))
6264 return false;
6266 mask[index++] = current_mask_element;
6269 if (index == mask_nunits)
6271 index = 0;
6272 if (!analyze_only)
6274 if (need_next_vector)
6276 first_vec_index = second_vec_index;
6277 second_vec_index = vec_index;
6280 next_scalar_stmt = VEC_index (gimple,
6281 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6283 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6284 mask, mask_nunits, mask_element_type, mask_type,
6285 first_vec_index, second_vec_index, gsi, node,
6286 builtin_decl, vectype, dr_chain, ncopies,
6287 vect_stmts_counter++);
6294 free (mask);
6295 return true;
6298 /* vectorizable_load.
6300 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6301 can be vectorized.
6302 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6303 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6304 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6306 bool
6307 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6308 slp_tree slp_node, slp_instance slp_node_instance)
6310 tree scalar_dest;
6311 tree vec_dest = NULL;
6312 tree data_ref = NULL;
6313 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6314 stmt_vec_info prev_stmt_info;
6315 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6317 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6318 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6319 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6320 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6321 tree new_temp;
6322 int mode;
6323 gimple new_stmt = NULL;
6324 tree dummy;
6325 enum dr_alignment_support alignment_support_scheme;
6326 tree dataref_ptr = NULL_TREE;
6327 gimple ptr_incr;
6328 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6329 int ncopies;
6330 int i, j, group_size;
6331 tree msq = NULL_TREE, lsq;
6332 tree offset = NULL_TREE;
6333 tree realignment_token = NULL_TREE;
6334 gimple phi = NULL;
6335 VEC(tree,heap) *dr_chain = NULL;
6336 bool strided_load = false;
6337 gimple first_stmt;
6338 tree scalar_type;
6339 bool inv_p;
6340 bool compute_in_loop = false;
6341 struct loop *at_loop;
6342 int vec_num;
6343 bool slp = (slp_node != NULL);
6344 bool slp_perm = false;
6345 enum tree_code code;
6347 /* Multiple types in SLP are handled by creating the appropriate number of
6348 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6349 case of SLP. */
6350 if (slp)
6351 ncopies = 1;
6352 else
6353 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6355 gcc_assert (ncopies >= 1);
6357 /* FORNOW. This restriction should be relaxed. */
6358 if (nested_in_vect_loop && ncopies > 1)
6360 if (vect_print_dump_info (REPORT_DETAILS))
6361 fprintf (vect_dump, "multiple types in nested loop.");
6362 return false;
6365 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6366 slp_perm = true;
6368 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6369 return false;
6371 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6372 return false;
6374 /* Is vectorizable load? */
6375 if (!is_gimple_assign (stmt))
6376 return false;
6378 scalar_dest = gimple_assign_lhs (stmt);
6379 if (TREE_CODE (scalar_dest) != SSA_NAME)
6380 return false;
6382 code = gimple_assign_rhs_code (stmt);
6383 if (code != ARRAY_REF
6384 && code != INDIRECT_REF
6385 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6386 return false;
6388 if (!STMT_VINFO_DATA_REF (stmt_info))
6389 return false;
6391 scalar_type = TREE_TYPE (DR_REF (dr));
6392 mode = (int) TYPE_MODE (vectype);
6394 /* FORNOW. In some cases can vectorize even if data-type not supported
6395 (e.g. - data copies). */
6396 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6398 if (vect_print_dump_info (REPORT_DETAILS))
6399 fprintf (vect_dump, "Aligned load, but unsupported type.");
6400 return false;
6403 /* The vector component type needs to be trivially convertible to the
6404 scalar lhs. This should always be the case. */
6405 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6407 if (vect_print_dump_info (REPORT_DETAILS))
6408 fprintf (vect_dump, "??? operands of different types");
6409 return false;
6412 /* Check if the load is a part of an interleaving chain. */
6413 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6415 strided_load = true;
6416 /* FORNOW */
6417 gcc_assert (! nested_in_vect_loop);
6419 /* Check if interleaving is supported. */
6420 if (!vect_strided_load_supported (vectype)
6421 && !PURE_SLP_STMT (stmt_info) && !slp)
6422 return false;
6425 if (!vec_stmt) /* transformation not required. */
6427 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6428 vect_model_load_cost (stmt_info, ncopies, NULL);
6429 return true;
6432 if (vect_print_dump_info (REPORT_DETAILS))
6433 fprintf (vect_dump, "transform load.");
6435 /** Transform. **/
6437 if (strided_load)
6439 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6440 /* Check if the chain of loads is already vectorized. */
6441 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6443 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6444 return true;
6446 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6447 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6449 /* VEC_NUM is the number of vect stmts to be created for this group. */
6450 if (slp)
6452 strided_load = false;
6453 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6455 else
6456 vec_num = group_size;
6458 dr_chain = VEC_alloc (tree, heap, vec_num);
6460 else
6462 first_stmt = stmt;
6463 first_dr = dr;
6464 group_size = vec_num = 1;
6467 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6468 gcc_assert (alignment_support_scheme);
6470 /* In case the vectorization factor (VF) is bigger than the number
6471 of elements that we can fit in a vectype (nunits), we have to generate
6472 more than one vector stmt - i.e - we need to "unroll" the
6473 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6474 from one copy of the vector stmt to the next, in the field
6475 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6476 stages to find the correct vector defs to be used when vectorizing
6477 stmts that use the defs of the current stmt. The example below illustrates
6478 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6479 4 vectorized stmts):
6481 before vectorization:
6482 RELATED_STMT VEC_STMT
6483 S1: x = memref - -
6484 S2: z = x + 1 - -
6486 step 1: vectorize stmt S1:
6487 We first create the vector stmt VS1_0, and, as usual, record a
6488 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6489 Next, we create the vector stmt VS1_1, and record a pointer to
6490 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6491 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6492 stmts and pointers:
6493 RELATED_STMT VEC_STMT
6494 VS1_0: vx0 = memref0 VS1_1 -
6495 VS1_1: vx1 = memref1 VS1_2 -
6496 VS1_2: vx2 = memref2 VS1_3 -
6497 VS1_3: vx3 = memref3 - -
6498 S1: x = load - VS1_0
6499 S2: z = x + 1 - -
6501 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6502 information we recorded in RELATED_STMT field is used to vectorize
6503 stmt S2. */
6505 /* In case of interleaving (non-unit strided access):
6507 S1: x2 = &base + 2
6508 S2: x0 = &base
6509 S3: x1 = &base + 1
6510 S4: x3 = &base + 3
6512 Vectorized loads are created in the order of memory accesses
6513 starting from the access of the first stmt of the chain:
6515 VS1: vx0 = &base
6516 VS2: vx1 = &base + vec_size*1
6517 VS3: vx3 = &base + vec_size*2
6518 VS4: vx4 = &base + vec_size*3
6520 Then permutation statements are generated:
6522 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6523 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6526 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6527 (the order of the data-refs in the output of vect_permute_load_chain
6528 corresponds to the order of scalar stmts in the interleaving chain - see
6529 the documentation of vect_permute_load_chain()).
6530 The generation of permutation stmts and recording them in
6531 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6533 In case of both multiple types and interleaving, the vector loads and
6534 permutation stmts above are created for every copy. The result vector stmts
6535 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6536 STMT_VINFO_RELATED_STMT for the next copies. */
6538 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6539 on a target that supports unaligned accesses (dr_unaligned_supported)
6540 we generate the following code:
6541 p = initial_addr;
6542 indx = 0;
6543 loop {
6544 p = p + indx * vectype_size;
6545 vec_dest = *(p);
6546 indx = indx + 1;
6549 Otherwise, the data reference is potentially unaligned on a target that
6550 does not support unaligned accesses (dr_explicit_realign_optimized) -
6551 then generate the following code, in which the data in each iteration is
6552 obtained by two vector loads, one from the previous iteration, and one
6553 from the current iteration:
6554 p1 = initial_addr;
6555 msq_init = *(floor(p1))
6556 p2 = initial_addr + VS - 1;
6557 realignment_token = call target_builtin;
6558 indx = 0;
6559 loop {
6560 p2 = p2 + indx * vectype_size
6561 lsq = *(floor(p2))
6562 vec_dest = realign_load (msq, lsq, realignment_token)
6563 indx = indx + 1;
6564 msq = lsq;
6565 } */
6567 /* If the misalignment remains the same throughout the execution of the
6568 loop, we can create the init_addr and permutation mask at the loop
6569 preheader. Otherwise, it needs to be created inside the loop.
6570 This can only occur when vectorizing memory accesses in the inner-loop
6571 nested within an outer-loop that is being vectorized. */
6573 if (nested_in_vect_loop_p (loop, stmt)
6574 && (TREE_INT_CST_LOW (DR_STEP (dr))
6575 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6577 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6578 compute_in_loop = true;
6581 if ((alignment_support_scheme == dr_explicit_realign_optimized
6582 || alignment_support_scheme == dr_explicit_realign)
6583 && !compute_in_loop)
6585 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6586 alignment_support_scheme, NULL_TREE,
6587 &at_loop);
6588 if (alignment_support_scheme == dr_explicit_realign_optimized)
6590 phi = SSA_NAME_DEF_STMT (msq);
6591 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6594 else
6595 at_loop = loop;
6597 prev_stmt_info = NULL;
6598 for (j = 0; j < ncopies; j++)
6600 /* 1. Create the vector pointer update chain. */
6601 if (j == 0)
6602 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6603 at_loop, offset,
6604 &dummy, &ptr_incr, false,
6605 &inv_p, NULL_TREE);
6606 else
6607 dataref_ptr =
6608 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6610 for (i = 0; i < vec_num; i++)
6612 if (i > 0)
6613 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6614 NULL_TREE);
6616 /* 2. Create the vector-load in the loop. */
6617 switch (alignment_support_scheme)
6619 case dr_aligned:
6620 gcc_assert (aligned_access_p (first_dr));
6621 data_ref = build_fold_indirect_ref (dataref_ptr);
6622 break;
6623 case dr_unaligned_supported:
6625 int mis = DR_MISALIGNMENT (first_dr);
6626 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6628 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6629 data_ref =
6630 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6631 break;
6633 case dr_explicit_realign:
6635 tree ptr, bump;
6636 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6638 if (compute_in_loop)
6639 msq = vect_setup_realignment (first_stmt, gsi,
6640 &realignment_token,
6641 dr_explicit_realign,
6642 dataref_ptr, NULL);
6644 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6645 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6646 new_stmt = gimple_build_assign (vec_dest, data_ref);
6647 new_temp = make_ssa_name (vec_dest, new_stmt);
6648 gimple_assign_set_lhs (new_stmt, new_temp);
6649 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6650 copy_virtual_operands (new_stmt, stmt);
6651 mark_symbols_for_renaming (new_stmt);
6652 msq = new_temp;
6654 bump = size_binop (MULT_EXPR, vs_minus_1,
6655 TYPE_SIZE_UNIT (scalar_type));
6656 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6657 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6658 break;
6660 case dr_explicit_realign_optimized:
6661 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6662 break;
6663 default:
6664 gcc_unreachable ();
6666 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6667 new_stmt = gimple_build_assign (vec_dest, data_ref);
6668 new_temp = make_ssa_name (vec_dest, new_stmt);
6669 gimple_assign_set_lhs (new_stmt, new_temp);
6670 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6671 mark_symbols_for_renaming (new_stmt);
6673 /* 3. Handle explicit realignment if necessary/supported. Create in
6674 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6675 if (alignment_support_scheme == dr_explicit_realign_optimized
6676 || alignment_support_scheme == dr_explicit_realign)
6678 tree tmp;
6680 lsq = gimple_assign_lhs (new_stmt);
6681 if (!realignment_token)
6682 realignment_token = dataref_ptr;
6683 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6684 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6685 realignment_token);
6686 new_stmt = gimple_build_assign (vec_dest, tmp);
6687 new_temp = make_ssa_name (vec_dest, new_stmt);
6688 gimple_assign_set_lhs (new_stmt, new_temp);
6689 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6691 if (alignment_support_scheme == dr_explicit_realign_optimized)
6693 gcc_assert (phi);
6694 if (i == vec_num - 1 && j == ncopies - 1)
6695 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6696 msq = lsq;
6700 /* 4. Handle invariant-load. */
6701 if (inv_p)
6703 gcc_assert (!strided_load);
6704 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6705 if (j == 0)
6707 int k;
6708 tree t = NULL_TREE;
6709 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6711 /* CHECKME: bitpos depends on endianess? */
6712 bitpos = bitsize_zero_node;
6713 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6714 bitsize, bitpos);
6715 vec_dest =
6716 vect_create_destination_var (scalar_dest, NULL_TREE);
6717 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6718 new_temp = make_ssa_name (vec_dest, new_stmt);
6719 gimple_assign_set_lhs (new_stmt, new_temp);
6720 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6722 for (k = nunits - 1; k >= 0; --k)
6723 t = tree_cons (NULL_TREE, new_temp, t);
6724 /* FIXME: use build_constructor directly. */
6725 vec_inv = build_constructor_from_list (vectype, t);
6726 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6727 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6729 else
6730 gcc_unreachable (); /* FORNOW. */
6733 /* Collect vector loads and later create their permutation in
6734 vect_transform_strided_load (). */
6735 if (strided_load || slp_perm)
6736 VEC_quick_push (tree, dr_chain, new_temp);
6738 /* Store vector loads in the corresponding SLP_NODE. */
6739 if (slp && !slp_perm)
6740 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6743 if (slp && !slp_perm)
6744 continue;
6746 if (slp_perm)
6748 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6749 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6750 slp_node_instance, false))
6752 VEC_free (tree, heap, dr_chain);
6753 return false;
6756 else
6758 if (strided_load)
6760 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6761 return false;
6763 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6764 VEC_free (tree, heap, dr_chain);
6765 dr_chain = VEC_alloc (tree, heap, group_size);
6767 else
6769 if (j == 0)
6770 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6771 else
6772 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6773 prev_stmt_info = vinfo_for_stmt (new_stmt);
6778 if (dr_chain)
6779 VEC_free (tree, heap, dr_chain);
6781 return true;
6785 /* Function vectorizable_live_operation.
6787 STMT computes a value that is used outside the loop. Check if
6788 it can be supported. */
6790 bool
6791 vectorizable_live_operation (gimple stmt,
6792 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6793 gimple *vec_stmt ATTRIBUTE_UNUSED)
6795 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6796 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6797 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6798 int i;
6799 int op_type;
6800 tree op;
6801 tree def;
6802 gimple def_stmt;
6803 enum vect_def_type dt;
6804 enum tree_code code;
6805 enum gimple_rhs_class rhs_class;
6807 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6809 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6810 return false;
6812 if (!is_gimple_assign (stmt))
6813 return false;
6815 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6816 return false;
6818 /* FORNOW. CHECKME. */
6819 if (nested_in_vect_loop_p (loop, stmt))
6820 return false;
6822 code = gimple_assign_rhs_code (stmt);
6823 op_type = TREE_CODE_LENGTH (code);
6824 rhs_class = get_gimple_rhs_class (code);
6825 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6826 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6828 /* FORNOW: support only if all uses are invariant. This means
6829 that the scalar operations can remain in place, unvectorized.
6830 The original last scalar value that they compute will be used. */
6832 for (i = 0; i < op_type; i++)
6834 if (rhs_class == GIMPLE_SINGLE_RHS)
6835 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6836 else
6837 op = gimple_op (stmt, i + 1);
6838 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6840 if (vect_print_dump_info (REPORT_DETAILS))
6841 fprintf (vect_dump, "use not simple.");
6842 return false;
6845 if (dt != vect_invariant_def && dt != vect_constant_def)
6846 return false;
6849 /* No transformation is required for the cases we currently support. */
6850 return true;
6854 /* Function vect_is_simple_cond.
6856 Input:
6857 LOOP - the loop that is being vectorized.
6858 COND - Condition that is checked for simple use.
6860 Returns whether a COND can be vectorized. Checks whether
6861 condition operands are supportable using vec_is_simple_use. */
6863 static bool
6864 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6866 tree lhs, rhs;
6867 tree def;
6868 enum vect_def_type dt;
6870 if (!COMPARISON_CLASS_P (cond))
6871 return false;
6873 lhs = TREE_OPERAND (cond, 0);
6874 rhs = TREE_OPERAND (cond, 1);
6876 if (TREE_CODE (lhs) == SSA_NAME)
6878 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6879 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6880 return false;
6882 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6883 && TREE_CODE (lhs) != FIXED_CST)
6884 return false;
6886 if (TREE_CODE (rhs) == SSA_NAME)
6888 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6889 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6890 return false;
6892 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6893 && TREE_CODE (rhs) != FIXED_CST)
6894 return false;
6896 return true;
6899 /* vectorizable_condition.
6901 Check if STMT is conditional modify expression that can be vectorized.
6902 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6903 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6904 at BSI.
6906 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6908 bool
6909 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6910 gimple *vec_stmt)
6912 tree scalar_dest = NULL_TREE;
6913 tree vec_dest = NULL_TREE;
6914 tree op = NULL_TREE;
6915 tree cond_expr, then_clause, else_clause;
6916 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6917 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6918 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6919 tree vec_compare, vec_cond_expr;
6920 tree new_temp;
6921 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6922 enum machine_mode vec_mode;
6923 tree def;
6924 enum vect_def_type dt;
6925 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6926 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6927 enum tree_code code;
6929 gcc_assert (ncopies >= 1);
6930 if (ncopies > 1)
6931 return false; /* FORNOW */
6933 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6934 return false;
6936 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6937 return false;
6939 /* FORNOW: SLP not supported. */
6940 if (STMT_SLP_TYPE (stmt_info))
6941 return false;
6943 /* FORNOW: not yet supported. */
6944 if (STMT_VINFO_LIVE_P (stmt_info))
6946 if (vect_print_dump_info (REPORT_DETAILS))
6947 fprintf (vect_dump, "value used after loop.");
6948 return false;
6951 /* Is vectorizable conditional operation? */
6952 if (!is_gimple_assign (stmt))
6953 return false;
6955 code = gimple_assign_rhs_code (stmt);
6957 if (code != COND_EXPR)
6958 return false;
6960 gcc_assert (gimple_assign_single_p (stmt));
6961 op = gimple_assign_rhs1 (stmt);
6962 cond_expr = TREE_OPERAND (op, 0);
6963 then_clause = TREE_OPERAND (op, 1);
6964 else_clause = TREE_OPERAND (op, 2);
6966 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6967 return false;
6969 /* We do not handle two different vector types for the condition
6970 and the values. */
6971 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6972 return false;
6974 if (TREE_CODE (then_clause) == SSA_NAME)
6976 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6977 if (!vect_is_simple_use (then_clause, loop_vinfo,
6978 &then_def_stmt, &def, &dt))
6979 return false;
6981 else if (TREE_CODE (then_clause) != INTEGER_CST
6982 && TREE_CODE (then_clause) != REAL_CST
6983 && TREE_CODE (then_clause) != FIXED_CST)
6984 return false;
6986 if (TREE_CODE (else_clause) == SSA_NAME)
6988 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6989 if (!vect_is_simple_use (else_clause, loop_vinfo,
6990 &else_def_stmt, &def, &dt))
6991 return false;
6993 else if (TREE_CODE (else_clause) != INTEGER_CST
6994 && TREE_CODE (else_clause) != REAL_CST
6995 && TREE_CODE (else_clause) != FIXED_CST)
6996 return false;
6999 vec_mode = TYPE_MODE (vectype);
7001 if (!vec_stmt)
7003 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
7004 return expand_vec_cond_expr_p (op, vec_mode);
7007 /* Transform */
7009 /* Handle def. */
7010 scalar_dest = gimple_assign_lhs (stmt);
7011 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7013 /* Handle cond expr. */
7014 vec_cond_lhs =
7015 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7016 vec_cond_rhs =
7017 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7018 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7019 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7021 /* Arguments are ready. Create the new vector stmt. */
7022 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7023 vec_cond_lhs, vec_cond_rhs);
7024 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7025 vec_compare, vec_then_clause, vec_else_clause);
7027 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7028 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7029 gimple_assign_set_lhs (*vec_stmt, new_temp);
7030 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7032 return true;
7036 /* Function vect_transform_stmt.
7038 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7040 static bool
7041 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7042 bool *strided_store, slp_tree slp_node,
7043 slp_instance slp_node_instance)
7045 bool is_store = false;
7046 gimple vec_stmt = NULL;
7047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7048 gimple orig_stmt_in_pattern;
7049 bool done;
7051 switch (STMT_VINFO_TYPE (stmt_info))
7053 case type_demotion_vec_info_type:
7054 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7055 gcc_assert (done);
7056 break;
7058 case type_promotion_vec_info_type:
7059 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7060 gcc_assert (done);
7061 break;
7063 case type_conversion_vec_info_type:
7064 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7065 gcc_assert (done);
7066 break;
7068 case induc_vec_info_type:
7069 gcc_assert (!slp_node);
7070 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7071 gcc_assert (done);
7072 break;
7074 case op_vec_info_type:
7075 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7076 gcc_assert (done);
7077 break;
7079 case assignment_vec_info_type:
7080 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7081 gcc_assert (done);
7082 break;
7084 case load_vec_info_type:
7085 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7086 slp_node_instance);
7087 gcc_assert (done);
7088 break;
7090 case store_vec_info_type:
7091 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7092 gcc_assert (done);
7093 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7095 /* In case of interleaving, the whole chain is vectorized when the
7096 last store in the chain is reached. Store stmts before the last
7097 one are skipped, and there vec_stmt_info shouldn't be freed
7098 meanwhile. */
7099 *strided_store = true;
7100 if (STMT_VINFO_VEC_STMT (stmt_info))
7101 is_store = true;
7103 else
7104 is_store = true;
7105 break;
7107 case condition_vec_info_type:
7108 gcc_assert (!slp_node);
7109 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7110 gcc_assert (done);
7111 break;
7113 case call_vec_info_type:
7114 gcc_assert (!slp_node);
7115 done = vectorizable_call (stmt, gsi, &vec_stmt);
7116 break;
7118 case reduc_vec_info_type:
7119 gcc_assert (!slp_node);
7120 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7121 gcc_assert (done);
7122 break;
7124 default:
7125 if (!STMT_VINFO_LIVE_P (stmt_info))
7127 if (vect_print_dump_info (REPORT_DETAILS))
7128 fprintf (vect_dump, "stmt not supported.");
7129 gcc_unreachable ();
7133 if (STMT_VINFO_LIVE_P (stmt_info)
7134 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7136 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7137 gcc_assert (done);
7140 if (vec_stmt)
7142 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7143 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7144 if (orig_stmt_in_pattern)
7146 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7147 /* STMT was inserted by the vectorizer to replace a computation idiom.
7148 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7149 computed this idiom. We need to record a pointer to VEC_STMT in
7150 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7151 documentation of vect_pattern_recog. */
7152 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7154 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7155 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7160 return is_store;
7164 /* This function builds ni_name = number of iterations loop executes
7165 on the loop preheader. */
7167 static tree
7168 vect_build_loop_niters (loop_vec_info loop_vinfo)
7170 tree ni_name, var;
7171 gimple_seq stmts = NULL;
7172 edge pe;
7173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7174 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7176 var = create_tmp_var (TREE_TYPE (ni), "niters");
7177 add_referenced_var (var);
7178 ni_name = force_gimple_operand (ni, &stmts, false, var);
7180 pe = loop_preheader_edge (loop);
7181 if (stmts)
7183 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7184 gcc_assert (!new_bb);
7187 return ni_name;
7191 /* This function generates the following statements:
7193 ni_name = number of iterations loop executes
7194 ratio = ni_name / vf
7195 ratio_mult_vf_name = ratio * vf
7197 and places them at the loop preheader edge. */
7199 static void
7200 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7201 tree *ni_name_ptr,
7202 tree *ratio_mult_vf_name_ptr,
7203 tree *ratio_name_ptr)
7206 edge pe;
7207 basic_block new_bb;
7208 gimple_seq stmts;
7209 tree ni_name;
7210 tree var;
7211 tree ratio_name;
7212 tree ratio_mult_vf_name;
7213 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7214 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7215 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7216 tree log_vf;
7218 pe = loop_preheader_edge (loop);
7220 /* Generate temporary variable that contains
7221 number of iterations loop executes. */
7223 ni_name = vect_build_loop_niters (loop_vinfo);
7224 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7226 /* Create: ratio = ni >> log2(vf) */
7228 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7229 if (!is_gimple_val (ratio_name))
7231 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7232 add_referenced_var (var);
7234 stmts = NULL;
7235 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7236 pe = loop_preheader_edge (loop);
7237 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7238 gcc_assert (!new_bb);
7241 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7243 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7244 ratio_name, log_vf);
7245 if (!is_gimple_val (ratio_mult_vf_name))
7247 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7248 add_referenced_var (var);
7250 stmts = NULL;
7251 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7252 true, var);
7253 pe = loop_preheader_edge (loop);
7254 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7255 gcc_assert (!new_bb);
7258 *ni_name_ptr = ni_name;
7259 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7260 *ratio_name_ptr = ratio_name;
7262 return;
7266 /* Function vect_update_ivs_after_vectorizer.
7268 "Advance" the induction variables of LOOP to the value they should take
7269 after the execution of LOOP. This is currently necessary because the
7270 vectorizer does not handle induction variables that are used after the
7271 loop. Such a situation occurs when the last iterations of LOOP are
7272 peeled, because:
7273 1. We introduced new uses after LOOP for IVs that were not originally used
7274 after LOOP: the IVs of LOOP are now used by an epilog loop.
7275 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7276 times, whereas the loop IVs should be bumped N times.
7278 Input:
7279 - LOOP - a loop that is going to be vectorized. The last few iterations
7280 of LOOP were peeled.
7281 - NITERS - the number of iterations that LOOP executes (before it is
7282 vectorized). i.e, the number of times the ivs should be bumped.
7283 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7284 coming out from LOOP on which there are uses of the LOOP ivs
7285 (this is the path from LOOP->exit to epilog_loop->preheader).
7287 The new definitions of the ivs are placed in LOOP->exit.
7288 The phi args associated with the edge UPDATE_E in the bb
7289 UPDATE_E->dest are updated accordingly.
7291 Assumption 1: Like the rest of the vectorizer, this function assumes
7292 a single loop exit that has a single predecessor.
7294 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7295 organized in the same order.
7297 Assumption 3: The access function of the ivs is simple enough (see
7298 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7300 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7301 coming out of LOOP on which the ivs of LOOP are used (this is the path
7302 that leads to the epilog loop; other paths skip the epilog loop). This
7303 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7304 needs to have its phis updated.
7307 static void
7308 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7309 edge update_e)
7311 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7312 basic_block exit_bb = single_exit (loop)->dest;
7313 gimple phi, phi1;
7314 gimple_stmt_iterator gsi, gsi1;
7315 basic_block update_bb = update_e->dest;
7317 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7319 /* Make sure there exists a single-predecessor exit bb: */
7320 gcc_assert (single_pred_p (exit_bb));
7322 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7323 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7324 gsi_next (&gsi), gsi_next (&gsi1))
7326 tree access_fn = NULL;
7327 tree evolution_part;
7328 tree init_expr;
7329 tree step_expr;
7330 tree var, ni, ni_name;
7331 gimple_stmt_iterator last_gsi;
7333 phi = gsi_stmt (gsi);
7334 phi1 = gsi_stmt (gsi1);
7335 if (vect_print_dump_info (REPORT_DETAILS))
7337 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7338 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7341 /* Skip virtual phi's. */
7342 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7344 if (vect_print_dump_info (REPORT_DETAILS))
7345 fprintf (vect_dump, "virtual phi. skip.");
7346 continue;
7349 /* Skip reduction phis. */
7350 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7352 if (vect_print_dump_info (REPORT_DETAILS))
7353 fprintf (vect_dump, "reduc phi. skip.");
7354 continue;
7357 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7358 gcc_assert (access_fn);
7359 STRIP_NOPS (access_fn);
7360 evolution_part =
7361 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7362 gcc_assert (evolution_part != NULL_TREE);
7364 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7365 of degree >= 2 or exponential. */
7366 gcc_assert (!tree_is_chrec (evolution_part));
7368 step_expr = evolution_part;
7369 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7370 loop->num));
7372 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7373 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7374 init_expr,
7375 fold_convert (sizetype,
7376 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7377 niters, step_expr)));
7378 else
7379 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7380 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7381 fold_convert (TREE_TYPE (init_expr),
7382 niters),
7383 step_expr),
7384 init_expr);
7388 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7389 add_referenced_var (var);
7391 last_gsi = gsi_last_bb (exit_bb);
7392 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7393 true, GSI_SAME_STMT);
7395 /* Fix phi expressions in the successor bb. */
7396 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7400 /* Return the more conservative threshold between the
7401 min_profitable_iters returned by the cost model and the user
7402 specified threshold, if provided. */
7404 static unsigned int
7405 conservative_cost_threshold (loop_vec_info loop_vinfo,
7406 int min_profitable_iters)
7408 unsigned int th;
7409 int min_scalar_loop_bound;
7411 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7412 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7414 /* Use the cost model only if it is more conservative than user specified
7415 threshold. */
7416 th = (unsigned) min_scalar_loop_bound;
7417 if (min_profitable_iters
7418 && (!min_scalar_loop_bound
7419 || min_profitable_iters > min_scalar_loop_bound))
7420 th = (unsigned) min_profitable_iters;
7422 if (th && vect_print_dump_info (REPORT_COST))
7423 fprintf (vect_dump, "Vectorization may not be profitable.");
7425 return th;
7428 /* Function vect_do_peeling_for_loop_bound
7430 Peel the last iterations of the loop represented by LOOP_VINFO.
7431 The peeled iterations form a new epilog loop. Given that the loop now
7432 iterates NITERS times, the new epilog loop iterates
7433 NITERS % VECTORIZATION_FACTOR times.
7435 The original loop will later be made to iterate
7436 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7438 static void
7439 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7441 tree ni_name, ratio_mult_vf_name;
7442 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7443 struct loop *new_loop;
7444 edge update_e;
7445 basic_block preheader;
7446 int loop_num;
7447 bool check_profitability = false;
7448 unsigned int th = 0;
7449 int min_profitable_iters;
7451 if (vect_print_dump_info (REPORT_DETAILS))
7452 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7454 initialize_original_copy_tables ();
7456 /* Generate the following variables on the preheader of original loop:
7458 ni_name = number of iteration the original loop executes
7459 ratio = ni_name / vf
7460 ratio_mult_vf_name = ratio * vf */
7461 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7462 &ratio_mult_vf_name, ratio);
7464 loop_num = loop->num;
7466 /* If cost model check not done during versioning and
7467 peeling for alignment. */
7468 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7469 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7470 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7472 check_profitability = true;
7474 /* Get profitability threshold for vectorized loop. */
7475 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7477 th = conservative_cost_threshold (loop_vinfo,
7478 min_profitable_iters);
7481 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7482 ratio_mult_vf_name, ni_name, false,
7483 th, check_profitability);
7484 gcc_assert (new_loop);
7485 gcc_assert (loop_num == loop->num);
7486 #ifdef ENABLE_CHECKING
7487 slpeel_verify_cfg_after_peeling (loop, new_loop);
7488 #endif
7490 /* A guard that controls whether the new_loop is to be executed or skipped
7491 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7492 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7493 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7494 is on the path where the LOOP IVs are used and need to be updated. */
7496 preheader = loop_preheader_edge (new_loop)->src;
7497 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7498 update_e = EDGE_PRED (preheader, 0);
7499 else
7500 update_e = EDGE_PRED (preheader, 1);
7502 /* Update IVs of original loop as if they were advanced
7503 by ratio_mult_vf_name steps. */
7504 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7506 /* After peeling we have to reset scalar evolution analyzer. */
7507 scev_reset ();
7509 free_original_copy_tables ();
7513 /* Function vect_gen_niters_for_prolog_loop
7515 Set the number of iterations for the loop represented by LOOP_VINFO
7516 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7517 and the misalignment of DR - the data reference recorded in
7518 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7519 this loop, the data reference DR will refer to an aligned location.
7521 The following computation is generated:
7523 If the misalignment of DR is known at compile time:
7524 addr_mis = int mis = DR_MISALIGNMENT (dr);
7525 Else, compute address misalignment in bytes:
7526 addr_mis = addr & (vectype_size - 1)
7528 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7530 (elem_size = element type size; an element is the scalar element whose type
7531 is the inner type of the vectype)
7533 When the step of the data-ref in the loop is not 1 (as in interleaved data
7534 and SLP), the number of iterations of the prolog must be divided by the step
7535 (which is equal to the size of interleaved group).
7537 The above formulas assume that VF == number of elements in the vector. This
7538 may not hold when there are multiple-types in the loop.
7539 In this case, for some data-references in the loop the VF does not represent
7540 the number of elements that fit in the vector. Therefore, instead of VF we
7541 use TYPE_VECTOR_SUBPARTS. */
7543 static tree
7544 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7546 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7547 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7548 tree var;
7549 gimple_seq stmts;
7550 tree iters, iters_name;
7551 edge pe;
7552 basic_block new_bb;
7553 gimple dr_stmt = DR_STMT (dr);
7554 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7555 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7556 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7557 tree niters_type = TREE_TYPE (loop_niters);
7558 int step = 1;
7559 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7560 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7562 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7563 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7565 pe = loop_preheader_edge (loop);
7567 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7569 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7570 int elem_misalign = byte_misalign / element_size;
7572 if (vect_print_dump_info (REPORT_DETAILS))
7573 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7575 iters = build_int_cst (niters_type,
7576 (((nelements - elem_misalign) & (nelements - 1)) / step));
7578 else
7580 gimple_seq new_stmts = NULL;
7581 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7582 &new_stmts, NULL_TREE, loop);
7583 tree ptr_type = TREE_TYPE (start_addr);
7584 tree size = TYPE_SIZE (ptr_type);
7585 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7586 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7587 tree elem_size_log =
7588 build_int_cst (type, exact_log2 (vectype_align/nelements));
7589 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7590 tree nelements_tree = build_int_cst (type, nelements);
7591 tree byte_misalign;
7592 tree elem_misalign;
7594 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7595 gcc_assert (!new_bb);
7597 /* Create: byte_misalign = addr & (vectype_size - 1) */
7598 byte_misalign =
7599 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7601 /* Create: elem_misalign = byte_misalign / element_size */
7602 elem_misalign =
7603 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7605 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7606 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7607 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7608 iters = fold_convert (niters_type, iters);
7611 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7612 /* If the loop bound is known at compile time we already verified that it is
7613 greater than vf; since the misalignment ('iters') is at most vf, there's
7614 no need to generate the MIN_EXPR in this case. */
7615 if (TREE_CODE (loop_niters) != INTEGER_CST)
7616 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7618 if (vect_print_dump_info (REPORT_DETAILS))
7620 fprintf (vect_dump, "niters for prolog loop: ");
7621 print_generic_expr (vect_dump, iters, TDF_SLIM);
7624 var = create_tmp_var (niters_type, "prolog_loop_niters");
7625 add_referenced_var (var);
7626 stmts = NULL;
7627 iters_name = force_gimple_operand (iters, &stmts, false, var);
7629 /* Insert stmt on loop preheader edge. */
7630 if (stmts)
7632 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7633 gcc_assert (!new_bb);
7636 return iters_name;
7640 /* Function vect_update_init_of_dr
7642 NITERS iterations were peeled from LOOP. DR represents a data reference
7643 in LOOP. This function updates the information recorded in DR to
7644 account for the fact that the first NITERS iterations had already been
7645 executed. Specifically, it updates the OFFSET field of DR. */
7647 static void
7648 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7650 tree offset = DR_OFFSET (dr);
7652 niters = fold_build2 (MULT_EXPR, sizetype,
7653 fold_convert (sizetype, niters),
7654 fold_convert (sizetype, DR_STEP (dr)));
7655 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7656 DR_OFFSET (dr) = offset;
7660 /* Function vect_update_inits_of_drs
7662 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7663 This function updates the information recorded for the data references in
7664 the loop to account for the fact that the first NITERS iterations had
7665 already been executed. Specifically, it updates the initial_condition of
7666 the access_function of all the data_references in the loop. */
7668 static void
7669 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7671 unsigned int i;
7672 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7673 struct data_reference *dr;
7675 if (vect_print_dump_info (REPORT_DETAILS))
7676 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7678 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7679 vect_update_init_of_dr (dr, niters);
7683 /* Function vect_do_peeling_for_alignment
7685 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7686 'niters' is set to the misalignment of one of the data references in the
7687 loop, thereby forcing it to refer to an aligned location at the beginning
7688 of the execution of this loop. The data reference for which we are
7689 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7691 static void
7692 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7694 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7695 tree niters_of_prolog_loop, ni_name;
7696 tree n_iters;
7697 struct loop *new_loop;
7698 bool check_profitability = false;
7699 unsigned int th = 0;
7700 int min_profitable_iters;
7702 if (vect_print_dump_info (REPORT_DETAILS))
7703 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7705 initialize_original_copy_tables ();
7707 ni_name = vect_build_loop_niters (loop_vinfo);
7708 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7711 /* If cost model check not done during versioning. */
7712 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7713 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7715 check_profitability = true;
7717 /* Get profitability threshold for vectorized loop. */
7718 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7720 th = conservative_cost_threshold (loop_vinfo,
7721 min_profitable_iters);
7724 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7725 new_loop =
7726 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7727 niters_of_prolog_loop, ni_name, true,
7728 th, check_profitability);
7730 gcc_assert (new_loop);
7731 #ifdef ENABLE_CHECKING
7732 slpeel_verify_cfg_after_peeling (new_loop, loop);
7733 #endif
7735 /* Update number of times loop executes. */
7736 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7737 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7738 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7740 /* Update the init conditions of the access functions of all data refs. */
7741 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7743 /* After peeling we have to reset scalar evolution analyzer. */
7744 scev_reset ();
7746 free_original_copy_tables ();
7750 /* Function vect_create_cond_for_align_checks.
7752 Create a conditional expression that represents the alignment checks for
7753 all of data references (array element references) whose alignment must be
7754 checked at runtime.
7756 Input:
7757 COND_EXPR - input conditional expression. New conditions will be chained
7758 with logical AND operation.
7759 LOOP_VINFO - two fields of the loop information are used.
7760 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7761 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7763 Output:
7764 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7765 expression.
7766 The returned value is the conditional expression to be used in the if
7767 statement that controls which version of the loop gets executed at runtime.
7769 The algorithm makes two assumptions:
7770 1) The number of bytes "n" in a vector is a power of 2.
7771 2) An address "a" is aligned if a%n is zero and that this
7772 test can be done as a&(n-1) == 0. For example, for 16
7773 byte vectors the test is a&0xf == 0. */
7775 static void
7776 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7777 tree *cond_expr,
7778 gimple_seq *cond_expr_stmt_list)
7780 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7781 VEC(gimple,heap) *may_misalign_stmts
7782 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7783 gimple ref_stmt;
7784 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7785 tree mask_cst;
7786 unsigned int i;
7787 tree psize;
7788 tree int_ptrsize_type;
7789 char tmp_name[20];
7790 tree or_tmp_name = NULL_TREE;
7791 tree and_tmp, and_tmp_name;
7792 gimple and_stmt;
7793 tree ptrsize_zero;
7794 tree part_cond_expr;
7796 /* Check that mask is one less than a power of 2, i.e., mask is
7797 all zeros followed by all ones. */
7798 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7800 /* CHECKME: what is the best integer or unsigned type to use to hold a
7801 cast from a pointer value? */
7802 psize = TYPE_SIZE (ptr_type_node);
7803 int_ptrsize_type
7804 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7806 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7807 of the first vector of the i'th data reference. */
7809 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7811 gimple_seq new_stmt_list = NULL;
7812 tree addr_base;
7813 tree addr_tmp, addr_tmp_name;
7814 tree or_tmp, new_or_tmp_name;
7815 gimple addr_stmt, or_stmt;
7817 /* create: addr_tmp = (int)(address_of_first_vector) */
7818 addr_base =
7819 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7820 NULL_TREE, loop);
7821 if (new_stmt_list != NULL)
7822 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7824 sprintf (tmp_name, "%s%d", "addr2int", i);
7825 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7826 add_referenced_var (addr_tmp);
7827 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7828 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7829 addr_base, NULL_TREE);
7830 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7831 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7833 /* The addresses are OR together. */
7835 if (or_tmp_name != NULL_TREE)
7837 /* create: or_tmp = or_tmp | addr_tmp */
7838 sprintf (tmp_name, "%s%d", "orptrs", i);
7839 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7840 add_referenced_var (or_tmp);
7841 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7842 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7843 new_or_tmp_name,
7844 or_tmp_name, addr_tmp_name);
7845 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7846 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7847 or_tmp_name = new_or_tmp_name;
7849 else
7850 or_tmp_name = addr_tmp_name;
7852 } /* end for i */
7854 mask_cst = build_int_cst (int_ptrsize_type, mask);
7856 /* create: and_tmp = or_tmp & mask */
7857 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7858 add_referenced_var (and_tmp);
7859 and_tmp_name = make_ssa_name (and_tmp, NULL);
7861 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7862 or_tmp_name, mask_cst);
7863 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7864 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7866 /* Make and_tmp the left operand of the conditional test against zero.
7867 if and_tmp has a nonzero bit then some address is unaligned. */
7868 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7869 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7870 and_tmp_name, ptrsize_zero);
7871 if (*cond_expr)
7872 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7873 *cond_expr, part_cond_expr);
7874 else
7875 *cond_expr = part_cond_expr;
7878 /* Function vect_vfa_segment_size.
7880 Create an expression that computes the size of segment
7881 that will be accessed for a data reference. The functions takes into
7882 account that realignment loads may access one more vector.
7884 Input:
7885 DR: The data reference.
7886 VECT_FACTOR: vectorization factor.
7888 Return an expression whose value is the size of segment which will be
7889 accessed by DR. */
7891 static tree
7892 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7894 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7895 DR_STEP (dr), vect_factor);
7897 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7899 tree vector_size = TYPE_SIZE_UNIT
7900 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7902 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7903 segment_length, vector_size);
7905 return fold_convert (sizetype, segment_length);
7908 /* Function vect_create_cond_for_alias_checks.
7910 Create a conditional expression that represents the run-time checks for
7911 overlapping of address ranges represented by a list of data references
7912 relations passed as input.
7914 Input:
7915 COND_EXPR - input conditional expression. New conditions will be chained
7916 with logical AND operation.
7917 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7918 to be checked.
7920 Output:
7921 COND_EXPR - conditional expression.
7922 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7923 expression.
7926 The returned value is the conditional expression to be used in the if
7927 statement that controls which version of the loop gets executed at runtime.
7930 static void
7931 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7932 tree * cond_expr,
7933 gimple_seq * cond_expr_stmt_list)
7935 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7936 VEC (ddr_p, heap) * may_alias_ddrs =
7937 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7938 tree vect_factor =
7939 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7941 ddr_p ddr;
7942 unsigned int i;
7943 tree part_cond_expr;
7945 /* Create expression
7946 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7947 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7951 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7952 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7954 if (VEC_empty (ddr_p, may_alias_ddrs))
7955 return;
7957 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7959 struct data_reference *dr_a, *dr_b;
7960 gimple dr_group_first_a, dr_group_first_b;
7961 tree addr_base_a, addr_base_b;
7962 tree segment_length_a, segment_length_b;
7963 gimple stmt_a, stmt_b;
7965 dr_a = DDR_A (ddr);
7966 stmt_a = DR_STMT (DDR_A (ddr));
7967 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7968 if (dr_group_first_a)
7970 stmt_a = dr_group_first_a;
7971 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7974 dr_b = DDR_B (ddr);
7975 stmt_b = DR_STMT (DDR_B (ddr));
7976 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7977 if (dr_group_first_b)
7979 stmt_b = dr_group_first_b;
7980 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7983 addr_base_a =
7984 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7985 NULL_TREE, loop);
7986 addr_base_b =
7987 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7988 NULL_TREE, loop);
7990 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7991 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7993 if (vect_print_dump_info (REPORT_DR_DETAILS))
7995 fprintf (vect_dump,
7996 "create runtime check for data references ");
7997 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7998 fprintf (vect_dump, " and ");
7999 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8003 part_cond_expr =
8004 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8005 fold_build2 (LT_EXPR, boolean_type_node,
8006 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8007 addr_base_a,
8008 segment_length_a),
8009 addr_base_b),
8010 fold_build2 (LT_EXPR, boolean_type_node,
8011 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8012 addr_base_b,
8013 segment_length_b),
8014 addr_base_a));
8016 if (*cond_expr)
8017 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8018 *cond_expr, part_cond_expr);
8019 else
8020 *cond_expr = part_cond_expr;
8022 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8023 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8024 VEC_length (ddr_p, may_alias_ddrs));
8028 /* Function vect_loop_versioning.
8030 If the loop has data references that may or may not be aligned or/and
8031 has data reference relations whose independence was not proven then
8032 two versions of the loop need to be generated, one which is vectorized
8033 and one which isn't. A test is then generated to control which of the
8034 loops is executed. The test checks for the alignment of all of the
8035 data references that may or may not be aligned. An additional
8036 sequence of runtime tests is generated for each pairs of DDRs whose
8037 independence was not proven. The vectorized version of loop is
8038 executed only if both alias and alignment tests are passed.
8040 The test generated to check which version of loop is executed
8041 is modified to also check for profitability as indicated by the
8042 cost model initially. */
8044 static void
8045 vect_loop_versioning (loop_vec_info loop_vinfo)
8047 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8048 struct loop *nloop;
8049 tree cond_expr = NULL_TREE;
8050 gimple_seq cond_expr_stmt_list = NULL;
8051 basic_block condition_bb;
8052 gimple_stmt_iterator gsi, cond_exp_gsi;
8053 basic_block merge_bb;
8054 basic_block new_exit_bb;
8055 edge new_exit_e, e;
8056 gimple orig_phi, new_phi;
8057 tree arg;
8058 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8059 gimple_seq gimplify_stmt_list = NULL;
8060 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8061 int min_profitable_iters = 0;
8062 unsigned int th;
8064 /* Get profitability threshold for vectorized loop. */
8065 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8067 th = conservative_cost_threshold (loop_vinfo,
8068 min_profitable_iters);
8070 cond_expr =
8071 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8072 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8074 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8075 false, NULL_TREE);
8077 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8078 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8079 &cond_expr_stmt_list);
8081 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8082 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8083 &cond_expr_stmt_list);
8085 cond_expr =
8086 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8087 cond_expr =
8088 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8089 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8091 initialize_original_copy_tables ();
8092 nloop = loop_version (loop, cond_expr, &condition_bb,
8093 prob, prob, REG_BR_PROB_BASE - prob, true);
8094 free_original_copy_tables();
8096 /* Loop versioning violates an assumption we try to maintain during
8097 vectorization - that the loop exit block has a single predecessor.
8098 After versioning, the exit block of both loop versions is the same
8099 basic block (i.e. it has two predecessors). Just in order to simplify
8100 following transformations in the vectorizer, we fix this situation
8101 here by adding a new (empty) block on the exit-edge of the loop,
8102 with the proper loop-exit phis to maintain loop-closed-form. */
8104 merge_bb = single_exit (loop)->dest;
8105 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8106 new_exit_bb = split_edge (single_exit (loop));
8107 new_exit_e = single_exit (loop);
8108 e = EDGE_SUCC (new_exit_bb, 0);
8110 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8112 orig_phi = gsi_stmt (gsi);
8113 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8114 new_exit_bb);
8115 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8116 add_phi_arg (new_phi, arg, new_exit_e);
8117 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8120 /* End loop-exit-fixes after versioning. */
8122 update_ssa (TODO_update_ssa);
8123 if (cond_expr_stmt_list)
8125 cond_exp_gsi = gsi_last_bb (condition_bb);
8126 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8130 /* Remove a group of stores (for SLP or interleaving), free their
8131 stmt_vec_info. */
8133 static void
8134 vect_remove_stores (gimple first_stmt)
8136 gimple next = first_stmt;
8137 gimple tmp;
8138 gimple_stmt_iterator next_si;
8140 while (next)
8142 /* Free the attached stmt_vec_info and remove the stmt. */
8143 next_si = gsi_for_stmt (next);
8144 gsi_remove (&next_si, true);
8145 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8146 free_stmt_vec_info (next);
8147 next = tmp;
8152 /* Vectorize SLP instance tree in postorder. */
8154 static bool
8155 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8156 unsigned int vectorization_factor)
8158 gimple stmt;
8159 bool strided_store, is_store;
8160 gimple_stmt_iterator si;
8161 stmt_vec_info stmt_info;
8162 unsigned int vec_stmts_size, nunits, group_size;
8163 tree vectype;
8164 int i;
8165 slp_tree loads_node;
8167 if (!node)
8168 return false;
8170 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8171 vectorization_factor);
8172 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8173 vectorization_factor);
8175 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8176 stmt_info = vinfo_for_stmt (stmt);
8178 /* VECTYPE is the type of the destination. */
8179 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8180 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8181 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8183 /* For each SLP instance calculate number of vector stmts to be created
8184 for the scalar stmts in each node of the SLP tree. Number of vector
8185 elements in one vector iteration is the number of scalar elements in
8186 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8187 size. */
8188 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8190 /* In case of load permutation we have to allocate vectorized statements for
8191 all the nodes that participate in that permutation. */
8192 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8194 for (i = 0;
8195 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8196 i++)
8198 if (!SLP_TREE_VEC_STMTS (loads_node))
8200 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8201 vec_stmts_size);
8202 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8207 if (!SLP_TREE_VEC_STMTS (node))
8209 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8210 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8213 if (vect_print_dump_info (REPORT_DETAILS))
8215 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8216 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8219 /* Loads should be inserted before the first load. */
8220 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8221 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8222 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8223 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8224 else
8225 si = gsi_for_stmt (stmt);
8227 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8228 if (is_store)
8230 if (DR_GROUP_FIRST_DR (stmt_info))
8231 /* If IS_STORE is TRUE, the vectorization of the
8232 interleaving chain was completed - free all the stores in
8233 the chain. */
8234 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8235 else
8236 /* FORNOW: SLP originates only from strided stores. */
8237 gcc_unreachable ();
8239 return true;
8242 /* FORNOW: SLP originates only from strided stores. */
8243 return false;
8247 static bool
8248 vect_schedule_slp (loop_vec_info loop_vinfo)
8250 VEC (slp_instance, heap) *slp_instances =
8251 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8252 slp_instance instance;
8253 unsigned int i;
8254 bool is_store = false;
8256 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8258 /* Schedule the tree of INSTANCE. */
8259 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8260 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8262 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8263 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8264 fprintf (vect_dump, "vectorizing stmts using SLP.");
8267 return is_store;
8270 /* Function vect_transform_loop.
8272 The analysis phase has determined that the loop is vectorizable.
8273 Vectorize the loop - created vectorized stmts to replace the scalar
8274 stmts in the loop, and update the loop exit condition. */
8276 void
8277 vect_transform_loop (loop_vec_info loop_vinfo)
8279 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8280 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8281 int nbbs = loop->num_nodes;
8282 gimple_stmt_iterator si;
8283 int i;
8284 tree ratio = NULL;
8285 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8286 bool strided_store;
8287 bool slp_scheduled = false;
8288 unsigned int nunits;
8290 if (vect_print_dump_info (REPORT_DETAILS))
8291 fprintf (vect_dump, "=== vec_transform_loop ===");
8293 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8294 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8295 vect_loop_versioning (loop_vinfo);
8297 /* CHECKME: we wouldn't need this if we called update_ssa once
8298 for all loops. */
8299 bitmap_zero (vect_memsyms_to_rename);
8301 /* Peel the loop if there are data refs with unknown alignment.
8302 Only one data ref with unknown store is allowed. */
8304 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8305 vect_do_peeling_for_alignment (loop_vinfo);
8307 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8308 compile time constant), or it is a constant that doesn't divide by the
8309 vectorization factor, then an epilog loop needs to be created.
8310 We therefore duplicate the loop: the original loop will be vectorized,
8311 and will compute the first (n/VF) iterations. The second copy of the loop
8312 will remain scalar and will compute the remaining (n%VF) iterations.
8313 (VF is the vectorization factor). */
8315 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8316 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8317 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8318 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8319 else
8320 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8321 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8323 /* 1) Make sure the loop header has exactly two entries
8324 2) Make sure we have a preheader basic block. */
8326 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8328 split_edge (loop_preheader_edge (loop));
8330 /* FORNOW: the vectorizer supports only loops which body consist
8331 of one basic block (header + empty latch). When the vectorizer will
8332 support more involved loop forms, the order by which the BBs are
8333 traversed need to be reconsidered. */
8335 for (i = 0; i < nbbs; i++)
8337 basic_block bb = bbs[i];
8338 stmt_vec_info stmt_info;
8339 gimple phi;
8341 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8343 phi = gsi_stmt (si);
8344 if (vect_print_dump_info (REPORT_DETAILS))
8346 fprintf (vect_dump, "------>vectorizing phi: ");
8347 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8349 stmt_info = vinfo_for_stmt (phi);
8350 if (!stmt_info)
8351 continue;
8353 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8354 && !STMT_VINFO_LIVE_P (stmt_info))
8355 continue;
8357 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8358 != (unsigned HOST_WIDE_INT) vectorization_factor)
8359 && vect_print_dump_info (REPORT_DETAILS))
8360 fprintf (vect_dump, "multiple-types.");
8362 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8364 if (vect_print_dump_info (REPORT_DETAILS))
8365 fprintf (vect_dump, "transform phi.");
8366 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8370 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8372 gimple stmt = gsi_stmt (si);
8373 bool is_store;
8375 if (vect_print_dump_info (REPORT_DETAILS))
8377 fprintf (vect_dump, "------>vectorizing statement: ");
8378 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8381 stmt_info = vinfo_for_stmt (stmt);
8383 /* vector stmts created in the outer-loop during vectorization of
8384 stmts in an inner-loop may not have a stmt_info, and do not
8385 need to be vectorized. */
8386 if (!stmt_info)
8388 gsi_next (&si);
8389 continue;
8392 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8393 && !STMT_VINFO_LIVE_P (stmt_info))
8395 gsi_next (&si);
8396 continue;
8399 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8400 nunits =
8401 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8402 if (!STMT_SLP_TYPE (stmt_info)
8403 && nunits != (unsigned int) vectorization_factor
8404 && vect_print_dump_info (REPORT_DETAILS))
8405 /* For SLP VF is set according to unrolling factor, and not to
8406 vector size, hence for SLP this print is not valid. */
8407 fprintf (vect_dump, "multiple-types.");
8409 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8410 reached. */
8411 if (STMT_SLP_TYPE (stmt_info))
8413 if (!slp_scheduled)
8415 slp_scheduled = true;
8417 if (vect_print_dump_info (REPORT_DETAILS))
8418 fprintf (vect_dump, "=== scheduling SLP instances ===");
8420 is_store = vect_schedule_slp (loop_vinfo);
8422 /* IS_STORE is true if STMT is a store. Stores cannot be of
8423 hybrid SLP type. They are removed in
8424 vect_schedule_slp_instance and their vinfo is destroyed. */
8425 if (is_store)
8427 gsi_next (&si);
8428 continue;
8432 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8433 if (PURE_SLP_STMT (stmt_info))
8435 gsi_next (&si);
8436 continue;
8440 /* -------- vectorize statement ------------ */
8441 if (vect_print_dump_info (REPORT_DETAILS))
8442 fprintf (vect_dump, "transform statement.");
8444 strided_store = false;
8445 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8446 if (is_store)
8448 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8450 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8451 interleaving chain was completed - free all the stores in
8452 the chain. */
8453 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8454 gsi_remove (&si, true);
8455 continue;
8457 else
8459 /* Free the attached stmt_vec_info and remove the stmt. */
8460 free_stmt_vec_info (stmt);
8461 gsi_remove (&si, true);
8462 continue;
8465 gsi_next (&si);
8466 } /* stmts in BB */
8467 } /* BBs in loop */
8469 slpeel_make_loop_iterate_ntimes (loop, ratio);
8471 mark_set_for_renaming (vect_memsyms_to_rename);
8473 /* The memory tags and pointers in vectorized statements need to
8474 have their SSA forms updated. FIXME, why can't this be delayed
8475 until all the loops have been transformed? */
8476 update_ssa (TODO_update_ssa);
8478 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8479 fprintf (vect_dump, "LOOP VECTORIZED.");
8480 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8481 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");