* gcc.dg/compat/struct-layout-1_generate.c (dg_options): New. Moved
[official-gcc.git] / gcc / tree-vect-transform.c
blob1e7538ab8a1e2fde2a0ce926a2d75ac7eccf6ad6
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
50 slp_tree, slp_instance);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (gimple, struct loop*, tree, tree *, gimple *, bool, bool *, tree);
54 static tree vect_create_addr_base_for_vector_ref
55 (gimple, gimple_seq *, tree, struct loop *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, gimple, tree *);
58 static tree vect_init_vector (gimple, tree, tree, gimple_stmt_iterator *);
59 static void vect_finish_stmt_generation
60 (gimple stmt, gimple vec_stmt, gimple_stmt_iterator *);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void vect_create_epilog_for_reduction
63 (tree, gimple, int, enum tree_code, gimple);
64 static tree get_initial_def_for_reduction (gimple, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 static int
78 cost_for_stmt (gimple stmt)
80 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
82 switch (STMT_VINFO_TYPE (stmt_info))
84 case load_vec_info_type:
85 return TARG_SCALAR_LOAD_COST;
86 case store_vec_info_type:
87 return TARG_SCALAR_STORE_COST;
88 case op_vec_info_type:
89 case condition_vec_info_type:
90 case assignment_vec_info_type:
91 case reduc_vec_info_type:
92 case induc_vec_info_type:
93 case type_promotion_vec_info_type:
94 case type_demotion_vec_info_type:
95 case type_conversion_vec_info_type:
96 case call_vec_info_type:
97 return TARG_SCALAR_STMT_COST;
98 case undef_vec_info_type:
99 default:
100 gcc_unreachable ();
105 /* Function vect_estimate_min_profitable_iters
107 Return the number of iterations required for the vector version of the
108 loop to be profitable relative to the cost of the scalar version of the
109 loop.
111 TODO: Take profile info into account before making vectorization
112 decisions, if available. */
115 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
117 int i;
118 int min_profitable_iters;
119 int peel_iters_prologue;
120 int peel_iters_epilogue;
121 int vec_inside_cost = 0;
122 int vec_outside_cost = 0;
123 int scalar_single_iter_cost = 0;
124 int scalar_outside_cost = 0;
125 bool runtime_test = false;
126 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
128 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
129 int nbbs = loop->num_nodes;
130 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
131 int peel_guard_costs = 0;
132 int innerloop_iters = 0, factor;
133 VEC (slp_instance, heap) *slp_instances;
134 slp_instance instance;
136 /* Cost model disabled. */
137 if (!flag_vect_cost_model)
139 if (vect_print_dump_info (REPORT_COST))
140 fprintf (vect_dump, "cost model disabled.");
141 return 0;
144 /* If the number of iterations is unknown, or the
145 peeling-for-misalignment amount is unknown, we will have to generate
146 a runtime test to test the loop count against the threshold. */
147 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
148 || (byte_misalign < 0))
149 runtime_test = true;
151 /* Requires loop versioning tests to handle misalignment. */
153 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
155 /* FIXME: Make cost depend on complexity of individual check. */
156 vec_outside_cost +=
157 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
158 if (vect_print_dump_info (REPORT_COST))
159 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
160 "versioning to treat misalignment.\n");
163 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 /* FIXME: Make cost depend on complexity of individual check. */
166 vec_outside_cost +=
167 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
168 if (vect_print_dump_info (REPORT_COST))
169 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
170 "versioning aliasing.\n");
173 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
174 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
176 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
179 /* Count statements in scalar loop. Using this as scalar cost for a single
180 iteration for now.
182 TODO: Add outer loop support.
184 TODO: Consider assigning different costs to different scalar
185 statements. */
187 /* FORNOW. */
188 if (loop->inner)
189 innerloop_iters = 50; /* FIXME */
191 for (i = 0; i < nbbs; i++)
193 gimple_stmt_iterator si;
194 basic_block bb = bbs[i];
196 if (bb->loop_father == loop->inner)
197 factor = innerloop_iters;
198 else
199 factor = 1;
201 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
203 gimple stmt = gsi_stmt (si);
204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
205 /* Skip stmts that are not vectorized inside the loop. */
206 if (!STMT_VINFO_RELEVANT_P (stmt_info)
207 && (!STMT_VINFO_LIVE_P (stmt_info)
208 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
209 continue;
210 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
211 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
212 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
213 some of the "outside" costs are generated inside the outer-loop. */
214 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
218 /* Add additional cost for the peeled instructions in prologue and epilogue
219 loop.
221 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
222 at compile-time - we assume it's vf/2 (the worst would be vf-1).
224 TODO: Build an expression that represents peel_iters for prologue and
225 epilogue to be used in a run-time test. */
227 if (byte_misalign < 0)
229 peel_iters_prologue = vf/2;
230 if (vect_print_dump_info (REPORT_COST))
231 fprintf (vect_dump, "cost model: "
232 "prologue peel iters set to vf/2.");
234 /* If peeling for alignment is unknown, loop bound of main loop becomes
235 unknown. */
236 peel_iters_epilogue = vf/2;
237 if (vect_print_dump_info (REPORT_COST))
238 fprintf (vect_dump, "cost model: "
239 "epilogue peel iters set to vf/2 because "
240 "peeling for alignment is unknown .");
242 /* If peeled iterations are unknown, count a taken branch and a not taken
243 branch per peeled loop. Even if scalar loop iterations are known,
244 vector iterations are not known since peeled prologue iterations are
245 not known. Hence guards remain the same. */
246 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
247 + TARG_COND_NOT_TAKEN_BRANCH_COST);
250 else
252 if (byte_misalign)
254 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
255 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
256 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
257 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
259 peel_iters_prologue = nelements - (byte_misalign / element_size);
261 else
262 peel_iters_prologue = 0;
264 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
266 peel_iters_epilogue = vf/2;
267 if (vect_print_dump_info (REPORT_COST))
268 fprintf (vect_dump, "cost model: "
269 "epilogue peel iters set to vf/2 because "
270 "loop iterations are unknown .");
272 /* If peeled iterations are known but number of scalar loop
273 iterations are unknown, count a taken branch per peeled loop. */
274 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
277 else
279 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
280 peel_iters_prologue = niters < peel_iters_prologue ?
281 niters : peel_iters_prologue;
282 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
286 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
287 + (peel_iters_epilogue * scalar_single_iter_cost)
288 + peel_guard_costs;
290 /* FORNOW: The scalar outside cost is incremented in one of the
291 following ways:
293 1. The vectorizer checks for alignment and aliasing and generates
294 a condition that allows dynamic vectorization. A cost model
295 check is ANDED with the versioning condition. Hence scalar code
296 path now has the added cost of the versioning check.
298 if (cost > th & versioning_check)
299 jmp to vector code
301 Hence run-time scalar is incremented by not-taken branch cost.
303 2. The vectorizer then checks if a prologue is required. If the
304 cost model check was not done before during versioning, it has to
305 be done before the prologue check.
307 if (cost <= th)
308 prologue = scalar_iters
309 if (prologue == 0)
310 jmp to vector code
311 else
312 execute prologue
313 if (prologue == num_iters)
314 go to exit
316 Hence the run-time scalar cost is incremented by a taken branch,
317 plus a not-taken branch, plus a taken branch cost.
319 3. The vectorizer then checks if an epilogue is required. If the
320 cost model check was not done before during prologue check, it
321 has to be done with the epilogue check.
323 if (prologue == 0)
324 jmp to vector code
325 else
326 execute prologue
327 if (prologue == num_iters)
328 go to exit
329 vector code:
330 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
331 jmp to epilogue
333 Hence the run-time scalar cost should be incremented by 2 taken
334 branches.
336 TODO: The back end may reorder the BBS's differently and reverse
337 conditions/branch directions. Change the estimates below to
338 something more reasonable. */
340 if (runtime_test)
342 /* Cost model check occurs at versioning. */
343 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
344 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
345 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
346 else
348 /* Cost model occurs at prologue generation. */
349 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
350 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
351 + TARG_COND_NOT_TAKEN_BRANCH_COST;
352 /* Cost model check occurs at epilogue generation. */
353 else
354 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
358 /* Add SLP costs. */
359 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
360 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
362 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
363 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
366 /* Calculate number of iterations required to make the vector version
367 profitable, relative to the loop bodies only. The following condition
368 must hold true:
369 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
370 where
371 SIC = scalar iteration cost, VIC = vector iteration cost,
372 VOC = vector outside cost, VF = vectorization factor,
373 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
374 SOC = scalar outside cost for run time cost model check. */
376 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
378 if (vec_outside_cost <= 0)
379 min_profitable_iters = 1;
380 else
382 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
383 - vec_inside_cost * peel_iters_prologue
384 - vec_inside_cost * peel_iters_epilogue)
385 / ((scalar_single_iter_cost * vf)
386 - vec_inside_cost);
388 if ((scalar_single_iter_cost * vf * min_profitable_iters)
389 <= ((vec_inside_cost * min_profitable_iters)
390 + ((vec_outside_cost - scalar_outside_cost) * vf)))
391 min_profitable_iters++;
394 /* vector version will never be profitable. */
395 else
397 if (vect_print_dump_info (REPORT_COST))
398 fprintf (vect_dump, "cost model: vector iteration cost = %d "
399 "is divisible by scalar iteration cost = %d by a factor "
400 "greater than or equal to the vectorization factor = %d .",
401 vec_inside_cost, scalar_single_iter_cost, vf);
402 return -1;
405 if (vect_print_dump_info (REPORT_COST))
407 fprintf (vect_dump, "Cost model analysis: \n");
408 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
409 vec_inside_cost);
410 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
411 vec_outside_cost);
412 fprintf (vect_dump, " Scalar iteration cost: %d\n",
413 scalar_single_iter_cost);
414 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
415 fprintf (vect_dump, " prologue iterations: %d\n",
416 peel_iters_prologue);
417 fprintf (vect_dump, " epilogue iterations: %d\n",
418 peel_iters_epilogue);
419 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
420 min_profitable_iters);
423 min_profitable_iters =
424 min_profitable_iters < vf ? vf : min_profitable_iters;
426 /* Because the condition we create is:
427 if (niters <= min_profitable_iters)
428 then skip the vectorized loop. */
429 min_profitable_iters--;
431 if (vect_print_dump_info (REPORT_COST))
432 fprintf (vect_dump, " Profitability threshold = %d\n",
433 min_profitable_iters);
435 return min_profitable_iters;
439 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
440 functions. Design better to avoid maintenance issues. */
442 /* Function vect_model_reduction_cost.
444 Models cost for a reduction operation, including the vector ops
445 generated within the strip-mine loop, the initial definition before
446 the loop, and the epilogue code that must be generated. */
448 static bool
449 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
450 int ncopies)
452 int outer_cost = 0;
453 enum tree_code code;
454 optab optab;
455 tree vectype;
456 gimple stmt, orig_stmt;
457 tree reduction_op;
458 enum machine_mode mode;
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
463 /* Cost of reduction op inside loop. */
464 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
466 stmt = STMT_VINFO_STMT (stmt_info);
468 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
470 case GIMPLE_SINGLE_RHS:
471 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
472 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
473 break;
474 case GIMPLE_UNARY_RHS:
475 reduction_op = gimple_assign_rhs1 (stmt);
476 break;
477 case GIMPLE_BINARY_RHS:
478 reduction_op = gimple_assign_rhs2 (stmt);
479 break;
480 default:
481 gcc_unreachable ();
484 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
485 if (!vectype)
487 if (vect_print_dump_info (REPORT_COST))
489 fprintf (vect_dump, "unsupported data-type ");
490 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
492 return false;
495 mode = TYPE_MODE (vectype);
496 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
498 if (!orig_stmt)
499 orig_stmt = STMT_VINFO_STMT (stmt_info);
501 code = gimple_assign_rhs_code (orig_stmt);
503 /* Add in cost for initial definition. */
504 outer_cost += TARG_SCALAR_TO_VEC_COST;
506 /* Determine cost of epilogue code.
508 We have a reduction operator that will reduce the vector in one statement.
509 Also requires scalar extract. */
511 if (!nested_in_vect_loop_p (loop, orig_stmt))
513 if (reduc_code < NUM_TREE_CODES)
514 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
515 else
517 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
518 tree bitsize =
519 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
520 int element_bitsize = tree_low_cst (bitsize, 1);
521 int nelements = vec_size_in_bits / element_bitsize;
523 optab = optab_for_tree_code (code, vectype, optab_default);
525 /* We have a whole vector shift available. */
526 if (VECTOR_MODE_P (mode)
527 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
528 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
529 /* Final reduction via vector shifts and the reduction operator. Also
530 requires scalar extract. */
531 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
532 + TARG_VEC_TO_SCALAR_COST);
533 else
534 /* Use extracts and reduction op for final reduction. For N elements,
535 we have N extracts and N-1 reduction ops. */
536 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
540 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
542 if (vect_print_dump_info (REPORT_COST))
543 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
544 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
545 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
547 return true;
551 /* Function vect_model_induction_cost.
553 Models cost for induction operations. */
555 static void
556 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
558 /* loop cost for vec_loop. */
559 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
560 /* prologue cost for vec_init and vec_step. */
561 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
563 if (vect_print_dump_info (REPORT_COST))
564 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
565 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
566 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
570 /* Function vect_model_simple_cost.
572 Models cost for simple operations, i.e. those that only emit ncopies of a
573 single op. Right now, this does not account for multiple insns that could
574 be generated for the single vector op. We will handle that shortly. */
576 void
577 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
578 enum vect_def_type *dt, slp_tree slp_node)
580 int i;
581 int inside_cost = 0, outside_cost = 0;
583 /* The SLP costs were already calculated during SLP tree build. */
584 if (PURE_SLP_STMT (stmt_info))
585 return;
587 inside_cost = ncopies * TARG_VEC_STMT_COST;
589 /* FORNOW: Assuming maximum 2 args per stmts. */
590 for (i = 0; i < 2; i++)
592 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
593 outside_cost += TARG_SCALAR_TO_VEC_COST;
596 if (vect_print_dump_info (REPORT_COST))
597 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
598 "outside_cost = %d .", inside_cost, outside_cost);
600 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
601 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
602 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
606 /* Function vect_cost_strided_group_size
608 For strided load or store, return the group_size only if it is the first
609 load or store of a group, else return 1. This ensures that group size is
610 only returned once per group. */
612 static int
613 vect_cost_strided_group_size (stmt_vec_info stmt_info)
615 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
617 if (first_stmt == STMT_VINFO_STMT (stmt_info))
618 return DR_GROUP_SIZE (stmt_info);
620 return 1;
624 /* Function vect_model_store_cost
626 Models cost for stores. In the case of strided accesses, one access
627 has the overhead of the strided access attributed to it. */
629 void
630 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
631 enum vect_def_type dt, slp_tree slp_node)
633 int group_size;
634 int inside_cost = 0, outside_cost = 0;
636 /* The SLP costs were already calculated during SLP tree build. */
637 if (PURE_SLP_STMT (stmt_info))
638 return;
640 if (dt == vect_constant_def || dt == vect_invariant_def)
641 outside_cost = TARG_SCALAR_TO_VEC_COST;
643 /* Strided access? */
644 if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node)
645 group_size = vect_cost_strided_group_size (stmt_info);
646 /* Not a strided access. */
647 else
648 group_size = 1;
650 /* Is this an access in a group of stores, which provide strided access?
651 If so, add in the cost of the permutes. */
652 if (group_size > 1)
654 /* Uses a high and low interleave operation for each needed permute. */
655 inside_cost = ncopies * exact_log2(group_size) * group_size
656 * TARG_VEC_STMT_COST;
658 if (vect_print_dump_info (REPORT_COST))
659 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
660 group_size);
664 /* Costs of the stores. */
665 inside_cost += ncopies * TARG_VEC_STORE_COST;
667 if (vect_print_dump_info (REPORT_COST))
668 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
669 "outside_cost = %d .", inside_cost, outside_cost);
671 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
672 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
673 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
677 /* Function vect_model_load_cost
679 Models cost for loads. In the case of strided accesses, the last access
680 has the overhead of the strided access attributed to it. Since unaligned
681 accesses are supported for loads, we also account for the costs of the
682 access scheme chosen. */
684 void
685 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
688 int group_size;
689 int alignment_support_cheme;
690 gimple first_stmt;
691 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
692 int inside_cost = 0, outside_cost = 0;
694 /* The SLP costs were already calculated during SLP tree build. */
695 if (PURE_SLP_STMT (stmt_info))
696 return;
698 /* Strided accesses? */
699 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
700 if (first_stmt && !slp_node)
702 group_size = vect_cost_strided_group_size (stmt_info);
703 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
705 /* Not a strided access. */
706 else
708 group_size = 1;
709 first_dr = dr;
712 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
714 /* Is this an access in a group of loads providing strided access?
715 If so, add in the cost of the permutes. */
716 if (group_size > 1)
718 /* Uses an even and odd extract operations for each needed permute. */
719 inside_cost = ncopies * exact_log2(group_size) * group_size
720 * TARG_VEC_STMT_COST;
722 if (vect_print_dump_info (REPORT_COST))
723 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
724 group_size);
728 /* The loads themselves. */
729 switch (alignment_support_cheme)
731 case dr_aligned:
733 inside_cost += ncopies * TARG_VEC_LOAD_COST;
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: aligned.");
738 break;
740 case dr_unaligned_supported:
742 /* Here, we assign an additional cost for the unaligned load. */
743 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
745 if (vect_print_dump_info (REPORT_COST))
746 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
747 "hardware.");
749 break;
751 case dr_explicit_realign:
753 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 /* FIXME: If the misalignment remains fixed across the iterations of
756 the containing loop, the following cost should be added to the
757 outside costs. */
758 if (targetm.vectorize.builtin_mask_for_load)
759 inside_cost += TARG_VEC_STMT_COST;
761 break;
763 case dr_explicit_realign_optimized:
765 if (vect_print_dump_info (REPORT_COST))
766 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
767 "pipelined.");
769 /* Unaligned software pipeline has a load of an address, an initial
770 load, and possibly a mask operation to "prime" the loop. However,
771 if this is an access in a group of loads, which provide strided
772 access, then the above cost should only be considered for one
773 access in the group. Inside the loop, there is a load op
774 and a realignment op. */
776 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
778 outside_cost = 2*TARG_VEC_STMT_COST;
779 if (targetm.vectorize.builtin_mask_for_load)
780 outside_cost += TARG_VEC_STMT_COST;
783 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
785 break;
788 default:
789 gcc_unreachable ();
792 if (vect_print_dump_info (REPORT_COST))
793 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
794 "outside_cost = %d .", inside_cost, outside_cost);
796 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
797 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
798 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
802 /* Function vect_get_new_vect_var.
804 Returns a name for a new variable. The current naming scheme appends the
805 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
806 the name of vectorizer generated variables, and appends that to NAME if
807 provided. */
809 static tree
810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
812 const char *prefix;
813 tree new_vect_var;
815 switch (var_kind)
817 case vect_simple_var:
818 prefix = "vect_";
819 break;
820 case vect_scalar_var:
821 prefix = "stmp_";
822 break;
823 case vect_pointer_var:
824 prefix = "vect_p";
825 break;
826 default:
827 gcc_unreachable ();
830 if (name)
832 char* tmp = concat (prefix, name, NULL);
833 new_vect_var = create_tmp_var (type, tmp);
834 free (tmp);
836 else
837 new_vect_var = create_tmp_var (type, prefix);
839 /* Mark vector typed variable as a gimple register variable. */
840 if (TREE_CODE (type) == VECTOR_TYPE)
841 DECL_GIMPLE_REG_P (new_vect_var) = true;
843 return new_vect_var;
847 /* Function vect_create_addr_base_for_vector_ref.
849 Create an expression that computes the address of the first memory location
850 that will be accessed for a data reference.
852 Input:
853 STMT: The statement containing the data reference.
854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
855 OFFSET: Optional. If supplied, it is be added to the initial address.
856 LOOP: Specify relative to which loop-nest should the address be computed.
857 For example, when the dataref is in an inner-loop nested in an
858 outer-loop that is now being vectorized, LOOP can be either the
859 outer-loop, or the inner-loop. The first memory location accessed
860 by the following dataref ('in' points to short):
862 for (i=0; i<N; i++)
863 for (j=0; j<M; j++)
864 s += in[i+j]
866 is as follows:
867 if LOOP=i_loop: &in (relative to i_loop)
868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
870 Output:
871 1. Return an SSA_NAME whose value is the address of the memory location of
872 the first vector of the data reference.
873 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
874 these statement(s) which define the returned SSA_NAME.
876 FORNOW: We are only handling array accesses with step 1. */
878 static tree
879 vect_create_addr_base_for_vector_ref (gimple stmt,
880 gimple_seq *new_stmt_list,
881 tree offset,
882 struct loop *loop)
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
885 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
886 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
887 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
888 tree base_name;
889 tree data_ref_base_var;
890 tree vec_stmt;
891 tree addr_base, addr_expr;
892 tree dest;
893 gimple_seq seq = NULL;
894 tree base_offset = unshare_expr (DR_OFFSET (dr));
895 tree init = unshare_expr (DR_INIT (dr));
896 tree vect_ptr_type, addr_expr2;
897 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
899 gcc_assert (loop);
900 if (loop != containing_loop)
902 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
903 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
905 gcc_assert (nested_in_vect_loop_p (loop, stmt));
907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
912 /* Create data_ref_base */
913 base_name = build_fold_indirect_ref (data_ref_base);
914 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
915 add_referenced_var (data_ref_base_var);
916 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
917 data_ref_base_var);
918 gimple_seq_add_seq (new_stmt_list, seq);
920 /* Create base_offset */
921 base_offset = size_binop (PLUS_EXPR,
922 fold_convert (sizetype, base_offset),
923 fold_convert (sizetype, init));
924 dest = create_tmp_var (sizetype, "base_off");
925 add_referenced_var (dest);
926 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
927 gimple_seq_add_seq (new_stmt_list, seq);
929 if (offset)
931 tree tmp = create_tmp_var (sizetype, "offset");
933 add_referenced_var (tmp);
934 offset = fold_build2 (MULT_EXPR, sizetype,
935 fold_convert (sizetype, offset), step);
936 base_offset = fold_build2 (PLUS_EXPR, sizetype,
937 base_offset, offset);
938 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
939 gimple_seq_add_seq (new_stmt_list, seq);
942 /* base + base_offset */
943 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
944 data_ref_base, base_offset);
946 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
948 /* addr_expr = addr_base */
949 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
950 get_name (base_name));
951 add_referenced_var (addr_expr);
952 vec_stmt = fold_convert (vect_ptr_type, addr_base);
953 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
954 get_name (base_name));
955 add_referenced_var (addr_expr2);
956 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr2);
957 gimple_seq_add_seq (new_stmt_list, seq);
959 if (vect_print_dump_info (REPORT_DETAILS))
961 fprintf (vect_dump, "created ");
962 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
964 return vec_stmt;
968 /* Function vect_create_data_ref_ptr.
970 Create a new pointer to vector type (vp), that points to the first location
971 accessed in the loop by STMT, along with the def-use update chain to
972 appropriately advance the pointer through the loop iterations. Also set
973 aliasing information for the pointer. This vector pointer is used by the
974 callers to this function to create a memory reference expression for vector
975 load/store access.
977 Input:
978 1. STMT: a stmt that references memory. Expected to be of the form
979 GIMPLE_ASSIGN <name, data-ref> or
980 GIMPLE_ASSIGN <data-ref, name>.
981 2. AT_LOOP: the loop where the vector memref is to be created.
982 3. OFFSET (optional): an offset to be added to the initial address accessed
983 by the data-ref in STMT.
984 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
985 pointing to the initial address.
986 5. TYPE: if not NULL indicates the required type of the data-ref.
988 Output:
989 1. Declare a new ptr to vector_type, and have it point to the base of the
990 data reference (initial addressed accessed by the data reference).
991 For example, for vector of type V8HI, the following code is generated:
993 v8hi *vp;
994 vp = (v8hi *)initial_address;
996 if OFFSET is not supplied:
997 initial_address = &a[init];
998 if OFFSET is supplied:
999 initial_address = &a[init + OFFSET];
1001 Return the initial_address in INITIAL_ADDRESS.
1003 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
1004 update the pointer in each iteration of the loop.
1006 Return the increment stmt that updates the pointer in PTR_INCR.
1008 3. Set INV_P to true if the access pattern of the data reference in the
1009 vectorized loop is invariant. Set it to false otherwise.
1011 4. Return the pointer. */
1013 static tree
1014 vect_create_data_ref_ptr (gimple stmt, struct loop *at_loop,
1015 tree offset, tree *initial_address, gimple *ptr_incr,
1016 bool only_init, bool *inv_p, tree type)
1018 tree base_name;
1019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1021 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1022 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
1023 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
1024 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1025 tree vect_ptr_type;
1026 tree vect_ptr;
1027 tree tag;
1028 tree new_temp;
1029 gimple vec_stmt;
1030 gimple_seq new_stmt_list = NULL;
1031 edge pe;
1032 basic_block new_bb;
1033 tree vect_ptr_init;
1034 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1035 tree vptr;
1036 gimple_stmt_iterator incr_gsi;
1037 bool insert_after;
1038 tree indx_before_incr, indx_after_incr;
1039 gimple incr;
1040 tree step;
1042 /* Check the step (evolution) of the load in LOOP, and record
1043 whether it's invariant. */
1044 if (nested_in_vect_loop)
1045 step = STMT_VINFO_DR_STEP (stmt_info);
1046 else
1047 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1049 if (tree_int_cst_compare (step, size_zero_node) == 0)
1050 *inv_p = true;
1051 else
1052 *inv_p = false;
1054 /* Create an expression for the first address accessed by this load
1055 in LOOP. */
1056 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1058 if (vect_print_dump_info (REPORT_DETAILS))
1060 tree data_ref_base = base_name;
1061 fprintf (vect_dump, "create vector-pointer variable to type: ");
1062 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1063 if (TREE_CODE (data_ref_base) == VAR_DECL)
1064 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1065 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1066 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1067 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1068 fprintf (vect_dump, " vectorizing a record based array ref: ");
1069 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1070 fprintf (vect_dump, " vectorizing a pointer ref: ");
1071 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1074 /** (1) Create the new vector-pointer variable: **/
1075 if (type)
1076 vect_ptr_type = build_pointer_type (type);
1077 else
1078 vect_ptr_type = build_pointer_type (vectype);
1080 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1081 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1082 vect_ptr_type = build_qualified_type (vect_ptr_type, TYPE_QUAL_RESTRICT);
1083 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1084 get_name (base_name));
1085 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == SSA_NAME
1086 && TYPE_RESTRICT (TREE_TYPE (DR_BASE_ADDRESS (dr))))
1088 get_alias_set (base_name);
1089 DECL_POINTER_ALIAS_SET (vect_ptr)
1090 = DECL_POINTER_ALIAS_SET (SSA_NAME_VAR (DR_BASE_ADDRESS (dr)));
1093 add_referenced_var (vect_ptr);
1095 /** (2) Add aliasing information to the new vector-pointer:
1096 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1098 tag = DR_SYMBOL_TAG (dr);
1099 gcc_assert (tag);
1101 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1102 tag must be created with tag added to its may alias list. */
1103 if (!MTAG_P (tag))
1104 new_type_alias (vect_ptr, tag, DR_REF (dr));
1105 else
1106 set_symbol_mem_tag (vect_ptr, tag);
1108 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1109 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1110 def-use update cycles for the pointer: One relative to the outer-loop
1111 (LOOP), which is what steps (3) and (4) below do. The other is relative
1112 to the inner-loop (which is the inner-most loop containing the dataref),
1113 and this is done be step (5) below.
1115 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1116 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1117 redundant. Steps (3),(4) create the following:
1119 vp0 = &base_addr;
1120 LOOP: vp1 = phi(vp0,vp2)
1121 ...
1123 vp2 = vp1 + step
1124 goto LOOP
1126 If there is an inner-loop nested in loop, then step (5) will also be
1127 applied, and an additional update in the inner-loop will be created:
1129 vp0 = &base_addr;
1130 LOOP: vp1 = phi(vp0,vp2)
1132 inner: vp3 = phi(vp1,vp4)
1133 vp4 = vp3 + inner_step
1134 if () goto inner
1136 vp2 = vp1 + step
1137 if () goto LOOP */
1139 /** (3) Calculate the initial address the vector-pointer, and set
1140 the vector-pointer to point to it before the loop: **/
1142 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1144 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1145 offset, loop);
1146 pe = loop_preheader_edge (loop);
1147 if (new_stmt_list)
1149 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
1150 gcc_assert (!new_bb);
1153 *initial_address = new_temp;
1155 /* Create: p = (vectype *) initial_base */
1156 vec_stmt = gimple_build_assign (vect_ptr,
1157 fold_convert (vect_ptr_type, new_temp));
1158 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1159 gimple_assign_set_lhs (vec_stmt, vect_ptr_init);
1160 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
1161 gcc_assert (!new_bb);
1164 /** (4) Handle the updating of the vector-pointer inside the loop.
1165 This is needed when ONLY_INIT is false, and also when AT_LOOP
1166 is the inner-loop nested in LOOP (during outer-loop vectorization).
1169 if (only_init && at_loop == loop) /* No update in loop is required. */
1171 /* Copy the points-to information if it exists. */
1172 if (DR_PTR_INFO (dr))
1173 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1174 vptr = vect_ptr_init;
1176 else
1178 /* The step of the vector pointer is the Vector Size. */
1179 tree step = TYPE_SIZE_UNIT (vectype);
1180 /* One exception to the above is when the scalar step of the load in
1181 LOOP is zero. In this case the step here is also zero. */
1182 if (*inv_p)
1183 step = size_zero_node;
1185 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
1187 create_iv (vect_ptr_init,
1188 fold_convert (vect_ptr_type, step),
1189 vect_ptr, loop, &incr_gsi, insert_after,
1190 &indx_before_incr, &indx_after_incr);
1191 incr = gsi_stmt (incr_gsi);
1192 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1194 /* Copy the points-to information if it exists. */
1195 if (DR_PTR_INFO (dr))
1197 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1198 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1200 merge_alias_info (vect_ptr_init, indx_before_incr);
1201 merge_alias_info (vect_ptr_init, indx_after_incr);
1202 if (ptr_incr)
1203 *ptr_incr = incr;
1205 vptr = indx_before_incr;
1208 if (!nested_in_vect_loop || only_init)
1209 return vptr;
1212 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1213 nested in LOOP, if exists: **/
1215 gcc_assert (nested_in_vect_loop);
1216 if (!only_init)
1218 standard_iv_increment_position (containing_loop, &incr_gsi,
1219 &insert_after);
1220 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), vect_ptr,
1221 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
1222 &indx_after_incr);
1223 incr = gsi_stmt (incr_gsi);
1224 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
1226 /* Copy the points-to information if it exists. */
1227 if (DR_PTR_INFO (dr))
1229 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1230 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1232 merge_alias_info (vect_ptr_init, indx_before_incr);
1233 merge_alias_info (vect_ptr_init, indx_after_incr);
1234 if (ptr_incr)
1235 *ptr_incr = incr;
1237 return indx_before_incr;
1239 else
1240 gcc_unreachable ();
1244 /* Function bump_vector_ptr
1246 Increment a pointer (to a vector type) by vector-size. If requested,
1247 i.e. if PTR-INCR is given, then also connect the new increment stmt
1248 to the existing def-use update-chain of the pointer, by modifying
1249 the PTR_INCR as illustrated below:
1251 The pointer def-use update-chain before this function:
1252 DATAREF_PTR = phi (p_0, p_2)
1253 ....
1254 PTR_INCR: p_2 = DATAREF_PTR + step
1256 The pointer def-use update-chain after this function:
1257 DATAREF_PTR = phi (p_0, p_2)
1258 ....
1259 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1260 ....
1261 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1263 Input:
1264 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1265 in the loop.
1266 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1267 the loop. The increment amount across iterations is expected
1268 to be vector_size.
1269 BSI - location where the new update stmt is to be placed.
1270 STMT - the original scalar memory-access stmt that is being vectorized.
1271 BUMP - optional. The offset by which to bump the pointer. If not given,
1272 the offset is assumed to be vector_size.
1274 Output: Return NEW_DATAREF_PTR as illustrated above.
1278 static tree
1279 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
1280 gimple stmt, tree bump)
1282 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1283 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1284 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1285 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1286 tree update = TYPE_SIZE_UNIT (vectype);
1287 gimple incr_stmt;
1288 ssa_op_iter iter;
1289 use_operand_p use_p;
1290 tree new_dataref_ptr;
1292 if (bump)
1293 update = bump;
1295 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr_var,
1296 dataref_ptr, update);
1297 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1298 gimple_assign_set_lhs (incr_stmt, new_dataref_ptr);
1299 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
1301 /* Copy the points-to information if it exists. */
1302 if (DR_PTR_INFO (dr))
1303 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1304 merge_alias_info (new_dataref_ptr, dataref_ptr);
1306 if (!ptr_incr)
1307 return new_dataref_ptr;
1309 /* Update the vector-pointer's cross-iteration increment. */
1310 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1312 tree use = USE_FROM_PTR (use_p);
1314 if (use == dataref_ptr)
1315 SET_USE (use_p, new_dataref_ptr);
1316 else
1317 gcc_assert (tree_int_cst_compare (use, update) == 0);
1320 return new_dataref_ptr;
1324 /* Function vect_create_destination_var.
1326 Create a new temporary of type VECTYPE. */
1328 static tree
1329 vect_create_destination_var (tree scalar_dest, tree vectype)
1331 tree vec_dest;
1332 const char *new_name;
1333 tree type;
1334 enum vect_var_kind kind;
1336 kind = vectype ? vect_simple_var : vect_scalar_var;
1337 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1339 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1341 new_name = get_name (scalar_dest);
1342 if (!new_name)
1343 new_name = "var_";
1344 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1345 add_referenced_var (vec_dest);
1347 return vec_dest;
1351 /* Function vect_init_vector.
1353 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1354 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1355 is not NULL. Otherwise, place the initialization at the loop preheader.
1356 Return the DEF of INIT_STMT.
1357 It will be used in the vectorization of STMT. */
1359 static tree
1360 vect_init_vector (gimple stmt, tree vector_var, tree vector_type,
1361 gimple_stmt_iterator *gsi)
1363 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1364 tree new_var;
1365 gimple init_stmt;
1366 tree vec_oprnd;
1367 edge pe;
1368 tree new_temp;
1369 basic_block new_bb;
1371 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1372 add_referenced_var (new_var);
1373 init_stmt = gimple_build_assign (new_var, vector_var);
1374 new_temp = make_ssa_name (new_var, init_stmt);
1375 gimple_assign_set_lhs (init_stmt, new_temp);
1377 if (gsi)
1378 vect_finish_stmt_generation (stmt, init_stmt, gsi);
1379 else
1381 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1382 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1384 if (nested_in_vect_loop_p (loop, stmt))
1385 loop = loop->inner;
1386 pe = loop_preheader_edge (loop);
1387 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1388 gcc_assert (!new_bb);
1391 if (vect_print_dump_info (REPORT_DETAILS))
1393 fprintf (vect_dump, "created new init_stmt: ");
1394 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1397 vec_oprnd = gimple_assign_lhs (init_stmt);
1398 return vec_oprnd;
1402 /* For constant and loop invariant defs of SLP_NODE this function returns
1403 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1404 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1405 stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
1407 static void
1408 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1409 unsigned int op_num, unsigned int number_of_vectors)
1411 VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1412 gimple stmt = VEC_index (gimple, stmts, 0);
1413 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1414 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1415 int nunits;
1416 tree vec_cst;
1417 tree t = NULL_TREE;
1418 int j, number_of_places_left_in_vector;
1419 tree vector_type;
1420 tree op, vop;
1421 int group_size = VEC_length (gimple, stmts);
1422 unsigned int vec_num, i;
1423 int number_of_copies = 1;
1424 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1425 bool constant_p, is_store;
1427 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1429 is_store = true;
1430 op = gimple_assign_rhs1 (stmt);
1432 else
1434 is_store = false;
1435 op = gimple_op (stmt, op_num + 1);
1438 if (CONSTANT_CLASS_P (op))
1440 vector_type = vectype;
1441 constant_p = true;
1443 else
1445 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1446 gcc_assert (vector_type);
1447 constant_p = false;
1450 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1452 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1453 created vectors. It is greater than 1 if unrolling is performed.
1455 For example, we have two scalar operands, s1 and s2 (e.g., group of
1456 strided accesses of size two), while NUNITS is four (i.e., four scalars
1457 of this type can be packed in a vector). The output vector will contain
1458 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1459 will be 2).
1461 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1462 containing the operands.
1464 For example, NUNITS is four as before, and the group size is 8
1465 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1466 {s5, s6, s7, s8}. */
1468 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1470 number_of_places_left_in_vector = nunits;
1471 for (j = 0; j < number_of_copies; j++)
1473 for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--)
1475 if (is_store)
1476 op = gimple_assign_rhs1 (stmt);
1477 else
1478 op = gimple_op (stmt, op_num + 1);
1480 /* Create 'vect_ = {op0,op1,...,opn}'. */
1481 t = tree_cons (NULL_TREE, op, t);
1483 number_of_places_left_in_vector--;
1485 if (number_of_places_left_in_vector == 0)
1487 number_of_places_left_in_vector = nunits;
1489 if (constant_p)
1490 vec_cst = build_vector (vector_type, t);
1491 else
1492 vec_cst = build_constructor_from_list (vector_type, t);
1493 VEC_quick_push (tree, voprnds,
1494 vect_init_vector (stmt, vec_cst, vector_type, NULL));
1495 t = NULL_TREE;
1500 /* Since the vectors are created in the reverse order, we should invert
1501 them. */
1502 vec_num = VEC_length (tree, voprnds);
1503 for (j = vec_num - 1; j >= 0; j--)
1505 vop = VEC_index (tree, voprnds, j);
1506 VEC_quick_push (tree, *vec_oprnds, vop);
1509 VEC_free (tree, heap, voprnds);
1511 /* In case that VF is greater than the unrolling factor needed for the SLP
1512 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1513 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1514 to replicate the vectors. */
1515 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1517 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1518 VEC_quick_push (tree, *vec_oprnds, vop);
1523 /* Get vectorized definitions from SLP_NODE that contains corresponding
1524 vectorized def-stmts. */
1526 static void
1527 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1529 tree vec_oprnd;
1530 gimple vec_def_stmt;
1531 unsigned int i;
1533 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1535 for (i = 0;
1536 VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1537 i++)
1539 gcc_assert (vec_def_stmt);
1540 vec_oprnd = gimple_get_lhs (vec_def_stmt);
1541 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1546 /* Get vectorized definitions for SLP_NODE.
1547 If the scalar definitions are loop invariants or constants, collect them and
1548 call vect_get_constant_vectors() to create vector stmts.
1549 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1550 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1551 vect_get_slp_vect_defs() to retrieve them.
1552 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1553 the right node. This is used when the second operand must remain scalar. */
1555 static void
1556 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1557 VEC (tree,heap) **vec_oprnds1)
1559 gimple first_stmt;
1560 enum tree_code code;
1561 int number_of_vects;
1562 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
1564 first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1565 /* The number of vector defs is determined by the number of vector statements
1566 in the node from which we get those statements. */
1567 if (SLP_TREE_LEFT (slp_node))
1568 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node));
1569 else
1571 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1572 /* Number of vector stmts was calculated according to LHS in
1573 vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if
1574 necessary. See vect_get_smallest_scalar_type() for details. */
1575 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
1576 &rhs_size_unit);
1577 if (rhs_size_unit != lhs_size_unit)
1579 number_of_vects *= rhs_size_unit;
1580 number_of_vects /= lhs_size_unit;
1584 /* Allocate memory for vectorized defs. */
1585 *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
1587 /* SLP_NODE corresponds either to a group of stores or to a group of
1588 unary/binary operations. We don't call this function for loads. */
1589 if (SLP_TREE_LEFT (slp_node))
1590 /* The defs are already vectorized. */
1591 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1592 else
1593 /* Build vectors from scalar defs. */
1594 vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
1596 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1597 /* Since we don't call this function with loads, this is a group of
1598 stores. */
1599 return;
1601 code = gimple_assign_rhs_code (first_stmt);
1602 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
1603 return;
1605 /* The number of vector defs is determined by the number of vector statements
1606 in the node from which we get those statements. */
1607 if (SLP_TREE_RIGHT (slp_node))
1608 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node));
1609 else
1610 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1612 *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects);
1614 if (SLP_TREE_RIGHT (slp_node))
1615 /* The defs are already vectorized. */
1616 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1617 else
1618 /* Build vectors from scalar defs. */
1619 vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
1623 /* Function get_initial_def_for_induction
1625 Input:
1626 STMT - a stmt that performs an induction operation in the loop.
1627 IV_PHI - the initial value of the induction variable
1629 Output:
1630 Return a vector variable, initialized with the first VF values of
1631 the induction variable. E.g., for an iv with IV_PHI='X' and
1632 evolution S, for a vector of 4 units, we want to return:
1633 [X, X + S, X + 2*S, X + 3*S]. */
1635 static tree
1636 get_initial_def_for_induction (gimple iv_phi)
1638 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1639 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1640 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1641 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
1642 tree vectype;
1643 int nunits;
1644 edge pe = loop_preheader_edge (loop);
1645 struct loop *iv_loop;
1646 basic_block new_bb;
1647 tree vec, vec_init, vec_step, t;
1648 tree access_fn;
1649 tree new_var;
1650 tree new_name;
1651 gimple init_stmt, induction_phi, new_stmt;
1652 tree induc_def, vec_def, vec_dest;
1653 tree init_expr, step_expr;
1654 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1655 int i;
1656 bool ok;
1657 int ncopies;
1658 tree expr;
1659 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1660 bool nested_in_vect_loop = false;
1661 gimple_seq stmts = NULL;
1662 imm_use_iterator imm_iter;
1663 use_operand_p use_p;
1664 gimple exit_phi;
1665 edge latch_e;
1666 tree loop_arg;
1667 gimple_stmt_iterator si;
1668 basic_block bb = gimple_bb (iv_phi);
1670 vectype = get_vectype_for_scalar_type (scalar_type);
1671 gcc_assert (vectype);
1672 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1673 ncopies = vf / nunits;
1675 gcc_assert (phi_info);
1676 gcc_assert (ncopies >= 1);
1678 /* Find the first insertion point in the BB. */
1679 si = gsi_after_labels (bb);
1681 if (INTEGRAL_TYPE_P (scalar_type) || POINTER_TYPE_P (scalar_type))
1682 step_expr = build_int_cst (scalar_type, 0);
1683 else
1684 step_expr = build_real (scalar_type, dconst0);
1686 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1687 if (nested_in_vect_loop_p (loop, iv_phi))
1689 nested_in_vect_loop = true;
1690 iv_loop = loop->inner;
1692 else
1693 iv_loop = loop;
1694 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
1696 latch_e = loop_latch_edge (iv_loop);
1697 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1699 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1700 gcc_assert (access_fn);
1701 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1702 &init_expr, &step_expr);
1703 gcc_assert (ok);
1704 pe = loop_preheader_edge (iv_loop);
1706 /* Create the vector that holds the initial_value of the induction. */
1707 if (nested_in_vect_loop)
1709 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1710 been created during vectorization of previous stmts; We obtain it from
1711 the STMT_VINFO_VEC_STMT of the defining stmt. */
1712 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1713 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1715 else
1717 /* iv_loop is the loop to be vectorized. Create:
1718 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1719 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1720 add_referenced_var (new_var);
1722 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1723 if (stmts)
1725 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
1726 gcc_assert (!new_bb);
1729 t = NULL_TREE;
1730 t = tree_cons (NULL_TREE, init_expr, t);
1731 for (i = 1; i < nunits; i++)
1733 /* Create: new_name_i = new_name + step_expr */
1734 enum tree_code code = POINTER_TYPE_P (scalar_type)
1735 ? POINTER_PLUS_EXPR : PLUS_EXPR;
1736 init_stmt = gimple_build_assign_with_ops (code, new_var,
1737 new_name, step_expr);
1738 new_name = make_ssa_name (new_var, init_stmt);
1739 gimple_assign_set_lhs (init_stmt, new_name);
1741 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
1742 gcc_assert (!new_bb);
1744 if (vect_print_dump_info (REPORT_DETAILS))
1746 fprintf (vect_dump, "created new init_stmt: ");
1747 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
1749 t = tree_cons (NULL_TREE, new_name, t);
1751 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1752 vec = build_constructor_from_list (vectype, nreverse (t));
1753 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1757 /* Create the vector that holds the step of the induction. */
1758 if (nested_in_vect_loop)
1759 /* iv_loop is nested in the loop to be vectorized. Generate:
1760 vec_step = [S, S, S, S] */
1761 new_name = step_expr;
1762 else
1764 /* iv_loop is the loop to be vectorized. Generate:
1765 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1766 expr = build_int_cst (scalar_type, vf);
1767 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1770 t = NULL_TREE;
1771 for (i = 0; i < nunits; i++)
1772 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1773 gcc_assert (CONSTANT_CLASS_P (new_name));
1774 vec = build_vector (vectype, t);
1775 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1778 /* Create the following def-use cycle:
1779 loop prolog:
1780 vec_init = ...
1781 vec_step = ...
1782 loop:
1783 vec_iv = PHI <vec_init, vec_loop>
1785 STMT
1787 vec_loop = vec_iv + vec_step; */
1789 /* Create the induction-phi that defines the induction-operand. */
1790 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1791 add_referenced_var (vec_dest);
1792 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1793 set_vinfo_for_stmt (induction_phi,
1794 new_stmt_vec_info (induction_phi, loop_vinfo));
1795 induc_def = PHI_RESULT (induction_phi);
1797 /* Create the iv update inside the loop */
1798 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1799 induc_def, vec_step);
1800 vec_def = make_ssa_name (vec_dest, new_stmt);
1801 gimple_assign_set_lhs (new_stmt, vec_def);
1802 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1803 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
1805 /* Set the arguments of the phi node: */
1806 add_phi_arg (induction_phi, vec_init, pe);
1807 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1810 /* In case that vectorization factor (VF) is bigger than the number
1811 of elements that we can fit in a vectype (nunits), we have to generate
1812 more than one vector stmt - i.e - we need to "unroll" the
1813 vector stmt by a factor VF/nunits. For more details see documentation
1814 in vectorizable_operation. */
1816 if (ncopies > 1)
1818 stmt_vec_info prev_stmt_vinfo;
1819 /* FORNOW. This restriction should be relaxed. */
1820 gcc_assert (!nested_in_vect_loop);
1822 /* Create the vector that holds the step of the induction. */
1823 expr = build_int_cst (scalar_type, nunits);
1824 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1825 t = NULL_TREE;
1826 for (i = 0; i < nunits; i++)
1827 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1828 gcc_assert (CONSTANT_CLASS_P (new_name));
1829 vec = build_vector (vectype, t);
1830 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1832 vec_def = induc_def;
1833 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1834 for (i = 1; i < ncopies; i++)
1836 /* vec_i = vec_prev + vec_step */
1837 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
1838 vec_def, vec_step);
1839 vec_def = make_ssa_name (vec_dest, new_stmt);
1840 gimple_assign_set_lhs (new_stmt, vec_def);
1842 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
1843 set_vinfo_for_stmt (new_stmt,
1844 new_stmt_vec_info (new_stmt, loop_vinfo));
1845 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1846 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1850 if (nested_in_vect_loop)
1852 /* Find the loop-closed exit-phi of the induction, and record
1853 the final vector of induction results: */
1854 exit_phi = NULL;
1855 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1857 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
1859 exit_phi = USE_STMT (use_p);
1860 break;
1863 if (exit_phi)
1865 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1866 /* FORNOW. Currently not supporting the case that an inner-loop induction
1867 is not used in the outer-loop (i.e. only outside the outer-loop). */
1868 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1869 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1871 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1872 if (vect_print_dump_info (REPORT_DETAILS))
1874 fprintf (vect_dump, "vector of inductions after inner-loop:");
1875 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
1881 if (vect_print_dump_info (REPORT_DETAILS))
1883 fprintf (vect_dump, "transform induction: created def-use cycle: ");
1884 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
1885 fprintf (vect_dump, "\n");
1886 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
1889 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1890 return induc_def;
1894 /* Function vect_get_vec_def_for_operand.
1896 OP is an operand in STMT. This function returns a (vector) def that will be
1897 used in the vectorized stmt for STMT.
1899 In the case that OP is an SSA_NAME which is defined in the loop, then
1900 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1902 In case OP is an invariant or constant, a new stmt that creates a vector def
1903 needs to be introduced. */
1905 static tree
1906 vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
1908 tree vec_oprnd;
1909 gimple vec_stmt;
1910 gimple def_stmt;
1911 stmt_vec_info def_stmt_info = NULL;
1912 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1913 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1914 unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1916 tree vec_inv;
1917 tree vec_cst;
1918 tree t = NULL_TREE;
1919 tree def;
1920 int i;
1921 enum vect_def_type dt;
1922 bool is_simple_use;
1923 tree vector_type;
1925 if (vect_print_dump_info (REPORT_DETAILS))
1927 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1928 print_generic_expr (vect_dump, op, TDF_SLIM);
1931 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1932 gcc_assert (is_simple_use);
1933 if (vect_print_dump_info (REPORT_DETAILS))
1935 if (def)
1937 fprintf (vect_dump, "def = ");
1938 print_generic_expr (vect_dump, def, TDF_SLIM);
1940 if (def_stmt)
1942 fprintf (vect_dump, " def_stmt = ");
1943 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1947 switch (dt)
1949 /* Case 1: operand is a constant. */
1950 case vect_constant_def:
1952 if (scalar_def)
1953 *scalar_def = op;
1955 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1956 if (vect_print_dump_info (REPORT_DETAILS))
1957 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1959 for (i = nunits - 1; i >= 0; --i)
1961 t = tree_cons (NULL_TREE, op, t);
1963 vec_cst = build_vector (vectype, t);
1964 return vect_init_vector (stmt, vec_cst, vectype, NULL);
1967 /* Case 2: operand is defined outside the loop - loop invariant. */
1968 case vect_invariant_def:
1970 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1971 gcc_assert (vector_type);
1972 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
1974 if (scalar_def)
1975 *scalar_def = def;
1977 /* Create 'vec_inv = {inv,inv,..,inv}' */
1978 if (vect_print_dump_info (REPORT_DETAILS))
1979 fprintf (vect_dump, "Create vector_inv.");
1981 for (i = nunits - 1; i >= 0; --i)
1983 t = tree_cons (NULL_TREE, def, t);
1986 /* FIXME: use build_constructor directly. */
1987 vec_inv = build_constructor_from_list (vector_type, t);
1988 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1991 /* Case 3: operand is defined inside the loop. */
1992 case vect_loop_def:
1994 if (scalar_def)
1995 *scalar_def = NULL/* FIXME tuples: def_stmt*/;
1997 /* Get the def from the vectorized stmt. */
1998 def_stmt_info = vinfo_for_stmt (def_stmt);
1999 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2000 gcc_assert (vec_stmt);
2001 if (gimple_code (vec_stmt) == GIMPLE_PHI)
2002 vec_oprnd = PHI_RESULT (vec_stmt);
2003 else if (is_gimple_call (vec_stmt))
2004 vec_oprnd = gimple_call_lhs (vec_stmt);
2005 else
2006 vec_oprnd = gimple_assign_lhs (vec_stmt);
2007 return vec_oprnd;
2010 /* Case 4: operand is defined by a loop header phi - reduction */
2011 case vect_reduction_def:
2013 struct loop *loop;
2015 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2016 loop = (gimple_bb (def_stmt))->loop_father;
2018 /* Get the def before the loop */
2019 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
2020 return get_initial_def_for_reduction (stmt, op, scalar_def);
2023 /* Case 5: operand is defined by loop-header phi - induction. */
2024 case vect_induction_def:
2026 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2028 /* Get the def from the vectorized stmt. */
2029 def_stmt_info = vinfo_for_stmt (def_stmt);
2030 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
2031 gcc_assert (vec_stmt && gimple_code (vec_stmt) == GIMPLE_PHI);
2032 vec_oprnd = PHI_RESULT (vec_stmt);
2033 return vec_oprnd;
2036 default:
2037 gcc_unreachable ();
2042 /* Function vect_get_vec_def_for_stmt_copy
2044 Return a vector-def for an operand. This function is used when the
2045 vectorized stmt to be created (by the caller to this function) is a "copy"
2046 created in case the vectorized result cannot fit in one vector, and several
2047 copies of the vector-stmt are required. In this case the vector-def is
2048 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
2049 of the stmt that defines VEC_OPRND.
2050 DT is the type of the vector def VEC_OPRND.
2052 Context:
2053 In case the vectorization factor (VF) is bigger than the number
2054 of elements that can fit in a vectype (nunits), we have to generate
2055 more than one vector stmt to vectorize the scalar stmt. This situation
2056 arises when there are multiple data-types operated upon in the loop; the
2057 smallest data-type determines the VF, and as a result, when vectorizing
2058 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
2059 vector stmt (each computing a vector of 'nunits' results, and together
2060 computing 'VF' results in each iteration). This function is called when
2061 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
2062 which VF=16 and nunits=4, so the number of copies required is 4):
2064 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
2066 S1: x = load VS1.0: vx.0 = memref0 VS1.1
2067 VS1.1: vx.1 = memref1 VS1.2
2068 VS1.2: vx.2 = memref2 VS1.3
2069 VS1.3: vx.3 = memref3
2071 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
2072 VSnew.1: vz1 = vx.1 + ... VSnew.2
2073 VSnew.2: vz2 = vx.2 + ... VSnew.3
2074 VSnew.3: vz3 = vx.3 + ...
2076 The vectorization of S1 is explained in vectorizable_load.
2077 The vectorization of S2:
2078 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2079 the function 'vect_get_vec_def_for_operand' is called to
2080 get the relevant vector-def for each operand of S2. For operand x it
2081 returns the vector-def 'vx.0'.
2083 To create the remaining copies of the vector-stmt (VSnew.j), this
2084 function is called to get the relevant vector-def for each operand. It is
2085 obtained from the respective VS1.j stmt, which is recorded in the
2086 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2088 For example, to obtain the vector-def 'vx.1' in order to create the
2089 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2090 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2091 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2092 and return its def ('vx.1').
2093 Overall, to create the above sequence this function will be called 3 times:
2094 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2095 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2096 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2098 static tree
2099 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2101 gimple vec_stmt_for_operand;
2102 stmt_vec_info def_stmt_info;
2104 /* Do nothing; can reuse same def. */
2105 if (dt == vect_invariant_def || dt == vect_constant_def )
2106 return vec_oprnd;
2108 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2109 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2110 gcc_assert (def_stmt_info);
2111 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2112 gcc_assert (vec_stmt_for_operand);
2113 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2114 if (gimple_code (vec_stmt_for_operand) == GIMPLE_PHI)
2115 vec_oprnd = PHI_RESULT (vec_stmt_for_operand);
2116 else
2117 vec_oprnd = gimple_get_lhs (vec_stmt_for_operand);
2118 return vec_oprnd;
2122 /* Get vectorized definitions for the operands to create a copy of an original
2123 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2125 static void
2126 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2127 VEC(tree,heap) **vec_oprnds0,
2128 VEC(tree,heap) **vec_oprnds1)
2130 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2132 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2133 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2135 if (vec_oprnds1 && *vec_oprnds1)
2137 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2138 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2139 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2144 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2146 static void
2147 vect_get_vec_defs (tree op0, tree op1, gimple stmt,
2148 VEC(tree,heap) **vec_oprnds0, VEC(tree,heap) **vec_oprnds1,
2149 slp_tree slp_node)
2151 if (slp_node)
2152 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2153 else
2155 tree vec_oprnd;
2157 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2158 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2159 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2161 if (op1)
2163 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2164 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2165 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2171 /* Function vect_finish_stmt_generation.
2173 Insert a new stmt. */
2175 static void
2176 vect_finish_stmt_generation (gimple stmt, gimple vec_stmt,
2177 gimple_stmt_iterator *gsi)
2179 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2182 gcc_assert (gimple_code (stmt) != GIMPLE_LABEL);
2184 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
2186 set_vinfo_for_stmt (vec_stmt, new_stmt_vec_info (vec_stmt, loop_vinfo));
2188 if (vect_print_dump_info (REPORT_DETAILS))
2190 fprintf (vect_dump, "add new stmt: ");
2191 print_gimple_stmt (vect_dump, vec_stmt, 0, TDF_SLIM);
2194 gimple_set_location (vec_stmt, gimple_location (gsi_stmt (*gsi)));
2198 /* Function get_initial_def_for_reduction
2200 Input:
2201 STMT - a stmt that performs a reduction operation in the loop.
2202 INIT_VAL - the initial value of the reduction variable
2204 Output:
2205 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2206 of the reduction (used for adjusting the epilog - see below).
2207 Return a vector variable, initialized according to the operation that STMT
2208 performs. This vector will be used as the initial value of the
2209 vector of partial results.
2211 Option1 (adjust in epilog): Initialize the vector as follows:
2212 add: [0,0,...,0,0]
2213 mult: [1,1,...,1,1]
2214 min/max: [init_val,init_val,..,init_val,init_val]
2215 bit and/or: [init_val,init_val,..,init_val,init_val]
2216 and when necessary (e.g. add/mult case) let the caller know
2217 that it needs to adjust the result by init_val.
2219 Option2: Initialize the vector as follows:
2220 add: [0,0,...,0,init_val]
2221 mult: [1,1,...,1,init_val]
2222 min/max: [init_val,init_val,...,init_val]
2223 bit and/or: [init_val,init_val,...,init_val]
2224 and no adjustments are needed.
2226 For example, for the following code:
2228 s = init_val;
2229 for (i=0;i<n;i++)
2230 s = s + a[i];
2232 STMT is 's = s + a[i]', and the reduction variable is 's'.
2233 For a vector of 4 units, we want to return either [0,0,0,init_val],
2234 or [0,0,0,0] and let the caller know that it needs to adjust
2235 the result at the end by 'init_val'.
2237 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2238 initialization vector is simpler (same element in all entries).
2239 A cost model should help decide between these two schemes. */
2241 static tree
2242 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2244 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2245 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2246 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2247 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2248 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2249 tree scalar_type = TREE_TYPE (vectype);
2250 enum tree_code code = gimple_assign_rhs_code (stmt);
2251 tree type = TREE_TYPE (init_val);
2252 tree vecdef;
2253 tree def_for_init;
2254 tree init_def;
2255 tree t = NULL_TREE;
2256 int i;
2257 bool nested_in_vect_loop = false;
2259 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2260 if (nested_in_vect_loop_p (loop, stmt))
2261 nested_in_vect_loop = true;
2262 else
2263 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2265 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2267 switch (code)
2269 case WIDEN_SUM_EXPR:
2270 case DOT_PROD_EXPR:
2271 case PLUS_EXPR:
2272 if (nested_in_vect_loop)
2273 *adjustment_def = vecdef;
2274 else
2275 *adjustment_def = init_val;
2276 /* Create a vector of zeros for init_def. */
2277 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2278 def_for_init = build_real (scalar_type, dconst0);
2279 else
2280 def_for_init = build_int_cst (scalar_type, 0);
2282 for (i = nunits - 1; i >= 0; --i)
2283 t = tree_cons (NULL_TREE, def_for_init, t);
2284 init_def = build_vector (vectype, t);
2285 break;
2287 case MIN_EXPR:
2288 case MAX_EXPR:
2289 *adjustment_def = NULL_TREE;
2290 init_def = vecdef;
2291 break;
2293 default:
2294 gcc_unreachable ();
2297 return init_def;
2301 /* Function vect_create_epilog_for_reduction
2303 Create code at the loop-epilog to finalize the result of a reduction
2304 computation.
2306 VECT_DEF is a vector of partial results.
2307 REDUC_CODE is the tree-code for the epilog reduction.
2308 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2309 number of elements that we can fit in a vectype (nunits). In this case
2310 we have to generate more than one vector stmt - i.e - we need to "unroll"
2311 the vector stmt by a factor VF/nunits. For more details see documentation
2312 in vectorizable_operation.
2313 STMT is the scalar reduction stmt that is being vectorized.
2314 REDUCTION_PHI is the phi-node that carries the reduction computation.
2316 This function:
2317 1. Creates the reduction def-use cycle: sets the arguments for
2318 REDUCTION_PHI:
2319 The loop-entry argument is the vectorized initial-value of the reduction.
2320 The loop-latch argument is VECT_DEF - the vector of partial sums.
2321 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2322 by applying the operation specified by REDUC_CODE if available, or by
2323 other means (whole-vector shifts or a scalar loop).
2324 The function also creates a new phi node at the loop exit to preserve
2325 loop-closed form, as illustrated below.
2327 The flow at the entry to this function:
2329 loop:
2330 vec_def = phi <null, null> # REDUCTION_PHI
2331 VECT_DEF = vector_stmt # vectorized form of STMT
2332 s_loop = scalar_stmt # (scalar) STMT
2333 loop_exit:
2334 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2335 use <s_out0>
2336 use <s_out0>
2338 The above is transformed by this function into:
2340 loop:
2341 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2342 VECT_DEF = vector_stmt # vectorized form of STMT
2343 s_loop = scalar_stmt # (scalar) STMT
2344 loop_exit:
2345 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2346 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2347 v_out2 = reduce <v_out1>
2348 s_out3 = extract_field <v_out2, 0>
2349 s_out4 = adjust_result <s_out3>
2350 use <s_out4>
2351 use <s_out4>
2354 static void
2355 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2356 int ncopies,
2357 enum tree_code reduc_code,
2358 gimple reduction_phi)
2360 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2361 stmt_vec_info prev_phi_info;
2362 tree vectype;
2363 enum machine_mode mode;
2364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2366 basic_block exit_bb;
2367 tree scalar_dest;
2368 tree scalar_type;
2369 gimple new_phi = NULL, phi;
2370 gimple_stmt_iterator exit_gsi;
2371 tree vec_dest;
2372 tree new_temp = NULL_TREE;
2373 tree new_name;
2374 gimple epilog_stmt = NULL;
2375 tree new_scalar_dest, new_dest;
2376 gimple exit_phi;
2377 tree bitsize, bitpos, bytesize;
2378 enum tree_code code = gimple_assign_rhs_code (stmt);
2379 tree adjustment_def;
2380 tree vec_initial_def, def;
2381 tree orig_name;
2382 imm_use_iterator imm_iter;
2383 use_operand_p use_p;
2384 bool extract_scalar_result = false;
2385 tree reduction_op, expr;
2386 gimple orig_stmt;
2387 gimple use_stmt;
2388 bool nested_in_vect_loop = false;
2389 VEC(gimple,heap) *phis = NULL;
2390 enum vect_def_type dt = vect_unknown_def_type;
2391 int j, i;
2393 if (nested_in_vect_loop_p (loop, stmt))
2395 loop = loop->inner;
2396 nested_in_vect_loop = true;
2399 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2401 case GIMPLE_SINGLE_RHS:
2402 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2403 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2404 break;
2405 case GIMPLE_UNARY_RHS:
2406 reduction_op = gimple_assign_rhs1 (stmt);
2407 break;
2408 case GIMPLE_BINARY_RHS:
2409 reduction_op = gimple_assign_rhs2 (stmt);
2410 break;
2411 default:
2412 gcc_unreachable ();
2415 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2416 gcc_assert (vectype);
2417 mode = TYPE_MODE (vectype);
2419 /*** 1. Create the reduction def-use cycle ***/
2421 /* For the case of reduction, vect_get_vec_def_for_operand returns
2422 the scalar def before the loop, that defines the initial value
2423 of the reduction variable. */
2424 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2425 &adjustment_def);
2427 phi = reduction_phi;
2428 def = vect_def;
2429 for (j = 0; j < ncopies; j++)
2431 /* 1.1 set the loop-entry arg of the reduction-phi: */
2432 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2434 /* 1.2 set the loop-latch arg for the reduction-phi: */
2435 if (j > 0)
2436 def = vect_get_vec_def_for_stmt_copy (dt, def);
2437 add_phi_arg (phi, def, loop_latch_edge (loop));
2439 if (vect_print_dump_info (REPORT_DETAILS))
2441 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2442 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2443 fprintf (vect_dump, "\n");
2444 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2447 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2450 /*** 2. Create epilog code
2451 The reduction epilog code operates across the elements of the vector
2452 of partial results computed by the vectorized loop.
2453 The reduction epilog code consists of:
2454 step 1: compute the scalar result in a vector (v_out2)
2455 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2456 step 3: adjust the scalar result (s_out3) if needed.
2458 Step 1 can be accomplished using one the following three schemes:
2459 (scheme 1) using reduc_code, if available.
2460 (scheme 2) using whole-vector shifts, if available.
2461 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2462 combined.
2464 The overall epilog code looks like this:
2466 s_out0 = phi <s_loop> # original EXIT_PHI
2467 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2468 v_out2 = reduce <v_out1> # step 1
2469 s_out3 = extract_field <v_out2, 0> # step 2
2470 s_out4 = adjust_result <s_out3> # step 3
2472 (step 3 is optional, and steps 1 and 2 may be combined).
2473 Lastly, the uses of s_out0 are replaced by s_out4.
2475 ***/
2477 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2478 v_out1 = phi <v_loop> */
2480 exit_bb = single_exit (loop)->dest;
2481 def = vect_def;
2482 prev_phi_info = NULL;
2483 for (j = 0; j < ncopies; j++)
2485 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2486 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
2487 if (j == 0)
2488 new_phi = phi;
2489 else
2491 def = vect_get_vec_def_for_stmt_copy (dt, def);
2492 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2494 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2495 prev_phi_info = vinfo_for_stmt (phi);
2497 exit_gsi = gsi_after_labels (exit_bb);
2499 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2500 (i.e. when reduc_code is not available) and in the final adjustment
2501 code (if needed). Also get the original scalar reduction variable as
2502 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2503 represents a reduction pattern), the tree-code and scalar-def are
2504 taken from the original stmt that the pattern-stmt (STMT) replaces.
2505 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2506 are taken from STMT. */
2508 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2509 if (!orig_stmt)
2511 /* Regular reduction */
2512 orig_stmt = stmt;
2514 else
2516 /* Reduction pattern */
2517 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2518 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2519 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2521 code = gimple_assign_rhs_code (orig_stmt);
2522 scalar_dest = gimple_assign_lhs (orig_stmt);
2523 scalar_type = TREE_TYPE (scalar_dest);
2524 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2525 bitsize = TYPE_SIZE (scalar_type);
2526 bytesize = TYPE_SIZE_UNIT (scalar_type);
2529 /* In case this is a reduction in an inner-loop while vectorizing an outer
2530 loop - we don't need to extract a single scalar result at the end of the
2531 inner-loop. The final vector of partial results will be used in the
2532 vectorized outer-loop, or reduced to a scalar result at the end of the
2533 outer-loop. */
2534 if (nested_in_vect_loop)
2535 goto vect_finalize_reduction;
2537 /* FORNOW */
2538 gcc_assert (ncopies == 1);
2540 /* 2.3 Create the reduction code, using one of the three schemes described
2541 above. */
2543 if (reduc_code < NUM_TREE_CODES)
2545 tree tmp;
2547 /*** Case 1: Create:
2548 v_out2 = reduc_expr <v_out1> */
2550 if (vect_print_dump_info (REPORT_DETAILS))
2551 fprintf (vect_dump, "Reduce using direct vector reduction.");
2553 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2554 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2555 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2556 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2557 gimple_assign_set_lhs (epilog_stmt, new_temp);
2558 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2560 extract_scalar_result = true;
2562 else
2564 enum tree_code shift_code = 0;
2565 bool have_whole_vector_shift = true;
2566 int bit_offset;
2567 int element_bitsize = tree_low_cst (bitsize, 1);
2568 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2569 tree vec_temp;
2571 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2572 shift_code = VEC_RSHIFT_EXPR;
2573 else
2574 have_whole_vector_shift = false;
2576 /* Regardless of whether we have a whole vector shift, if we're
2577 emulating the operation via tree-vect-generic, we don't want
2578 to use it. Only the first round of the reduction is likely
2579 to still be profitable via emulation. */
2580 /* ??? It might be better to emit a reduction tree code here, so that
2581 tree-vect-generic can expand the first round via bit tricks. */
2582 if (!VECTOR_MODE_P (mode))
2583 have_whole_vector_shift = false;
2584 else
2586 optab optab = optab_for_tree_code (code, vectype, optab_default);
2587 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2588 have_whole_vector_shift = false;
2591 if (have_whole_vector_shift)
2593 /*** Case 2: Create:
2594 for (offset = VS/2; offset >= element_size; offset/=2)
2596 Create: va' = vec_shift <va, offset>
2597 Create: va = vop <va, va'>
2598 } */
2600 if (vect_print_dump_info (REPORT_DETAILS))
2601 fprintf (vect_dump, "Reduce using vector shifts");
2603 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2604 new_temp = PHI_RESULT (new_phi);
2606 for (bit_offset = vec_size_in_bits/2;
2607 bit_offset >= element_bitsize;
2608 bit_offset /= 2)
2610 tree bitpos = size_int (bit_offset);
2611 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2612 new_temp, bitpos);
2613 new_name = make_ssa_name (vec_dest, epilog_stmt);
2614 gimple_assign_set_lhs (epilog_stmt, new_name);
2615 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2617 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2618 new_name, new_temp);
2619 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2620 gimple_assign_set_lhs (epilog_stmt, new_temp);
2621 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2624 extract_scalar_result = true;
2626 else
2628 tree rhs;
2630 /*** Case 3: Create:
2631 s = extract_field <v_out2, 0>
2632 for (offset = element_size;
2633 offset < vector_size;
2634 offset += element_size;)
2636 Create: s' = extract_field <v_out2, offset>
2637 Create: s = op <s, s'>
2638 } */
2640 if (vect_print_dump_info (REPORT_DETAILS))
2641 fprintf (vect_dump, "Reduce using scalar code. ");
2643 vec_temp = PHI_RESULT (new_phi);
2644 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2645 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2646 bitsize_zero_node);
2647 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2648 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2649 gimple_assign_set_lhs (epilog_stmt, new_temp);
2650 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2652 for (bit_offset = element_bitsize;
2653 bit_offset < vec_size_in_bits;
2654 bit_offset += element_bitsize)
2656 tree bitpos = bitsize_int (bit_offset);
2657 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2658 bitpos);
2660 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2661 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2662 gimple_assign_set_lhs (epilog_stmt, new_name);
2663 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2665 epilog_stmt = gimple_build_assign_with_ops (code,
2666 new_scalar_dest,
2667 new_name, new_temp);
2668 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2669 gimple_assign_set_lhs (epilog_stmt, new_temp);
2670 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2673 extract_scalar_result = false;
2677 /* 2.4 Extract the final scalar result. Create:
2678 s_out3 = extract_field <v_out2, bitpos> */
2680 if (extract_scalar_result)
2682 tree rhs;
2684 gcc_assert (!nested_in_vect_loop);
2685 if (vect_print_dump_info (REPORT_DETAILS))
2686 fprintf (vect_dump, "extract scalar result");
2688 if (BYTES_BIG_ENDIAN)
2689 bitpos = size_binop (MULT_EXPR,
2690 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2691 TYPE_SIZE (scalar_type));
2692 else
2693 bitpos = bitsize_zero_node;
2695 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2696 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2697 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2698 gimple_assign_set_lhs (epilog_stmt, new_temp);
2699 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2702 vect_finalize_reduction:
2704 /* 2.5 Adjust the final result by the initial value of the reduction
2705 variable. (When such adjustment is not needed, then
2706 'adjustment_def' is zero). For example, if code is PLUS we create:
2707 new_temp = loop_exit_def + adjustment_def */
2709 if (adjustment_def)
2711 if (nested_in_vect_loop)
2713 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2714 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2715 new_dest = vect_create_destination_var (scalar_dest, vectype);
2717 else
2719 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2720 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2721 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2723 epilog_stmt = gimple_build_assign (new_dest, expr);
2724 new_temp = make_ssa_name (new_dest, epilog_stmt);
2725 gimple_assign_set_lhs (epilog_stmt, new_temp);
2726 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
2727 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2731 /* 2.6 Handle the loop-exit phi */
2733 /* Replace uses of s_out0 with uses of s_out3:
2734 Find the loop-closed-use at the loop exit of the original scalar result.
2735 (The reduction result is expected to have two immediate uses - one at the
2736 latch block, and one at the loop exit). */
2737 phis = VEC_alloc (gimple, heap, 10);
2738 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2740 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2742 exit_phi = USE_STMT (use_p);
2743 VEC_quick_push (gimple, phis, exit_phi);
2746 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2747 gcc_assert (!VEC_empty (gimple, phis));
2749 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
2751 if (nested_in_vect_loop)
2753 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2755 /* FORNOW. Currently not supporting the case that an inner-loop
2756 reduction is not used in the outer-loop (but only outside the
2757 outer-loop). */
2758 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2759 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2761 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2762 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2763 set_vinfo_for_stmt (epilog_stmt,
2764 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2765 if (adjustment_def)
2766 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
2767 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
2768 continue;
2771 /* Replace the uses: */
2772 orig_name = PHI_RESULT (exit_phi);
2773 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2774 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2775 SET_USE (use_p, new_temp);
2777 VEC_free (gimple, heap, phis);
2781 /* Function vectorizable_reduction.
2783 Check if STMT performs a reduction operation that can be vectorized.
2784 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2785 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2786 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2788 This function also handles reduction idioms (patterns) that have been
2789 recognized in advance during vect_pattern_recog. In this case, STMT may be
2790 of this form:
2791 X = pattern_expr (arg0, arg1, ..., X)
2792 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2793 sequence that had been detected and replaced by the pattern-stmt (STMT).
2795 In some cases of reduction patterns, the type of the reduction variable X is
2796 different than the type of the other arguments of STMT.
2797 In such cases, the vectype that is used when transforming STMT into a vector
2798 stmt is different than the vectype that is used to determine the
2799 vectorization factor, because it consists of a different number of elements
2800 than the actual number of elements that are being operated upon in parallel.
2802 For example, consider an accumulation of shorts into an int accumulator.
2803 On some targets it's possible to vectorize this pattern operating on 8
2804 shorts at a time (hence, the vectype for purposes of determining the
2805 vectorization factor should be V8HI); on the other hand, the vectype that
2806 is used to create the vector form is actually V4SI (the type of the result).
2808 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2809 indicates what is the actual level of parallelism (V8HI in the example), so
2810 that the right vectorization factor would be derived. This vectype
2811 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2812 be used to create the vectorized stmt. The right vectype for the vectorized
2813 stmt is obtained from the type of the result X:
2814 get_vectype_for_scalar_type (TREE_TYPE (X))
2816 This means that, contrary to "regular" reductions (or "regular" stmts in
2817 general), the following equation:
2818 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2819 does *NOT* necessarily hold for reduction patterns. */
2821 bool
2822 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
2823 gimple *vec_stmt)
2825 tree vec_dest;
2826 tree scalar_dest;
2827 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2828 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2829 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2832 enum tree_code code, orig_code, epilog_reduc_code = 0;
2833 enum machine_mode vec_mode;
2834 int op_type;
2835 optab optab, reduc_optab;
2836 tree new_temp = NULL_TREE;
2837 tree def;
2838 gimple def_stmt;
2839 enum vect_def_type dt;
2840 gimple new_phi = NULL;
2841 tree scalar_type;
2842 bool is_simple_use;
2843 gimple orig_stmt;
2844 stmt_vec_info orig_stmt_info;
2845 tree expr = NULL_TREE;
2846 int i;
2847 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2848 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2849 int epilog_copies;
2850 stmt_vec_info prev_stmt_info, prev_phi_info;
2851 gimple first_phi = NULL;
2852 bool single_defuse_cycle = false;
2853 tree reduc_def;
2854 gimple new_stmt = NULL;
2855 int j;
2856 tree ops[3];
2858 if (nested_in_vect_loop_p (loop, stmt))
2859 loop = loop->inner;
2861 gcc_assert (ncopies >= 1);
2863 /* FORNOW: SLP not supported. */
2864 if (STMT_SLP_TYPE (stmt_info))
2865 return false;
2867 /* 1. Is vectorizable reduction? */
2869 /* Not supportable if the reduction variable is used in the loop. */
2870 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2871 return false;
2873 /* Reductions that are not used even in an enclosing outer-loop,
2874 are expected to be "live" (used out of the loop). */
2875 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2876 && !STMT_VINFO_LIVE_P (stmt_info))
2877 return false;
2879 /* Make sure it was already recognized as a reduction computation. */
2880 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2881 return false;
2883 /* 2. Has this been recognized as a reduction pattern?
2885 Check if STMT represents a pattern that has been recognized
2886 in earlier analysis stages. For stmts that represent a pattern,
2887 the STMT_VINFO_RELATED_STMT field records the last stmt in
2888 the original sequence that constitutes the pattern. */
2890 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2891 if (orig_stmt)
2893 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2894 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2895 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2896 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2899 /* 3. Check the operands of the operation. The first operands are defined
2900 inside the loop body. The last operand is the reduction variable,
2901 which is defined by the loop-header-phi. */
2903 gcc_assert (is_gimple_assign (stmt));
2905 /* Flatten RHS */
2906 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2908 case GIMPLE_SINGLE_RHS:
2909 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
2910 if (op_type == ternary_op)
2912 tree rhs = gimple_assign_rhs1 (stmt);
2913 ops[0] = TREE_OPERAND (rhs, 0);
2914 ops[1] = TREE_OPERAND (rhs, 1);
2915 ops[2] = TREE_OPERAND (rhs, 2);
2916 code = TREE_CODE (rhs);
2918 else
2919 return false;
2920 break;
2922 case GIMPLE_BINARY_RHS:
2923 code = gimple_assign_rhs_code (stmt);
2924 op_type = TREE_CODE_LENGTH (code);
2925 gcc_assert (op_type == binary_op);
2926 ops[0] = gimple_assign_rhs1 (stmt);
2927 ops[1] = gimple_assign_rhs2 (stmt);
2928 break;
2930 case GIMPLE_UNARY_RHS:
2931 return false;
2933 default:
2934 gcc_unreachable ();
2937 scalar_dest = gimple_assign_lhs (stmt);
2938 scalar_type = TREE_TYPE (scalar_dest);
2939 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2940 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2941 return false;
2943 /* All uses but the last are expected to be defined in the loop.
2944 The last use is the reduction variable. */
2945 for (i = 0; i < op_type-1; i++)
2947 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt,
2948 &def, &dt);
2949 gcc_assert (is_simple_use);
2950 if (dt != vect_loop_def
2951 && dt != vect_invariant_def
2952 && dt != vect_constant_def
2953 && dt != vect_induction_def)
2954 return false;
2957 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &def, &dt);
2958 gcc_assert (is_simple_use);
2959 gcc_assert (dt == vect_reduction_def);
2960 gcc_assert (gimple_code (def_stmt) == GIMPLE_PHI);
2961 if (orig_stmt)
2962 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2963 else
2964 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2966 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2967 return false;
2969 /* 4. Supportable by target? */
2971 /* 4.1. check support for the operation in the loop */
2972 optab = optab_for_tree_code (code, vectype, optab_default);
2973 if (!optab)
2975 if (vect_print_dump_info (REPORT_DETAILS))
2976 fprintf (vect_dump, "no optab.");
2977 return false;
2979 vec_mode = TYPE_MODE (vectype);
2980 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2982 if (vect_print_dump_info (REPORT_DETAILS))
2983 fprintf (vect_dump, "op not supported by target.");
2984 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2985 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2986 < vect_min_worthwhile_factor (code))
2987 return false;
2988 if (vect_print_dump_info (REPORT_DETAILS))
2989 fprintf (vect_dump, "proceeding using word mode.");
2992 /* Worthwhile without SIMD support? */
2993 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2994 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2995 < vect_min_worthwhile_factor (code))
2997 if (vect_print_dump_info (REPORT_DETAILS))
2998 fprintf (vect_dump, "not worthwhile without SIMD support.");
2999 return false;
3002 /* 4.2. Check support for the epilog operation.
3004 If STMT represents a reduction pattern, then the type of the
3005 reduction variable may be different than the type of the rest
3006 of the arguments. For example, consider the case of accumulation
3007 of shorts into an int accumulator; The original code:
3008 S1: int_a = (int) short_a;
3009 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3011 was replaced with:
3012 STMT: int_acc = widen_sum <short_a, int_acc>
3014 This means that:
3015 1. The tree-code that is used to create the vector operation in the
3016 epilog code (that reduces the partial results) is not the
3017 tree-code of STMT, but is rather the tree-code of the original
3018 stmt from the pattern that STMT is replacing. I.e, in the example
3019 above we want to use 'widen_sum' in the loop, but 'plus' in the
3020 epilog.
3021 2. The type (mode) we use to check available target support
3022 for the vector operation to be created in the *epilog*, is
3023 determined by the type of the reduction variable (in the example
3024 above we'd check this: plus_optab[vect_int_mode]).
3025 However the type (mode) we use to check available target support
3026 for the vector operation to be created *inside the loop*, is
3027 determined by the type of the other arguments to STMT (in the
3028 example we'd check this: widen_sum_optab[vect_short_mode]).
3030 This is contrary to "regular" reductions, in which the types of all
3031 the arguments are the same as the type of the reduction variable.
3032 For "regular" reductions we can therefore use the same vector type
3033 (and also the same tree-code) when generating the epilog code and
3034 when generating the code inside the loop. */
3036 if (orig_stmt)
3038 /* This is a reduction pattern: get the vectype from the type of the
3039 reduction variable, and get the tree-code from orig_stmt. */
3040 orig_code = gimple_assign_rhs_code (orig_stmt);
3041 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3042 if (!vectype)
3044 if (vect_print_dump_info (REPORT_DETAILS))
3046 fprintf (vect_dump, "unsupported data-type ");
3047 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3049 return false;
3052 vec_mode = TYPE_MODE (vectype);
3054 else
3056 /* Regular reduction: use the same vectype and tree-code as used for
3057 the vector code inside the loop can be used for the epilog code. */
3058 orig_code = code;
3061 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3062 return false;
3063 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3064 if (!reduc_optab)
3066 if (vect_print_dump_info (REPORT_DETAILS))
3067 fprintf (vect_dump, "no optab for reduction.");
3068 epilog_reduc_code = NUM_TREE_CODES;
3070 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3072 if (vect_print_dump_info (REPORT_DETAILS))
3073 fprintf (vect_dump, "reduc op not supported by target.");
3074 epilog_reduc_code = NUM_TREE_CODES;
3077 if (!vec_stmt) /* transformation not required. */
3079 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3080 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3081 return false;
3082 return true;
3085 /** Transform. **/
3087 if (vect_print_dump_info (REPORT_DETAILS))
3088 fprintf (vect_dump, "transform reduction.");
3090 /* Create the destination vector */
3091 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3093 /* In case the vectorization factor (VF) is bigger than the number
3094 of elements that we can fit in a vectype (nunits), we have to generate
3095 more than one vector stmt - i.e - we need to "unroll" the
3096 vector stmt by a factor VF/nunits. For more details see documentation
3097 in vectorizable_operation. */
3099 /* If the reduction is used in an outer loop we need to generate
3100 VF intermediate results, like so (e.g. for ncopies=2):
3101 r0 = phi (init, r0)
3102 r1 = phi (init, r1)
3103 r0 = x0 + r0;
3104 r1 = x1 + r1;
3105 (i.e. we generate VF results in 2 registers).
3106 In this case we have a separate def-use cycle for each copy, and therefore
3107 for each copy we get the vector def for the reduction variable from the
3108 respective phi node created for this copy.
3110 Otherwise (the reduction is unused in the loop nest), we can combine
3111 together intermediate results, like so (e.g. for ncopies=2):
3112 r = phi (init, r)
3113 r = x0 + r;
3114 r = x1 + r;
3115 (i.e. we generate VF/2 results in a single register).
3116 In this case for each copy we get the vector def for the reduction variable
3117 from the vectorized reduction operation generated in the previous iteration.
3120 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop)
3122 single_defuse_cycle = true;
3123 epilog_copies = 1;
3125 else
3126 epilog_copies = ncopies;
3128 prev_stmt_info = NULL;
3129 prev_phi_info = NULL;
3130 for (j = 0; j < ncopies; j++)
3132 if (j == 0 || !single_defuse_cycle)
3134 /* Create the reduction-phi that defines the reduction-operand. */
3135 new_phi = create_phi_node (vec_dest, loop->header);
3136 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo));
3139 /* Handle uses. */
3140 if (j == 0)
3142 loop_vec_def0 = vect_get_vec_def_for_operand (ops[0], stmt, NULL);
3143 if (op_type == ternary_op)
3145 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, NULL);
3148 /* Get the vector def for the reduction variable from the phi node */
3149 reduc_def = PHI_RESULT (new_phi);
3150 first_phi = new_phi;
3152 else
3154 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3155 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3156 if (op_type == ternary_op)
3157 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3159 if (single_defuse_cycle)
3160 reduc_def = gimple_assign_lhs (new_stmt);
3161 else
3162 reduc_def = PHI_RESULT (new_phi);
3164 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3167 /* Arguments are ready. create the new vector stmt. */
3168 if (op_type == binary_op)
3169 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3170 else
3171 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3172 reduc_def);
3173 new_stmt = gimple_build_assign (vec_dest, expr);
3174 new_temp = make_ssa_name (vec_dest, new_stmt);
3175 gimple_assign_set_lhs (new_stmt, new_temp);
3176 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3178 if (j == 0)
3179 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3180 else
3181 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3182 prev_stmt_info = vinfo_for_stmt (new_stmt);
3183 prev_phi_info = vinfo_for_stmt (new_phi);
3186 /* Finalize the reduction-phi (set its arguments) and create the
3187 epilog reduction code. */
3188 if (!single_defuse_cycle)
3189 new_temp = gimple_assign_lhs (*vec_stmt);
3190 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3191 epilog_reduc_code, first_phi);
3192 return true;
3195 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3196 a function declaration if the target has a vectorized version
3197 of the function, or NULL_TREE if the function cannot be vectorized. */
3199 tree
3200 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
3202 tree fndecl = gimple_call_fndecl (call);
3203 enum built_in_function code;
3205 /* We only handle functions that do not read or clobber memory -- i.e.
3206 const or novops ones. */
3207 if (!(gimple_call_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3208 return NULL_TREE;
3210 if (!fndecl
3211 || TREE_CODE (fndecl) != FUNCTION_DECL
3212 || !DECL_BUILT_IN (fndecl))
3213 return NULL_TREE;
3215 code = DECL_FUNCTION_CODE (fndecl);
3216 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3217 vectype_in);
3220 /* Function vectorizable_call.
3222 Check if STMT performs a function call that can be vectorized.
3223 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3224 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3225 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3227 bool
3228 vectorizable_call (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt)
3230 tree vec_dest;
3231 tree scalar_dest;
3232 tree op, type;
3233 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3234 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3235 tree vectype_out, vectype_in;
3236 int nunits_in;
3237 int nunits_out;
3238 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3239 tree fndecl, new_temp, def, rhs_type, lhs_type;
3240 gimple def_stmt;
3241 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3242 gimple new_stmt;
3243 int ncopies, j;
3244 VEC(tree, heap) *vargs = NULL;
3245 enum { NARROW, NONE, WIDEN } modifier;
3246 size_t i, nargs;
3248 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3249 return false;
3251 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3252 return false;
3254 /* FORNOW: SLP not supported. */
3255 if (STMT_SLP_TYPE (stmt_info))
3256 return false;
3258 /* Is STMT a vectorizable call? */
3259 if (!is_gimple_call (stmt))
3260 return false;
3262 if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3263 return false;
3265 /* Process function arguments. */
3266 rhs_type = NULL_TREE;
3267 nargs = gimple_call_num_args (stmt);
3269 /* Bail out if the function has more than two arguments, we
3270 do not have interesting builtin functions to vectorize with
3271 more than two arguments. No arguments is also not good. */
3272 if (nargs == 0 || nargs > 2)
3273 return false;
3275 for (i = 0; i < nargs; i++)
3277 op = gimple_call_arg (stmt, i);
3279 /* We can only handle calls with arguments of the same type. */
3280 if (rhs_type
3281 && rhs_type != TREE_TYPE (op))
3283 if (vect_print_dump_info (REPORT_DETAILS))
3284 fprintf (vect_dump, "argument types differ.");
3285 return false;
3287 rhs_type = TREE_TYPE (op);
3289 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[i]))
3291 if (vect_print_dump_info (REPORT_DETAILS))
3292 fprintf (vect_dump, "use not simple.");
3293 return false;
3297 vectype_in = get_vectype_for_scalar_type (rhs_type);
3298 if (!vectype_in)
3299 return false;
3300 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3302 lhs_type = TREE_TYPE (gimple_call_lhs (stmt));
3303 vectype_out = get_vectype_for_scalar_type (lhs_type);
3304 if (!vectype_out)
3305 return false;
3306 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3308 /* FORNOW */
3309 if (nunits_in == nunits_out / 2)
3310 modifier = NARROW;
3311 else if (nunits_out == nunits_in)
3312 modifier = NONE;
3313 else if (nunits_out == nunits_in / 2)
3314 modifier = WIDEN;
3315 else
3316 return false;
3318 /* For now, we only vectorize functions if a target specific builtin
3319 is available. TODO -- in some cases, it might be profitable to
3320 insert the calls for pieces of the vector, in order to be able
3321 to vectorize other operations in the loop. */
3322 fndecl = vectorizable_function (stmt, vectype_out, vectype_in);
3323 if (fndecl == NULL_TREE)
3325 if (vect_print_dump_info (REPORT_DETAILS))
3326 fprintf (vect_dump, "function is not vectorizable.");
3328 return false;
3331 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3333 if (modifier == NARROW)
3334 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3335 else
3336 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3338 /* Sanity check: make sure that at least one copy of the vectorized stmt
3339 needs to be generated. */
3340 gcc_assert (ncopies >= 1);
3342 if (!vec_stmt) /* transformation not required. */
3344 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3345 if (vect_print_dump_info (REPORT_DETAILS))
3346 fprintf (vect_dump, "=== vectorizable_call ===");
3347 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3348 return true;
3351 /** Transform. **/
3353 if (vect_print_dump_info (REPORT_DETAILS))
3354 fprintf (vect_dump, "transform operation.");
3356 /* Handle def. */
3357 scalar_dest = gimple_call_lhs (stmt);
3358 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3360 prev_stmt_info = NULL;
3361 switch (modifier)
3363 case NONE:
3364 for (j = 0; j < ncopies; ++j)
3366 /* Build argument list for the vectorized call. */
3367 if (j == 0)
3368 vargs = VEC_alloc (tree, heap, nargs);
3369 else
3370 VEC_truncate (tree, vargs, 0);
3372 for (i = 0; i < nargs; i++)
3374 op = gimple_call_arg (stmt, i);
3375 if (j == 0)
3376 vec_oprnd0
3377 = vect_get_vec_def_for_operand (op, stmt, NULL);
3378 else
3379 vec_oprnd0
3380 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3382 VEC_quick_push (tree, vargs, vec_oprnd0);
3385 new_stmt = gimple_build_call_vec (fndecl, vargs);
3386 new_temp = make_ssa_name (vec_dest, new_stmt);
3387 gimple_call_set_lhs (new_stmt, new_temp);
3389 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3391 if (j == 0)
3392 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3393 else
3394 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3396 prev_stmt_info = vinfo_for_stmt (new_stmt);
3399 break;
3401 case NARROW:
3402 for (j = 0; j < ncopies; ++j)
3404 /* Build argument list for the vectorized call. */
3405 if (j == 0)
3406 vargs = VEC_alloc (tree, heap, nargs * 2);
3407 else
3408 VEC_truncate (tree, vargs, 0);
3410 for (i = 0; i < nargs; i++)
3412 op = gimple_call_arg (stmt, i);
3413 if (j == 0)
3415 vec_oprnd0
3416 = vect_get_vec_def_for_operand (op, stmt, NULL);
3417 vec_oprnd1
3418 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3420 else
3422 vec_oprnd0
3423 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3424 vec_oprnd1
3425 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3428 VEC_quick_push (tree, vargs, vec_oprnd0);
3429 VEC_quick_push (tree, vargs, vec_oprnd1);
3432 new_stmt = gimple_build_call_vec (fndecl, vargs);
3433 new_temp = make_ssa_name (vec_dest, new_stmt);
3434 gimple_call_set_lhs (new_stmt, new_temp);
3436 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3438 if (j == 0)
3439 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3440 else
3441 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3443 prev_stmt_info = vinfo_for_stmt (new_stmt);
3446 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3448 break;
3450 case WIDEN:
3451 /* No current target implements this case. */
3452 return false;
3455 VEC_free (tree, heap, vargs);
3457 /* The call in STMT might prevent it from being removed in dce.
3458 We however cannot remove it here, due to the way the ssa name
3459 it defines is mapped to the new definition. So just replace
3460 rhs of the statement with something harmless. */
3462 type = TREE_TYPE (scalar_dest);
3463 new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
3464 fold_convert (type, integer_zero_node));
3465 set_vinfo_for_stmt (new_stmt, stmt_info);
3466 set_vinfo_for_stmt (stmt, NULL);
3467 STMT_VINFO_STMT (stmt_info) = new_stmt;
3468 gsi_replace (gsi, new_stmt, false);
3469 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3471 return true;
3475 /* Function vect_gen_widened_results_half
3477 Create a vector stmt whose code, type, number of arguments, and result
3478 variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
3479 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3480 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3481 needs to be created (DECL is a function-decl of a target-builtin).
3482 STMT is the original scalar stmt that we are vectorizing. */
3484 static gimple
3485 vect_gen_widened_results_half (enum tree_code code,
3486 tree decl,
3487 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3488 tree vec_dest, gimple_stmt_iterator *gsi,
3489 gimple stmt)
3491 gimple new_stmt;
3492 tree new_temp;
3493 tree sym;
3494 ssa_op_iter iter;
3496 /* Generate half of the widened result: */
3497 if (code == CALL_EXPR)
3499 /* Target specific support */
3500 if (op_type == binary_op)
3501 new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
3502 else
3503 new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
3504 new_temp = make_ssa_name (vec_dest, new_stmt);
3505 gimple_call_set_lhs (new_stmt, new_temp);
3507 else
3509 /* Generic support */
3510 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3511 if (op_type != binary_op)
3512 vec_oprnd1 = NULL;
3513 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
3514 vec_oprnd1);
3515 new_temp = make_ssa_name (vec_dest, new_stmt);
3516 gimple_assign_set_lhs (new_stmt, new_temp);
3518 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3520 if (code == CALL_EXPR)
3522 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3524 if (TREE_CODE (sym) == SSA_NAME)
3525 sym = SSA_NAME_VAR (sym);
3526 mark_sym_for_renaming (sym);
3530 return new_stmt;
3534 /* Check if STMT performs a conversion operation, that can be vectorized.
3535 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3536 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3537 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3539 bool
3540 vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
3541 gimple *vec_stmt, slp_tree slp_node)
3543 tree vec_dest;
3544 tree scalar_dest;
3545 tree op0;
3546 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3547 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3548 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3549 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3550 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3551 tree new_temp;
3552 tree def;
3553 gimple def_stmt;
3554 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3555 gimple new_stmt = NULL;
3556 stmt_vec_info prev_stmt_info;
3557 int nunits_in;
3558 int nunits_out;
3559 tree vectype_out, vectype_in;
3560 int ncopies, j;
3561 tree expr;
3562 tree rhs_type, lhs_type;
3563 tree builtin_decl;
3564 enum { NARROW, NONE, WIDEN } modifier;
3565 int i;
3566 VEC(tree,heap) *vec_oprnds0 = NULL;
3567 tree vop0;
3568 tree integral_type;
3569 VEC(tree,heap) *dummy = NULL;
3570 int dummy_int;
3572 /* Is STMT a vectorizable conversion? */
3574 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3575 return false;
3577 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3578 return false;
3580 if (!is_gimple_assign (stmt))
3581 return false;
3583 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3584 return false;
3586 code = gimple_assign_rhs_code (stmt);
3587 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3588 return false;
3590 /* Check types of lhs and rhs. */
3591 op0 = gimple_assign_rhs1 (stmt);
3592 rhs_type = TREE_TYPE (op0);
3593 vectype_in = get_vectype_for_scalar_type (rhs_type);
3594 if (!vectype_in)
3595 return false;
3596 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3598 scalar_dest = gimple_assign_lhs (stmt);
3599 lhs_type = TREE_TYPE (scalar_dest);
3600 vectype_out = get_vectype_for_scalar_type (lhs_type);
3601 if (!vectype_out)
3602 return false;
3603 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3605 /* FORNOW */
3606 if (nunits_in == nunits_out / 2)
3607 modifier = NARROW;
3608 else if (nunits_out == nunits_in)
3609 modifier = NONE;
3610 else if (nunits_out == nunits_in / 2)
3611 modifier = WIDEN;
3612 else
3613 return false;
3615 if (modifier == NONE)
3616 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3618 /* Bail out if the types are both integral or non-integral. */
3619 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3620 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3621 return false;
3623 integral_type = INTEGRAL_TYPE_P (rhs_type) ? vectype_in : vectype_out;
3625 if (modifier == NARROW)
3626 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3627 else
3628 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3630 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3631 this, so we can safely override NCOPIES with 1 here. */
3632 if (slp_node)
3633 ncopies = 1;
3635 /* Sanity check: make sure that at least one copy of the vectorized stmt
3636 needs to be generated. */
3637 gcc_assert (ncopies >= 1);
3639 /* Check the operands of the operation. */
3640 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3642 if (vect_print_dump_info (REPORT_DETAILS))
3643 fprintf (vect_dump, "use not simple.");
3644 return false;
3647 /* Supportable by target? */
3648 if ((modifier == NONE
3649 && !targetm.vectorize.builtin_conversion (code, integral_type))
3650 || (modifier == WIDEN
3651 && !supportable_widening_operation (code, stmt, vectype_in,
3652 &decl1, &decl2,
3653 &code1, &code2,
3654 &dummy_int, &dummy))
3655 || (modifier == NARROW
3656 && !supportable_narrowing_operation (code, stmt, vectype_in,
3657 &code1, &dummy_int, &dummy)))
3659 if (vect_print_dump_info (REPORT_DETAILS))
3660 fprintf (vect_dump, "conversion not supported by target.");
3661 return false;
3664 if (modifier != NONE)
3666 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3667 /* FORNOW: SLP not supported. */
3668 if (STMT_SLP_TYPE (stmt_info))
3669 return false;
3672 if (!vec_stmt) /* transformation not required. */
3674 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3675 return true;
3678 /** Transform. **/
3679 if (vect_print_dump_info (REPORT_DETAILS))
3680 fprintf (vect_dump, "transform conversion.");
3682 /* Handle def. */
3683 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3685 if (modifier == NONE && !slp_node)
3686 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3688 prev_stmt_info = NULL;
3689 switch (modifier)
3691 case NONE:
3692 for (j = 0; j < ncopies; j++)
3694 tree sym;
3695 ssa_op_iter iter;
3697 if (j == 0)
3698 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3699 else
3700 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3702 builtin_decl =
3703 targetm.vectorize.builtin_conversion (code, integral_type);
3704 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3706 /* Arguments are ready. create the new vector stmt. */
3707 new_stmt = gimple_build_call (builtin_decl, 1, vop0);
3708 new_temp = make_ssa_name (vec_dest, new_stmt);
3709 gimple_call_set_lhs (new_stmt, new_temp);
3710 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3711 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3712 SSA_OP_ALL_VIRTUALS)
3714 if (TREE_CODE (sym) == SSA_NAME)
3715 sym = SSA_NAME_VAR (sym);
3716 mark_sym_for_renaming (sym);
3718 if (slp_node)
3719 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3722 if (j == 0)
3723 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3724 else
3725 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3726 prev_stmt_info = vinfo_for_stmt (new_stmt);
3728 break;
3730 case WIDEN:
3731 /* In case the vectorization factor (VF) is bigger than the number
3732 of elements that we can fit in a vectype (nunits), we have to
3733 generate more than one vector stmt - i.e - we need to "unroll"
3734 the vector stmt by a factor VF/nunits. */
3735 for (j = 0; j < ncopies; j++)
3737 if (j == 0)
3738 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3739 else
3740 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3742 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3744 /* Generate first half of the widened result: */
3745 new_stmt
3746 = vect_gen_widened_results_half (code1, decl1,
3747 vec_oprnd0, vec_oprnd1,
3748 unary_op, vec_dest, gsi, stmt);
3749 if (j == 0)
3750 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3751 else
3752 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3753 prev_stmt_info = vinfo_for_stmt (new_stmt);
3755 /* Generate second half of the widened result: */
3756 new_stmt
3757 = vect_gen_widened_results_half (code2, decl2,
3758 vec_oprnd0, vec_oprnd1,
3759 unary_op, vec_dest, gsi, stmt);
3760 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3761 prev_stmt_info = vinfo_for_stmt (new_stmt);
3763 break;
3765 case NARROW:
3766 /* In case the vectorization factor (VF) is bigger than the number
3767 of elements that we can fit in a vectype (nunits), we have to
3768 generate more than one vector stmt - i.e - we need to "unroll"
3769 the vector stmt by a factor VF/nunits. */
3770 for (j = 0; j < ncopies; j++)
3772 /* Handle uses. */
3773 if (j == 0)
3775 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3776 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3778 else
3780 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3781 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3784 /* Arguments are ready. Create the new vector stmt. */
3785 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3786 new_stmt = gimple_build_assign_with_ops (code1, vec_dest, vec_oprnd0,
3787 vec_oprnd1);
3788 new_temp = make_ssa_name (vec_dest, new_stmt);
3789 gimple_assign_set_lhs (new_stmt, new_temp);
3790 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3792 if (j == 0)
3793 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3794 else
3795 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3797 prev_stmt_info = vinfo_for_stmt (new_stmt);
3800 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3803 if (vec_oprnds0)
3804 VEC_free (tree, heap, vec_oprnds0);
3806 return true;
3810 /* Function vectorizable_assignment.
3812 Check if STMT performs an assignment (copy) that can be vectorized.
3813 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3814 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3815 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3817 bool
3818 vectorizable_assignment (gimple stmt, gimple_stmt_iterator *gsi,
3819 gimple *vec_stmt, slp_tree slp_node)
3821 tree vec_dest;
3822 tree scalar_dest;
3823 tree op;
3824 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3825 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3826 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3827 tree new_temp;
3828 tree def;
3829 gimple def_stmt;
3830 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3831 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3832 int ncopies;
3833 int i;
3834 VEC(tree,heap) *vec_oprnds = NULL;
3835 tree vop;
3837 /* Multiple types in SLP are handled by creating the appropriate number of
3838 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
3839 case of SLP. */
3840 if (slp_node)
3841 ncopies = 1;
3842 else
3843 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3845 gcc_assert (ncopies >= 1);
3846 if (ncopies > 1)
3847 return false; /* FORNOW */
3849 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3850 return false;
3852 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3853 return false;
3855 /* Is vectorizable assignment? */
3856 if (!is_gimple_assign (stmt))
3857 return false;
3859 scalar_dest = gimple_assign_lhs (stmt);
3860 if (TREE_CODE (scalar_dest) != SSA_NAME)
3861 return false;
3863 if (gimple_assign_single_p (stmt)
3864 || gimple_assign_rhs_code (stmt) == PAREN_EXPR)
3865 op = gimple_assign_rhs1 (stmt);
3866 else
3867 return false;
3869 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3871 if (vect_print_dump_info (REPORT_DETAILS))
3872 fprintf (vect_dump, "use not simple.");
3873 return false;
3876 if (!vec_stmt) /* transformation not required. */
3878 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3879 if (vect_print_dump_info (REPORT_DETAILS))
3880 fprintf (vect_dump, "=== vectorizable_assignment ===");
3881 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3882 return true;
3885 /** Transform. **/
3886 if (vect_print_dump_info (REPORT_DETAILS))
3887 fprintf (vect_dump, "transform assignment.");
3889 /* Handle def. */
3890 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3892 /* Handle use. */
3893 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3895 /* Arguments are ready. create the new vector stmt. */
3896 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3898 *vec_stmt = gimple_build_assign (vec_dest, vop);
3899 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3900 gimple_assign_set_lhs (*vec_stmt, new_temp);
3901 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
3902 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3904 if (slp_node)
3905 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3908 VEC_free (tree, heap, vec_oprnds);
3909 return true;
3913 /* Function vect_min_worthwhile_factor.
3915 For a loop where we could vectorize the operation indicated by CODE,
3916 return the minimum vectorization factor that makes it worthwhile
3917 to use generic vectors. */
3918 static int
3919 vect_min_worthwhile_factor (enum tree_code code)
3921 switch (code)
3923 case PLUS_EXPR:
3924 case MINUS_EXPR:
3925 case NEGATE_EXPR:
3926 return 4;
3928 case BIT_AND_EXPR:
3929 case BIT_IOR_EXPR:
3930 case BIT_XOR_EXPR:
3931 case BIT_NOT_EXPR:
3932 return 2;
3934 default:
3935 return INT_MAX;
3940 /* Function vectorizable_induction
3942 Check if PHI performs an induction computation that can be vectorized.
3943 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3944 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3945 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3947 bool
3948 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3949 gimple *vec_stmt)
3951 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3952 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3953 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3954 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3955 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3956 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3957 tree vec_def;
3959 gcc_assert (ncopies >= 1);
3960 /* FORNOW. This restriction should be relaxed. */
3961 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3963 if (vect_print_dump_info (REPORT_DETAILS))
3964 fprintf (vect_dump, "multiple types in nested loop.");
3965 return false;
3968 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3969 return false;
3971 /* FORNOW: SLP not supported. */
3972 if (STMT_SLP_TYPE (stmt_info))
3973 return false;
3975 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3977 if (gimple_code (phi) != GIMPLE_PHI)
3978 return false;
3980 if (!vec_stmt) /* transformation not required. */
3982 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3983 if (vect_print_dump_info (REPORT_DETAILS))
3984 fprintf (vect_dump, "=== vectorizable_induction ===");
3985 vect_model_induction_cost (stmt_info, ncopies);
3986 return true;
3989 /** Transform. **/
3991 if (vect_print_dump_info (REPORT_DETAILS))
3992 fprintf (vect_dump, "transform induction phi.");
3994 vec_def = get_initial_def_for_induction (phi);
3995 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3996 return true;
4000 /* Function vectorizable_operation.
4002 Check if STMT performs a binary or unary operation that can be vectorized.
4003 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4004 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4005 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4007 bool
4008 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
4009 gimple *vec_stmt, slp_tree slp_node)
4011 tree vec_dest;
4012 tree scalar_dest;
4013 tree op0, op1 = NULL;
4014 tree vec_oprnd1 = NULL_TREE;
4015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4016 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4017 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4018 enum tree_code code;
4019 enum machine_mode vec_mode;
4020 tree new_temp;
4021 int op_type;
4022 optab optab;
4023 int icode;
4024 enum machine_mode optab_op2_mode;
4025 tree def;
4026 gimple def_stmt;
4027 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4028 gimple new_stmt = NULL;
4029 stmt_vec_info prev_stmt_info;
4030 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
4031 int nunits_out;
4032 tree vectype_out;
4033 int ncopies;
4034 int j, i;
4035 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4036 tree vop0, vop1;
4037 unsigned int k;
4038 bool shift_p = false;
4039 bool scalar_shift_arg = false;
4041 /* Multiple types in SLP are handled by creating the appropriate number of
4042 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4043 case of SLP. */
4044 if (slp_node)
4045 ncopies = 1;
4046 else
4047 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4049 gcc_assert (ncopies >= 1);
4051 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4052 return false;
4054 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4055 return false;
4057 /* Is STMT a vectorizable binary/unary operation? */
4058 if (!is_gimple_assign (stmt))
4059 return false;
4061 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4062 return false;
4064 scalar_dest = gimple_assign_lhs (stmt);
4065 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4066 if (!vectype_out)
4067 return false;
4068 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4069 if (nunits_out != nunits_in)
4070 return false;
4072 code = gimple_assign_rhs_code (stmt);
4074 /* For pointer addition, we should use the normal plus for
4075 the vector addition. */
4076 if (code == POINTER_PLUS_EXPR)
4077 code = PLUS_EXPR;
4079 /* Support only unary or binary operations. */
4080 op_type = TREE_CODE_LENGTH (code);
4081 if (op_type != unary_op && op_type != binary_op)
4083 if (vect_print_dump_info (REPORT_DETAILS))
4084 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
4085 return false;
4088 op0 = gimple_assign_rhs1 (stmt);
4089 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4091 if (vect_print_dump_info (REPORT_DETAILS))
4092 fprintf (vect_dump, "use not simple.");
4093 return false;
4096 if (op_type == binary_op)
4098 op1 = gimple_assign_rhs2 (stmt);
4099 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4101 if (vect_print_dump_info (REPORT_DETAILS))
4102 fprintf (vect_dump, "use not simple.");
4103 return false;
4107 /* If this is a shift/rotate, determine whether the shift amount is a vector,
4108 or scalar. If the shift/rotate amount is a vector, use the vector/vector
4109 shift optabs. */
4110 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
4111 || code == RROTATE_EXPR)
4113 shift_p = true;
4115 /* vector shifted by vector */
4116 if (dt[1] == vect_loop_def)
4118 optab = optab_for_tree_code (code, vectype, optab_vector);
4119 if (vect_print_dump_info (REPORT_DETAILS))
4120 fprintf (vect_dump, "vector/vector shift/rotate found.");
4123 /* See if the machine has a vector shifted by scalar insn and if not
4124 then see if it has a vector shifted by vector insn */
4125 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
4127 optab = optab_for_tree_code (code, vectype, optab_scalar);
4128 if (optab
4129 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4130 != CODE_FOR_nothing))
4132 scalar_shift_arg = true;
4133 if (vect_print_dump_info (REPORT_DETAILS))
4134 fprintf (vect_dump, "vector/scalar shift/rotate found.");
4136 else
4138 optab = optab_for_tree_code (code, vectype, optab_vector);
4139 if (vect_print_dump_info (REPORT_DETAILS)
4140 && optab
4141 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
4142 != CODE_FOR_nothing))
4143 fprintf (vect_dump, "vector/vector shift/rotate found.");
4147 else
4149 if (vect_print_dump_info (REPORT_DETAILS))
4150 fprintf (vect_dump, "operand mode requires invariant argument.");
4151 return false;
4154 else
4155 optab = optab_for_tree_code (code, vectype, optab_default);
4157 /* Supportable by target? */
4158 if (!optab)
4160 if (vect_print_dump_info (REPORT_DETAILS))
4161 fprintf (vect_dump, "no optab.");
4162 return false;
4164 vec_mode = TYPE_MODE (vectype);
4165 icode = (int) optab_handler (optab, vec_mode)->insn_code;
4166 if (icode == CODE_FOR_nothing)
4168 if (vect_print_dump_info (REPORT_DETAILS))
4169 fprintf (vect_dump, "op not supported by target.");
4170 /* Check only during analysis. */
4171 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4172 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4173 < vect_min_worthwhile_factor (code)
4174 && !vec_stmt))
4175 return false;
4176 if (vect_print_dump_info (REPORT_DETAILS))
4177 fprintf (vect_dump, "proceeding using word mode.");
4180 /* Worthwhile without SIMD support? Check only during analysis. */
4181 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4182 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4183 < vect_min_worthwhile_factor (code)
4184 && !vec_stmt)
4186 if (vect_print_dump_info (REPORT_DETAILS))
4187 fprintf (vect_dump, "not worthwhile without SIMD support.");
4188 return false;
4191 if (!vec_stmt) /* transformation not required. */
4193 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4194 if (vect_print_dump_info (REPORT_DETAILS))
4195 fprintf (vect_dump, "=== vectorizable_operation ===");
4196 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4197 return true;
4200 /** Transform. **/
4202 if (vect_print_dump_info (REPORT_DETAILS))
4203 fprintf (vect_dump, "transform binary/unary operation.");
4205 /* Handle def. */
4206 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4208 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4209 created in the previous stages of the recursion, so no allocation is
4210 needed, except for the case of shift with scalar shift argument. In that
4211 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4212 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4213 In case of loop-based vectorization we allocate VECs of size 1. We
4214 allocate VEC_OPRNDS1 only in case of binary operation. */
4215 if (!slp_node)
4217 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4218 if (op_type == binary_op)
4219 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4221 else if (scalar_shift_arg)
4222 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4224 /* In case the vectorization factor (VF) is bigger than the number
4225 of elements that we can fit in a vectype (nunits), we have to generate
4226 more than one vector stmt - i.e - we need to "unroll" the
4227 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4228 from one copy of the vector stmt to the next, in the field
4229 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4230 stages to find the correct vector defs to be used when vectorizing
4231 stmts that use the defs of the current stmt. The example below illustrates
4232 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4233 4 vectorized stmts):
4235 before vectorization:
4236 RELATED_STMT VEC_STMT
4237 S1: x = memref - -
4238 S2: z = x + 1 - -
4240 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4241 there):
4242 RELATED_STMT VEC_STMT
4243 VS1_0: vx0 = memref0 VS1_1 -
4244 VS1_1: vx1 = memref1 VS1_2 -
4245 VS1_2: vx2 = memref2 VS1_3 -
4246 VS1_3: vx3 = memref3 - -
4247 S1: x = load - VS1_0
4248 S2: z = x + 1 - -
4250 step2: vectorize stmt S2 (done here):
4251 To vectorize stmt S2 we first need to find the relevant vector
4252 def for the first operand 'x'. This is, as usual, obtained from
4253 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4254 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4255 relevant vector def 'vx0'. Having found 'vx0' we can generate
4256 the vector stmt VS2_0, and as usual, record it in the
4257 STMT_VINFO_VEC_STMT of stmt S2.
4258 When creating the second copy (VS2_1), we obtain the relevant vector
4259 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4260 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4261 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4262 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4263 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4264 chain of stmts and pointers:
4265 RELATED_STMT VEC_STMT
4266 VS1_0: vx0 = memref0 VS1_1 -
4267 VS1_1: vx1 = memref1 VS1_2 -
4268 VS1_2: vx2 = memref2 VS1_3 -
4269 VS1_3: vx3 = memref3 - -
4270 S1: x = load - VS1_0
4271 VS2_0: vz0 = vx0 + v1 VS2_1 -
4272 VS2_1: vz1 = vx1 + v1 VS2_2 -
4273 VS2_2: vz2 = vx2 + v1 VS2_3 -
4274 VS2_3: vz3 = vx3 + v1 - -
4275 S2: z = x + 1 - VS2_0 */
4277 prev_stmt_info = NULL;
4278 for (j = 0; j < ncopies; j++)
4280 /* Handle uses. */
4281 if (j == 0)
4283 if (op_type == binary_op && scalar_shift_arg)
4285 /* Vector shl and shr insn patterns can be defined with scalar
4286 operand 2 (shift operand). In this case, use constant or loop
4287 invariant op1 directly, without extending it to vector mode
4288 first. */
4289 optab_op2_mode = insn_data[icode].operand[2].mode;
4290 if (!VECTOR_MODE_P (optab_op2_mode))
4292 if (vect_print_dump_info (REPORT_DETAILS))
4293 fprintf (vect_dump, "operand 1 using scalar mode.");
4294 vec_oprnd1 = op1;
4295 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4296 if (slp_node)
4298 /* Store vec_oprnd1 for every vector stmt to be created
4299 for SLP_NODE. We check during the analysis that all the
4300 shift arguments are the same.
4301 TODO: Allow different constants for different vector
4302 stmts generated for an SLP instance. */
4303 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4304 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4309 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4310 (a special case for certain kind of vector shifts); otherwise,
4311 operand 1 should be of a vector type (the usual case). */
4312 if (op_type == binary_op && !vec_oprnd1)
4313 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4314 slp_node);
4315 else
4316 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4317 slp_node);
4319 else
4320 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4322 /* Arguments are ready. Create the new vector stmt. */
4323 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4325 vop1 = ((op_type == binary_op)
4326 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4327 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4328 new_temp = make_ssa_name (vec_dest, new_stmt);
4329 gimple_assign_set_lhs (new_stmt, new_temp);
4330 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4331 if (slp_node)
4332 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4335 if (slp_node)
4336 continue;
4338 if (j == 0)
4339 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4340 else
4341 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4342 prev_stmt_info = vinfo_for_stmt (new_stmt);
4345 VEC_free (tree, heap, vec_oprnds0);
4346 if (vec_oprnds1)
4347 VEC_free (tree, heap, vec_oprnds1);
4349 return true;
4353 /* Get vectorized definitions for loop-based vectorization. For the first
4354 operand we call vect_get_vec_def_for_operand() (with OPRND containing
4355 scalar operand), and for the rest we get a copy with
4356 vect_get_vec_def_for_stmt_copy() using the previous vector definition
4357 (stored in OPRND). See vect_get_vec_def_for_stmt_copy() for details.
4358 The vectors are collected into VEC_OPRNDS. */
4360 static void
4361 vect_get_loop_based_defs (tree *oprnd, gimple stmt, enum vect_def_type dt,
4362 VEC (tree, heap) **vec_oprnds, int multi_step_cvt)
4364 tree vec_oprnd;
4366 /* Get first vector operand. */
4367 /* All the vector operands except the very first one (that is scalar oprnd)
4368 are stmt copies. */
4369 if (TREE_CODE (TREE_TYPE (*oprnd)) != VECTOR_TYPE)
4370 vec_oprnd = vect_get_vec_def_for_operand (*oprnd, stmt, NULL);
4371 else
4372 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, *oprnd);
4374 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4376 /* Get second vector operand. */
4377 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
4378 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
4380 *oprnd = vec_oprnd;
4382 /* For conversion in multiple steps, continue to get operands
4383 recursively. */
4384 if (multi_step_cvt)
4385 vect_get_loop_based_defs (oprnd, stmt, dt, vec_oprnds, multi_step_cvt - 1);
4389 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
4390 For multi-step conversions store the resulting vectors and call the function
4391 recursively. */
4393 static void
4394 vect_create_vectorized_demotion_stmts (VEC (tree, heap) **vec_oprnds,
4395 int multi_step_cvt, gimple stmt,
4396 VEC (tree, heap) *vec_dsts,
4397 gimple_stmt_iterator *gsi,
4398 slp_tree slp_node, enum tree_code code,
4399 stmt_vec_info *prev_stmt_info)
4401 unsigned int i;
4402 tree vop0, vop1, new_tmp, vec_dest;
4403 gimple new_stmt;
4404 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4406 vec_dest = VEC_pop (tree, vec_dsts);
4408 for (i = 0; i < VEC_length (tree, *vec_oprnds); i += 2)
4410 /* Create demotion operation. */
4411 vop0 = VEC_index (tree, *vec_oprnds, i);
4412 vop1 = VEC_index (tree, *vec_oprnds, i + 1);
4413 new_stmt = gimple_build_assign_with_ops (code, vec_dest, vop0, vop1);
4414 new_tmp = make_ssa_name (vec_dest, new_stmt);
4415 gimple_assign_set_lhs (new_stmt, new_tmp);
4416 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4418 if (multi_step_cvt)
4419 /* Store the resulting vector for next recursive call. */
4420 VEC_replace (tree, *vec_oprnds, i/2, new_tmp);
4421 else
4423 /* This is the last step of the conversion sequence. Store the
4424 vectors in SLP_NODE or in vector info of the scalar statement
4425 (or in STMT_VINFO_RELATED_STMT chain). */
4426 if (slp_node)
4427 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4428 else
4430 if (!*prev_stmt_info)
4431 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4432 else
4433 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt;
4435 *prev_stmt_info = vinfo_for_stmt (new_stmt);
4440 /* For multi-step demotion operations we first generate demotion operations
4441 from the source type to the intermediate types, and then combine the
4442 results (stored in VEC_OPRNDS) in demotion operation to the destination
4443 type. */
4444 if (multi_step_cvt)
4446 /* At each level of recursion we have have of the operands we had at the
4447 previous level. */
4448 VEC_truncate (tree, *vec_oprnds, (i+1)/2);
4449 vect_create_vectorized_demotion_stmts (vec_oprnds, multi_step_cvt - 1,
4450 stmt, vec_dsts, gsi, slp_node,
4451 code, prev_stmt_info);
4456 /* Function vectorizable_type_demotion
4458 Check if STMT performs a binary or unary operation that involves
4459 type demotion, and if it can be vectorized.
4460 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4461 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4462 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4464 bool
4465 vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
4466 gimple *vec_stmt, slp_tree slp_node)
4468 tree vec_dest;
4469 tree scalar_dest;
4470 tree op0;
4471 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4472 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4473 enum tree_code code, code1 = ERROR_MARK;
4474 tree def;
4475 gimple def_stmt;
4476 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4477 stmt_vec_info prev_stmt_info;
4478 int nunits_in;
4479 int nunits_out;
4480 tree vectype_out;
4481 int ncopies;
4482 int j, i;
4483 tree vectype_in;
4484 int multi_step_cvt = 0;
4485 VEC (tree, heap) *vec_oprnds0 = NULL;
4486 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4487 tree last_oprnd, intermediate_type;
4489 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4490 return false;
4492 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4493 return false;
4495 /* Is STMT a vectorizable type-demotion operation? */
4496 if (!is_gimple_assign (stmt))
4497 return false;
4499 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4500 return false;
4502 code = gimple_assign_rhs_code (stmt);
4503 if (!CONVERT_EXPR_CODE_P (code))
4504 return false;
4506 op0 = gimple_assign_rhs1 (stmt);
4507 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4508 if (!vectype_in)
4509 return false;
4510 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4512 scalar_dest = gimple_assign_lhs (stmt);
4513 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4514 if (!vectype_out)
4515 return false;
4516 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4517 if (nunits_in >= nunits_out)
4518 return false;
4520 /* Multiple types in SLP are handled by creating the appropriate number of
4521 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4522 case of SLP. */
4523 if (slp_node)
4524 ncopies = 1;
4525 else
4526 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4528 gcc_assert (ncopies >= 1);
4530 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4531 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4532 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4533 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4534 && CONVERT_EXPR_CODE_P (code))))
4535 return false;
4537 /* Check the operands of the operation. */
4538 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4540 if (vect_print_dump_info (REPORT_DETAILS))
4541 fprintf (vect_dump, "use not simple.");
4542 return false;
4545 /* Supportable by target? */
4546 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1,
4547 &multi_step_cvt, &interm_types))
4548 return false;
4550 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4552 if (!vec_stmt) /* transformation not required. */
4554 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4555 if (vect_print_dump_info (REPORT_DETAILS))
4556 fprintf (vect_dump, "=== vectorizable_demotion ===");
4557 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4558 return true;
4561 /** Transform. **/
4562 if (vect_print_dump_info (REPORT_DETAILS))
4563 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4564 ncopies);
4566 /* In case of multi-step demotion, we first generate demotion operations to
4567 the intermediate types, and then from that types to the final one.
4568 We create vector destinations for the intermediate type (TYPES) received
4569 from supportable_narrowing_operation, and store them in the correct order
4570 for future use in vect_create_vectorized_demotion_stmts(). */
4571 if (multi_step_cvt)
4572 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4573 else
4574 vec_dsts = VEC_alloc (tree, heap, 1);
4576 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4577 VEC_quick_push (tree, vec_dsts, vec_dest);
4579 if (multi_step_cvt)
4581 for (i = VEC_length (tree, interm_types) - 1;
4582 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4584 vec_dest = vect_create_destination_var (scalar_dest,
4585 intermediate_type);
4586 VEC_quick_push (tree, vec_dsts, vec_dest);
4590 /* In case the vectorization factor (VF) is bigger than the number
4591 of elements that we can fit in a vectype (nunits), we have to generate
4592 more than one vector stmt - i.e - we need to "unroll" the
4593 vector stmt by a factor VF/nunits. */
4594 last_oprnd = op0;
4595 prev_stmt_info = NULL;
4596 for (j = 0; j < ncopies; j++)
4598 /* Handle uses. */
4599 if (slp_node)
4600 vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
4601 else
4603 VEC_free (tree, heap, vec_oprnds0);
4604 vec_oprnds0 = VEC_alloc (tree, heap,
4605 (multi_step_cvt ? vect_pow2 (multi_step_cvt) * 2 : 2));
4606 vect_get_loop_based_defs (&last_oprnd, stmt, dt[0], &vec_oprnds0,
4607 vect_pow2 (multi_step_cvt) - 1);
4610 /* Arguments are ready. Create the new vector stmts. */
4611 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4612 vect_create_vectorized_demotion_stmts (&vec_oprnds0,
4613 multi_step_cvt, stmt, tmp_vec_dsts,
4614 gsi, slp_node, code1,
4615 &prev_stmt_info);
4618 VEC_free (tree, heap, vec_oprnds0);
4619 VEC_free (tree, heap, vec_dsts);
4620 VEC_free (tree, heap, tmp_vec_dsts);
4621 VEC_free (tree, heap, interm_types);
4623 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4624 return true;
4628 /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
4629 and VEC_OPRNDS1 (for binary operations). For multi-step conversions store
4630 the resulting vectors and call the function recursively. */
4632 static void
4633 vect_create_vectorized_promotion_stmts (VEC (tree, heap) **vec_oprnds0,
4634 VEC (tree, heap) **vec_oprnds1,
4635 int multi_step_cvt, gimple stmt,
4636 VEC (tree, heap) *vec_dsts,
4637 gimple_stmt_iterator *gsi,
4638 slp_tree slp_node, enum tree_code code1,
4639 enum tree_code code2, tree decl1,
4640 tree decl2, int op_type,
4641 stmt_vec_info *prev_stmt_info)
4643 int i;
4644 tree vop0, vop1, new_tmp1, new_tmp2, vec_dest;
4645 gimple new_stmt1, new_stmt2;
4646 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4647 VEC (tree, heap) *vec_tmp;
4649 vec_dest = VEC_pop (tree, vec_dsts);
4650 vec_tmp = VEC_alloc (tree, heap, VEC_length (tree, *vec_oprnds0) * 2);
4652 for (i = 0; VEC_iterate (tree, *vec_oprnds0, i, vop0); i++)
4654 if (op_type == binary_op)
4655 vop1 = VEC_index (tree, *vec_oprnds1, i);
4656 else
4657 vop1 = NULL_TREE;
4659 /* Generate the two halves of promotion operation. */
4660 new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
4661 op_type, vec_dest, gsi, stmt);
4662 new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
4663 op_type, vec_dest, gsi, stmt);
4664 if (is_gimple_call (new_stmt1))
4666 new_tmp1 = gimple_call_lhs (new_stmt1);
4667 new_tmp2 = gimple_call_lhs (new_stmt2);
4669 else
4671 new_tmp1 = gimple_assign_lhs (new_stmt1);
4672 new_tmp2 = gimple_assign_lhs (new_stmt2);
4675 if (multi_step_cvt)
4677 /* Store the results for the recursive call. */
4678 VEC_quick_push (tree, vec_tmp, new_tmp1);
4679 VEC_quick_push (tree, vec_tmp, new_tmp2);
4681 else
4683 /* Last step of promotion sequience - store the results. */
4684 if (slp_node)
4686 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt1);
4687 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt2);
4689 else
4691 if (!*prev_stmt_info)
4692 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt1;
4693 else
4694 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt1;
4696 *prev_stmt_info = vinfo_for_stmt (new_stmt1);
4697 STMT_VINFO_RELATED_STMT (*prev_stmt_info) = new_stmt2;
4698 *prev_stmt_info = vinfo_for_stmt (new_stmt2);
4703 if (multi_step_cvt)
4705 /* For multi-step promotion operation we first generate we call the
4706 function recurcively for every stage. We start from the input type,
4707 create promotion operations to the intermediate types, and then
4708 create promotions to the output type. */
4709 *vec_oprnds0 = VEC_copy (tree, heap, vec_tmp);
4710 VEC_free (tree, heap, vec_tmp);
4711 vect_create_vectorized_promotion_stmts (vec_oprnds0, vec_oprnds1,
4712 multi_step_cvt - 1, stmt,
4713 vec_dsts, gsi, slp_node, code1,
4714 code2, decl2, decl2, op_type,
4715 prev_stmt_info);
4720 /* Function vectorizable_type_promotion
4722 Check if STMT performs a binary or unary operation that involves
4723 type promotion, and if it can be vectorized.
4724 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4725 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4726 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4728 bool
4729 vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
4730 gimple *vec_stmt, slp_tree slp_node)
4732 tree vec_dest;
4733 tree scalar_dest;
4734 tree op0, op1 = NULL;
4735 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4736 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4737 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4738 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4739 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4740 int op_type;
4741 tree def;
4742 gimple def_stmt;
4743 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4744 stmt_vec_info prev_stmt_info;
4745 int nunits_in;
4746 int nunits_out;
4747 tree vectype_out;
4748 int ncopies;
4749 int j, i;
4750 tree vectype_in;
4751 tree intermediate_type = NULL_TREE;
4752 int multi_step_cvt = 0;
4753 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
4754 VEC (tree, heap) *vec_dsts = NULL, *interm_types = NULL, *tmp_vec_dsts = NULL;
4756 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4757 return false;
4759 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4760 return false;
4762 /* Is STMT a vectorizable type-promotion operation? */
4763 if (!is_gimple_assign (stmt))
4764 return false;
4766 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4767 return false;
4769 code = gimple_assign_rhs_code (stmt);
4770 if (!CONVERT_EXPR_CODE_P (code)
4771 && code != WIDEN_MULT_EXPR)
4772 return false;
4774 op0 = gimple_assign_rhs1 (stmt);
4775 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4776 if (!vectype_in)
4777 return false;
4778 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4780 scalar_dest = gimple_assign_lhs (stmt);
4781 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4782 if (!vectype_out)
4783 return false;
4784 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4785 if (nunits_in <= nunits_out)
4786 return false;
4788 /* Multiple types in SLP are handled by creating the appropriate number of
4789 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
4790 case of SLP. */
4791 if (slp_node)
4792 ncopies = 1;
4793 else
4794 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4796 gcc_assert (ncopies >= 1);
4798 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4799 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4800 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4801 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4802 && CONVERT_EXPR_CODE_P (code))))
4803 return false;
4805 /* Check the operands of the operation. */
4806 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4808 if (vect_print_dump_info (REPORT_DETAILS))
4809 fprintf (vect_dump, "use not simple.");
4810 return false;
4813 op_type = TREE_CODE_LENGTH (code);
4814 if (op_type == binary_op)
4816 op1 = gimple_assign_rhs2 (stmt);
4817 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4819 if (vect_print_dump_info (REPORT_DETAILS))
4820 fprintf (vect_dump, "use not simple.");
4821 return false;
4825 /* Supportable by target? */
4826 if (!supportable_widening_operation (code, stmt, vectype_in,
4827 &decl1, &decl2, &code1, &code2,
4828 &multi_step_cvt, &interm_types))
4829 return false;
4831 /* Binary widening operation can only be supported directly by the
4832 architecture. */
4833 gcc_assert (!(multi_step_cvt && op_type == binary_op));
4835 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4837 if (!vec_stmt) /* transformation not required. */
4839 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4840 if (vect_print_dump_info (REPORT_DETAILS))
4841 fprintf (vect_dump, "=== vectorizable_promotion ===");
4842 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4843 return true;
4846 /** Transform. **/
4848 if (vect_print_dump_info (REPORT_DETAILS))
4849 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4850 ncopies);
4852 /* Handle def. */
4853 /* In case of multi-step promotion, we first generate promotion operations
4854 to the intermediate types, and then from that types to the final one.
4855 We store vector destination in VEC_DSTS in the correct order for
4856 recursive creation of promotion operations in
4857 vect_create_vectorized_promotion_stmts(). Vector destinations are created
4858 according to TYPES recieved from supportable_widening_operation(). */
4859 if (multi_step_cvt)
4860 vec_dsts = VEC_alloc (tree, heap, multi_step_cvt + 1);
4861 else
4862 vec_dsts = VEC_alloc (tree, heap, 1);
4864 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4865 VEC_quick_push (tree, vec_dsts, vec_dest);
4867 if (multi_step_cvt)
4869 for (i = VEC_length (tree, interm_types) - 1;
4870 VEC_iterate (tree, interm_types, i, intermediate_type); i--)
4872 vec_dest = vect_create_destination_var (scalar_dest,
4873 intermediate_type);
4874 VEC_quick_push (tree, vec_dsts, vec_dest);
4878 if (!slp_node)
4880 vec_oprnds0 = VEC_alloc (tree, heap,
4881 (multi_step_cvt ? vect_pow2 (multi_step_cvt) : 1));
4882 if (op_type == binary_op)
4883 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4886 /* In case the vectorization factor (VF) is bigger than the number
4887 of elements that we can fit in a vectype (nunits), we have to generate
4888 more than one vector stmt - i.e - we need to "unroll" the
4889 vector stmt by a factor VF/nunits. */
4891 prev_stmt_info = NULL;
4892 for (j = 0; j < ncopies; j++)
4894 /* Handle uses. */
4895 if (j == 0)
4897 if (slp_node)
4898 vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
4899 else
4901 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4902 VEC_quick_push (tree, vec_oprnds0, vec_oprnd0);
4903 if (op_type == binary_op)
4905 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4906 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4910 else
4912 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4913 VEC_replace (tree, vec_oprnds0, 0, vec_oprnd0);
4914 if (op_type == binary_op)
4916 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4917 VEC_replace (tree, vec_oprnds1, 0, vec_oprnd1);
4921 /* Arguments are ready. Create the new vector stmts. */
4922 tmp_vec_dsts = VEC_copy (tree, heap, vec_dsts);
4923 vect_create_vectorized_promotion_stmts (&vec_oprnds0, &vec_oprnds1,
4924 multi_step_cvt, stmt,
4925 tmp_vec_dsts,
4926 gsi, slp_node, code1, code2,
4927 decl1, decl2, op_type,
4928 &prev_stmt_info);
4931 VEC_free (tree, heap, vec_dsts);
4932 VEC_free (tree, heap, tmp_vec_dsts);
4933 VEC_free (tree, heap, interm_types);
4934 VEC_free (tree, heap, vec_oprnds0);
4935 VEC_free (tree, heap, vec_oprnds1);
4937 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4938 return true;
4942 /* Function vect_strided_store_supported.
4944 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4945 and FALSE otherwise. */
4947 static bool
4948 vect_strided_store_supported (tree vectype)
4950 optab interleave_high_optab, interleave_low_optab;
4951 int mode;
4953 mode = (int) TYPE_MODE (vectype);
4955 /* Check that the operation is supported. */
4956 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4957 vectype, optab_default);
4958 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4959 vectype, optab_default);
4960 if (!interleave_high_optab || !interleave_low_optab)
4962 if (vect_print_dump_info (REPORT_DETAILS))
4963 fprintf (vect_dump, "no optab for interleave.");
4964 return false;
4967 if (optab_handler (interleave_high_optab, mode)->insn_code
4968 == CODE_FOR_nothing
4969 || optab_handler (interleave_low_optab, mode)->insn_code
4970 == CODE_FOR_nothing)
4972 if (vect_print_dump_info (REPORT_DETAILS))
4973 fprintf (vect_dump, "interleave op not supported by target.");
4974 return false;
4977 return true;
4981 /* Function vect_permute_store_chain.
4983 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4984 a power of 2, generate interleave_high/low stmts to reorder the data
4985 correctly for the stores. Return the final references for stores in
4986 RESULT_CHAIN.
4988 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4989 The input is 4 vectors each containing 8 elements. We assign a number to each
4990 element, the input sequence is:
4992 1st vec: 0 1 2 3 4 5 6 7
4993 2nd vec: 8 9 10 11 12 13 14 15
4994 3rd vec: 16 17 18 19 20 21 22 23
4995 4th vec: 24 25 26 27 28 29 30 31
4997 The output sequence should be:
4999 1st vec: 0 8 16 24 1 9 17 25
5000 2nd vec: 2 10 18 26 3 11 19 27
5001 3rd vec: 4 12 20 28 5 13 21 30
5002 4th vec: 6 14 22 30 7 15 23 31
5004 i.e., we interleave the contents of the four vectors in their order.
5006 We use interleave_high/low instructions to create such output. The input of
5007 each interleave_high/low operation is two vectors:
5008 1st vec 2nd vec
5009 0 1 2 3 4 5 6 7
5010 the even elements of the result vector are obtained left-to-right from the
5011 high/low elements of the first vector. The odd elements of the result are
5012 obtained left-to-right from the high/low elements of the second vector.
5013 The output of interleave_high will be: 0 4 1 5
5014 and of interleave_low: 2 6 3 7
5017 The permutation is done in log LENGTH stages. In each stage interleave_high
5018 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5019 where the first argument is taken from the first half of DR_CHAIN and the
5020 second argument from it's second half.
5021 In our example,
5023 I1: interleave_high (1st vec, 3rd vec)
5024 I2: interleave_low (1st vec, 3rd vec)
5025 I3: interleave_high (2nd vec, 4th vec)
5026 I4: interleave_low (2nd vec, 4th vec)
5028 The output for the first stage is:
5030 I1: 0 16 1 17 2 18 3 19
5031 I2: 4 20 5 21 6 22 7 23
5032 I3: 8 24 9 25 10 26 11 27
5033 I4: 12 28 13 29 14 30 15 31
5035 The output of the second stage, i.e. the final result is:
5037 I1: 0 8 16 24 1 9 17 25
5038 I2: 2 10 18 26 3 11 19 27
5039 I3: 4 12 20 28 5 13 21 30
5040 I4: 6 14 22 30 7 15 23 31. */
5042 static bool
5043 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
5044 unsigned int length,
5045 gimple stmt,
5046 gimple_stmt_iterator *gsi,
5047 VEC(tree,heap) **result_chain)
5049 tree perm_dest, vect1, vect2, high, low;
5050 gimple perm_stmt;
5051 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5052 tree scalar_dest;
5053 int i;
5054 unsigned int j;
5055 enum tree_code high_code, low_code;
5057 scalar_dest = gimple_assign_lhs (stmt);
5059 /* Check that the operation is supported. */
5060 if (!vect_strided_store_supported (vectype))
5061 return false;
5063 *result_chain = VEC_copy (tree, heap, dr_chain);
5065 for (i = 0; i < exact_log2 (length); i++)
5067 for (j = 0; j < length/2; j++)
5069 vect1 = VEC_index (tree, dr_chain, j);
5070 vect2 = VEC_index (tree, dr_chain, j+length/2);
5072 /* Create interleaving stmt:
5073 in the case of big endian:
5074 high = interleave_high (vect1, vect2)
5075 and in the case of little endian:
5076 high = interleave_low (vect1, vect2). */
5077 perm_dest = create_tmp_var (vectype, "vect_inter_high");
5078 DECL_GIMPLE_REG_P (perm_dest) = 1;
5079 add_referenced_var (perm_dest);
5080 if (BYTES_BIG_ENDIAN)
5082 high_code = VEC_INTERLEAVE_HIGH_EXPR;
5083 low_code = VEC_INTERLEAVE_LOW_EXPR;
5085 else
5087 low_code = VEC_INTERLEAVE_HIGH_EXPR;
5088 high_code = VEC_INTERLEAVE_LOW_EXPR;
5090 perm_stmt = gimple_build_assign_with_ops (high_code, perm_dest,
5091 vect1, vect2);
5092 high = make_ssa_name (perm_dest, perm_stmt);
5093 gimple_assign_set_lhs (perm_stmt, high);
5094 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5095 VEC_replace (tree, *result_chain, 2*j, high);
5097 /* Create interleaving stmt:
5098 in the case of big endian:
5099 low = interleave_low (vect1, vect2)
5100 and in the case of little endian:
5101 low = interleave_high (vect1, vect2). */
5102 perm_dest = create_tmp_var (vectype, "vect_inter_low");
5103 DECL_GIMPLE_REG_P (perm_dest) = 1;
5104 add_referenced_var (perm_dest);
5105 perm_stmt = gimple_build_assign_with_ops (low_code, perm_dest,
5106 vect1, vect2);
5107 low = make_ssa_name (perm_dest, perm_stmt);
5108 gimple_assign_set_lhs (perm_stmt, low);
5109 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5110 VEC_replace (tree, *result_chain, 2*j+1, low);
5112 dr_chain = VEC_copy (tree, heap, *result_chain);
5114 return true;
5118 /* Function vectorizable_store.
5120 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
5121 can be vectorized.
5122 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5123 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5124 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5126 bool
5127 vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
5128 slp_tree slp_node)
5130 tree scalar_dest;
5131 tree data_ref;
5132 tree op;
5133 tree vec_oprnd = NULL_TREE;
5134 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5135 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
5136 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5137 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5138 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5139 enum machine_mode vec_mode;
5140 tree dummy;
5141 enum dr_alignment_support alignment_support_scheme;
5142 tree def;
5143 gimple def_stmt;
5144 enum vect_def_type dt;
5145 stmt_vec_info prev_stmt_info = NULL;
5146 tree dataref_ptr = NULL_TREE;
5147 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5148 int ncopies;
5149 int j;
5150 gimple next_stmt, first_stmt = NULL;
5151 bool strided_store = false;
5152 unsigned int group_size, i;
5153 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
5154 bool inv_p;
5155 VEC(tree,heap) *vec_oprnds = NULL;
5156 bool slp = (slp_node != NULL);
5157 stmt_vec_info first_stmt_vinfo;
5158 unsigned int vec_num;
5160 /* Multiple types in SLP are handled by creating the appropriate number of
5161 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
5162 case of SLP. */
5163 if (slp)
5164 ncopies = 1;
5165 else
5166 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5168 gcc_assert (ncopies >= 1);
5170 /* FORNOW. This restriction should be relaxed. */
5171 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
5173 if (vect_print_dump_info (REPORT_DETAILS))
5174 fprintf (vect_dump, "multiple types in nested loop.");
5175 return false;
5178 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5179 return false;
5181 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5182 return false;
5184 /* Is vectorizable store? */
5186 if (!is_gimple_assign (stmt))
5187 return false;
5189 scalar_dest = gimple_assign_lhs (stmt);
5190 if (TREE_CODE (scalar_dest) != ARRAY_REF
5191 && TREE_CODE (scalar_dest) != INDIRECT_REF
5192 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5193 return false;
5195 gcc_assert (gimple_assign_single_p (stmt));
5196 op = gimple_assign_rhs1 (stmt);
5197 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5199 if (vect_print_dump_info (REPORT_DETAILS))
5200 fprintf (vect_dump, "use not simple.");
5201 return false;
5204 /* The scalar rhs type needs to be trivially convertible to the vector
5205 component type. This should always be the case. */
5206 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (op)))
5208 if (vect_print_dump_info (REPORT_DETAILS))
5209 fprintf (vect_dump, "??? operands of different types");
5210 return false;
5213 vec_mode = TYPE_MODE (vectype);
5214 /* FORNOW. In some cases can vectorize even if data-type not supported
5215 (e.g. - array initialization with 0). */
5216 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
5217 return false;
5219 if (!STMT_VINFO_DATA_REF (stmt_info))
5220 return false;
5222 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5224 strided_store = true;
5225 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5226 if (!vect_strided_store_supported (vectype)
5227 && !PURE_SLP_STMT (stmt_info) && !slp)
5228 return false;
5230 if (first_stmt == stmt)
5232 /* STMT is the leader of the group. Check the operands of all the
5233 stmts of the group. */
5234 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
5235 while (next_stmt)
5237 gcc_assert (gimple_assign_single_p (next_stmt));
5238 op = gimple_assign_rhs1 (next_stmt);
5239 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5241 if (vect_print_dump_info (REPORT_DETAILS))
5242 fprintf (vect_dump, "use not simple.");
5243 return false;
5245 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5250 if (!vec_stmt) /* transformation not required. */
5252 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
5253 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
5254 return true;
5257 /** Transform. **/
5259 if (strided_store)
5261 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5262 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5264 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
5266 /* FORNOW */
5267 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5269 /* We vectorize all the stmts of the interleaving group when we
5270 reach the last stmt in the group. */
5271 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
5272 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
5273 && !slp)
5275 *vec_stmt = NULL;
5276 return true;
5279 if (slp)
5280 strided_store = false;
5282 /* VEC_NUM is the number of vect stmts to be created for this group. */
5283 if (slp)
5284 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5285 else
5286 vec_num = group_size;
5288 else
5290 first_stmt = stmt;
5291 first_dr = dr;
5292 group_size = vec_num = 1;
5293 first_stmt_vinfo = stmt_info;
5296 if (vect_print_dump_info (REPORT_DETAILS))
5297 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
5299 dr_chain = VEC_alloc (tree, heap, group_size);
5300 oprnds = VEC_alloc (tree, heap, group_size);
5302 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5303 gcc_assert (alignment_support_scheme);
5304 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
5306 /* In case the vectorization factor (VF) is bigger than the number
5307 of elements that we can fit in a vectype (nunits), we have to generate
5308 more than one vector stmt - i.e - we need to "unroll" the
5309 vector stmt by a factor VF/nunits. For more details see documentation in
5310 vect_get_vec_def_for_copy_stmt. */
5312 /* In case of interleaving (non-unit strided access):
5314 S1: &base + 2 = x2
5315 S2: &base = x0
5316 S3: &base + 1 = x1
5317 S4: &base + 3 = x3
5319 We create vectorized stores starting from base address (the access of the
5320 first stmt in the chain (S2 in the above example), when the last store stmt
5321 of the chain (S4) is reached:
5323 VS1: &base = vx2
5324 VS2: &base + vec_size*1 = vx0
5325 VS3: &base + vec_size*2 = vx1
5326 VS4: &base + vec_size*3 = vx3
5328 Then permutation statements are generated:
5330 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
5331 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
5334 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5335 (the order of the data-refs in the output of vect_permute_store_chain
5336 corresponds to the order of scalar stmts in the interleaving chain - see
5337 the documentation of vect_permute_store_chain()).
5339 In case of both multiple types and interleaving, above vector stores and
5340 permutation stmts are created for every copy. The result vector stmts are
5341 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5342 STMT_VINFO_RELATED_STMT for the next copies.
5345 prev_stmt_info = NULL;
5346 for (j = 0; j < ncopies; j++)
5348 gimple new_stmt;
5349 gimple ptr_incr;
5351 if (j == 0)
5353 if (slp)
5355 /* Get vectorized arguments for SLP_NODE. */
5356 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
5358 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
5360 else
5362 /* For interleaved stores we collect vectorized defs for all the
5363 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
5364 used as an input to vect_permute_store_chain(), and OPRNDS as
5365 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
5367 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5368 OPRNDS are of size 1. */
5369 next_stmt = first_stmt;
5370 for (i = 0; i < group_size; i++)
5372 /* Since gaps are not supported for interleaved stores,
5373 GROUP_SIZE is the exact number of stmts in the chain.
5374 Therefore, NEXT_STMT can't be NULL_TREE. In case that
5375 there is no interleaving, GROUP_SIZE is 1, and only one
5376 iteration of the loop will be executed. */
5377 gcc_assert (next_stmt
5378 && gimple_assign_single_p (next_stmt));
5379 op = gimple_assign_rhs1 (next_stmt);
5381 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
5382 NULL);
5383 VEC_quick_push(tree, dr_chain, vec_oprnd);
5384 VEC_quick_push(tree, oprnds, vec_oprnd);
5385 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5389 /* We should have catched mismatched types earlier. */
5390 gcc_assert (useless_type_conversion_p (vectype,
5391 TREE_TYPE (vec_oprnd)));
5392 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
5393 &dummy, &ptr_incr, false,
5394 &inv_p, NULL);
5395 gcc_assert (!inv_p);
5397 else
5399 /* For interleaved stores we created vectorized defs for all the
5400 defs stored in OPRNDS in the previous iteration (previous copy).
5401 DR_CHAIN is then used as an input to vect_permute_store_chain(),
5402 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
5403 next copy.
5404 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
5405 OPRNDS are of size 1. */
5406 for (i = 0; i < group_size; i++)
5408 op = VEC_index (tree, oprnds, i);
5409 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
5410 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
5411 VEC_replace(tree, dr_chain, i, vec_oprnd);
5412 VEC_replace(tree, oprnds, i, vec_oprnd);
5414 dataref_ptr =
5415 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
5418 if (strided_store)
5420 result_chain = VEC_alloc (tree, heap, group_size);
5421 /* Permute. */
5422 if (!vect_permute_store_chain (dr_chain, group_size, stmt, gsi,
5423 &result_chain))
5424 return false;
5427 next_stmt = first_stmt;
5428 for (i = 0; i < vec_num; i++)
5430 if (i > 0)
5431 /* Bump the vector pointer. */
5432 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
5433 NULL_TREE);
5435 if (slp)
5436 vec_oprnd = VEC_index (tree, vec_oprnds, i);
5437 else if (strided_store)
5438 /* For strided stores vectorized defs are interleaved in
5439 vect_permute_store_chain(). */
5440 vec_oprnd = VEC_index (tree, result_chain, i);
5442 data_ref = build_fold_indirect_ref (dataref_ptr);
5443 /* If accesses through a pointer to vectype do not alias the original
5444 memory reference we have a problem. This should never happen. */
5445 gcc_assert (get_alias_set (data_ref) == get_alias_set (gimple_assign_lhs (stmt))
5446 || alias_set_subset_of (get_alias_set (data_ref),
5447 get_alias_set (gimple_assign_lhs (stmt))));
5449 /* Arguments are ready. Create the new vector stmt. */
5450 new_stmt = gimple_build_assign (data_ref, vec_oprnd);
5451 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5452 mark_symbols_for_renaming (new_stmt);
5454 if (slp)
5455 continue;
5457 if (j == 0)
5458 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5459 else
5460 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5462 prev_stmt_info = vinfo_for_stmt (new_stmt);
5463 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5464 if (!next_stmt)
5465 break;
5469 VEC_free (tree, heap, dr_chain);
5470 VEC_free (tree, heap, oprnds);
5471 if (result_chain)
5472 VEC_free (tree, heap, result_chain);
5474 return true;
5478 /* Function vect_setup_realignment
5480 This function is called when vectorizing an unaligned load using
5481 the dr_explicit_realign[_optimized] scheme.
5482 This function generates the following code at the loop prolog:
5484 p = initial_addr;
5485 x msq_init = *(floor(p)); # prolog load
5486 realignment_token = call target_builtin;
5487 loop:
5488 x msq = phi (msq_init, ---)
5490 The stmts marked with x are generated only for the case of
5491 dr_explicit_realign_optimized.
5493 The code above sets up a new (vector) pointer, pointing to the first
5494 location accessed by STMT, and a "floor-aligned" load using that pointer.
5495 It also generates code to compute the "realignment-token" (if the relevant
5496 target hook was defined), and creates a phi-node at the loop-header bb
5497 whose arguments are the result of the prolog-load (created by this
5498 function) and the result of a load that takes place in the loop (to be
5499 created by the caller to this function).
5501 For the case of dr_explicit_realign_optimized:
5502 The caller to this function uses the phi-result (msq) to create the
5503 realignment code inside the loop, and sets up the missing phi argument,
5504 as follows:
5505 loop:
5506 msq = phi (msq_init, lsq)
5507 lsq = *(floor(p')); # load in loop
5508 result = realign_load (msq, lsq, realignment_token);
5510 For the case of dr_explicit_realign:
5511 loop:
5512 msq = *(floor(p)); # load in loop
5513 p' = p + (VS-1);
5514 lsq = *(floor(p')); # load in loop
5515 result = realign_load (msq, lsq, realignment_token);
5517 Input:
5518 STMT - (scalar) load stmt to be vectorized. This load accesses
5519 a memory location that may be unaligned.
5520 BSI - place where new code is to be inserted.
5521 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5522 is used.
5524 Output:
5525 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5526 target hook, if defined.
5527 Return value - the result of the loop-header phi node. */
5529 static tree
5530 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
5531 tree *realignment_token,
5532 enum dr_alignment_support alignment_support_scheme,
5533 tree init_addr,
5534 struct loop **at_loop)
5536 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5537 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5538 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5539 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5540 edge pe;
5541 tree scalar_dest = gimple_assign_lhs (stmt);
5542 tree vec_dest;
5543 gimple inc;
5544 tree ptr;
5545 tree data_ref;
5546 gimple new_stmt;
5547 basic_block new_bb;
5548 tree msq_init = NULL_TREE;
5549 tree new_temp;
5550 gimple phi_stmt;
5551 tree msq = NULL_TREE;
5552 gimple_seq stmts = NULL;
5553 bool inv_p;
5554 bool compute_in_loop = false;
5555 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5556 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5557 struct loop *loop_for_initial_load;
5559 gcc_assert (alignment_support_scheme == dr_explicit_realign
5560 || alignment_support_scheme == dr_explicit_realign_optimized);
5562 /* We need to generate three things:
5563 1. the misalignment computation
5564 2. the extra vector load (for the optimized realignment scheme).
5565 3. the phi node for the two vectors from which the realignment is
5566 done (for the optimized realignment scheme).
5569 /* 1. Determine where to generate the misalignment computation.
5571 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5572 calculation will be generated by this function, outside the loop (in the
5573 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5574 caller, inside the loop.
5576 Background: If the misalignment remains fixed throughout the iterations of
5577 the loop, then both realignment schemes are applicable, and also the
5578 misalignment computation can be done outside LOOP. This is because we are
5579 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5580 are a multiple of VS (the Vector Size), and therefore the misalignment in
5581 different vectorized LOOP iterations is always the same.
5582 The problem arises only if the memory access is in an inner-loop nested
5583 inside LOOP, which is now being vectorized using outer-loop vectorization.
5584 This is the only case when the misalignment of the memory access may not
5585 remain fixed throughout the iterations of the inner-loop (as explained in
5586 detail in vect_supportable_dr_alignment). In this case, not only is the
5587 optimized realignment scheme not applicable, but also the misalignment
5588 computation (and generation of the realignment token that is passed to
5589 REALIGN_LOAD) have to be done inside the loop.
5591 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5592 or not, which in turn determines if the misalignment is computed inside
5593 the inner-loop, or outside LOOP. */
5595 if (init_addr != NULL_TREE)
5597 compute_in_loop = true;
5598 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5602 /* 2. Determine where to generate the extra vector load.
5604 For the optimized realignment scheme, instead of generating two vector
5605 loads in each iteration, we generate a single extra vector load in the
5606 preheader of the loop, and in each iteration reuse the result of the
5607 vector load from the previous iteration. In case the memory access is in
5608 an inner-loop nested inside LOOP, which is now being vectorized using
5609 outer-loop vectorization, we need to determine whether this initial vector
5610 load should be generated at the preheader of the inner-loop, or can be
5611 generated at the preheader of LOOP. If the memory access has no evolution
5612 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5613 to be generated inside LOOP (in the preheader of the inner-loop). */
5615 if (nested_in_vect_loop)
5617 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5618 bool invariant_in_outerloop =
5619 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5620 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5622 else
5623 loop_for_initial_load = loop;
5624 if (at_loop)
5625 *at_loop = loop_for_initial_load;
5627 /* 3. For the case of the optimized realignment, create the first vector
5628 load at the loop preheader. */
5630 if (alignment_support_scheme == dr_explicit_realign_optimized)
5632 /* Create msq_init = *(floor(p1)) in the loop preheader */
5634 gcc_assert (!compute_in_loop);
5635 pe = loop_preheader_edge (loop_for_initial_load);
5636 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5637 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5638 &init_addr, &inc, true, &inv_p, NULL_TREE);
5639 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5640 new_stmt = gimple_build_assign (vec_dest, data_ref);
5641 new_temp = make_ssa_name (vec_dest, new_stmt);
5642 gimple_assign_set_lhs (new_stmt, new_temp);
5643 mark_symbols_for_renaming (new_stmt);
5644 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5645 gcc_assert (!new_bb);
5646 msq_init = gimple_assign_lhs (new_stmt);
5649 /* 4. Create realignment token using a target builtin, if available.
5650 It is done either inside the containing loop, or before LOOP (as
5651 determined above). */
5653 if (targetm.vectorize.builtin_mask_for_load)
5655 tree builtin_decl;
5657 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5658 if (compute_in_loop)
5659 gcc_assert (init_addr); /* already computed by the caller. */
5660 else
5662 /* Generate the INIT_ADDR computation outside LOOP. */
5663 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5664 NULL_TREE, loop);
5665 pe = loop_preheader_edge (loop);
5666 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5667 gcc_assert (!new_bb);
5670 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5671 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5672 vec_dest =
5673 vect_create_destination_var (scalar_dest,
5674 gimple_call_return_type (new_stmt));
5675 new_temp = make_ssa_name (vec_dest, new_stmt);
5676 gimple_call_set_lhs (new_stmt, new_temp);
5678 if (compute_in_loop)
5679 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5680 else
5682 /* Generate the misalignment computation outside LOOP. */
5683 pe = loop_preheader_edge (loop);
5684 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5685 gcc_assert (!new_bb);
5688 *realignment_token = gimple_call_lhs (new_stmt);
5690 /* The result of the CALL_EXPR to this builtin is determined from
5691 the value of the parameter and no global variables are touched
5692 which makes the builtin a "const" function. Requiring the
5693 builtin to have the "const" attribute makes it unnecessary
5694 to call mark_call_clobbered. */
5695 gcc_assert (TREE_READONLY (builtin_decl));
5698 if (alignment_support_scheme == dr_explicit_realign)
5699 return msq;
5701 gcc_assert (!compute_in_loop);
5702 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5705 /* 5. Create msq = phi <msq_init, lsq> in loop */
5707 pe = loop_preheader_edge (containing_loop);
5708 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5709 msq = make_ssa_name (vec_dest, NULL);
5710 phi_stmt = create_phi_node (msq, containing_loop->header);
5711 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5712 add_phi_arg (phi_stmt, msq_init, pe);
5714 return msq;
5718 /* Function vect_strided_load_supported.
5720 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5721 and FALSE otherwise. */
5723 static bool
5724 vect_strided_load_supported (tree vectype)
5726 optab perm_even_optab, perm_odd_optab;
5727 int mode;
5729 mode = (int) TYPE_MODE (vectype);
5731 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5732 optab_default);
5733 if (!perm_even_optab)
5735 if (vect_print_dump_info (REPORT_DETAILS))
5736 fprintf (vect_dump, "no optab for perm_even.");
5737 return false;
5740 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5742 if (vect_print_dump_info (REPORT_DETAILS))
5743 fprintf (vect_dump, "perm_even op not supported by target.");
5744 return false;
5747 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5748 optab_default);
5749 if (!perm_odd_optab)
5751 if (vect_print_dump_info (REPORT_DETAILS))
5752 fprintf (vect_dump, "no optab for perm_odd.");
5753 return false;
5756 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5758 if (vect_print_dump_info (REPORT_DETAILS))
5759 fprintf (vect_dump, "perm_odd op not supported by target.");
5760 return false;
5762 return true;
5766 /* Function vect_permute_load_chain.
5768 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5769 a power of 2, generate extract_even/odd stmts to reorder the input data
5770 correctly. Return the final references for loads in RESULT_CHAIN.
5772 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5773 The input is 4 vectors each containing 8 elements. We assign a number to each
5774 element, the input sequence is:
5776 1st vec: 0 1 2 3 4 5 6 7
5777 2nd vec: 8 9 10 11 12 13 14 15
5778 3rd vec: 16 17 18 19 20 21 22 23
5779 4th vec: 24 25 26 27 28 29 30 31
5781 The output sequence should be:
5783 1st vec: 0 4 8 12 16 20 24 28
5784 2nd vec: 1 5 9 13 17 21 25 29
5785 3rd vec: 2 6 10 14 18 22 26 30
5786 4th vec: 3 7 11 15 19 23 27 31
5788 i.e., the first output vector should contain the first elements of each
5789 interleaving group, etc.
5791 We use extract_even/odd instructions to create such output. The input of each
5792 extract_even/odd operation is two vectors
5793 1st vec 2nd vec
5794 0 1 2 3 4 5 6 7
5796 and the output is the vector of extracted even/odd elements. The output of
5797 extract_even will be: 0 2 4 6
5798 and of extract_odd: 1 3 5 7
5801 The permutation is done in log LENGTH stages. In each stage extract_even and
5802 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5803 order. In our example,
5805 E1: extract_even (1st vec, 2nd vec)
5806 E2: extract_odd (1st vec, 2nd vec)
5807 E3: extract_even (3rd vec, 4th vec)
5808 E4: extract_odd (3rd vec, 4th vec)
5810 The output for the first stage will be:
5812 E1: 0 2 4 6 8 10 12 14
5813 E2: 1 3 5 7 9 11 13 15
5814 E3: 16 18 20 22 24 26 28 30
5815 E4: 17 19 21 23 25 27 29 31
5817 In order to proceed and create the correct sequence for the next stage (or
5818 for the correct output, if the second stage is the last one, as in our
5819 example), we first put the output of extract_even operation and then the
5820 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5821 The input for the second stage is:
5823 1st vec (E1): 0 2 4 6 8 10 12 14
5824 2nd vec (E3): 16 18 20 22 24 26 28 30
5825 3rd vec (E2): 1 3 5 7 9 11 13 15
5826 4th vec (E4): 17 19 21 23 25 27 29 31
5828 The output of the second stage:
5830 E1: 0 4 8 12 16 20 24 28
5831 E2: 2 6 10 14 18 22 26 30
5832 E3: 1 5 9 13 17 21 25 29
5833 E4: 3 7 11 15 19 23 27 31
5835 And RESULT_CHAIN after reordering:
5837 1st vec (E1): 0 4 8 12 16 20 24 28
5838 2nd vec (E3): 1 5 9 13 17 21 25 29
5839 3rd vec (E2): 2 6 10 14 18 22 26 30
5840 4th vec (E4): 3 7 11 15 19 23 27 31. */
5842 static bool
5843 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5844 unsigned int length,
5845 gimple stmt,
5846 gimple_stmt_iterator *gsi,
5847 VEC(tree,heap) **result_chain)
5849 tree perm_dest, data_ref, first_vect, second_vect;
5850 gimple perm_stmt;
5851 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5852 int i;
5853 unsigned int j;
5855 /* Check that the operation is supported. */
5856 if (!vect_strided_load_supported (vectype))
5857 return false;
5859 *result_chain = VEC_copy (tree, heap, dr_chain);
5860 for (i = 0; i < exact_log2 (length); i++)
5862 for (j = 0; j < length; j +=2)
5864 first_vect = VEC_index (tree, dr_chain, j);
5865 second_vect = VEC_index (tree, dr_chain, j+1);
5867 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5868 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5869 DECL_GIMPLE_REG_P (perm_dest) = 1;
5870 add_referenced_var (perm_dest);
5872 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_EVEN_EXPR,
5873 perm_dest, first_vect,
5874 second_vect);
5876 data_ref = make_ssa_name (perm_dest, perm_stmt);
5877 gimple_assign_set_lhs (perm_stmt, data_ref);
5878 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5879 mark_symbols_for_renaming (perm_stmt);
5881 VEC_replace (tree, *result_chain, j/2, data_ref);
5883 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5884 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5885 DECL_GIMPLE_REG_P (perm_dest) = 1;
5886 add_referenced_var (perm_dest);
5888 perm_stmt = gimple_build_assign_with_ops (VEC_EXTRACT_ODD_EXPR,
5889 perm_dest, first_vect,
5890 second_vect);
5891 data_ref = make_ssa_name (perm_dest, perm_stmt);
5892 gimple_assign_set_lhs (perm_stmt, data_ref);
5893 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5894 mark_symbols_for_renaming (perm_stmt);
5896 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5898 dr_chain = VEC_copy (tree, heap, *result_chain);
5900 return true;
5904 /* Function vect_transform_strided_load.
5906 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5907 to perform their permutation and ascribe the result vectorized statements to
5908 the scalar statements.
5911 static bool
5912 vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
5913 gimple_stmt_iterator *gsi)
5915 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5916 gimple first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5917 gimple next_stmt, new_stmt;
5918 VEC(tree,heap) *result_chain = NULL;
5919 unsigned int i, gap_count;
5920 tree tmp_data_ref;
5922 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5923 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5924 vectors, that are ready for vector computation. */
5925 result_chain = VEC_alloc (tree, heap, size);
5926 /* Permute. */
5927 if (!vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain))
5928 return false;
5930 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5931 Since we scan the chain starting from it's first node, their order
5932 corresponds the order of data-refs in RESULT_CHAIN. */
5933 next_stmt = first_stmt;
5934 gap_count = 1;
5935 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5937 if (!next_stmt)
5938 break;
5940 /* Skip the gaps. Loads created for the gaps will be removed by dead
5941 code elimination pass later. No need to check for the first stmt in
5942 the group, since it always exists.
5943 DR_GROUP_GAP is the number of steps in elements from the previous
5944 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5945 correspond to the gaps.
5947 if (next_stmt != first_stmt
5948 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5950 gap_count++;
5951 continue;
5954 while (next_stmt)
5956 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5957 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5958 copies, and we put the new vector statement in the first available
5959 RELATED_STMT. */
5960 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5961 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5962 else
5964 if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5966 gimple prev_stmt =
5967 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5968 gimple rel_stmt =
5969 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5970 while (rel_stmt)
5972 prev_stmt = rel_stmt;
5973 rel_stmt =
5974 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5977 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5978 new_stmt;
5982 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5983 gap_count = 1;
5984 /* If NEXT_STMT accesses the same DR as the previous statement,
5985 put the same TMP_DATA_REF as its vectorized statement; otherwise
5986 get the next data-ref from RESULT_CHAIN. */
5987 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5988 break;
5992 VEC_free (tree, heap, result_chain);
5993 return true;
5997 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
5998 building a vector of type MASK_TYPE from it) and two input vectors placed in
5999 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
6000 shifting by STRIDE elements of DR_CHAIN for every copy.
6001 (STRIDE is the number of vectorized stmts for NODE divided by the number of
6002 copies).
6003 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
6004 the created stmts must be inserted. */
6006 static inline void
6007 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
6008 int *mask_array, int mask_nunits,
6009 tree mask_element_type, tree mask_type,
6010 int first_vec_indx, int second_vec_indx,
6011 gimple_stmt_iterator *gsi, slp_tree node,
6012 tree builtin_decl, tree vectype,
6013 VEC(tree,heap) *dr_chain,
6014 int ncopies, int vect_stmts_counter)
6016 tree t = NULL_TREE, mask_vec, mask, perm_dest;
6017 gimple perm_stmt = NULL;
6018 stmt_vec_info next_stmt_info;
6019 int i, group_size, stride, dr_chain_size;
6020 tree first_vec, second_vec, data_ref;
6021 tree sym;
6022 ssa_op_iter iter;
6023 VEC (tree, heap) *params = NULL;
6025 /* Create a vector mask. */
6026 for (i = mask_nunits - 1; i >= 0; --i)
6027 t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
6029 mask_vec = build_vector (mask_type, t);
6030 mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
6032 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
6033 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
6034 dr_chain_size = VEC_length (tree, dr_chain);
6036 /* Initialize the vect stmts of NODE to properly insert the generated
6037 stmts later. */
6038 for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
6039 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6040 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
6042 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
6043 for (i = 0; i < ncopies; i++)
6045 first_vec = VEC_index (tree, dr_chain, first_vec_indx);
6046 second_vec = VEC_index (tree, dr_chain, second_vec_indx);
6048 /* Build argument list for the vectorized call. */
6049 VEC_free (tree, heap, params);
6050 params = VEC_alloc (tree, heap, 3);
6051 VEC_quick_push (tree, params, first_vec);
6052 VEC_quick_push (tree, params, second_vec);
6053 VEC_quick_push (tree, params, mask);
6055 /* Generate the permute statement. */
6056 perm_stmt = gimple_build_call_vec (builtin_decl, params);
6057 data_ref = make_ssa_name (perm_dest, perm_stmt);
6058 gimple_call_set_lhs (perm_stmt, data_ref);
6059 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6060 FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
6062 if (TREE_CODE (sym) == SSA_NAME)
6063 sym = SSA_NAME_VAR (sym);
6064 mark_sym_for_renaming (sym);
6067 /* Store the vector statement in NODE. */
6068 VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
6069 stride * i + vect_stmts_counter, perm_stmt);
6071 first_vec_indx += stride;
6072 second_vec_indx += stride;
6075 /* Mark the scalar stmt as vectorized. */
6076 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
6077 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
6081 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
6082 return in CURRENT_MASK_ELEMENT its equivalent in target specific
6083 representation. Check that the mask is valid and return FALSE if not.
6084 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
6085 the next vector, i.e., the current first vector is not needed. */
6087 static bool
6088 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
6089 int mask_nunits, bool only_one_vec, int index,
6090 int *mask, int *current_mask_element,
6091 bool *need_next_vector)
6093 int i;
6094 static int number_of_mask_fixes = 1;
6095 static bool mask_fixed = false;
6096 static bool needs_first_vector = false;
6098 /* Convert to target specific representation. */
6099 *current_mask_element = first_mask_element + m;
6100 /* Adjust the value in case it's a mask for second and third vectors. */
6101 *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
6103 if (*current_mask_element < mask_nunits)
6104 needs_first_vector = true;
6106 /* We have only one input vector to permute but the mask accesses values in
6107 the next vector as well. */
6108 if (only_one_vec && *current_mask_element >= mask_nunits)
6110 if (vect_print_dump_info (REPORT_DETAILS))
6112 fprintf (vect_dump, "permutation requires at least two vectors ");
6113 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6116 return false;
6119 /* The mask requires the next vector. */
6120 if (*current_mask_element >= mask_nunits * 2)
6122 if (needs_first_vector || mask_fixed)
6124 /* We either need the first vector too or have already moved to the
6125 next vector. In both cases, this permutation needs three
6126 vectors. */
6127 if (vect_print_dump_info (REPORT_DETAILS))
6129 fprintf (vect_dump, "permutation requires at "
6130 "least three vectors ");
6131 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6134 return false;
6137 /* We move to the next vector, dropping the first one and working with
6138 the second and the third - we need to adjust the values of the mask
6139 accordingly. */
6140 *current_mask_element -= mask_nunits * number_of_mask_fixes;
6142 for (i = 0; i < index; i++)
6143 mask[i] -= mask_nunits * number_of_mask_fixes;
6145 (number_of_mask_fixes)++;
6146 mask_fixed = true;
6149 *need_next_vector = mask_fixed;
6151 /* This was the last element of this mask. Start a new one. */
6152 if (index == mask_nunits - 1)
6154 number_of_mask_fixes = 1;
6155 mask_fixed = false;
6156 needs_first_vector = false;
6159 return true;
6163 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6164 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6165 permute statements for SLP_NODE_INSTANCE. */
6166 bool
6167 vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
6168 gimple_stmt_iterator *gsi, int vf,
6169 slp_instance slp_node_instance, bool analyze_only)
6171 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6172 tree mask_element_type = NULL_TREE, mask_type;
6173 int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
6174 slp_tree node;
6175 tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
6176 gimple next_scalar_stmt;
6177 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6178 int first_mask_element;
6179 int index, unroll_factor, *mask, current_mask_element, ncopies;
6180 bool only_one_vec = false, need_next_vector = false;
6181 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
6183 if (!targetm.vectorize.builtin_vec_perm)
6185 if (vect_print_dump_info (REPORT_DETAILS))
6187 fprintf (vect_dump, "no builtin for vect permute for ");
6188 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6191 return false;
6194 builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
6195 &mask_element_type);
6196 if (!builtin_decl || !mask_element_type)
6198 if (vect_print_dump_info (REPORT_DETAILS))
6200 fprintf (vect_dump, "no builtin for vect permute for ");
6201 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
6204 return false;
6207 mask_type = get_vectype_for_scalar_type (mask_element_type);
6208 mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
6209 mask = (int *) xmalloc (sizeof (int) * mask_nunits);
6210 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6211 scale = mask_nunits / nunits;
6212 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6214 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
6215 unrolling factor. */
6216 orig_vec_stmts_num = group_size *
6217 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
6218 if (orig_vec_stmts_num == 1)
6219 only_one_vec = true;
6221 /* Number of copies is determined by the final vectorization factor
6222 relatively to SLP_NODE_INSTANCE unrolling factor. */
6223 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
6225 /* Generate permutation masks for every NODE. Number of masks for each NODE
6226 is equal to GROUP_SIZE.
6227 E.g., we have a group of three nodes with three loads from the same
6228 location in each node, and the vector size is 4. I.e., we have a
6229 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6230 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6231 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6234 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
6235 scpecific type, e.g., in bytes for Altivec.
6236 The last mask is illegal since we assume two operands for permute
6237 operation, and the mask element values can't be outside that range. Hence,
6238 the last mask must be converted into {2,5,5,5}.
6239 For the first two permutations we need the first and the second input
6240 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6241 we need the second and the third vectors: {b1,c1,a2,b2} and
6242 {c2,a3,b3,c3}. */
6244 for (i = 0;
6245 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
6246 i, node);
6247 i++)
6249 scalar_index = 0;
6250 index = 0;
6251 vect_stmts_counter = 0;
6252 vec_index = 0;
6253 first_vec_index = vec_index++;
6254 if (only_one_vec)
6255 second_vec_index = first_vec_index;
6256 else
6257 second_vec_index = vec_index++;
6259 for (j = 0; j < unroll_factor; j++)
6261 for (k = 0; k < group_size; k++)
6263 first_mask_element = (i + j * group_size) * scale;
6264 for (m = 0; m < scale; m++)
6266 if (!vect_get_mask_element (stmt, first_mask_element, m,
6267 mask_nunits, only_one_vec, index, mask,
6268 &current_mask_element, &need_next_vector))
6269 return false;
6271 mask[index++] = current_mask_element;
6274 if (index == mask_nunits)
6276 index = 0;
6277 if (!analyze_only)
6279 if (need_next_vector)
6281 first_vec_index = second_vec_index;
6282 second_vec_index = vec_index;
6285 next_scalar_stmt = VEC_index (gimple,
6286 SLP_TREE_SCALAR_STMTS (node), scalar_index++);
6288 vect_create_mask_and_perm (stmt, next_scalar_stmt,
6289 mask, mask_nunits, mask_element_type, mask_type,
6290 first_vec_index, second_vec_index, gsi, node,
6291 builtin_decl, vectype, dr_chain, ncopies,
6292 vect_stmts_counter++);
6299 free (mask);
6300 return true;
6303 /* vectorizable_load.
6305 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
6306 can be vectorized.
6307 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6308 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
6309 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6311 bool
6312 vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
6313 slp_tree slp_node, slp_instance slp_node_instance)
6315 tree scalar_dest;
6316 tree vec_dest = NULL;
6317 tree data_ref = NULL;
6318 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6319 stmt_vec_info prev_stmt_info;
6320 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6321 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6322 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
6323 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
6324 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
6325 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6326 tree new_temp;
6327 int mode;
6328 gimple new_stmt = NULL;
6329 tree dummy;
6330 enum dr_alignment_support alignment_support_scheme;
6331 tree dataref_ptr = NULL_TREE;
6332 gimple ptr_incr;
6333 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6334 int ncopies;
6335 int i, j, group_size;
6336 tree msq = NULL_TREE, lsq;
6337 tree offset = NULL_TREE;
6338 tree realignment_token = NULL_TREE;
6339 gimple phi = NULL;
6340 VEC(tree,heap) *dr_chain = NULL;
6341 bool strided_load = false;
6342 gimple first_stmt;
6343 tree scalar_type;
6344 bool inv_p;
6345 bool compute_in_loop = false;
6346 struct loop *at_loop;
6347 int vec_num;
6348 bool slp = (slp_node != NULL);
6349 bool slp_perm = false;
6350 enum tree_code code;
6352 /* Multiple types in SLP are handled by creating the appropriate number of
6353 vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in
6354 case of SLP. */
6355 if (slp)
6356 ncopies = 1;
6357 else
6358 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6360 gcc_assert (ncopies >= 1);
6362 /* FORNOW. This restriction should be relaxed. */
6363 if (nested_in_vect_loop && ncopies > 1)
6365 if (vect_print_dump_info (REPORT_DETAILS))
6366 fprintf (vect_dump, "multiple types in nested loop.");
6367 return false;
6370 if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
6371 slp_perm = true;
6373 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6374 return false;
6376 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6377 return false;
6379 /* Is vectorizable load? */
6380 if (!is_gimple_assign (stmt))
6381 return false;
6383 scalar_dest = gimple_assign_lhs (stmt);
6384 if (TREE_CODE (scalar_dest) != SSA_NAME)
6385 return false;
6387 code = gimple_assign_rhs_code (stmt);
6388 if (code != ARRAY_REF
6389 && code != INDIRECT_REF
6390 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
6391 return false;
6393 if (!STMT_VINFO_DATA_REF (stmt_info))
6394 return false;
6396 scalar_type = TREE_TYPE (DR_REF (dr));
6397 mode = (int) TYPE_MODE (vectype);
6399 /* FORNOW. In some cases can vectorize even if data-type not supported
6400 (e.g. - data copies). */
6401 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
6403 if (vect_print_dump_info (REPORT_DETAILS))
6404 fprintf (vect_dump, "Aligned load, but unsupported type.");
6405 return false;
6408 /* The vector component type needs to be trivially convertible to the
6409 scalar lhs. This should always be the case. */
6410 if (!useless_type_conversion_p (TREE_TYPE (scalar_dest), TREE_TYPE (vectype)))
6412 if (vect_print_dump_info (REPORT_DETAILS))
6413 fprintf (vect_dump, "??? operands of different types");
6414 return false;
6417 /* Check if the load is a part of an interleaving chain. */
6418 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6420 strided_load = true;
6421 /* FORNOW */
6422 gcc_assert (! nested_in_vect_loop);
6424 /* Check if interleaving is supported. */
6425 if (!vect_strided_load_supported (vectype)
6426 && !PURE_SLP_STMT (stmt_info) && !slp)
6427 return false;
6430 if (!vec_stmt) /* transformation not required. */
6432 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
6433 vect_model_load_cost (stmt_info, ncopies, NULL);
6434 return true;
6437 if (vect_print_dump_info (REPORT_DETAILS))
6438 fprintf (vect_dump, "transform load.");
6440 /** Transform. **/
6442 if (strided_load)
6444 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
6445 /* Check if the chain of loads is already vectorized. */
6446 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
6448 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6449 return true;
6451 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
6452 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
6454 /* VEC_NUM is the number of vect stmts to be created for this group. */
6455 if (slp)
6457 strided_load = false;
6458 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6460 else
6461 vec_num = group_size;
6463 dr_chain = VEC_alloc (tree, heap, vec_num);
6465 else
6467 first_stmt = stmt;
6468 first_dr = dr;
6469 group_size = vec_num = 1;
6472 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
6473 gcc_assert (alignment_support_scheme);
6475 /* In case the vectorization factor (VF) is bigger than the number
6476 of elements that we can fit in a vectype (nunits), we have to generate
6477 more than one vector stmt - i.e - we need to "unroll" the
6478 vector stmt by a factor VF/nunits. In doing so, we record a pointer
6479 from one copy of the vector stmt to the next, in the field
6480 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
6481 stages to find the correct vector defs to be used when vectorizing
6482 stmts that use the defs of the current stmt. The example below illustrates
6483 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
6484 4 vectorized stmts):
6486 before vectorization:
6487 RELATED_STMT VEC_STMT
6488 S1: x = memref - -
6489 S2: z = x + 1 - -
6491 step 1: vectorize stmt S1:
6492 We first create the vector stmt VS1_0, and, as usual, record a
6493 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
6494 Next, we create the vector stmt VS1_1, and record a pointer to
6495 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
6496 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
6497 stmts and pointers:
6498 RELATED_STMT VEC_STMT
6499 VS1_0: vx0 = memref0 VS1_1 -
6500 VS1_1: vx1 = memref1 VS1_2 -
6501 VS1_2: vx2 = memref2 VS1_3 -
6502 VS1_3: vx3 = memref3 - -
6503 S1: x = load - VS1_0
6504 S2: z = x + 1 - -
6506 See in documentation in vect_get_vec_def_for_stmt_copy for how the
6507 information we recorded in RELATED_STMT field is used to vectorize
6508 stmt S2. */
6510 /* In case of interleaving (non-unit strided access):
6512 S1: x2 = &base + 2
6513 S2: x0 = &base
6514 S3: x1 = &base + 1
6515 S4: x3 = &base + 3
6517 Vectorized loads are created in the order of memory accesses
6518 starting from the access of the first stmt of the chain:
6520 VS1: vx0 = &base
6521 VS2: vx1 = &base + vec_size*1
6522 VS3: vx3 = &base + vec_size*2
6523 VS4: vx4 = &base + vec_size*3
6525 Then permutation statements are generated:
6527 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
6528 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
6531 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
6532 (the order of the data-refs in the output of vect_permute_load_chain
6533 corresponds to the order of scalar stmts in the interleaving chain - see
6534 the documentation of vect_permute_load_chain()).
6535 The generation of permutation stmts and recording them in
6536 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
6538 In case of both multiple types and interleaving, the vector loads and
6539 permutation stmts above are created for every copy. The result vector stmts
6540 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
6541 STMT_VINFO_RELATED_STMT for the next copies. */
6543 /* If the data reference is aligned (dr_aligned) or potentially unaligned
6544 on a target that supports unaligned accesses (dr_unaligned_supported)
6545 we generate the following code:
6546 p = initial_addr;
6547 indx = 0;
6548 loop {
6549 p = p + indx * vectype_size;
6550 vec_dest = *(p);
6551 indx = indx + 1;
6554 Otherwise, the data reference is potentially unaligned on a target that
6555 does not support unaligned accesses (dr_explicit_realign_optimized) -
6556 then generate the following code, in which the data in each iteration is
6557 obtained by two vector loads, one from the previous iteration, and one
6558 from the current iteration:
6559 p1 = initial_addr;
6560 msq_init = *(floor(p1))
6561 p2 = initial_addr + VS - 1;
6562 realignment_token = call target_builtin;
6563 indx = 0;
6564 loop {
6565 p2 = p2 + indx * vectype_size
6566 lsq = *(floor(p2))
6567 vec_dest = realign_load (msq, lsq, realignment_token)
6568 indx = indx + 1;
6569 msq = lsq;
6570 } */
6572 /* If the misalignment remains the same throughout the execution of the
6573 loop, we can create the init_addr and permutation mask at the loop
6574 preheader. Otherwise, it needs to be created inside the loop.
6575 This can only occur when vectorizing memory accesses in the inner-loop
6576 nested within an outer-loop that is being vectorized. */
6578 if (nested_in_vect_loop_p (loop, stmt)
6579 && (TREE_INT_CST_LOW (DR_STEP (dr))
6580 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
6582 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
6583 compute_in_loop = true;
6586 if ((alignment_support_scheme == dr_explicit_realign_optimized
6587 || alignment_support_scheme == dr_explicit_realign)
6588 && !compute_in_loop)
6590 msq = vect_setup_realignment (first_stmt, gsi, &realignment_token,
6591 alignment_support_scheme, NULL_TREE,
6592 &at_loop);
6593 if (alignment_support_scheme == dr_explicit_realign_optimized)
6595 phi = SSA_NAME_DEF_STMT (msq);
6596 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6599 else
6600 at_loop = loop;
6602 prev_stmt_info = NULL;
6603 for (j = 0; j < ncopies; j++)
6605 /* 1. Create the vector pointer update chain. */
6606 if (j == 0)
6607 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
6608 at_loop, offset,
6609 &dummy, &ptr_incr, false,
6610 &inv_p, NULL_TREE);
6611 else
6612 dataref_ptr =
6613 bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, NULL_TREE);
6615 for (i = 0; i < vec_num; i++)
6617 if (i > 0)
6618 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
6619 NULL_TREE);
6621 /* 2. Create the vector-load in the loop. */
6622 switch (alignment_support_scheme)
6624 case dr_aligned:
6625 gcc_assert (aligned_access_p (first_dr));
6626 data_ref = build_fold_indirect_ref (dataref_ptr);
6627 break;
6628 case dr_unaligned_supported:
6630 int mis = DR_MISALIGNMENT (first_dr);
6631 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
6633 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
6634 data_ref =
6635 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
6636 break;
6638 case dr_explicit_realign:
6640 tree ptr, bump;
6641 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
6643 if (compute_in_loop)
6644 msq = vect_setup_realignment (first_stmt, gsi,
6645 &realignment_token,
6646 dr_explicit_realign,
6647 dataref_ptr, NULL);
6649 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6650 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6651 new_stmt = gimple_build_assign (vec_dest, data_ref);
6652 new_temp = make_ssa_name (vec_dest, new_stmt);
6653 gimple_assign_set_lhs (new_stmt, new_temp);
6654 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6655 copy_virtual_operands (new_stmt, stmt);
6656 mark_symbols_for_renaming (new_stmt);
6657 msq = new_temp;
6659 bump = size_binop (MULT_EXPR, vs_minus_1,
6660 TYPE_SIZE_UNIT (scalar_type));
6661 ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump);
6662 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
6663 break;
6665 case dr_explicit_realign_optimized:
6666 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
6667 break;
6668 default:
6669 gcc_unreachable ();
6671 /* If accesses through a pointer to vectype do not alias the original
6672 memory reference we have a problem. This should never happen. */
6673 gcc_assert (get_alias_set (data_ref) == get_alias_set (gimple_assign_rhs1 (stmt))
6674 || alias_set_subset_of (get_alias_set (data_ref),
6675 get_alias_set (gimple_assign_rhs1 (stmt))));
6676 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6677 new_stmt = gimple_build_assign (vec_dest, data_ref);
6678 new_temp = make_ssa_name (vec_dest, new_stmt);
6679 gimple_assign_set_lhs (new_stmt, new_temp);
6680 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6681 mark_symbols_for_renaming (new_stmt);
6683 /* 3. Handle explicit realignment if necessary/supported. Create in
6684 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
6685 if (alignment_support_scheme == dr_explicit_realign_optimized
6686 || alignment_support_scheme == dr_explicit_realign)
6688 tree tmp;
6690 lsq = gimple_assign_lhs (new_stmt);
6691 if (!realignment_token)
6692 realignment_token = dataref_ptr;
6693 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6694 tmp = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
6695 realignment_token);
6696 new_stmt = gimple_build_assign (vec_dest, tmp);
6697 new_temp = make_ssa_name (vec_dest, new_stmt);
6698 gimple_assign_set_lhs (new_stmt, new_temp);
6699 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6701 if (alignment_support_scheme == dr_explicit_realign_optimized)
6703 gcc_assert (phi);
6704 if (i == vec_num - 1 && j == ncopies - 1)
6705 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
6706 msq = lsq;
6710 /* 4. Handle invariant-load. */
6711 if (inv_p)
6713 gcc_assert (!strided_load);
6714 gcc_assert (nested_in_vect_loop_p (loop, stmt));
6715 if (j == 0)
6717 int k;
6718 tree t = NULL_TREE;
6719 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
6721 /* CHECKME: bitpos depends on endianess? */
6722 bitpos = bitsize_zero_node;
6723 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6724 bitsize, bitpos);
6725 vec_dest =
6726 vect_create_destination_var (scalar_dest, NULL_TREE);
6727 new_stmt = gimple_build_assign (vec_dest, vec_inv);
6728 new_temp = make_ssa_name (vec_dest, new_stmt);
6729 gimple_assign_set_lhs (new_stmt, new_temp);
6730 vect_finish_stmt_generation (stmt, new_stmt, gsi);
6732 for (k = nunits - 1; k >= 0; --k)
6733 t = tree_cons (NULL_TREE, new_temp, t);
6734 /* FIXME: use build_constructor directly. */
6735 vec_inv = build_constructor_from_list (vectype, t);
6736 new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
6737 new_stmt = SSA_NAME_DEF_STMT (new_temp);
6739 else
6740 gcc_unreachable (); /* FORNOW. */
6743 /* Collect vector loads and later create their permutation in
6744 vect_transform_strided_load (). */
6745 if (strided_load || slp_perm)
6746 VEC_quick_push (tree, dr_chain, new_temp);
6748 /* Store vector loads in the corresponding SLP_NODE. */
6749 if (slp && !slp_perm)
6750 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
6753 if (slp && !slp_perm)
6754 continue;
6756 if (slp_perm)
6758 if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
6759 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
6760 slp_node_instance, false))
6762 VEC_free (tree, heap, dr_chain);
6763 return false;
6766 else
6768 if (strided_load)
6770 if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
6771 return false;
6773 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
6774 VEC_free (tree, heap, dr_chain);
6775 dr_chain = VEC_alloc (tree, heap, group_size);
6777 else
6779 if (j == 0)
6780 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6781 else
6782 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6783 prev_stmt_info = vinfo_for_stmt (new_stmt);
6788 if (dr_chain)
6789 VEC_free (tree, heap, dr_chain);
6791 return true;
6795 /* Function vectorizable_live_operation.
6797 STMT computes a value that is used outside the loop. Check if
6798 it can be supported. */
6800 bool
6801 vectorizable_live_operation (gimple stmt,
6802 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6803 gimple *vec_stmt ATTRIBUTE_UNUSED)
6805 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6806 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6807 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6808 int i;
6809 int op_type;
6810 tree op;
6811 tree def;
6812 gimple def_stmt;
6813 enum vect_def_type dt;
6814 enum tree_code code;
6815 enum gimple_rhs_class rhs_class;
6817 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6819 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6820 return false;
6822 if (!is_gimple_assign (stmt))
6823 return false;
6825 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6826 return false;
6828 /* FORNOW. CHECKME. */
6829 if (nested_in_vect_loop_p (loop, stmt))
6830 return false;
6832 code = gimple_assign_rhs_code (stmt);
6833 op_type = TREE_CODE_LENGTH (code);
6834 rhs_class = get_gimple_rhs_class (code);
6835 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
6836 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
6838 /* FORNOW: support only if all uses are invariant. This means
6839 that the scalar operations can remain in place, unvectorized.
6840 The original last scalar value that they compute will be used. */
6842 for (i = 0; i < op_type; i++)
6844 if (rhs_class == GIMPLE_SINGLE_RHS)
6845 op = TREE_OPERAND (gimple_op (stmt, 1), i);
6846 else
6847 op = gimple_op (stmt, i + 1);
6848 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6850 if (vect_print_dump_info (REPORT_DETAILS))
6851 fprintf (vect_dump, "use not simple.");
6852 return false;
6855 if (dt != vect_invariant_def && dt != vect_constant_def)
6856 return false;
6859 /* No transformation is required for the cases we currently support. */
6860 return true;
6864 /* Function vect_is_simple_cond.
6866 Input:
6867 LOOP - the loop that is being vectorized.
6868 COND - Condition that is checked for simple use.
6870 Returns whether a COND can be vectorized. Checks whether
6871 condition operands are supportable using vec_is_simple_use. */
6873 static bool
6874 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6876 tree lhs, rhs;
6877 tree def;
6878 enum vect_def_type dt;
6880 if (!COMPARISON_CLASS_P (cond))
6881 return false;
6883 lhs = TREE_OPERAND (cond, 0);
6884 rhs = TREE_OPERAND (cond, 1);
6886 if (TREE_CODE (lhs) == SSA_NAME)
6888 gimple lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6889 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6890 return false;
6892 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6893 && TREE_CODE (lhs) != FIXED_CST)
6894 return false;
6896 if (TREE_CODE (rhs) == SSA_NAME)
6898 gimple rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6899 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6900 return false;
6902 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6903 && TREE_CODE (rhs) != FIXED_CST)
6904 return false;
6906 return true;
6909 /* vectorizable_condition.
6911 Check if STMT is conditional modify expression that can be vectorized.
6912 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6913 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6914 at BSI.
6916 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6918 bool
6919 vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
6920 gimple *vec_stmt)
6922 tree scalar_dest = NULL_TREE;
6923 tree vec_dest = NULL_TREE;
6924 tree op = NULL_TREE;
6925 tree cond_expr, then_clause, else_clause;
6926 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6927 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6928 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6929 tree vec_compare, vec_cond_expr;
6930 tree new_temp;
6931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6932 enum machine_mode vec_mode;
6933 tree def;
6934 enum vect_def_type dt;
6935 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6936 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6937 enum tree_code code;
6939 gcc_assert (ncopies >= 1);
6940 if (ncopies > 1)
6941 return false; /* FORNOW */
6943 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6944 return false;
6946 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6947 return false;
6949 /* FORNOW: SLP not supported. */
6950 if (STMT_SLP_TYPE (stmt_info))
6951 return false;
6953 /* FORNOW: not yet supported. */
6954 if (STMT_VINFO_LIVE_P (stmt_info))
6956 if (vect_print_dump_info (REPORT_DETAILS))
6957 fprintf (vect_dump, "value used after loop.");
6958 return false;
6961 /* Is vectorizable conditional operation? */
6962 if (!is_gimple_assign (stmt))
6963 return false;
6965 code = gimple_assign_rhs_code (stmt);
6967 if (code != COND_EXPR)
6968 return false;
6970 gcc_assert (gimple_assign_single_p (stmt));
6971 op = gimple_assign_rhs1 (stmt);
6972 cond_expr = TREE_OPERAND (op, 0);
6973 then_clause = TREE_OPERAND (op, 1);
6974 else_clause = TREE_OPERAND (op, 2);
6976 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6977 return false;
6979 /* We do not handle two different vector types for the condition
6980 and the values. */
6981 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6982 return false;
6984 if (TREE_CODE (then_clause) == SSA_NAME)
6986 gimple then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6987 if (!vect_is_simple_use (then_clause, loop_vinfo,
6988 &then_def_stmt, &def, &dt))
6989 return false;
6991 else if (TREE_CODE (then_clause) != INTEGER_CST
6992 && TREE_CODE (then_clause) != REAL_CST
6993 && TREE_CODE (then_clause) != FIXED_CST)
6994 return false;
6996 if (TREE_CODE (else_clause) == SSA_NAME)
6998 gimple else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6999 if (!vect_is_simple_use (else_clause, loop_vinfo,
7000 &else_def_stmt, &def, &dt))
7001 return false;
7003 else if (TREE_CODE (else_clause) != INTEGER_CST
7004 && TREE_CODE (else_clause) != REAL_CST
7005 && TREE_CODE (else_clause) != FIXED_CST)
7006 return false;
7009 vec_mode = TYPE_MODE (vectype);
7011 if (!vec_stmt)
7013 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
7014 return expand_vec_cond_expr_p (op, vec_mode);
7017 /* Transform */
7019 /* Handle def. */
7020 scalar_dest = gimple_assign_lhs (stmt);
7021 vec_dest = vect_create_destination_var (scalar_dest, vectype);
7023 /* Handle cond expr. */
7024 vec_cond_lhs =
7025 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
7026 vec_cond_rhs =
7027 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
7028 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
7029 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
7031 /* Arguments are ready. Create the new vector stmt. */
7032 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
7033 vec_cond_lhs, vec_cond_rhs);
7034 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
7035 vec_compare, vec_then_clause, vec_else_clause);
7037 *vec_stmt = gimple_build_assign (vec_dest, vec_cond_expr);
7038 new_temp = make_ssa_name (vec_dest, *vec_stmt);
7039 gimple_assign_set_lhs (*vec_stmt, new_temp);
7040 vect_finish_stmt_generation (stmt, *vec_stmt, gsi);
7042 return true;
7046 /* Function vect_transform_stmt.
7048 Create a vectorized stmt to replace STMT, and insert it at BSI. */
7050 static bool
7051 vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
7052 bool *strided_store, slp_tree slp_node,
7053 slp_instance slp_node_instance)
7055 bool is_store = false;
7056 gimple vec_stmt = NULL;
7057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7058 gimple orig_stmt_in_pattern;
7059 bool done;
7061 switch (STMT_VINFO_TYPE (stmt_info))
7063 case type_demotion_vec_info_type:
7064 done = vectorizable_type_demotion (stmt, gsi, &vec_stmt, slp_node);
7065 gcc_assert (done);
7066 break;
7068 case type_promotion_vec_info_type:
7069 done = vectorizable_type_promotion (stmt, gsi, &vec_stmt, slp_node);
7070 gcc_assert (done);
7071 break;
7073 case type_conversion_vec_info_type:
7074 done = vectorizable_conversion (stmt, gsi, &vec_stmt, slp_node);
7075 gcc_assert (done);
7076 break;
7078 case induc_vec_info_type:
7079 gcc_assert (!slp_node);
7080 done = vectorizable_induction (stmt, gsi, &vec_stmt);
7081 gcc_assert (done);
7082 break;
7084 case op_vec_info_type:
7085 done = vectorizable_operation (stmt, gsi, &vec_stmt, slp_node);
7086 gcc_assert (done);
7087 break;
7089 case assignment_vec_info_type:
7090 done = vectorizable_assignment (stmt, gsi, &vec_stmt, slp_node);
7091 gcc_assert (done);
7092 break;
7094 case load_vec_info_type:
7095 done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
7096 slp_node_instance);
7097 gcc_assert (done);
7098 break;
7100 case store_vec_info_type:
7101 done = vectorizable_store (stmt, gsi, &vec_stmt, slp_node);
7102 gcc_assert (done);
7103 if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && !slp_node)
7105 /* In case of interleaving, the whole chain is vectorized when the
7106 last store in the chain is reached. Store stmts before the last
7107 one are skipped, and there vec_stmt_info shouldn't be freed
7108 meanwhile. */
7109 *strided_store = true;
7110 if (STMT_VINFO_VEC_STMT (stmt_info))
7111 is_store = true;
7113 else
7114 is_store = true;
7115 break;
7117 case condition_vec_info_type:
7118 gcc_assert (!slp_node);
7119 done = vectorizable_condition (stmt, gsi, &vec_stmt);
7120 gcc_assert (done);
7121 break;
7123 case call_vec_info_type:
7124 gcc_assert (!slp_node);
7125 done = vectorizable_call (stmt, gsi, &vec_stmt);
7126 break;
7128 case reduc_vec_info_type:
7129 gcc_assert (!slp_node);
7130 done = vectorizable_reduction (stmt, gsi, &vec_stmt);
7131 gcc_assert (done);
7132 break;
7134 default:
7135 if (!STMT_VINFO_LIVE_P (stmt_info))
7137 if (vect_print_dump_info (REPORT_DETAILS))
7138 fprintf (vect_dump, "stmt not supported.");
7139 gcc_unreachable ();
7143 if (STMT_VINFO_LIVE_P (stmt_info)
7144 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
7146 done = vectorizable_live_operation (stmt, gsi, &vec_stmt);
7147 gcc_assert (done);
7150 if (vec_stmt)
7152 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
7153 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
7154 if (orig_stmt_in_pattern)
7156 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
7157 /* STMT was inserted by the vectorizer to replace a computation idiom.
7158 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
7159 computed this idiom. We need to record a pointer to VEC_STMT in
7160 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
7161 documentation of vect_pattern_recog. */
7162 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
7164 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
7165 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
7170 return is_store;
7174 /* This function builds ni_name = number of iterations loop executes
7175 on the loop preheader. */
7177 static tree
7178 vect_build_loop_niters (loop_vec_info loop_vinfo)
7180 tree ni_name, var;
7181 gimple_seq stmts = NULL;
7182 edge pe;
7183 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7184 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
7186 var = create_tmp_var (TREE_TYPE (ni), "niters");
7187 add_referenced_var (var);
7188 ni_name = force_gimple_operand (ni, &stmts, false, var);
7190 pe = loop_preheader_edge (loop);
7191 if (stmts)
7193 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7194 gcc_assert (!new_bb);
7197 return ni_name;
7201 /* This function generates the following statements:
7203 ni_name = number of iterations loop executes
7204 ratio = ni_name / vf
7205 ratio_mult_vf_name = ratio * vf
7207 and places them at the loop preheader edge. */
7209 static void
7210 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
7211 tree *ni_name_ptr,
7212 tree *ratio_mult_vf_name_ptr,
7213 tree *ratio_name_ptr)
7216 edge pe;
7217 basic_block new_bb;
7218 gimple_seq stmts;
7219 tree ni_name;
7220 tree var;
7221 tree ratio_name;
7222 tree ratio_mult_vf_name;
7223 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7224 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
7225 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7226 tree log_vf;
7228 pe = loop_preheader_edge (loop);
7230 /* Generate temporary variable that contains
7231 number of iterations loop executes. */
7233 ni_name = vect_build_loop_niters (loop_vinfo);
7234 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
7236 /* Create: ratio = ni >> log2(vf) */
7238 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
7239 if (!is_gimple_val (ratio_name))
7241 var = create_tmp_var (TREE_TYPE (ni), "bnd");
7242 add_referenced_var (var);
7244 stmts = NULL;
7245 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
7246 pe = loop_preheader_edge (loop);
7247 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7248 gcc_assert (!new_bb);
7251 /* Create: ratio_mult_vf = ratio << log2 (vf). */
7253 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
7254 ratio_name, log_vf);
7255 if (!is_gimple_val (ratio_mult_vf_name))
7257 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
7258 add_referenced_var (var);
7260 stmts = NULL;
7261 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
7262 true, var);
7263 pe = loop_preheader_edge (loop);
7264 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7265 gcc_assert (!new_bb);
7268 *ni_name_ptr = ni_name;
7269 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
7270 *ratio_name_ptr = ratio_name;
7272 return;
7276 /* Function vect_update_ivs_after_vectorizer.
7278 "Advance" the induction variables of LOOP to the value they should take
7279 after the execution of LOOP. This is currently necessary because the
7280 vectorizer does not handle induction variables that are used after the
7281 loop. Such a situation occurs when the last iterations of LOOP are
7282 peeled, because:
7283 1. We introduced new uses after LOOP for IVs that were not originally used
7284 after LOOP: the IVs of LOOP are now used by an epilog loop.
7285 2. LOOP is going to be vectorized; this means that it will iterate N/VF
7286 times, whereas the loop IVs should be bumped N times.
7288 Input:
7289 - LOOP - a loop that is going to be vectorized. The last few iterations
7290 of LOOP were peeled.
7291 - NITERS - the number of iterations that LOOP executes (before it is
7292 vectorized). i.e, the number of times the ivs should be bumped.
7293 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
7294 coming out from LOOP on which there are uses of the LOOP ivs
7295 (this is the path from LOOP->exit to epilog_loop->preheader).
7297 The new definitions of the ivs are placed in LOOP->exit.
7298 The phi args associated with the edge UPDATE_E in the bb
7299 UPDATE_E->dest are updated accordingly.
7301 Assumption 1: Like the rest of the vectorizer, this function assumes
7302 a single loop exit that has a single predecessor.
7304 Assumption 2: The phi nodes in the LOOP header and in update_bb are
7305 organized in the same order.
7307 Assumption 3: The access function of the ivs is simple enough (see
7308 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
7310 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
7311 coming out of LOOP on which the ivs of LOOP are used (this is the path
7312 that leads to the epilog loop; other paths skip the epilog loop). This
7313 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
7314 needs to have its phis updated.
7317 static void
7318 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
7319 edge update_e)
7321 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7322 basic_block exit_bb = single_exit (loop)->dest;
7323 gimple phi, phi1;
7324 gimple_stmt_iterator gsi, gsi1;
7325 basic_block update_bb = update_e->dest;
7327 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
7329 /* Make sure there exists a single-predecessor exit bb: */
7330 gcc_assert (single_pred_p (exit_bb));
7332 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
7333 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
7334 gsi_next (&gsi), gsi_next (&gsi1))
7336 tree access_fn = NULL;
7337 tree evolution_part;
7338 tree init_expr;
7339 tree step_expr;
7340 tree var, ni, ni_name;
7341 gimple_stmt_iterator last_gsi;
7343 phi = gsi_stmt (gsi);
7344 phi1 = gsi_stmt (gsi1);
7345 if (vect_print_dump_info (REPORT_DETAILS))
7347 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
7348 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
7351 /* Skip virtual phi's. */
7352 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
7354 if (vect_print_dump_info (REPORT_DETAILS))
7355 fprintf (vect_dump, "virtual phi. skip.");
7356 continue;
7359 /* Skip reduction phis. */
7360 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
7362 if (vect_print_dump_info (REPORT_DETAILS))
7363 fprintf (vect_dump, "reduc phi. skip.");
7364 continue;
7367 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
7368 gcc_assert (access_fn);
7369 STRIP_NOPS (access_fn);
7370 evolution_part =
7371 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
7372 gcc_assert (evolution_part != NULL_TREE);
7374 /* FORNOW: We do not support IVs whose evolution function is a polynomial
7375 of degree >= 2 or exponential. */
7376 gcc_assert (!tree_is_chrec (evolution_part));
7378 step_expr = evolution_part;
7379 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
7380 loop->num));
7382 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
7383 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
7384 init_expr,
7385 fold_convert (sizetype,
7386 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
7387 niters, step_expr)));
7388 else
7389 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
7390 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
7391 fold_convert (TREE_TYPE (init_expr),
7392 niters),
7393 step_expr),
7394 init_expr);
7398 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
7399 add_referenced_var (var);
7401 last_gsi = gsi_last_bb (exit_bb);
7402 ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
7403 true, GSI_SAME_STMT);
7405 /* Fix phi expressions in the successor bb. */
7406 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
7410 /* Return the more conservative threshold between the
7411 min_profitable_iters returned by the cost model and the user
7412 specified threshold, if provided. */
7414 static unsigned int
7415 conservative_cost_threshold (loop_vec_info loop_vinfo,
7416 int min_profitable_iters)
7418 unsigned int th;
7419 int min_scalar_loop_bound;
7421 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
7422 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
7424 /* Use the cost model only if it is more conservative than user specified
7425 threshold. */
7426 th = (unsigned) min_scalar_loop_bound;
7427 if (min_profitable_iters
7428 && (!min_scalar_loop_bound
7429 || min_profitable_iters > min_scalar_loop_bound))
7430 th = (unsigned) min_profitable_iters;
7432 if (th && vect_print_dump_info (REPORT_COST))
7433 fprintf (vect_dump, "Vectorization may not be profitable.");
7435 return th;
7438 /* Function vect_do_peeling_for_loop_bound
7440 Peel the last iterations of the loop represented by LOOP_VINFO.
7441 The peeled iterations form a new epilog loop. Given that the loop now
7442 iterates NITERS times, the new epilog loop iterates
7443 NITERS % VECTORIZATION_FACTOR times.
7445 The original loop will later be made to iterate
7446 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
7448 static void
7449 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
7451 tree ni_name, ratio_mult_vf_name;
7452 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7453 struct loop *new_loop;
7454 edge update_e;
7455 basic_block preheader;
7456 int loop_num;
7457 bool check_profitability = false;
7458 unsigned int th = 0;
7459 int min_profitable_iters;
7461 if (vect_print_dump_info (REPORT_DETAILS))
7462 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
7464 initialize_original_copy_tables ();
7466 /* Generate the following variables on the preheader of original loop:
7468 ni_name = number of iteration the original loop executes
7469 ratio = ni_name / vf
7470 ratio_mult_vf_name = ratio * vf */
7471 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
7472 &ratio_mult_vf_name, ratio);
7474 loop_num = loop->num;
7476 /* If cost model check not done during versioning and
7477 peeling for alignment. */
7478 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7479 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
7480 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7482 check_profitability = true;
7484 /* Get profitability threshold for vectorized loop. */
7485 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7487 th = conservative_cost_threshold (loop_vinfo,
7488 min_profitable_iters);
7491 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
7492 ratio_mult_vf_name, ni_name, false,
7493 th, check_profitability);
7494 gcc_assert (new_loop);
7495 gcc_assert (loop_num == loop->num);
7496 #ifdef ENABLE_CHECKING
7497 slpeel_verify_cfg_after_peeling (loop, new_loop);
7498 #endif
7500 /* A guard that controls whether the new_loop is to be executed or skipped
7501 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
7502 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
7503 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
7504 is on the path where the LOOP IVs are used and need to be updated. */
7506 preheader = loop_preheader_edge (new_loop)->src;
7507 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
7508 update_e = EDGE_PRED (preheader, 0);
7509 else
7510 update_e = EDGE_PRED (preheader, 1);
7512 /* Update IVs of original loop as if they were advanced
7513 by ratio_mult_vf_name steps. */
7514 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
7516 /* After peeling we have to reset scalar evolution analyzer. */
7517 scev_reset ();
7519 free_original_copy_tables ();
7523 /* Function vect_gen_niters_for_prolog_loop
7525 Set the number of iterations for the loop represented by LOOP_VINFO
7526 to the minimum between LOOP_NITERS (the original iteration count of the loop)
7527 and the misalignment of DR - the data reference recorded in
7528 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
7529 this loop, the data reference DR will refer to an aligned location.
7531 The following computation is generated:
7533 If the misalignment of DR is known at compile time:
7534 addr_mis = int mis = DR_MISALIGNMENT (dr);
7535 Else, compute address misalignment in bytes:
7536 addr_mis = addr & (vectype_size - 1)
7538 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
7540 (elem_size = element type size; an element is the scalar element whose type
7541 is the inner type of the vectype)
7543 When the step of the data-ref in the loop is not 1 (as in interleaved data
7544 and SLP), the number of iterations of the prolog must be divided by the step
7545 (which is equal to the size of interleaved group).
7547 The above formulas assume that VF == number of elements in the vector. This
7548 may not hold when there are multiple-types in the loop.
7549 In this case, for some data-references in the loop the VF does not represent
7550 the number of elements that fit in the vector. Therefore, instead of VF we
7551 use TYPE_VECTOR_SUBPARTS. */
7553 static tree
7554 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
7556 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
7557 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7558 tree var;
7559 gimple_seq stmts;
7560 tree iters, iters_name;
7561 edge pe;
7562 basic_block new_bb;
7563 gimple dr_stmt = DR_STMT (dr);
7564 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
7565 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7566 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
7567 tree niters_type = TREE_TYPE (loop_niters);
7568 int step = 1;
7569 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
7570 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
7572 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7573 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
7575 pe = loop_preheader_edge (loop);
7577 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
7579 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
7580 int elem_misalign = byte_misalign / element_size;
7582 if (vect_print_dump_info (REPORT_DETAILS))
7583 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
7585 iters = build_int_cst (niters_type,
7586 (((nelements - elem_misalign) & (nelements - 1)) / step));
7588 else
7590 gimple_seq new_stmts = NULL;
7591 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
7592 &new_stmts, NULL_TREE, loop);
7593 tree ptr_type = TREE_TYPE (start_addr);
7594 tree size = TYPE_SIZE (ptr_type);
7595 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
7596 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
7597 tree elem_size_log =
7598 build_int_cst (type, exact_log2 (vectype_align/nelements));
7599 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
7600 tree nelements_tree = build_int_cst (type, nelements);
7601 tree byte_misalign;
7602 tree elem_misalign;
7604 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
7605 gcc_assert (!new_bb);
7607 /* Create: byte_misalign = addr & (vectype_size - 1) */
7608 byte_misalign =
7609 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
7611 /* Create: elem_misalign = byte_misalign / element_size */
7612 elem_misalign =
7613 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
7615 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
7616 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
7617 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
7618 iters = fold_convert (niters_type, iters);
7621 /* Create: prolog_loop_niters = min (iters, loop_niters) */
7622 /* If the loop bound is known at compile time we already verified that it is
7623 greater than vf; since the misalignment ('iters') is at most vf, there's
7624 no need to generate the MIN_EXPR in this case. */
7625 if (TREE_CODE (loop_niters) != INTEGER_CST)
7626 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
7628 if (vect_print_dump_info (REPORT_DETAILS))
7630 fprintf (vect_dump, "niters for prolog loop: ");
7631 print_generic_expr (vect_dump, iters, TDF_SLIM);
7634 var = create_tmp_var (niters_type, "prolog_loop_niters");
7635 add_referenced_var (var);
7636 stmts = NULL;
7637 iters_name = force_gimple_operand (iters, &stmts, false, var);
7639 /* Insert stmt on loop preheader edge. */
7640 if (stmts)
7642 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7643 gcc_assert (!new_bb);
7646 return iters_name;
7650 /* Function vect_update_init_of_dr
7652 NITERS iterations were peeled from LOOP. DR represents a data reference
7653 in LOOP. This function updates the information recorded in DR to
7654 account for the fact that the first NITERS iterations had already been
7655 executed. Specifically, it updates the OFFSET field of DR. */
7657 static void
7658 vect_update_init_of_dr (struct data_reference *dr, tree niters)
7660 tree offset = DR_OFFSET (dr);
7662 niters = fold_build2 (MULT_EXPR, sizetype,
7663 fold_convert (sizetype, niters),
7664 fold_convert (sizetype, DR_STEP (dr)));
7665 offset = fold_build2 (PLUS_EXPR, sizetype, offset, niters);
7666 DR_OFFSET (dr) = offset;
7670 /* Function vect_update_inits_of_drs
7672 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
7673 This function updates the information recorded for the data references in
7674 the loop to account for the fact that the first NITERS iterations had
7675 already been executed. Specifically, it updates the initial_condition of
7676 the access_function of all the data_references in the loop. */
7678 static void
7679 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
7681 unsigned int i;
7682 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
7683 struct data_reference *dr;
7685 if (vect_print_dump_info (REPORT_DETAILS))
7686 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
7688 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
7689 vect_update_init_of_dr (dr, niters);
7693 /* Function vect_do_peeling_for_alignment
7695 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
7696 'niters' is set to the misalignment of one of the data references in the
7697 loop, thereby forcing it to refer to an aligned location at the beginning
7698 of the execution of this loop. The data reference for which we are
7699 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
7701 static void
7702 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
7704 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7705 tree niters_of_prolog_loop, ni_name;
7706 tree n_iters;
7707 struct loop *new_loop;
7708 bool check_profitability = false;
7709 unsigned int th = 0;
7710 int min_profitable_iters;
7712 if (vect_print_dump_info (REPORT_DETAILS))
7713 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
7715 initialize_original_copy_tables ();
7717 ni_name = vect_build_loop_niters (loop_vinfo);
7718 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
7721 /* If cost model check not done during versioning. */
7722 if (!VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7723 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7725 check_profitability = true;
7727 /* Get profitability threshold for vectorized loop. */
7728 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7730 th = conservative_cost_threshold (loop_vinfo,
7731 min_profitable_iters);
7734 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
7735 new_loop =
7736 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
7737 niters_of_prolog_loop, ni_name, true,
7738 th, check_profitability);
7740 gcc_assert (new_loop);
7741 #ifdef ENABLE_CHECKING
7742 slpeel_verify_cfg_after_peeling (new_loop, loop);
7743 #endif
7745 /* Update number of times loop executes. */
7746 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
7747 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
7748 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
7750 /* Update the init conditions of the access functions of all data refs. */
7751 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
7753 /* After peeling we have to reset scalar evolution analyzer. */
7754 scev_reset ();
7756 free_original_copy_tables ();
7760 /* Function vect_create_cond_for_align_checks.
7762 Create a conditional expression that represents the alignment checks for
7763 all of data references (array element references) whose alignment must be
7764 checked at runtime.
7766 Input:
7767 COND_EXPR - input conditional expression. New conditions will be chained
7768 with logical AND operation.
7769 LOOP_VINFO - two fields of the loop information are used.
7770 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
7771 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
7773 Output:
7774 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7775 expression.
7776 The returned value is the conditional expression to be used in the if
7777 statement that controls which version of the loop gets executed at runtime.
7779 The algorithm makes two assumptions:
7780 1) The number of bytes "n" in a vector is a power of 2.
7781 2) An address "a" is aligned if a%n is zero and that this
7782 test can be done as a&(n-1) == 0. For example, for 16
7783 byte vectors the test is a&0xf == 0. */
7785 static void
7786 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
7787 tree *cond_expr,
7788 gimple_seq *cond_expr_stmt_list)
7790 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7791 VEC(gimple,heap) *may_misalign_stmts
7792 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7793 gimple ref_stmt;
7794 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
7795 tree mask_cst;
7796 unsigned int i;
7797 tree psize;
7798 tree int_ptrsize_type;
7799 char tmp_name[20];
7800 tree or_tmp_name = NULL_TREE;
7801 tree and_tmp, and_tmp_name;
7802 gimple and_stmt;
7803 tree ptrsize_zero;
7804 tree part_cond_expr;
7806 /* Check that mask is one less than a power of 2, i.e., mask is
7807 all zeros followed by all ones. */
7808 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
7810 /* CHECKME: what is the best integer or unsigned type to use to hold a
7811 cast from a pointer value? */
7812 psize = TYPE_SIZE (ptr_type_node);
7813 int_ptrsize_type
7814 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7816 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7817 of the first vector of the i'th data reference. */
7819 for (i = 0; VEC_iterate (gimple, may_misalign_stmts, i, ref_stmt); i++)
7821 gimple_seq new_stmt_list = NULL;
7822 tree addr_base;
7823 tree addr_tmp, addr_tmp_name;
7824 tree or_tmp, new_or_tmp_name;
7825 gimple addr_stmt, or_stmt;
7827 /* create: addr_tmp = (int)(address_of_first_vector) */
7828 addr_base =
7829 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
7830 NULL_TREE, loop);
7831 if (new_stmt_list != NULL)
7832 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
7834 sprintf (tmp_name, "%s%d", "addr2int", i);
7835 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7836 add_referenced_var (addr_tmp);
7837 addr_tmp_name = make_ssa_name (addr_tmp, NULL);
7838 addr_stmt = gimple_build_assign_with_ops (NOP_EXPR, addr_tmp_name,
7839 addr_base, NULL_TREE);
7840 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7841 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
7843 /* The addresses are OR together. */
7845 if (or_tmp_name != NULL_TREE)
7847 /* create: or_tmp = or_tmp | addr_tmp */
7848 sprintf (tmp_name, "%s%d", "orptrs", i);
7849 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7850 add_referenced_var (or_tmp);
7851 new_or_tmp_name = make_ssa_name (or_tmp, NULL);
7852 or_stmt = gimple_build_assign_with_ops (BIT_IOR_EXPR,
7853 new_or_tmp_name,
7854 or_tmp_name, addr_tmp_name);
7855 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7856 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
7857 or_tmp_name = new_or_tmp_name;
7859 else
7860 or_tmp_name = addr_tmp_name;
7862 } /* end for i */
7864 mask_cst = build_int_cst (int_ptrsize_type, mask);
7866 /* create: and_tmp = or_tmp & mask */
7867 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7868 add_referenced_var (and_tmp);
7869 and_tmp_name = make_ssa_name (and_tmp, NULL);
7871 and_stmt = gimple_build_assign_with_ops (BIT_AND_EXPR, and_tmp_name,
7872 or_tmp_name, mask_cst);
7873 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7874 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
7876 /* Make and_tmp the left operand of the conditional test against zero.
7877 if and_tmp has a nonzero bit then some address is unaligned. */
7878 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7879 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7880 and_tmp_name, ptrsize_zero);
7881 if (*cond_expr)
7882 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7883 *cond_expr, part_cond_expr);
7884 else
7885 *cond_expr = part_cond_expr;
7888 /* Function vect_vfa_segment_size.
7890 Create an expression that computes the size of segment
7891 that will be accessed for a data reference. The functions takes into
7892 account that realignment loads may access one more vector.
7894 Input:
7895 DR: The data reference.
7896 VECT_FACTOR: vectorization factor.
7898 Return an expression whose value is the size of segment which will be
7899 accessed by DR. */
7901 static tree
7902 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7904 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7905 DR_STEP (dr), vect_factor);
7907 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7909 tree vector_size = TYPE_SIZE_UNIT
7910 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7912 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7913 segment_length, vector_size);
7915 return fold_convert (sizetype, segment_length);
7918 /* Function vect_create_cond_for_alias_checks.
7920 Create a conditional expression that represents the run-time checks for
7921 overlapping of address ranges represented by a list of data references
7922 relations passed as input.
7924 Input:
7925 COND_EXPR - input conditional expression. New conditions will be chained
7926 with logical AND operation.
7927 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7928 to be checked.
7930 Output:
7931 COND_EXPR - conditional expression.
7932 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7933 expression.
7936 The returned value is the conditional expression to be used in the if
7937 statement that controls which version of the loop gets executed at runtime.
7940 static void
7941 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7942 tree * cond_expr,
7943 gimple_seq * cond_expr_stmt_list)
7945 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7946 VEC (ddr_p, heap) * may_alias_ddrs =
7947 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7948 tree vect_factor =
7949 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7951 ddr_p ddr;
7952 unsigned int i;
7953 tree part_cond_expr;
7955 /* Create expression
7956 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7957 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7961 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7962 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7964 if (VEC_empty (ddr_p, may_alias_ddrs))
7965 return;
7967 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7969 struct data_reference *dr_a, *dr_b;
7970 gimple dr_group_first_a, dr_group_first_b;
7971 tree addr_base_a, addr_base_b;
7972 tree segment_length_a, segment_length_b;
7973 gimple stmt_a, stmt_b;
7975 dr_a = DDR_A (ddr);
7976 stmt_a = DR_STMT (DDR_A (ddr));
7977 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7978 if (dr_group_first_a)
7980 stmt_a = dr_group_first_a;
7981 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7984 dr_b = DDR_B (ddr);
7985 stmt_b = DR_STMT (DDR_B (ddr));
7986 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7987 if (dr_group_first_b)
7989 stmt_b = dr_group_first_b;
7990 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7993 addr_base_a =
7994 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7995 NULL_TREE, loop);
7996 addr_base_b =
7997 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7998 NULL_TREE, loop);
8000 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
8001 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
8003 if (vect_print_dump_info (REPORT_DR_DETAILS))
8005 fprintf (vect_dump,
8006 "create runtime check for data references ");
8007 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
8008 fprintf (vect_dump, " and ");
8009 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
8013 part_cond_expr =
8014 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
8015 fold_build2 (LT_EXPR, boolean_type_node,
8016 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
8017 addr_base_a,
8018 segment_length_a),
8019 addr_base_b),
8020 fold_build2 (LT_EXPR, boolean_type_node,
8021 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
8022 addr_base_b,
8023 segment_length_b),
8024 addr_base_a));
8026 if (*cond_expr)
8027 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
8028 *cond_expr, part_cond_expr);
8029 else
8030 *cond_expr = part_cond_expr;
8032 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8033 fprintf (vect_dump, "created %u versioning for alias checks.\n",
8034 VEC_length (ddr_p, may_alias_ddrs));
8038 /* Function vect_loop_versioning.
8040 If the loop has data references that may or may not be aligned or/and
8041 has data reference relations whose independence was not proven then
8042 two versions of the loop need to be generated, one which is vectorized
8043 and one which isn't. A test is then generated to control which of the
8044 loops is executed. The test checks for the alignment of all of the
8045 data references that may or may not be aligned. An additional
8046 sequence of runtime tests is generated for each pairs of DDRs whose
8047 independence was not proven. The vectorized version of loop is
8048 executed only if both alias and alignment tests are passed.
8050 The test generated to check which version of loop is executed
8051 is modified to also check for profitability as indicated by the
8052 cost model initially. */
8054 static void
8055 vect_loop_versioning (loop_vec_info loop_vinfo)
8057 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8058 struct loop *nloop;
8059 tree cond_expr = NULL_TREE;
8060 gimple_seq cond_expr_stmt_list = NULL;
8061 basic_block condition_bb;
8062 gimple_stmt_iterator gsi, cond_exp_gsi;
8063 basic_block merge_bb;
8064 basic_block new_exit_bb;
8065 edge new_exit_e, e;
8066 gimple orig_phi, new_phi;
8067 tree arg;
8068 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
8069 gimple_seq gimplify_stmt_list = NULL;
8070 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
8071 int min_profitable_iters = 0;
8072 unsigned int th;
8074 /* Get profitability threshold for vectorized loop. */
8075 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
8077 th = conservative_cost_threshold (loop_vinfo,
8078 min_profitable_iters);
8080 cond_expr =
8081 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
8082 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
8084 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
8085 false, NULL_TREE);
8087 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
8088 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
8089 &cond_expr_stmt_list);
8091 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8092 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
8093 &cond_expr_stmt_list);
8095 cond_expr =
8096 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
8097 cond_expr =
8098 force_gimple_operand (cond_expr, &gimplify_stmt_list, true, NULL_TREE);
8099 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
8101 initialize_original_copy_tables ();
8102 nloop = loop_version (loop, cond_expr, &condition_bb,
8103 prob, prob, REG_BR_PROB_BASE - prob, true);
8104 free_original_copy_tables();
8106 /* Loop versioning violates an assumption we try to maintain during
8107 vectorization - that the loop exit block has a single predecessor.
8108 After versioning, the exit block of both loop versions is the same
8109 basic block (i.e. it has two predecessors). Just in order to simplify
8110 following transformations in the vectorizer, we fix this situation
8111 here by adding a new (empty) block on the exit-edge of the loop,
8112 with the proper loop-exit phis to maintain loop-closed-form. */
8114 merge_bb = single_exit (loop)->dest;
8115 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
8116 new_exit_bb = split_edge (single_exit (loop));
8117 new_exit_e = single_exit (loop);
8118 e = EDGE_SUCC (new_exit_bb, 0);
8120 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8122 orig_phi = gsi_stmt (gsi);
8123 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
8124 new_exit_bb);
8125 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
8126 add_phi_arg (new_phi, arg, new_exit_e);
8127 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
8130 /* End loop-exit-fixes after versioning. */
8132 update_ssa (TODO_update_ssa);
8133 if (cond_expr_stmt_list)
8135 cond_exp_gsi = gsi_last_bb (condition_bb);
8136 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, GSI_SAME_STMT);
8140 /* Remove a group of stores (for SLP or interleaving), free their
8141 stmt_vec_info. */
8143 static void
8144 vect_remove_stores (gimple first_stmt)
8146 gimple next = first_stmt;
8147 gimple tmp;
8148 gimple_stmt_iterator next_si;
8150 while (next)
8152 /* Free the attached stmt_vec_info and remove the stmt. */
8153 next_si = gsi_for_stmt (next);
8154 gsi_remove (&next_si, true);
8155 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
8156 free_stmt_vec_info (next);
8157 next = tmp;
8162 /* Vectorize SLP instance tree in postorder. */
8164 static bool
8165 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
8166 unsigned int vectorization_factor)
8168 gimple stmt;
8169 bool strided_store, is_store;
8170 gimple_stmt_iterator si;
8171 stmt_vec_info stmt_info;
8172 unsigned int vec_stmts_size, nunits, group_size;
8173 tree vectype;
8174 int i;
8175 slp_tree loads_node;
8177 if (!node)
8178 return false;
8180 vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance,
8181 vectorization_factor);
8182 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance,
8183 vectorization_factor);
8185 stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
8186 stmt_info = vinfo_for_stmt (stmt);
8188 /* VECTYPE is the type of the destination. */
8189 vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt)));
8190 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
8191 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
8193 /* For each SLP instance calculate number of vector stmts to be created
8194 for the scalar stmts in each node of the SLP tree. Number of vector
8195 elements in one vector iteration is the number of scalar elements in
8196 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
8197 size. */
8198 vec_stmts_size = (vectorization_factor * group_size) / nunits;
8200 /* In case of load permutation we have to allocate vectorized statements for
8201 all the nodes that participate in that permutation. */
8202 if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
8204 for (i = 0;
8205 VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
8206 i++)
8208 if (!SLP_TREE_VEC_STMTS (loads_node))
8210 SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
8211 vec_stmts_size);
8212 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
8217 if (!SLP_TREE_VEC_STMTS (node))
8219 SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
8220 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
8223 if (vect_print_dump_info (REPORT_DETAILS))
8225 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
8226 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8229 /* Loads should be inserted before the first load. */
8230 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
8231 && STMT_VINFO_STRIDED_ACCESS (stmt_info)
8232 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
8233 si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
8234 else
8235 si = gsi_for_stmt (stmt);
8237 is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
8238 if (is_store)
8240 if (DR_GROUP_FIRST_DR (stmt_info))
8241 /* If IS_STORE is TRUE, the vectorization of the
8242 interleaving chain was completed - free all the stores in
8243 the chain. */
8244 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8245 else
8246 /* FORNOW: SLP originates only from strided stores. */
8247 gcc_unreachable ();
8249 return true;
8252 /* FORNOW: SLP originates only from strided stores. */
8253 return false;
8257 static bool
8258 vect_schedule_slp (loop_vec_info loop_vinfo)
8260 VEC (slp_instance, heap) *slp_instances =
8261 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8262 slp_instance instance;
8263 unsigned int i;
8264 bool is_store = false;
8266 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
8268 /* Schedule the tree of INSTANCE. */
8269 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
8270 instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
8272 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
8273 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
8274 fprintf (vect_dump, "vectorizing stmts using SLP.");
8277 return is_store;
8280 /* Function vect_transform_loop.
8282 The analysis phase has determined that the loop is vectorizable.
8283 Vectorize the loop - created vectorized stmts to replace the scalar
8284 stmts in the loop, and update the loop exit condition. */
8286 void
8287 vect_transform_loop (loop_vec_info loop_vinfo)
8289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8291 int nbbs = loop->num_nodes;
8292 gimple_stmt_iterator si;
8293 int i;
8294 tree ratio = NULL;
8295 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8296 bool strided_store;
8297 bool slp_scheduled = false;
8298 unsigned int nunits;
8300 if (vect_print_dump_info (REPORT_DETAILS))
8301 fprintf (vect_dump, "=== vec_transform_loop ===");
8303 if (VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
8304 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
8305 vect_loop_versioning (loop_vinfo);
8307 /* CHECKME: we wouldn't need this if we called update_ssa once
8308 for all loops. */
8309 bitmap_zero (vect_memsyms_to_rename);
8311 /* Peel the loop if there are data refs with unknown alignment.
8312 Only one data ref with unknown store is allowed. */
8314 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
8315 vect_do_peeling_for_alignment (loop_vinfo);
8317 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
8318 compile time constant), or it is a constant that doesn't divide by the
8319 vectorization factor, then an epilog loop needs to be created.
8320 We therefore duplicate the loop: the original loop will be vectorized,
8321 and will compute the first (n/VF) iterations. The second copy of the loop
8322 will remain scalar and will compute the remaining (n%VF) iterations.
8323 (VF is the vectorization factor). */
8325 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8326 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8327 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
8328 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
8329 else
8330 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8331 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
8333 /* 1) Make sure the loop header has exactly two entries
8334 2) Make sure we have a preheader basic block. */
8336 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8338 split_edge (loop_preheader_edge (loop));
8340 /* FORNOW: the vectorizer supports only loops which body consist
8341 of one basic block (header + empty latch). When the vectorizer will
8342 support more involved loop forms, the order by which the BBs are
8343 traversed need to be reconsidered. */
8345 for (i = 0; i < nbbs; i++)
8347 basic_block bb = bbs[i];
8348 stmt_vec_info stmt_info;
8349 gimple phi;
8351 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
8353 phi = gsi_stmt (si);
8354 if (vect_print_dump_info (REPORT_DETAILS))
8356 fprintf (vect_dump, "------>vectorizing phi: ");
8357 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
8359 stmt_info = vinfo_for_stmt (phi);
8360 if (!stmt_info)
8361 continue;
8363 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8364 && !STMT_VINFO_LIVE_P (stmt_info))
8365 continue;
8367 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
8368 != (unsigned HOST_WIDE_INT) vectorization_factor)
8369 && vect_print_dump_info (REPORT_DETAILS))
8370 fprintf (vect_dump, "multiple-types.");
8372 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
8374 if (vect_print_dump_info (REPORT_DETAILS))
8375 fprintf (vect_dump, "transform phi.");
8376 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8380 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
8382 gimple stmt = gsi_stmt (si);
8383 bool is_store;
8385 if (vect_print_dump_info (REPORT_DETAILS))
8387 fprintf (vect_dump, "------>vectorizing statement: ");
8388 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
8391 stmt_info = vinfo_for_stmt (stmt);
8393 /* vector stmts created in the outer-loop during vectorization of
8394 stmts in an inner-loop may not have a stmt_info, and do not
8395 need to be vectorized. */
8396 if (!stmt_info)
8398 gsi_next (&si);
8399 continue;
8402 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8403 && !STMT_VINFO_LIVE_P (stmt_info))
8405 gsi_next (&si);
8406 continue;
8409 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
8410 nunits =
8411 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8412 if (!STMT_SLP_TYPE (stmt_info)
8413 && nunits != (unsigned int) vectorization_factor
8414 && vect_print_dump_info (REPORT_DETAILS))
8415 /* For SLP VF is set according to unrolling factor, and not to
8416 vector size, hence for SLP this print is not valid. */
8417 fprintf (vect_dump, "multiple-types.");
8419 /* SLP. Schedule all the SLP instances when the first SLP stmt is
8420 reached. */
8421 if (STMT_SLP_TYPE (stmt_info))
8423 if (!slp_scheduled)
8425 slp_scheduled = true;
8427 if (vect_print_dump_info (REPORT_DETAILS))
8428 fprintf (vect_dump, "=== scheduling SLP instances ===");
8430 is_store = vect_schedule_slp (loop_vinfo);
8432 /* IS_STORE is true if STMT is a store. Stores cannot be of
8433 hybrid SLP type. They are removed in
8434 vect_schedule_slp_instance and their vinfo is destroyed. */
8435 if (is_store)
8437 gsi_next (&si);
8438 continue;
8442 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
8443 if (PURE_SLP_STMT (stmt_info))
8445 gsi_next (&si);
8446 continue;
8450 /* -------- vectorize statement ------------ */
8451 if (vect_print_dump_info (REPORT_DETAILS))
8452 fprintf (vect_dump, "transform statement.");
8454 strided_store = false;
8455 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
8456 if (is_store)
8458 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
8460 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8461 interleaving chain was completed - free all the stores in
8462 the chain. */
8463 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
8464 gsi_remove (&si, true);
8465 continue;
8467 else
8469 /* Free the attached stmt_vec_info and remove the stmt. */
8470 free_stmt_vec_info (stmt);
8471 gsi_remove (&si, true);
8472 continue;
8475 gsi_next (&si);
8476 } /* stmts in BB */
8477 } /* BBs in loop */
8479 slpeel_make_loop_iterate_ntimes (loop, ratio);
8481 mark_set_for_renaming (vect_memsyms_to_rename);
8483 /* The memory tags and pointers in vectorized statements need to
8484 have their SSA forms updated. FIXME, why can't this be delayed
8485 until all the loops have been transformed? */
8486 update_ssa (TODO_update_ssa);
8488 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8489 fprintf (vect_dump, "LOOP VECTORIZED.");
8490 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
8491 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");