re PR rtl-optimization/35838 (FAIL: 22_locale/num_get/get/char/16.cc execution test...
[official-gcc.git] / gcc / tree-vect-transform.c
blob6168fdab74d9dfb84c87fd7c5f8145cc0018d455
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int scalar_outside_cost = 0;
123 bool runtime_test = false;
124 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
127 int nbbs = loop->num_nodes;
128 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
129 int peel_guard_costs = 0;
130 int innerloop_iters = 0, factor;
131 VEC (slp_instance, heap) *slp_instances;
132 slp_instance instance;
134 /* Cost model disabled. */
135 if (!flag_vect_cost_model)
137 if (vect_print_dump_info (REPORT_COST))
138 fprintf (vect_dump, "cost model disabled.");
139 return 0;
142 /* If the number of iterations is unknown, or the
143 peeling-for-misalignment amount is unknown, we will have to generate
144 a runtime test to test the loop count against the threshold. */
145 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
146 || (byte_misalign < 0))
147 runtime_test = true;
149 /* Requires loop versioning tests to handle misalignment. */
151 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
153 /* FIXME: Make cost depend on complexity of individual check. */
154 vec_outside_cost +=
155 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
156 if (vect_print_dump_info (REPORT_COST))
157 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
158 "versioning to treat misalignment.\n");
161 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
163 /* FIXME: Make cost depend on complexity of individual check. */
164 vec_outside_cost +=
165 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
166 if (vect_print_dump_info (REPORT_COST))
167 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
168 "versioning aliasing.\n");
171 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
172 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
174 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
177 /* Count statements in scalar loop. Using this as scalar cost for a single
178 iteration for now.
180 TODO: Add outer loop support.
182 TODO: Consider assigning different costs to different scalar
183 statements. */
185 /* FORNOW. */
186 if (loop->inner)
187 innerloop_iters = 50; /* FIXME */
189 for (i = 0; i < nbbs; i++)
191 block_stmt_iterator si;
192 basic_block bb = bbs[i];
194 if (bb->loop_father == loop->inner)
195 factor = innerloop_iters;
196 else
197 factor = 1;
199 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
201 tree stmt = bsi_stmt (si);
202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
203 /* Skip stmts that are not vectorized inside the loop. */
204 if (!STMT_VINFO_RELEVANT_P (stmt_info)
205 && (!STMT_VINFO_LIVE_P (stmt_info)
206 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
207 continue;
208 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
209 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
210 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
211 some of the "outside" costs are generated inside the outer-loop. */
212 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
216 /* Add additional cost for the peeled instructions in prologue and epilogue
217 loop.
219 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
220 at compile-time - we assume it's vf/2 (the worst would be vf-1).
222 TODO: Build an expression that represents peel_iters for prologue and
223 epilogue to be used in a run-time test. */
225 if (byte_misalign < 0)
227 peel_iters_prologue = vf/2;
228 if (vect_print_dump_info (REPORT_COST))
229 fprintf (vect_dump, "cost model: "
230 "prologue peel iters set to vf/2.");
232 /* If peeling for alignment is unknown, loop bound of main loop becomes
233 unknown. */
234 peel_iters_epilogue = vf/2;
235 if (vect_print_dump_info (REPORT_COST))
236 fprintf (vect_dump, "cost model: "
237 "epilogue peel iters set to vf/2 because "
238 "peeling for alignment is unknown .");
240 /* If peeled iterations are unknown, count a taken branch and a not taken
241 branch per peeled loop. Even if scalar loop iterations are known,
242 vector iterations are not known since peeled prologue iterations are
243 not known. Hence guards remain the same. */
244 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
245 + TARG_COND_NOT_TAKEN_BRANCH_COST);
248 else
250 if (byte_misalign)
252 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
253 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
254 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
255 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
257 peel_iters_prologue = nelements - (byte_misalign / element_size);
259 else
260 peel_iters_prologue = 0;
262 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
264 peel_iters_epilogue = vf/2;
265 if (vect_print_dump_info (REPORT_COST))
266 fprintf (vect_dump, "cost model: "
267 "epilogue peel iters set to vf/2 because "
268 "loop iterations are unknown .");
270 /* If peeled iterations are known but number of scalar loop
271 iterations are unknown, count a taken branch per peeled loop. */
272 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
275 else
277 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
278 peel_iters_prologue = niters < peel_iters_prologue ?
279 niters : peel_iters_prologue;
280 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
284 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
285 + (peel_iters_epilogue * scalar_single_iter_cost)
286 + peel_guard_costs;
288 /* FORNOW: The scalar outside cost is incremented in one of the
289 following ways:
291 1. The vectorizer checks for alignment and aliasing and generates
292 a condition that allows dynamic vectorization. A cost model
293 check is ANDED with the versioning condition. Hence scalar code
294 path now has the added cost of the versioning check.
296 if (cost > th & versioning_check)
297 jmp to vector code
299 Hence run-time scalar is incremented by not-taken branch cost.
301 2. The vectorizer then checks if a prologue is required. If the
302 cost model check was not done before during versioning, it has to
303 be done before the prologue check.
305 if (cost <= th)
306 prologue = scalar_iters
307 if (prologue == 0)
308 jmp to vector code
309 else
310 execute prologue
311 if (prologue == num_iters)
312 go to exit
314 Hence the run-time scalar cost is incremented by a taken branch,
315 plus a not-taken branch, plus a taken branch cost.
317 3. The vectorizer then checks if an epilogue is required. If the
318 cost model check was not done before during prologue check, it
319 has to be done with the epilogue check.
321 if (prologue == 0)
322 jmp to vector code
323 else
324 execute prologue
325 if (prologue == num_iters)
326 go to exit
327 vector code:
328 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
329 jmp to epilogue
331 Hence the run-time scalar cost should be incremented by 2 taken
332 branches.
334 TODO: The back end may reorder the BBS's differently and reverse
335 conditions/branch directions. Change the stimates below to
336 something more reasonable. */
338 if (runtime_test)
340 /* Cost model check occurs at versioning. */
341 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
342 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
343 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
344 else
346 /* Cost model occurs at prologue generation. */
347 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
348 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
349 + TARG_COND_NOT_TAKEN_BRANCH_COST;
350 /* Cost model check occurs at epilogue generation. */
351 else
352 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
356 /* Add SLP costs. */
357 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
358 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
360 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
361 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
364 /* Calculate number of iterations required to make the vector version
365 profitable, relative to the loop bodies only. The following condition
366 must hold true:
367 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
368 where
369 SIC = scalar iteration cost, VIC = vector iteration cost,
370 VOC = vector outside cost, VF = vectorization factor,
371 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
372 SOC = scalar outside cost for run time cost model check. */
374 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
376 if (vec_outside_cost <= 0)
377 min_profitable_iters = 1;
378 else
380 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
381 - vec_inside_cost * peel_iters_prologue
382 - vec_inside_cost * peel_iters_epilogue)
383 / ((scalar_single_iter_cost * vf)
384 - vec_inside_cost);
386 if ((scalar_single_iter_cost * vf * min_profitable_iters)
387 <= ((vec_inside_cost * min_profitable_iters)
388 + ((vec_outside_cost - scalar_outside_cost) * vf)))
389 min_profitable_iters++;
392 /* vector version will never be profitable. */
393 else
395 if (vect_print_dump_info (REPORT_COST))
396 fprintf (vect_dump, "cost model: vector iteration cost = %d "
397 "is divisible by scalar iteration cost = %d by a factor "
398 "greater than or equal to the vectorization factor = %d .",
399 vec_inside_cost, scalar_single_iter_cost, vf);
400 return -1;
403 if (vect_print_dump_info (REPORT_COST))
405 fprintf (vect_dump, "Cost model analysis: \n");
406 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
407 vec_inside_cost);
408 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
409 vec_outside_cost);
410 fprintf (vect_dump, " Scalar iteration cost: %d\n",
411 scalar_single_iter_cost);
412 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
413 fprintf (vect_dump, " prologue iterations: %d\n",
414 peel_iters_prologue);
415 fprintf (vect_dump, " epilogue iterations: %d\n",
416 peel_iters_epilogue);
417 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
418 min_profitable_iters);
421 min_profitable_iters =
422 min_profitable_iters < vf ? vf : min_profitable_iters;
424 /* Because the condition we create is:
425 if (niters <= min_profitable_iters)
426 then skip the vectorized loop. */
427 min_profitable_iters--;
429 if (vect_print_dump_info (REPORT_COST))
430 fprintf (vect_dump, " Profitability threshold = %d\n",
431 min_profitable_iters);
433 return min_profitable_iters;
437 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
438 functions. Design better to avoid maintenance issues. */
440 /* Function vect_model_reduction_cost.
442 Models cost for a reduction operation, including the vector ops
443 generated within the strip-mine loop, the initial definition before
444 the loop, and the epilogue code that must be generated. */
446 static bool
447 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
448 int ncopies)
450 int outer_cost = 0;
451 enum tree_code code;
452 optab optab;
453 tree vectype;
454 tree orig_stmt;
455 tree reduction_op;
456 enum machine_mode mode;
457 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
458 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
462 /* Cost of reduction op inside loop. */
463 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
465 reduction_op = TREE_OPERAND (operation, op_type-1);
466 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
467 if (!vectype)
469 if (vect_print_dump_info (REPORT_COST))
471 fprintf (vect_dump, "unsupported data-type ");
472 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
474 return false;
477 mode = TYPE_MODE (vectype);
478 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
480 if (!orig_stmt)
481 orig_stmt = STMT_VINFO_STMT (stmt_info);
483 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
485 /* Add in cost for initial definition. */
486 outer_cost += TARG_SCALAR_TO_VEC_COST;
488 /* Determine cost of epilogue code.
490 We have a reduction operator that will reduce the vector in one statement.
491 Also requires scalar extract. */
493 if (!nested_in_vect_loop_p (loop, orig_stmt))
495 if (reduc_code < NUM_TREE_CODES)
496 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
497 else
499 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
500 tree bitsize =
501 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
502 int element_bitsize = tree_low_cst (bitsize, 1);
503 int nelements = vec_size_in_bits / element_bitsize;
505 optab = optab_for_tree_code (code, vectype);
507 /* We have a whole vector shift available. */
508 if (VECTOR_MODE_P (mode)
509 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
510 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
511 /* Final reduction via vector shifts and the reduction operator. Also
512 requires scalar extract. */
513 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
514 + TARG_VEC_TO_SCALAR_COST);
515 else
516 /* Use extracts and reduction op for final reduction. For N elements,
517 we have N extracts and N-1 reduction ops. */
518 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
522 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
524 if (vect_print_dump_info (REPORT_COST))
525 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
526 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
527 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
529 return true;
533 /* Function vect_model_induction_cost.
535 Models cost for induction operations. */
537 static void
538 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
540 /* loop cost for vec_loop. */
541 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
542 /* prologue cost for vec_init and vec_step. */
543 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
545 if (vect_print_dump_info (REPORT_COST))
546 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
547 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
548 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
552 /* Function vect_model_simple_cost.
554 Models cost for simple operations, i.e. those that only emit ncopies of a
555 single op. Right now, this does not account for multiple insns that could
556 be generated for the single vector op. We will handle that shortly. */
558 void
559 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
560 enum vect_def_type *dt, slp_tree slp_node)
562 int i;
563 int inside_cost = 0, outside_cost = 0;
565 inside_cost = ncopies * TARG_VEC_STMT_COST;
567 /* FORNOW: Assuming maximum 2 args per stmts. */
568 for (i = 0; i < 2; i++)
570 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
571 outside_cost += TARG_SCALAR_TO_VEC_COST;
574 if (vect_print_dump_info (REPORT_COST))
575 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
576 "outside_cost = %d .", inside_cost, outside_cost);
578 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
579 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
580 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
584 /* Function vect_cost_strided_group_size
586 For strided load or store, return the group_size only if it is the first
587 load or store of a group, else return 1. This ensures that group size is
588 only returned once per group. */
590 static int
591 vect_cost_strided_group_size (stmt_vec_info stmt_info)
593 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
595 if (first_stmt == STMT_VINFO_STMT (stmt_info))
596 return DR_GROUP_SIZE (stmt_info);
598 return 1;
602 /* Function vect_model_store_cost
604 Models cost for stores. In the case of strided accesses, one access
605 has the overhead of the strided access attributed to it. */
607 void
608 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
609 enum vect_def_type dt, slp_tree slp_node)
611 int group_size;
612 int inside_cost = 0, outside_cost = 0;
614 if (dt == vect_constant_def || dt == vect_invariant_def)
615 outside_cost = TARG_SCALAR_TO_VEC_COST;
617 /* Strided access? */
618 if (DR_GROUP_FIRST_DR (stmt_info))
619 group_size = vect_cost_strided_group_size (stmt_info);
620 /* Not a strided access. */
621 else
622 group_size = 1;
624 /* Is this an access in a group of stores, which provide strided access?
625 If so, add in the cost of the permutes. */
626 if (group_size > 1)
628 /* Uses a high and low interleave operation for each needed permute. */
629 inside_cost = ncopies * exact_log2(group_size) * group_size
630 * TARG_VEC_STMT_COST;
632 if (vect_print_dump_info (REPORT_COST))
633 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
634 group_size);
638 /* Costs of the stores. */
639 inside_cost += ncopies * TARG_VEC_STORE_COST;
641 if (vect_print_dump_info (REPORT_COST))
642 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
643 "outside_cost = %d .", inside_cost, outside_cost);
645 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
646 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
647 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
651 /* Function vect_model_load_cost
653 Models cost for loads. In the case of strided accesses, the last access
654 has the overhead of the strided access attributed to it. Since unaligned
655 accesses are supported for loads, we also account for the costs of the
656 access scheme chosen. */
658 void
659 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
662 int group_size;
663 int alignment_support_cheme;
664 tree first_stmt;
665 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
666 int inside_cost = 0, outside_cost = 0;
668 /* Strided accesses? */
669 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
670 if (first_stmt && !slp_node)
672 group_size = vect_cost_strided_group_size (stmt_info);
673 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
675 /* Not a strided access. */
676 else
678 group_size = 1;
679 first_dr = dr;
682 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
684 /* Is this an access in a group of loads providing strided access?
685 If so, add in the cost of the permutes. */
686 if (group_size > 1)
688 /* Uses an even and odd extract operations for each needed permute. */
689 inside_cost = ncopies * exact_log2(group_size) * group_size
690 * TARG_VEC_STMT_COST;
692 if (vect_print_dump_info (REPORT_COST))
693 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
694 group_size);
698 /* The loads themselves. */
699 switch (alignment_support_cheme)
701 case dr_aligned:
703 inside_cost += ncopies * TARG_VEC_LOAD_COST;
705 if (vect_print_dump_info (REPORT_COST))
706 fprintf (vect_dump, "vect_model_load_cost: aligned.");
708 break;
710 case dr_unaligned_supported:
712 /* Here, we assign an additional cost for the unaligned load. */
713 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
715 if (vect_print_dump_info (REPORT_COST))
716 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
717 "hardware.");
719 break;
721 case dr_explicit_realign:
723 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
725 /* FIXME: If the misalignment remains fixed across the iterations of
726 the containing loop, the following cost should be added to the
727 outside costs. */
728 if (targetm.vectorize.builtin_mask_for_load)
729 inside_cost += TARG_VEC_STMT_COST;
731 break;
733 case dr_explicit_realign_optimized:
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
737 "pipelined.");
739 /* Unaligned software pipeline has a load of an address, an initial
740 load, and possibly a mask operation to "prime" the loop. However,
741 if this is an access in a group of loads, which provide strided
742 access, then the above cost should only be considered for one
743 access in the group. Inside the loop, there is a load op
744 and a realignment op. */
746 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
748 outside_cost = 2*TARG_VEC_STMT_COST;
749 if (targetm.vectorize.builtin_mask_for_load)
750 outside_cost += TARG_VEC_STMT_COST;
753 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 break;
758 default:
759 gcc_unreachable ();
762 if (vect_print_dump_info (REPORT_COST))
763 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
764 "outside_cost = %d .", inside_cost, outside_cost);
766 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
767 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
768 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
772 /* Function vect_get_new_vect_var.
774 Returns a name for a new variable. The current naming scheme appends the
775 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
776 the name of vectorizer generated variables, and appends that to NAME if
777 provided. */
779 static tree
780 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
782 const char *prefix;
783 tree new_vect_var;
785 switch (var_kind)
787 case vect_simple_var:
788 prefix = "vect_";
789 break;
790 case vect_scalar_var:
791 prefix = "stmp_";
792 break;
793 case vect_pointer_var:
794 prefix = "vect_p";
795 break;
796 default:
797 gcc_unreachable ();
800 if (name)
802 char* tmp = concat (prefix, name, NULL);
803 new_vect_var = create_tmp_var (type, tmp);
804 free (tmp);
806 else
807 new_vect_var = create_tmp_var (type, prefix);
809 /* Mark vector typed variable as a gimple register variable. */
810 if (TREE_CODE (type) == VECTOR_TYPE)
811 DECL_GIMPLE_REG_P (new_vect_var) = true;
813 return new_vect_var;
817 /* Function vect_create_addr_base_for_vector_ref.
819 Create an expression that computes the address of the first memory location
820 that will be accessed for a data reference.
822 Input:
823 STMT: The statement containing the data reference.
824 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
825 OFFSET: Optional. If supplied, it is be added to the initial address.
826 LOOP: Specify relative to which loop-nest should the address be computed.
827 For example, when the dataref is in an inner-loop nested in an
828 outer-loop that is now being vectorized, LOOP can be either the
829 outer-loop, or the inner-loop. The first memory location accessed
830 by the following dataref ('in' points to short):
832 for (i=0; i<N; i++)
833 for (j=0; j<M; j++)
834 s += in[i+j]
836 is as follows:
837 if LOOP=i_loop: &in (relative to i_loop)
838 if LOOP=j_loop: &in+i*2B (relative to j_loop)
840 Output:
841 1. Return an SSA_NAME whose value is the address of the memory location of
842 the first vector of the data reference.
843 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
844 these statement(s) which define the returned SSA_NAME.
846 FORNOW: We are only handling array accesses with step 1. */
848 static tree
849 vect_create_addr_base_for_vector_ref (tree stmt,
850 tree *new_stmt_list,
851 tree offset,
852 struct loop *loop)
854 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
855 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
856 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
857 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
858 tree base_name;
859 tree data_ref_base_var;
860 tree new_base_stmt;
861 tree vec_stmt;
862 tree addr_base, addr_expr;
863 tree dest, new_stmt;
864 tree base_offset = unshare_expr (DR_OFFSET (dr));
865 tree init = unshare_expr (DR_INIT (dr));
866 tree vect_ptr_type, addr_expr2;
867 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
869 gcc_assert (loop);
870 if (loop != containing_loop)
872 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
873 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
875 gcc_assert (nested_in_vect_loop_p (loop, stmt));
877 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
878 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
879 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
882 /* Create data_ref_base */
883 base_name = build_fold_indirect_ref (data_ref_base);
884 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
885 add_referenced_var (data_ref_base_var);
886 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
887 true, data_ref_base_var);
888 append_to_statement_list_force(new_base_stmt, new_stmt_list);
890 /* Create base_offset */
891 base_offset = size_binop (PLUS_EXPR, base_offset, init);
892 base_offset = fold_convert (sizetype, base_offset);
893 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
894 add_referenced_var (dest);
895 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
896 append_to_statement_list_force (new_stmt, new_stmt_list);
898 if (offset)
900 tree tmp = create_tmp_var (sizetype, "offset");
902 add_referenced_var (tmp);
903 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
904 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
905 base_offset, offset);
906 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
907 append_to_statement_list_force (new_stmt, new_stmt_list);
910 /* base + base_offset */
911 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
912 data_ref_base, base_offset);
914 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
916 /* addr_expr = addr_base */
917 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
918 get_name (base_name));
919 add_referenced_var (addr_expr);
920 vec_stmt = fold_convert (vect_ptr_type, addr_base);
921 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
922 get_name (base_name));
923 add_referenced_var (addr_expr2);
924 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
925 append_to_statement_list_force (new_stmt, new_stmt_list);
927 if (vect_print_dump_info (REPORT_DETAILS))
929 fprintf (vect_dump, "created ");
930 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
932 return vec_stmt;
936 /* Function vect_create_data_ref_ptr.
938 Create a new pointer to vector type (vp), that points to the first location
939 accessed in the loop by STMT, along with the def-use update chain to
940 appropriately advance the pointer through the loop iterations. Also set
941 aliasing information for the pointer. This vector pointer is used by the
942 callers to this function to create a memory reference expression for vector
943 load/store access.
945 Input:
946 1. STMT: a stmt that references memory. Expected to be of the form
947 GIMPLE_MODIFY_STMT <name, data-ref> or
948 GIMPLE_MODIFY_STMT <data-ref, name>.
949 2. AT_LOOP: the loop where the vector memref is to be created.
950 3. OFFSET (optional): an offset to be added to the initial address accessed
951 by the data-ref in STMT.
952 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
953 pointing to the initial address.
954 5. TYPE: if not NULL indicates the required type of the data-ref
956 Output:
957 1. Declare a new ptr to vector_type, and have it point to the base of the
958 data reference (initial addressed accessed by the data reference).
959 For example, for vector of type V8HI, the following code is generated:
961 v8hi *vp;
962 vp = (v8hi *)initial_address;
964 if OFFSET is not supplied:
965 initial_address = &a[init];
966 if OFFSET is supplied:
967 initial_address = &a[init + OFFSET];
969 Return the initial_address in INITIAL_ADDRESS.
971 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
972 update the pointer in each iteration of the loop.
974 Return the increment stmt that updates the pointer in PTR_INCR.
976 3. Set INV_P to true if the access pattern of the data reference in the
977 vectorized loop is invariant. Set it to false otherwise.
979 4. Return the pointer. */
981 static tree
982 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
983 tree offset, tree *initial_address, tree *ptr_incr,
984 bool only_init, tree type, bool *inv_p)
986 tree base_name;
987 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
988 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
990 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
991 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
992 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
993 tree vect_ptr_type;
994 tree vect_ptr;
995 tree tag;
996 tree new_temp;
997 tree vec_stmt;
998 tree new_stmt_list = NULL_TREE;
999 edge pe;
1000 basic_block new_bb;
1001 tree vect_ptr_init;
1002 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1003 tree vptr;
1004 block_stmt_iterator incr_bsi;
1005 bool insert_after;
1006 tree indx_before_incr, indx_after_incr;
1007 tree incr;
1008 tree step;
1010 /* Check the step (evolution) of the load in LOOP, and record
1011 whether it's invariant. */
1012 if (nested_in_vect_loop)
1013 step = STMT_VINFO_DR_STEP (stmt_info);
1014 else
1015 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1017 if (tree_int_cst_compare (step, size_zero_node) == 0)
1018 *inv_p = true;
1019 else
1020 *inv_p = false;
1022 /* Create an expression for the first address accessed by this load
1023 in LOOP. */
1024 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1026 if (vect_print_dump_info (REPORT_DETAILS))
1028 tree data_ref_base = base_name;
1029 fprintf (vect_dump, "create vector-pointer variable to type: ");
1030 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1031 if (TREE_CODE (data_ref_base) == VAR_DECL)
1032 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1033 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1034 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1035 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1036 fprintf (vect_dump, " vectorizing a record based array ref: ");
1037 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1038 fprintf (vect_dump, " vectorizing a pointer ref: ");
1039 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1042 /** (1) Create the new vector-pointer variable: **/
1043 if (type)
1044 vect_ptr_type = build_pointer_type (type);
1045 else
1046 vect_ptr_type = build_pointer_type (vectype);
1047 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1048 get_name (base_name));
1049 add_referenced_var (vect_ptr);
1051 /** (2) Add aliasing information to the new vector-pointer:
1052 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1054 tag = DR_SYMBOL_TAG (dr);
1055 gcc_assert (tag);
1057 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1058 tag must be created with tag added to its may alias list. */
1059 if (!MTAG_P (tag))
1060 new_type_alias (vect_ptr, tag, DR_REF (dr));
1061 else
1062 set_symbol_mem_tag (vect_ptr, tag);
1064 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1066 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1067 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1068 def-use update cycles for the pointer: One relative to the outer-loop
1069 (LOOP), which is what steps (3) and (4) below do. The other is relative
1070 to the inner-loop (which is the inner-most loop containing the dataref),
1071 and this is done be step (5) below.
1073 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1074 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1075 redundant. Steps (3),(4) create the following:
1077 vp0 = &base_addr;
1078 LOOP: vp1 = phi(vp0,vp2)
1079 ...
1081 vp2 = vp1 + step
1082 goto LOOP
1084 If there is an inner-loop nested in loop, then step (5) will also be
1085 applied, and an additional update in the inner-loop will be created:
1087 vp0 = &base_addr;
1088 LOOP: vp1 = phi(vp0,vp2)
1090 inner: vp3 = phi(vp1,vp4)
1091 vp4 = vp3 + inner_step
1092 if () goto inner
1094 vp2 = vp1 + step
1095 if () goto LOOP */
1097 /** (3) Calculate the initial address the vector-pointer, and set
1098 the vector-pointer to point to it before the loop: **/
1100 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1102 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1103 offset, loop);
1104 pe = loop_preheader_edge (loop);
1105 if (new_stmt_list)
1107 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1108 gcc_assert (!new_bb);
1111 *initial_address = new_temp;
1113 /* Create: p = (vectype *) initial_base */
1114 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1115 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1116 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1117 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1118 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1119 gcc_assert (!new_bb);
1122 /** (4) Handle the updating of the vector-pointer inside the loop.
1123 This is needed when ONLY_INIT is false, and also when AT_LOOP
1124 is the inner-loop nested in LOOP (during outer-loop vectorization).
1127 if (only_init && at_loop == loop) /* No update in loop is required. */
1129 /* Copy the points-to information if it exists. */
1130 if (DR_PTR_INFO (dr))
1131 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1132 vptr = vect_ptr_init;
1134 else
1136 /* The step of the vector pointer is the Vector Size. */
1137 tree step = TYPE_SIZE_UNIT (vectype);
1138 /* One exception to the above is when the scalar step of the load in
1139 LOOP is zero. In this case the step here is also zero. */
1140 if (*inv_p)
1141 step = size_zero_node;
1143 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1145 create_iv (vect_ptr_init,
1146 fold_convert (vect_ptr_type, step),
1147 NULL_TREE, loop, &incr_bsi, insert_after,
1148 &indx_before_incr, &indx_after_incr);
1149 incr = bsi_stmt (incr_bsi);
1150 set_stmt_info (stmt_ann (incr),
1151 new_stmt_vec_info (incr, loop_vinfo));
1153 /* Copy the points-to information if it exists. */
1154 if (DR_PTR_INFO (dr))
1156 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1157 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1159 merge_alias_info (vect_ptr_init, indx_before_incr);
1160 merge_alias_info (vect_ptr_init, indx_after_incr);
1161 if (ptr_incr)
1162 *ptr_incr = incr;
1164 vptr = indx_before_incr;
1167 if (!nested_in_vect_loop || only_init)
1168 return vptr;
1171 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1172 nested in LOOP, if exists: **/
1174 gcc_assert (nested_in_vect_loop);
1175 if (!only_init)
1177 standard_iv_increment_position (containing_loop, &incr_bsi,
1178 &insert_after);
1179 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1180 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1181 &indx_after_incr);
1182 incr = bsi_stmt (incr_bsi);
1183 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1185 /* Copy the points-to information if it exists. */
1186 if (DR_PTR_INFO (dr))
1188 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1189 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1191 merge_alias_info (vect_ptr_init, indx_before_incr);
1192 merge_alias_info (vect_ptr_init, indx_after_incr);
1193 if (ptr_incr)
1194 *ptr_incr = incr;
1196 return indx_before_incr;
1198 else
1199 gcc_unreachable ();
1203 /* Function bump_vector_ptr
1205 Increment a pointer (to a vector type) by vector-size. If requested,
1206 i.e. if PTR-INCR is given, then also connect the new increment stmt
1207 to the existing def-use update-chain of the pointer, by modifying
1208 the PTR_INCR as illustrated below:
1210 The pointer def-use update-chain before this function:
1211 DATAREF_PTR = phi (p_0, p_2)
1212 ....
1213 PTR_INCR: p_2 = DATAREF_PTR + step
1215 The pointer def-use update-chain after this function:
1216 DATAREF_PTR = phi (p_0, p_2)
1217 ....
1218 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1219 ....
1220 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1222 Input:
1223 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1224 in the loop.
1225 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1226 the loop. The increment amount across iterations is expected
1227 to be vector_size.
1228 BSI - location where the new update stmt is to be placed.
1229 STMT - the original scalar memory-access stmt that is being vectorized.
1230 BUMP - optional. The offset by which to bump the pointer. If not given,
1231 the offset is assumed to be vector_size.
1233 Output: Return NEW_DATAREF_PTR as illustrated above.
1237 static tree
1238 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1239 tree stmt, tree bump)
1241 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1242 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1243 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1244 tree vptr_type = TREE_TYPE (dataref_ptr);
1245 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1246 tree update = TYPE_SIZE_UNIT (vectype);
1247 tree incr_stmt;
1248 ssa_op_iter iter;
1249 use_operand_p use_p;
1250 tree new_dataref_ptr;
1252 if (bump)
1253 update = bump;
1255 incr_stmt = build_gimple_modify_stmt (ptr_var,
1256 build2 (POINTER_PLUS_EXPR, vptr_type,
1257 dataref_ptr, update));
1258 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1259 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1260 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1262 /* Copy the points-to information if it exists. */
1263 if (DR_PTR_INFO (dr))
1264 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1265 merge_alias_info (new_dataref_ptr, dataref_ptr);
1267 if (!ptr_incr)
1268 return new_dataref_ptr;
1270 /* Update the vector-pointer's cross-iteration increment. */
1271 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1273 tree use = USE_FROM_PTR (use_p);
1275 if (use == dataref_ptr)
1276 SET_USE (use_p, new_dataref_ptr);
1277 else
1278 gcc_assert (tree_int_cst_compare (use, update) == 0);
1281 return new_dataref_ptr;
1285 /* Function vect_create_destination_var.
1287 Create a new temporary of type VECTYPE. */
1289 static tree
1290 vect_create_destination_var (tree scalar_dest, tree vectype)
1292 tree vec_dest;
1293 const char *new_name;
1294 tree type;
1295 enum vect_var_kind kind;
1297 kind = vectype ? vect_simple_var : vect_scalar_var;
1298 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1300 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1302 new_name = get_name (scalar_dest);
1303 if (!new_name)
1304 new_name = "var_";
1305 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1306 add_referenced_var (vec_dest);
1308 return vec_dest;
1312 /* Function vect_init_vector.
1314 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1315 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1316 is not NULL. Otherwise, place the initialization at the loop preheader.
1317 Return the DEF of INIT_STMT.
1318 It will be used in the vectorization of STMT. */
1320 static tree
1321 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1322 block_stmt_iterator *bsi)
1324 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1325 tree new_var;
1326 tree init_stmt;
1327 tree vec_oprnd;
1328 edge pe;
1329 tree new_temp;
1330 basic_block new_bb;
1332 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1333 add_referenced_var (new_var);
1334 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1335 new_temp = make_ssa_name (new_var, init_stmt);
1336 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1338 if (bsi)
1339 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1340 else
1342 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1343 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1345 if (nested_in_vect_loop_p (loop, stmt))
1346 loop = loop->inner;
1347 pe = loop_preheader_edge (loop);
1348 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1349 gcc_assert (!new_bb);
1352 if (vect_print_dump_info (REPORT_DETAILS))
1354 fprintf (vect_dump, "created new init_stmt: ");
1355 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1358 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1359 return vec_oprnd;
1363 /* For constant and loop invariant defs of SLP_NODE this function returns
1364 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1365 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1366 stmts. */
1368 static void
1369 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1370 unsigned int op_num)
1372 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1373 tree stmt = VEC_index (tree, stmts, 0);
1374 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1375 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1376 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1377 tree vec_cst;
1378 tree t = NULL_TREE;
1379 int j, number_of_places_left_in_vector;
1380 tree vector_type;
1381 tree op, vop, operation;
1382 int group_size = VEC_length (tree, stmts);
1383 unsigned int vec_num, i;
1384 int number_of_copies = 1;
1385 bool is_store = false;
1386 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1387 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1388 bool constant_p;
1390 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1391 is_store = true;
1393 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1394 created vectors. It is greater than 1 if unrolling is performed.
1396 For example, we have two scalar operands, s1 and s2 (e.g., group of
1397 strided accesses of size two), while NUINTS is four (i.e., four scalars
1398 of this type can be packed in a vector). The output vector will contain
1399 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1400 will be 2).
1402 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1403 containing the operands.
1405 For example, NUINTS is four as before, and the group size is 8
1406 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1407 {s5, s6, s7, s8}. */
1409 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1411 number_of_places_left_in_vector = nunits;
1412 constant_p = true;
1413 for (j = 0; j < number_of_copies; j++)
1415 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1417 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1418 if (is_store)
1419 op = operation;
1420 else
1421 op = TREE_OPERAND (operation, op_num);
1422 if (!CONSTANT_CLASS_P (op))
1423 constant_p = false;
1425 /* Create 'vect_ = {op0,op1,...,opn}'. */
1426 t = tree_cons (NULL_TREE, op, t);
1428 number_of_places_left_in_vector--;
1430 if (number_of_places_left_in_vector == 0)
1432 number_of_places_left_in_vector = nunits;
1434 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1435 gcc_assert (vector_type);
1436 if (constant_p)
1437 vec_cst = build_vector (vector_type, t);
1438 else
1439 vec_cst = build_constructor_from_list (vector_type, t);
1440 constant_p = true;
1441 VEC_quick_push (tree, voprnds,
1442 vect_init_vector (stmt, vec_cst, vector_type,
1443 NULL));
1444 t = NULL_TREE;
1449 /* Since the vectors are created in the reverse order, we should invert
1450 them. */
1451 vec_num = VEC_length (tree, voprnds);
1452 for (j = vec_num - 1; j >= 0; j--)
1454 vop = VEC_index (tree, voprnds, j);
1455 VEC_quick_push (tree, *vec_oprnds, vop);
1458 VEC_free (tree, heap, voprnds);
1460 /* In case that VF is greater than the unrolling factor needed for the SLP
1461 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1462 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1463 to replicate the vectors. */
1464 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1466 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1467 VEC_quick_push (tree, *vec_oprnds, vop);
1472 /* Get vectorized definitions from SLP_NODE that contains corresponding
1473 vectorized def-stmts. */
1475 static void
1476 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1478 tree vec_oprnd;
1479 tree vec_def_stmt;
1480 unsigned int i;
1482 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1484 for (i = 0;
1485 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1486 i++)
1488 gcc_assert (vec_def_stmt);
1489 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1490 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1495 /* Get vectorized definitions for SLP_NODE.
1496 If the scalar definitions are loop invariants or constants, collect them and
1497 call vect_get_constant_vectors() to create vector stmts.
1498 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1499 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1500 vect_get_slp_vect_defs() to retrieve them.
1501 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1502 the right node. This is used when the second operand must remain scalar. */
1504 static void
1505 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1506 VEC (tree,heap) **vec_oprnds1)
1508 tree operation, first_stmt;
1510 /* Allocate memory for vectorized defs. */
1511 *vec_oprnds0 = VEC_alloc (tree, heap,
1512 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1514 /* SLP_NODE corresponds either to a group of stores or to a group of
1515 unary/binary operations. We don't call this function for loads. */
1516 if (SLP_TREE_LEFT (slp_node))
1517 /* The defs are already vectorized. */
1518 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1519 else
1520 /* Build vectors from scalar defs. */
1521 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1523 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1524 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1525 /* Since we don't call this function with loads, this is a group of
1526 stores. */
1527 return;
1529 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1530 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1531 return;
1533 *vec_oprnds1 = VEC_alloc (tree, heap,
1534 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1536 if (SLP_TREE_RIGHT (slp_node))
1537 /* The defs are already vectorized. */
1538 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1539 else
1540 /* Build vectors from scalar defs. */
1541 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1545 /* Function get_initial_def_for_induction
1547 Input:
1548 STMT - a stmt that performs an induction operation in the loop.
1549 IV_PHI - the initial value of the induction variable
1551 Output:
1552 Return a vector variable, initialized with the first VF values of
1553 the induction variable. E.g., for an iv with IV_PHI='X' and
1554 evolution S, for a vector of 4 units, we want to return:
1555 [X, X + S, X + 2*S, X + 3*S]. */
1557 static tree
1558 get_initial_def_for_induction (tree iv_phi)
1560 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1561 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1562 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1564 tree vectype;
1565 int nunits;
1566 edge pe = loop_preheader_edge (loop);
1567 struct loop *iv_loop;
1568 basic_block new_bb;
1569 tree vec, vec_init, vec_step, t;
1570 tree access_fn;
1571 tree new_var;
1572 tree new_name;
1573 tree init_stmt;
1574 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1575 tree init_expr, step_expr;
1576 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1577 int i;
1578 bool ok;
1579 int ncopies;
1580 tree expr;
1581 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1582 bool nested_in_vect_loop = false;
1583 tree stmts;
1584 imm_use_iterator imm_iter;
1585 use_operand_p use_p;
1586 tree exit_phi;
1587 edge latch_e;
1588 tree loop_arg;
1589 block_stmt_iterator si;
1590 basic_block bb = bb_for_stmt (iv_phi);
1592 vectype = get_vectype_for_scalar_type (scalar_type);
1593 gcc_assert (vectype);
1594 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1595 ncopies = vf / nunits;
1597 gcc_assert (phi_info);
1598 gcc_assert (ncopies >= 1);
1600 /* Find the first insertion point in the BB. */
1601 si = bsi_after_labels (bb);
1603 if (INTEGRAL_TYPE_P (scalar_type))
1604 step_expr = build_int_cst (scalar_type, 0);
1605 else
1606 step_expr = build_real (scalar_type, dconst0);
1608 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1609 if (nested_in_vect_loop_p (loop, iv_phi))
1611 nested_in_vect_loop = true;
1612 iv_loop = loop->inner;
1614 else
1615 iv_loop = loop;
1616 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1618 latch_e = loop_latch_edge (iv_loop);
1619 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1621 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1622 gcc_assert (access_fn);
1623 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1624 &init_expr, &step_expr);
1625 gcc_assert (ok);
1626 pe = loop_preheader_edge (iv_loop);
1628 /* Create the vector that holds the initial_value of the induction. */
1629 if (nested_in_vect_loop)
1631 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1632 been created during vectorization of previous stmts; We obtain it from
1633 the STMT_VINFO_VEC_STMT of the defining stmt. */
1634 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1635 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1637 else
1639 /* iv_loop is the loop to be vectorized. Create:
1640 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1641 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1642 add_referenced_var (new_var);
1644 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1645 if (stmts)
1647 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1648 gcc_assert (!new_bb);
1651 t = NULL_TREE;
1652 t = tree_cons (NULL_TREE, init_expr, t);
1653 for (i = 1; i < nunits; i++)
1655 tree tmp;
1657 /* Create: new_name_i = new_name + step_expr */
1658 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1659 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1660 new_name = make_ssa_name (new_var, init_stmt);
1661 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1663 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1664 gcc_assert (!new_bb);
1666 if (vect_print_dump_info (REPORT_DETAILS))
1668 fprintf (vect_dump, "created new init_stmt: ");
1669 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1671 t = tree_cons (NULL_TREE, new_name, t);
1673 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1674 vec = build_constructor_from_list (vectype, nreverse (t));
1675 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1679 /* Create the vector that holds the step of the induction. */
1680 if (nested_in_vect_loop)
1681 /* iv_loop is nested in the loop to be vectorized. Generate:
1682 vec_step = [S, S, S, S] */
1683 new_name = step_expr;
1684 else
1686 /* iv_loop is the loop to be vectorized. Generate:
1687 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1688 expr = build_int_cst (scalar_type, vf);
1689 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1692 t = NULL_TREE;
1693 for (i = 0; i < nunits; i++)
1694 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1695 gcc_assert (CONSTANT_CLASS_P (new_name));
1696 vec = build_vector (vectype, t);
1697 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1700 /* Create the following def-use cycle:
1701 loop prolog:
1702 vec_init = ...
1703 vec_step = ...
1704 loop:
1705 vec_iv = PHI <vec_init, vec_loop>
1707 STMT
1709 vec_loop = vec_iv + vec_step; */
1711 /* Create the induction-phi that defines the induction-operand. */
1712 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1713 add_referenced_var (vec_dest);
1714 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1715 set_stmt_info (get_stmt_ann (induction_phi),
1716 new_stmt_vec_info (induction_phi, loop_vinfo));
1717 induc_def = PHI_RESULT (induction_phi);
1719 /* Create the iv update inside the loop */
1720 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1721 build2 (PLUS_EXPR, vectype,
1722 induc_def, vec_step));
1723 vec_def = make_ssa_name (vec_dest, new_stmt);
1724 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1725 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1726 set_stmt_info (get_stmt_ann (new_stmt),
1727 new_stmt_vec_info (new_stmt, loop_vinfo));
1729 /* Set the arguments of the phi node: */
1730 add_phi_arg (induction_phi, vec_init, pe);
1731 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1734 /* In case that vectorization factor (VF) is bigger than the number
1735 of elements that we can fit in a vectype (nunits), we have to generate
1736 more than one vector stmt - i.e - we need to "unroll" the
1737 vector stmt by a factor VF/nunits. For more details see documentation
1738 in vectorizable_operation. */
1740 if (ncopies > 1)
1742 stmt_vec_info prev_stmt_vinfo;
1743 /* FORNOW. This restriction should be relaxed. */
1744 gcc_assert (!nested_in_vect_loop);
1746 /* Create the vector that holds the step of the induction. */
1747 expr = build_int_cst (scalar_type, nunits);
1748 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1749 t = NULL_TREE;
1750 for (i = 0; i < nunits; i++)
1751 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1752 gcc_assert (CONSTANT_CLASS_P (new_name));
1753 vec = build_vector (vectype, t);
1754 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1756 vec_def = induc_def;
1757 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1758 for (i = 1; i < ncopies; i++)
1760 tree tmp;
1762 /* vec_i = vec_prev + vec_step */
1763 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1764 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1765 vec_def = make_ssa_name (vec_dest, new_stmt);
1766 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1767 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1768 set_stmt_info (get_stmt_ann (new_stmt),
1769 new_stmt_vec_info (new_stmt, loop_vinfo));
1770 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1771 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1775 if (nested_in_vect_loop)
1777 /* Find the loop-closed exit-phi of the induction, and record
1778 the final vector of induction results: */
1779 exit_phi = NULL;
1780 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1782 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1784 exit_phi = USE_STMT (use_p);
1785 break;
1788 if (exit_phi)
1790 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1791 /* FORNOW. Currently not supporting the case that an inner-loop induction
1792 is not used in the outer-loop (i.e. only outside the outer-loop). */
1793 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1794 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1796 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1797 if (vect_print_dump_info (REPORT_DETAILS))
1799 fprintf (vect_dump, "vector of inductions after inner-loop:");
1800 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1806 if (vect_print_dump_info (REPORT_DETAILS))
1808 fprintf (vect_dump, "transform induction: created def-use cycle:");
1809 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1810 fprintf (vect_dump, "\n");
1811 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1814 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1815 return induc_def;
1819 /* Function vect_get_vec_def_for_operand.
1821 OP is an operand in STMT. This function returns a (vector) def that will be
1822 used in the vectorized stmt for STMT.
1824 In the case that OP is an SSA_NAME which is defined in the loop, then
1825 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1827 In case OP is an invariant or constant, a new stmt that creates a vector def
1828 needs to be introduced. */
1830 static tree
1831 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1833 tree vec_oprnd;
1834 tree vec_stmt;
1835 tree def_stmt;
1836 stmt_vec_info def_stmt_info = NULL;
1837 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1838 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1839 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1840 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1841 tree vec_inv;
1842 tree vec_cst;
1843 tree t = NULL_TREE;
1844 tree def;
1845 int i;
1846 enum vect_def_type dt;
1847 bool is_simple_use;
1848 tree vector_type;
1850 if (vect_print_dump_info (REPORT_DETAILS))
1852 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1853 print_generic_expr (vect_dump, op, TDF_SLIM);
1856 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1857 gcc_assert (is_simple_use);
1858 if (vect_print_dump_info (REPORT_DETAILS))
1860 if (def)
1862 fprintf (vect_dump, "def = ");
1863 print_generic_expr (vect_dump, def, TDF_SLIM);
1865 if (def_stmt)
1867 fprintf (vect_dump, " def_stmt = ");
1868 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1872 switch (dt)
1874 /* Case 1: operand is a constant. */
1875 case vect_constant_def:
1877 if (scalar_def)
1878 *scalar_def = op;
1880 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1881 if (vect_print_dump_info (REPORT_DETAILS))
1882 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1884 for (i = nunits - 1; i >= 0; --i)
1886 t = tree_cons (NULL_TREE, op, t);
1888 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1889 gcc_assert (vector_type);
1890 vec_cst = build_vector (vector_type, t);
1892 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1895 /* Case 2: operand is defined outside the loop - loop invariant. */
1896 case vect_invariant_def:
1898 if (scalar_def)
1899 *scalar_def = def;
1901 /* Create 'vec_inv = {inv,inv,..,inv}' */
1902 if (vect_print_dump_info (REPORT_DETAILS))
1903 fprintf (vect_dump, "Create vector_inv.");
1905 for (i = nunits - 1; i >= 0; --i)
1907 t = tree_cons (NULL_TREE, def, t);
1910 /* FIXME: use build_constructor directly. */
1911 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1912 gcc_assert (vector_type);
1913 vec_inv = build_constructor_from_list (vector_type, t);
1914 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1917 /* Case 3: operand is defined inside the loop. */
1918 case vect_loop_def:
1920 if (scalar_def)
1921 *scalar_def = def_stmt;
1923 /* Get the def from the vectorized stmt. */
1924 def_stmt_info = vinfo_for_stmt (def_stmt);
1925 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1926 gcc_assert (vec_stmt);
1927 if (TREE_CODE (vec_stmt) == PHI_NODE)
1928 vec_oprnd = PHI_RESULT (vec_stmt);
1929 else
1930 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1931 return vec_oprnd;
1934 /* Case 4: operand is defined by a loop header phi - reduction */
1935 case vect_reduction_def:
1937 struct loop *loop;
1939 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1940 loop = (bb_for_stmt (def_stmt))->loop_father;
1942 /* Get the def before the loop */
1943 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1944 return get_initial_def_for_reduction (stmt, op, scalar_def);
1947 /* Case 5: operand is defined by loop-header phi - induction. */
1948 case vect_induction_def:
1950 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1952 /* Get the def from the vectorized stmt. */
1953 def_stmt_info = vinfo_for_stmt (def_stmt);
1954 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1955 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1956 vec_oprnd = PHI_RESULT (vec_stmt);
1957 return vec_oprnd;
1960 default:
1961 gcc_unreachable ();
1966 /* Function vect_get_vec_def_for_stmt_copy
1968 Return a vector-def for an operand. This function is used when the
1969 vectorized stmt to be created (by the caller to this function) is a "copy"
1970 created in case the vectorized result cannot fit in one vector, and several
1971 copies of the vector-stmt are required. In this case the vector-def is
1972 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1973 of the stmt that defines VEC_OPRND.
1974 DT is the type of the vector def VEC_OPRND.
1976 Context:
1977 In case the vectorization factor (VF) is bigger than the number
1978 of elements that can fit in a vectype (nunits), we have to generate
1979 more than one vector stmt to vectorize the scalar stmt. This situation
1980 arises when there are multiple data-types operated upon in the loop; the
1981 smallest data-type determines the VF, and as a result, when vectorizing
1982 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1983 vector stmt (each computing a vector of 'nunits' results, and together
1984 computing 'VF' results in each iteration). This function is called when
1985 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1986 which VF=16 and nunits=4, so the number of copies required is 4):
1988 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1990 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1991 VS1.1: vx.1 = memref1 VS1.2
1992 VS1.2: vx.2 = memref2 VS1.3
1993 VS1.3: vx.3 = memref3
1995 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1996 VSnew.1: vz1 = vx.1 + ... VSnew.2
1997 VSnew.2: vz2 = vx.2 + ... VSnew.3
1998 VSnew.3: vz3 = vx.3 + ...
2000 The vectorization of S1 is explained in vectorizable_load.
2001 The vectorization of S2:
2002 To create the first vector-stmt out of the 4 copies - VSnew.0 -
2003 the function 'vect_get_vec_def_for_operand' is called to
2004 get the relevant vector-def for each operand of S2. For operand x it
2005 returns the vector-def 'vx.0'.
2007 To create the remaining copies of the vector-stmt (VSnew.j), this
2008 function is called to get the relevant vector-def for each operand. It is
2009 obtained from the respective VS1.j stmt, which is recorded in the
2010 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2012 For example, to obtain the vector-def 'vx.1' in order to create the
2013 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2014 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2015 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2016 and return its def ('vx.1').
2017 Overall, to create the above sequence this function will be called 3 times:
2018 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2019 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2020 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2022 static tree
2023 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2025 tree vec_stmt_for_operand;
2026 stmt_vec_info def_stmt_info;
2028 /* Do nothing; can reuse same def. */
2029 if (dt == vect_invariant_def || dt == vect_constant_def )
2030 return vec_oprnd;
2032 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2033 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2034 gcc_assert (def_stmt_info);
2035 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2036 gcc_assert (vec_stmt_for_operand);
2037 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
2038 return vec_oprnd;
2042 /* Get vectorized definitions for the operands to create a copy of an original
2043 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2045 static void
2046 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2047 VEC(tree,heap) **vec_oprnds0,
2048 VEC(tree,heap) **vec_oprnds1)
2050 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2052 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2053 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2055 if (vec_oprnds1 && *vec_oprnds1)
2057 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2058 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2059 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2064 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2066 static void
2067 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2068 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2070 if (slp_node)
2071 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2072 else
2074 tree vec_oprnd;
2076 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2077 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2078 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2080 if (op1)
2082 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2083 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2084 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2090 /* Function vect_finish_stmt_generation.
2092 Insert a new stmt. */
2094 static void
2095 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2096 block_stmt_iterator *bsi)
2098 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2099 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2101 gcc_assert (stmt == bsi_stmt (*bsi));
2102 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2104 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2106 set_stmt_info (get_stmt_ann (vec_stmt),
2107 new_stmt_vec_info (vec_stmt, loop_vinfo));
2109 if (vect_print_dump_info (REPORT_DETAILS))
2111 fprintf (vect_dump, "add new stmt: ");
2112 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2115 /* Make sure bsi points to the stmt that is being vectorized. */
2116 gcc_assert (stmt == bsi_stmt (*bsi));
2118 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2122 /* Function get_initial_def_for_reduction
2124 Input:
2125 STMT - a stmt that performs a reduction operation in the loop.
2126 INIT_VAL - the initial value of the reduction variable
2128 Output:
2129 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2130 of the reduction (used for adjusting the epilog - see below).
2131 Return a vector variable, initialized according to the operation that STMT
2132 performs. This vector will be used as the initial value of the
2133 vector of partial results.
2135 Option1 (adjust in epilog): Initialize the vector as follows:
2136 add: [0,0,...,0,0]
2137 mult: [1,1,...,1,1]
2138 min/max: [init_val,init_val,..,init_val,init_val]
2139 bit and/or: [init_val,init_val,..,init_val,init_val]
2140 and when necessary (e.g. add/mult case) let the caller know
2141 that it needs to adjust the result by init_val.
2143 Option2: Initialize the vector as follows:
2144 add: [0,0,...,0,init_val]
2145 mult: [1,1,...,1,init_val]
2146 min/max: [init_val,init_val,...,init_val]
2147 bit and/or: [init_val,init_val,...,init_val]
2148 and no adjustments are needed.
2150 For example, for the following code:
2152 s = init_val;
2153 for (i=0;i<n;i++)
2154 s = s + a[i];
2156 STMT is 's = s + a[i]', and the reduction variable is 's'.
2157 For a vector of 4 units, we want to return either [0,0,0,init_val],
2158 or [0,0,0,0] and let the caller know that it needs to adjust
2159 the result at the end by 'init_val'.
2161 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2162 initialization vector is simpler (same element in all entries).
2163 A cost model should help decide between these two schemes. */
2165 static tree
2166 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2168 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2169 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2170 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2171 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2172 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2173 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2174 tree type = TREE_TYPE (init_val);
2175 tree vecdef;
2176 tree def_for_init;
2177 tree init_def;
2178 tree t = NULL_TREE;
2179 int i;
2180 tree vector_type;
2181 bool nested_in_vect_loop = false;
2183 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2184 if (nested_in_vect_loop_p (loop, stmt))
2185 nested_in_vect_loop = true;
2186 else
2187 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2189 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2191 switch (code)
2193 case WIDEN_SUM_EXPR:
2194 case DOT_PROD_EXPR:
2195 case PLUS_EXPR:
2196 if (nested_in_vect_loop)
2197 *adjustment_def = vecdef;
2198 else
2199 *adjustment_def = init_val;
2200 /* Create a vector of zeros for init_def. */
2201 if (SCALAR_FLOAT_TYPE_P (type))
2202 def_for_init = build_real (type, dconst0);
2203 else
2204 def_for_init = build_int_cst (type, 0);
2205 for (i = nunits - 1; i >= 0; --i)
2206 t = tree_cons (NULL_TREE, def_for_init, t);
2207 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2208 gcc_assert (vector_type);
2209 init_def = build_vector (vector_type, t);
2210 break;
2212 case MIN_EXPR:
2213 case MAX_EXPR:
2214 *adjustment_def = NULL_TREE;
2215 init_def = vecdef;
2216 break;
2218 default:
2219 gcc_unreachable ();
2222 return init_def;
2226 /* Function vect_create_epilog_for_reduction
2228 Create code at the loop-epilog to finalize the result of a reduction
2229 computation.
2231 VECT_DEF is a vector of partial results.
2232 REDUC_CODE is the tree-code for the epilog reduction.
2233 STMT is the scalar reduction stmt that is being vectorized.
2234 REDUCTION_PHI is the phi-node that carries the reduction computation.
2236 This function:
2237 1. Creates the reduction def-use cycle: sets the arguments for
2238 REDUCTION_PHI:
2239 The loop-entry argument is the vectorized initial-value of the reduction.
2240 The loop-latch argument is VECT_DEF - the vector of partial sums.
2241 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2242 by applying the operation specified by REDUC_CODE if available, or by
2243 other means (whole-vector shifts or a scalar loop).
2244 The function also creates a new phi node at the loop exit to preserve
2245 loop-closed form, as illustrated below.
2247 The flow at the entry to this function:
2249 loop:
2250 vec_def = phi <null, null> # REDUCTION_PHI
2251 VECT_DEF = vector_stmt # vectorized form of STMT
2252 s_loop = scalar_stmt # (scalar) STMT
2253 loop_exit:
2254 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2255 use <s_out0>
2256 use <s_out0>
2258 The above is transformed by this function into:
2260 loop:
2261 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2262 VECT_DEF = vector_stmt # vectorized form of STMT
2263 s_loop = scalar_stmt # (scalar) STMT
2264 loop_exit:
2265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2267 v_out2 = reduce <v_out1>
2268 s_out3 = extract_field <v_out2, 0>
2269 s_out4 = adjust_result <s_out3>
2270 use <s_out4>
2271 use <s_out4>
2274 static void
2275 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2276 enum tree_code reduc_code, tree reduction_phi)
2278 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2279 tree vectype;
2280 enum machine_mode mode;
2281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2282 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2283 basic_block exit_bb;
2284 tree scalar_dest;
2285 tree scalar_type;
2286 tree new_phi;
2287 block_stmt_iterator exit_bsi;
2288 tree vec_dest;
2289 tree new_temp = NULL_TREE;
2290 tree new_name;
2291 tree epilog_stmt = NULL_TREE;
2292 tree new_scalar_dest, exit_phi, new_dest;
2293 tree bitsize, bitpos, bytesize;
2294 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2295 tree adjustment_def;
2296 tree vec_initial_def;
2297 tree orig_name;
2298 imm_use_iterator imm_iter;
2299 use_operand_p use_p;
2300 bool extract_scalar_result = false;
2301 tree reduction_op, expr;
2302 tree orig_stmt;
2303 tree use_stmt;
2304 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2305 bool nested_in_vect_loop = false;
2306 int op_type;
2307 VEC(tree,heap) *phis = NULL;
2308 int i;
2310 if (nested_in_vect_loop_p (loop, stmt))
2312 loop = loop->inner;
2313 nested_in_vect_loop = true;
2316 op_type = TREE_OPERAND_LENGTH (operation);
2317 reduction_op = TREE_OPERAND (operation, op_type-1);
2318 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2319 gcc_assert (vectype);
2320 mode = TYPE_MODE (vectype);
2322 /*** 1. Create the reduction def-use cycle ***/
2324 /* 1.1 set the loop-entry arg of the reduction-phi: */
2325 /* For the case of reduction, vect_get_vec_def_for_operand returns
2326 the scalar def before the loop, that defines the initial value
2327 of the reduction variable. */
2328 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2329 &adjustment_def);
2330 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2332 /* 1.2 set the loop-latch arg for the reduction-phi: */
2333 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2335 if (vect_print_dump_info (REPORT_DETAILS))
2337 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2338 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2339 fprintf (vect_dump, "\n");
2340 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2344 /*** 2. Create epilog code
2345 The reduction epilog code operates across the elements of the vector
2346 of partial results computed by the vectorized loop.
2347 The reduction epilog code consists of:
2348 step 1: compute the scalar result in a vector (v_out2)
2349 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2350 step 3: adjust the scalar result (s_out3) if needed.
2352 Step 1 can be accomplished using one the following three schemes:
2353 (scheme 1) using reduc_code, if available.
2354 (scheme 2) using whole-vector shifts, if available.
2355 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2356 combined.
2358 The overall epilog code looks like this:
2360 s_out0 = phi <s_loop> # original EXIT_PHI
2361 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2362 v_out2 = reduce <v_out1> # step 1
2363 s_out3 = extract_field <v_out2, 0> # step 2
2364 s_out4 = adjust_result <s_out3> # step 3
2366 (step 3 is optional, and step2 1 and 2 may be combined).
2367 Lastly, the uses of s_out0 are replaced by s_out4.
2369 ***/
2371 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2372 v_out1 = phi <v_loop> */
2374 exit_bb = single_exit (loop)->dest;
2375 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2376 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2377 exit_bsi = bsi_after_labels (exit_bb);
2379 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2380 (i.e. when reduc_code is not available) and in the final adjustment
2381 code (if needed). Also get the original scalar reduction variable as
2382 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2383 represents a reduction pattern), the tree-code and scalar-def are
2384 taken from the original stmt that the pattern-stmt (STMT) replaces.
2385 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2386 are taken from STMT. */
2388 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2389 if (!orig_stmt)
2391 /* Regular reduction */
2392 orig_stmt = stmt;
2394 else
2396 /* Reduction pattern */
2397 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2398 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2399 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2401 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2402 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2403 scalar_type = TREE_TYPE (scalar_dest);
2404 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2405 bitsize = TYPE_SIZE (scalar_type);
2406 bytesize = TYPE_SIZE_UNIT (scalar_type);
2409 /* In case this is a reduction in an inner-loop while vectorizing an outer
2410 loop - we don't need to extract a single scalar result at the end of the
2411 inner-loop. The final vector of partial results will be used in the
2412 vectorized outer-loop, or reduced to a scalar result at the end of the
2413 outer-loop. */
2414 if (nested_in_vect_loop)
2415 goto vect_finalize_reduction;
2417 /* 2.3 Create the reduction code, using one of the three schemes described
2418 above. */
2420 if (reduc_code < NUM_TREE_CODES)
2422 tree tmp;
2424 /*** Case 1: Create:
2425 v_out2 = reduc_expr <v_out1> */
2427 if (vect_print_dump_info (REPORT_DETAILS))
2428 fprintf (vect_dump, "Reduce using direct vector reduction.");
2430 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2431 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2432 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2433 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2434 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2435 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2437 extract_scalar_result = true;
2439 else
2441 enum tree_code shift_code = 0;
2442 bool have_whole_vector_shift = true;
2443 int bit_offset;
2444 int element_bitsize = tree_low_cst (bitsize, 1);
2445 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2446 tree vec_temp;
2448 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2449 shift_code = VEC_RSHIFT_EXPR;
2450 else
2451 have_whole_vector_shift = false;
2453 /* Regardless of whether we have a whole vector shift, if we're
2454 emulating the operation via tree-vect-generic, we don't want
2455 to use it. Only the first round of the reduction is likely
2456 to still be profitable via emulation. */
2457 /* ??? It might be better to emit a reduction tree code here, so that
2458 tree-vect-generic can expand the first round via bit tricks. */
2459 if (!VECTOR_MODE_P (mode))
2460 have_whole_vector_shift = false;
2461 else
2463 optab optab = optab_for_tree_code (code, vectype);
2464 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2465 have_whole_vector_shift = false;
2468 if (have_whole_vector_shift)
2470 /*** Case 2: Create:
2471 for (offset = VS/2; offset >= element_size; offset/=2)
2473 Create: va' = vec_shift <va, offset>
2474 Create: va = vop <va, va'>
2475 } */
2477 if (vect_print_dump_info (REPORT_DETAILS))
2478 fprintf (vect_dump, "Reduce using vector shifts");
2480 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2481 new_temp = PHI_RESULT (new_phi);
2483 for (bit_offset = vec_size_in_bits/2;
2484 bit_offset >= element_bitsize;
2485 bit_offset /= 2)
2487 tree bitpos = size_int (bit_offset);
2488 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2489 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2490 new_name = make_ssa_name (vec_dest, epilog_stmt);
2491 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2492 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2494 tmp = build2 (code, vectype, new_name, new_temp);
2495 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2496 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2497 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2498 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2501 extract_scalar_result = true;
2503 else
2505 tree rhs;
2507 /*** Case 3: Create:
2508 s = extract_field <v_out2, 0>
2509 for (offset = element_size;
2510 offset < vector_size;
2511 offset += element_size;)
2513 Create: s' = extract_field <v_out2, offset>
2514 Create: s = op <s, s'>
2515 } */
2517 if (vect_print_dump_info (REPORT_DETAILS))
2518 fprintf (vect_dump, "Reduce using scalar code. ");
2520 vec_temp = PHI_RESULT (new_phi);
2521 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2522 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2523 bitsize_zero_node);
2524 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2525 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2526 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2527 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2529 for (bit_offset = element_bitsize;
2530 bit_offset < vec_size_in_bits;
2531 bit_offset += element_bitsize)
2533 tree tmp;
2534 tree bitpos = bitsize_int (bit_offset);
2535 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2536 bitpos);
2538 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2539 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2540 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2541 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2543 tmp = build2 (code, scalar_type, new_name, new_temp);
2544 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2545 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2546 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2547 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2550 extract_scalar_result = false;
2554 /* 2.4 Extract the final scalar result. Create:
2555 s_out3 = extract_field <v_out2, bitpos> */
2557 if (extract_scalar_result)
2559 tree rhs;
2561 gcc_assert (!nested_in_vect_loop);
2562 if (vect_print_dump_info (REPORT_DETAILS))
2563 fprintf (vect_dump, "extract scalar result");
2565 if (BYTES_BIG_ENDIAN)
2566 bitpos = size_binop (MULT_EXPR,
2567 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2568 TYPE_SIZE (scalar_type));
2569 else
2570 bitpos = bitsize_zero_node;
2572 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2573 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2574 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2575 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2576 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2579 vect_finalize_reduction:
2581 /* 2.5 Adjust the final result by the initial value of the reduction
2582 variable. (When such adjustment is not needed, then
2583 'adjustment_def' is zero). For example, if code is PLUS we create:
2584 new_temp = loop_exit_def + adjustment_def */
2586 if (adjustment_def)
2588 if (nested_in_vect_loop)
2590 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2591 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2592 new_dest = vect_create_destination_var (scalar_dest, vectype);
2594 else
2596 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2597 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2598 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2600 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2601 new_temp = make_ssa_name (new_dest, epilog_stmt);
2602 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2603 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2607 /* 2.6 Handle the loop-exit phi */
2609 /* Replace uses of s_out0 with uses of s_out3:
2610 Find the loop-closed-use at the loop exit of the original scalar result.
2611 (The reduction result is expected to have two immediate uses - one at the
2612 latch block, and one at the loop exit). */
2613 phis = VEC_alloc (tree, heap, 10);
2614 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2616 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2618 exit_phi = USE_STMT (use_p);
2619 VEC_quick_push (tree, phis, exit_phi);
2622 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2623 gcc_assert (!VEC_empty (tree, phis));
2625 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2627 if (nested_in_vect_loop)
2629 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2631 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2632 is not used in the outer-loop (but only outside the outer-loop). */
2633 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2634 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2636 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2637 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2638 set_stmt_info (get_stmt_ann (epilog_stmt),
2639 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2640 continue;
2643 /* Replace the uses: */
2644 orig_name = PHI_RESULT (exit_phi);
2645 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2646 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2647 SET_USE (use_p, new_temp);
2649 VEC_free (tree, heap, phis);
2653 /* Function vectorizable_reduction.
2655 Check if STMT performs a reduction operation that can be vectorized.
2656 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2657 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2658 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2660 This function also handles reduction idioms (patterns) that have been
2661 recognized in advance during vect_pattern_recog. In this case, STMT may be
2662 of this form:
2663 X = pattern_expr (arg0, arg1, ..., X)
2664 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2665 sequence that had been detected and replaced by the pattern-stmt (STMT).
2667 In some cases of reduction patterns, the type of the reduction variable X is
2668 different than the type of the other arguments of STMT.
2669 In such cases, the vectype that is used when transforming STMT into a vector
2670 stmt is different than the vectype that is used to determine the
2671 vectorization factor, because it consists of a different number of elements
2672 than the actual number of elements that are being operated upon in parallel.
2674 For example, consider an accumulation of shorts into an int accumulator.
2675 On some targets it's possible to vectorize this pattern operating on 8
2676 shorts at a time (hence, the vectype for purposes of determining the
2677 vectorization factor should be V8HI); on the other hand, the vectype that
2678 is used to create the vector form is actually V4SI (the type of the result).
2680 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2681 indicates what is the actual level of parallelism (V8HI in the example), so
2682 that the right vectorization factor would be derived. This vectype
2683 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2684 be used to create the vectorized stmt. The right vectype for the vectorized
2685 stmt is obtained from the type of the result X:
2686 get_vectype_for_scalar_type (TREE_TYPE (X))
2688 This means that, contrary to "regular" reductions (or "regular" stmts in
2689 general), the following equation:
2690 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2691 does *NOT* necessarily hold for reduction patterns. */
2693 bool
2694 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2696 tree vec_dest;
2697 tree scalar_dest;
2698 tree op;
2699 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2700 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2701 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2703 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2704 tree operation;
2705 enum tree_code code, orig_code, epilog_reduc_code = 0;
2706 enum machine_mode vec_mode;
2707 int op_type;
2708 optab optab, reduc_optab;
2709 tree new_temp = NULL_TREE;
2710 tree def, def_stmt;
2711 enum vect_def_type dt;
2712 tree new_phi;
2713 tree scalar_type;
2714 bool is_simple_use;
2715 tree orig_stmt;
2716 stmt_vec_info orig_stmt_info;
2717 tree expr = NULL_TREE;
2718 int i;
2719 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2720 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2721 stmt_vec_info prev_stmt_info;
2722 tree reduc_def;
2723 tree new_stmt = NULL_TREE;
2724 int j;
2726 if (nested_in_vect_loop_p (loop, stmt))
2728 loop = loop->inner;
2729 /* FORNOW. This restriction should be relaxed. */
2730 if (ncopies > 1)
2732 if (vect_print_dump_info (REPORT_DETAILS))
2733 fprintf (vect_dump, "multiple types in nested loop.");
2734 return false;
2738 gcc_assert (ncopies >= 1);
2740 /* FORNOW: SLP not supported. */
2741 if (STMT_SLP_TYPE (stmt_info))
2742 return false;
2744 /* 1. Is vectorizable reduction? */
2746 /* Not supportable if the reduction variable is used in the loop. */
2747 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2748 return false;
2750 /* Reductions that are not used even in an enclosing outer-loop,
2751 are expected to be "live" (used out of the loop). */
2752 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2753 && !STMT_VINFO_LIVE_P (stmt_info))
2754 return false;
2756 /* Make sure it was already recognized as a reduction computation. */
2757 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2758 return false;
2760 /* 2. Has this been recognized as a reduction pattern?
2762 Check if STMT represents a pattern that has been recognized
2763 in earlier analysis stages. For stmts that represent a pattern,
2764 the STMT_VINFO_RELATED_STMT field records the last stmt in
2765 the original sequence that constitutes the pattern. */
2767 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2768 if (orig_stmt)
2770 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2771 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2772 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2773 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2776 /* 3. Check the operands of the operation. The first operands are defined
2777 inside the loop body. The last operand is the reduction variable,
2778 which is defined by the loop-header-phi. */
2780 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2782 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2783 code = TREE_CODE (operation);
2784 op_type = TREE_OPERAND_LENGTH (operation);
2785 if (op_type != binary_op && op_type != ternary_op)
2786 return false;
2787 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2788 scalar_type = TREE_TYPE (scalar_dest);
2789 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2790 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2791 return false;
2793 /* All uses but the last are expected to be defined in the loop.
2794 The last use is the reduction variable. */
2795 for (i = 0; i < op_type-1; i++)
2797 op = TREE_OPERAND (operation, i);
2798 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2799 gcc_assert (is_simple_use);
2800 if (dt != vect_loop_def
2801 && dt != vect_invariant_def
2802 && dt != vect_constant_def
2803 && dt != vect_induction_def)
2804 return false;
2807 op = TREE_OPERAND (operation, i);
2808 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2809 gcc_assert (is_simple_use);
2810 gcc_assert (dt == vect_reduction_def);
2811 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2812 if (orig_stmt)
2813 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2814 else
2815 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2817 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2818 return false;
2820 /* 4. Supportable by target? */
2822 /* 4.1. check support for the operation in the loop */
2823 optab = optab_for_tree_code (code, vectype);
2824 if (!optab)
2826 if (vect_print_dump_info (REPORT_DETAILS))
2827 fprintf (vect_dump, "no optab.");
2828 return false;
2830 vec_mode = TYPE_MODE (vectype);
2831 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2833 if (vect_print_dump_info (REPORT_DETAILS))
2834 fprintf (vect_dump, "op not supported by target.");
2835 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2836 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2837 < vect_min_worthwhile_factor (code))
2838 return false;
2839 if (vect_print_dump_info (REPORT_DETAILS))
2840 fprintf (vect_dump, "proceeding using word mode.");
2843 /* Worthwhile without SIMD support? */
2844 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2845 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2846 < vect_min_worthwhile_factor (code))
2848 if (vect_print_dump_info (REPORT_DETAILS))
2849 fprintf (vect_dump, "not worthwhile without SIMD support.");
2850 return false;
2853 /* 4.2. Check support for the epilog operation.
2855 If STMT represents a reduction pattern, then the type of the
2856 reduction variable may be different than the type of the rest
2857 of the arguments. For example, consider the case of accumulation
2858 of shorts into an int accumulator; The original code:
2859 S1: int_a = (int) short_a;
2860 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2862 was replaced with:
2863 STMT: int_acc = widen_sum <short_a, int_acc>
2865 This means that:
2866 1. The tree-code that is used to create the vector operation in the
2867 epilog code (that reduces the partial results) is not the
2868 tree-code of STMT, but is rather the tree-code of the original
2869 stmt from the pattern that STMT is replacing. I.e, in the example
2870 above we want to use 'widen_sum' in the loop, but 'plus' in the
2871 epilog.
2872 2. The type (mode) we use to check available target support
2873 for the vector operation to be created in the *epilog*, is
2874 determined by the type of the reduction variable (in the example
2875 above we'd check this: plus_optab[vect_int_mode]).
2876 However the type (mode) we use to check available target support
2877 for the vector operation to be created *inside the loop*, is
2878 determined by the type of the other arguments to STMT (in the
2879 example we'd check this: widen_sum_optab[vect_short_mode]).
2881 This is contrary to "regular" reductions, in which the types of all
2882 the arguments are the same as the type of the reduction variable.
2883 For "regular" reductions we can therefore use the same vector type
2884 (and also the same tree-code) when generating the epilog code and
2885 when generating the code inside the loop. */
2887 if (orig_stmt)
2889 /* This is a reduction pattern: get the vectype from the type of the
2890 reduction variable, and get the tree-code from orig_stmt. */
2891 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2892 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2893 if (!vectype)
2895 if (vect_print_dump_info (REPORT_DETAILS))
2897 fprintf (vect_dump, "unsupported data-type ");
2898 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2900 return false;
2903 vec_mode = TYPE_MODE (vectype);
2905 else
2907 /* Regular reduction: use the same vectype and tree-code as used for
2908 the vector code inside the loop can be used for the epilog code. */
2909 orig_code = code;
2912 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2913 return false;
2914 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2915 if (!reduc_optab)
2917 if (vect_print_dump_info (REPORT_DETAILS))
2918 fprintf (vect_dump, "no optab for reduction.");
2919 epilog_reduc_code = NUM_TREE_CODES;
2921 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2923 if (vect_print_dump_info (REPORT_DETAILS))
2924 fprintf (vect_dump, "reduc op not supported by target.");
2925 epilog_reduc_code = NUM_TREE_CODES;
2928 if (!vec_stmt) /* transformation not required. */
2930 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2931 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2932 return false;
2933 return true;
2936 /** Transform. **/
2938 if (vect_print_dump_info (REPORT_DETAILS))
2939 fprintf (vect_dump, "transform reduction.");
2941 /* Create the destination vector */
2942 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2944 /* Create the reduction-phi that defines the reduction-operand. */
2945 new_phi = create_phi_node (vec_dest, loop->header);
2947 /* In case the vectorization factor (VF) is bigger than the number
2948 of elements that we can fit in a vectype (nunits), we have to generate
2949 more than one vector stmt - i.e - we need to "unroll" the
2950 vector stmt by a factor VF/nunits. For more details see documentation
2951 in vectorizable_operation. */
2953 prev_stmt_info = NULL;
2954 for (j = 0; j < ncopies; j++)
2956 /* Handle uses. */
2957 if (j == 0)
2959 op = TREE_OPERAND (operation, 0);
2960 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2961 if (op_type == ternary_op)
2963 op = TREE_OPERAND (operation, 1);
2964 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2967 /* Get the vector def for the reduction variable from the phi node */
2968 reduc_def = PHI_RESULT (new_phi);
2970 else
2972 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2973 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2974 if (op_type == ternary_op)
2975 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2977 /* Get the vector def for the reduction variable from the vectorized
2978 reduction operation generated in the previous iteration (j-1) */
2979 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2982 /* Arguments are ready. create the new vector stmt. */
2983 if (op_type == binary_op)
2984 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2985 else
2986 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2987 reduc_def);
2988 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2989 new_temp = make_ssa_name (vec_dest, new_stmt);
2990 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2991 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2993 if (j == 0)
2994 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2995 else
2996 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2997 prev_stmt_info = vinfo_for_stmt (new_stmt);
3000 /* Finalize the reduction-phi (set it's arguments) and create the
3001 epilog reduction code. */
3002 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
3003 return true;
3006 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3007 a function declaration if the target has a vectorized version
3008 of the function, or NULL_TREE if the function cannot be vectorized. */
3010 tree
3011 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
3013 tree fndecl = get_callee_fndecl (call);
3014 enum built_in_function code;
3016 /* We only handle functions that do not read or clobber memory -- i.e.
3017 const or novops ones. */
3018 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3019 return NULL_TREE;
3021 if (!fndecl
3022 || TREE_CODE (fndecl) != FUNCTION_DECL
3023 || !DECL_BUILT_IN (fndecl))
3024 return NULL_TREE;
3026 code = DECL_FUNCTION_CODE (fndecl);
3027 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3028 vectype_in);
3031 /* Function vectorizable_call.
3033 Check if STMT performs a function call that can be vectorized.
3034 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3035 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3036 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3038 bool
3039 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3041 tree vec_dest;
3042 tree scalar_dest;
3043 tree operation;
3044 tree op, type;
3045 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3047 tree vectype_out, vectype_in;
3048 int nunits_in;
3049 int nunits_out;
3050 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3051 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3052 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
3053 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3054 tree new_stmt;
3055 int ncopies, j, nargs;
3056 call_expr_arg_iterator iter;
3057 tree vargs;
3058 enum { NARROW, NONE, WIDEN } modifier;
3060 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3061 return false;
3063 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3064 return false;
3066 /* FORNOW: SLP not supported. */
3067 if (STMT_SLP_TYPE (stmt_info))
3068 return false;
3070 /* Is STMT a vectorizable call? */
3071 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3072 return false;
3074 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3075 return false;
3077 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3078 if (TREE_CODE (operation) != CALL_EXPR)
3079 return false;
3081 /* Process function arguments. */
3082 rhs_type = NULL_TREE;
3083 nargs = 0;
3084 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3086 /* Bail out if the function has more than two arguments, we
3087 do not have interesting builtin functions to vectorize with
3088 more than two arguments. */
3089 if (nargs >= 2)
3090 return false;
3092 /* We can only handle calls with arguments of the same type. */
3093 if (rhs_type
3094 && rhs_type != TREE_TYPE (op))
3096 if (vect_print_dump_info (REPORT_DETAILS))
3097 fprintf (vect_dump, "argument types differ.");
3098 return false;
3100 rhs_type = TREE_TYPE (op);
3102 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3104 if (vect_print_dump_info (REPORT_DETAILS))
3105 fprintf (vect_dump, "use not simple.");
3106 return false;
3109 ++nargs;
3112 /* No arguments is also not good. */
3113 if (nargs == 0)
3114 return false;
3116 vectype_in = get_vectype_for_scalar_type (rhs_type);
3117 if (!vectype_in)
3118 return false;
3119 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3121 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3122 vectype_out = get_vectype_for_scalar_type (lhs_type);
3123 if (!vectype_out)
3124 return false;
3125 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3127 /* FORNOW */
3128 if (nunits_in == nunits_out / 2)
3129 modifier = NARROW;
3130 else if (nunits_out == nunits_in)
3131 modifier = NONE;
3132 else if (nunits_out == nunits_in / 2)
3133 modifier = WIDEN;
3134 else
3135 return false;
3137 /* For now, we only vectorize functions if a target specific builtin
3138 is available. TODO -- in some cases, it might be profitable to
3139 insert the calls for pieces of the vector, in order to be able
3140 to vectorize other operations in the loop. */
3141 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3142 if (fndecl == NULL_TREE)
3144 if (vect_print_dump_info (REPORT_DETAILS))
3145 fprintf (vect_dump, "function is not vectorizable.");
3147 return false;
3150 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3152 if (modifier == NARROW)
3153 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3154 else
3155 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3157 /* Sanity check: make sure that at least one copy of the vectorized stmt
3158 needs to be generated. */
3159 gcc_assert (ncopies >= 1);
3161 /* FORNOW. This restriction should be relaxed. */
3162 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3164 if (vect_print_dump_info (REPORT_DETAILS))
3165 fprintf (vect_dump, "multiple types in nested loop.");
3166 return false;
3169 if (!vec_stmt) /* transformation not required. */
3171 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3172 if (vect_print_dump_info (REPORT_DETAILS))
3173 fprintf (vect_dump, "=== vectorizable_call ===");
3174 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3175 return true;
3178 /** Transform. **/
3180 if (vect_print_dump_info (REPORT_DETAILS))
3181 fprintf (vect_dump, "transform operation.");
3183 /* FORNOW. This restriction should be relaxed. */
3184 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3186 if (vect_print_dump_info (REPORT_DETAILS))
3187 fprintf (vect_dump, "multiple types in nested loop.");
3188 return false;
3191 /* Handle def. */
3192 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3193 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3195 prev_stmt_info = NULL;
3196 switch (modifier)
3198 case NONE:
3199 for (j = 0; j < ncopies; ++j)
3201 /* Build argument list for the vectorized call. */
3202 /* FIXME: Rewrite this so that it doesn't
3203 construct a temporary list. */
3204 vargs = NULL_TREE;
3205 nargs = 0;
3206 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3208 if (j == 0)
3209 vec_oprnd0
3210 = vect_get_vec_def_for_operand (op, stmt, NULL);
3211 else
3212 vec_oprnd0
3213 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3215 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3217 ++nargs;
3219 vargs = nreverse (vargs);
3221 rhs = build_function_call_expr (fndecl, vargs);
3222 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3223 new_temp = make_ssa_name (vec_dest, new_stmt);
3224 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3226 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3228 if (j == 0)
3229 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3230 else
3231 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3233 prev_stmt_info = vinfo_for_stmt (new_stmt);
3236 break;
3238 case NARROW:
3239 for (j = 0; j < ncopies; ++j)
3241 /* Build argument list for the vectorized call. */
3242 /* FIXME: Rewrite this so that it doesn't
3243 construct a temporary list. */
3244 vargs = NULL_TREE;
3245 nargs = 0;
3246 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3248 if (j == 0)
3250 vec_oprnd0
3251 = vect_get_vec_def_for_operand (op, stmt, NULL);
3252 vec_oprnd1
3253 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3255 else
3257 vec_oprnd0
3258 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3259 vec_oprnd1
3260 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3263 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3264 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3266 ++nargs;
3268 vargs = nreverse (vargs);
3270 rhs = build_function_call_expr (fndecl, vargs);
3271 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3272 new_temp = make_ssa_name (vec_dest, new_stmt);
3273 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3275 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3277 if (j == 0)
3278 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3279 else
3280 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3282 prev_stmt_info = vinfo_for_stmt (new_stmt);
3285 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3287 break;
3289 case WIDEN:
3290 /* No current target implements this case. */
3291 return false;
3294 /* The call in STMT might prevent it from being removed in dce.
3295 We however cannot remove it here, due to the way the ssa name
3296 it defines is mapped to the new definition. So just replace
3297 rhs of the statement with something harmless. */
3298 type = TREE_TYPE (scalar_dest);
3299 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3300 update_stmt (stmt);
3302 return true;
3306 /* Function vect_gen_widened_results_half
3308 Create a vector stmt whose code, type, number of arguments, and result
3309 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3310 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3311 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3312 needs to be created (DECL is a function-decl of a target-builtin).
3313 STMT is the original scalar stmt that we are vectorizing. */
3315 static tree
3316 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3317 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3318 tree vec_dest, block_stmt_iterator *bsi,
3319 tree stmt)
3321 tree expr;
3322 tree new_stmt;
3323 tree new_temp;
3324 tree sym;
3325 ssa_op_iter iter;
3327 /* Generate half of the widened result: */
3328 if (code == CALL_EXPR)
3330 /* Target specific support */
3331 if (op_type == binary_op)
3332 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3333 else
3334 expr = build_call_expr (decl, 1, vec_oprnd0);
3336 else
3338 /* Generic support */
3339 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3340 if (op_type == binary_op)
3341 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3342 else
3343 expr = build1 (code, vectype, vec_oprnd0);
3345 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3346 new_temp = make_ssa_name (vec_dest, new_stmt);
3347 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3348 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3350 if (code == CALL_EXPR)
3352 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3354 if (TREE_CODE (sym) == SSA_NAME)
3355 sym = SSA_NAME_VAR (sym);
3356 mark_sym_for_renaming (sym);
3360 return new_stmt;
3364 /* Check if STMT performs a conversion operation, that can be vectorized.
3365 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3366 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3367 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3369 bool
3370 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3371 tree *vec_stmt, slp_tree slp_node)
3373 tree vec_dest;
3374 tree scalar_dest;
3375 tree operation;
3376 tree op0;
3377 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3378 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3379 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3380 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3381 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3382 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3383 tree new_temp;
3384 tree def, def_stmt;
3385 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3386 tree new_stmt = NULL_TREE;
3387 stmt_vec_info prev_stmt_info;
3388 int nunits_in;
3389 int nunits_out;
3390 tree vectype_out, vectype_in;
3391 int ncopies, j;
3392 tree expr;
3393 tree rhs_type, lhs_type;
3394 tree builtin_decl;
3395 enum { NARROW, NONE, WIDEN } modifier;
3396 int i;
3397 VEC(tree,heap) *vec_oprnds0 = NULL;
3398 tree vop0;
3400 /* Is STMT a vectorizable conversion? */
3402 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3403 return false;
3405 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3406 return false;
3408 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3409 return false;
3411 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3412 return false;
3414 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3415 code = TREE_CODE (operation);
3416 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3417 return false;
3419 /* Check types of lhs and rhs. */
3420 op0 = TREE_OPERAND (operation, 0);
3421 rhs_type = TREE_TYPE (op0);
3422 vectype_in = get_vectype_for_scalar_type (rhs_type);
3423 if (!vectype_in)
3424 return false;
3425 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3427 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3428 lhs_type = TREE_TYPE (scalar_dest);
3429 vectype_out = get_vectype_for_scalar_type (lhs_type);
3430 if (!vectype_out)
3431 return false;
3432 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3434 /* FORNOW */
3435 if (nunits_in == nunits_out / 2)
3436 modifier = NARROW;
3437 else if (nunits_out == nunits_in)
3438 modifier = NONE;
3439 else if (nunits_out == nunits_in / 2)
3440 modifier = WIDEN;
3441 else
3442 return false;
3444 if (modifier == NONE)
3445 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3447 /* Bail out if the types are both integral or non-integral. */
3448 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3449 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3450 return false;
3452 if (modifier == NARROW)
3453 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3454 else
3455 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3457 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3458 this, so we can safely override NCOPIES with 1 here. */
3459 if (slp_node)
3460 ncopies = 1;
3462 /* Sanity check: make sure that at least one copy of the vectorized stmt
3463 needs to be generated. */
3464 gcc_assert (ncopies >= 1);
3466 /* FORNOW. This restriction should be relaxed. */
3467 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3469 if (vect_print_dump_info (REPORT_DETAILS))
3470 fprintf (vect_dump, "multiple types in nested loop.");
3471 return false;
3474 /* Check the operands of the operation. */
3475 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3477 if (vect_print_dump_info (REPORT_DETAILS))
3478 fprintf (vect_dump, "use not simple.");
3479 return false;
3482 /* Supportable by target? */
3483 if ((modifier == NONE
3484 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3485 || (modifier == WIDEN
3486 && !supportable_widening_operation (code, stmt, vectype_in,
3487 &decl1, &decl2,
3488 &code1, &code2))
3489 || (modifier == NARROW
3490 && !supportable_narrowing_operation (code, stmt, vectype_in,
3491 &code1)))
3493 if (vect_print_dump_info (REPORT_DETAILS))
3494 fprintf (vect_dump, "op not supported by target.");
3495 return false;
3498 if (modifier != NONE)
3500 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3501 /* FORNOW: SLP not supported. */
3502 if (STMT_SLP_TYPE (stmt_info))
3503 return false;
3506 if (!vec_stmt) /* transformation not required. */
3508 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3509 return true;
3512 /** Transform. **/
3513 if (vect_print_dump_info (REPORT_DETAILS))
3514 fprintf (vect_dump, "transform conversion.");
3516 /* Handle def. */
3517 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3519 if (modifier == NONE && !slp_node)
3520 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3522 prev_stmt_info = NULL;
3523 switch (modifier)
3525 case NONE:
3526 for (j = 0; j < ncopies; j++)
3528 tree sym;
3529 ssa_op_iter iter;
3531 if (j == 0)
3532 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3533 else
3534 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3536 builtin_decl =
3537 targetm.vectorize.builtin_conversion (code, vectype_in);
3538 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3540 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3542 /* Arguments are ready. create the new vector stmt. */
3543 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3544 new_temp = make_ssa_name (vec_dest, new_stmt);
3545 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3546 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3547 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3548 SSA_OP_ALL_VIRTUALS)
3550 if (TREE_CODE (sym) == SSA_NAME)
3551 sym = SSA_NAME_VAR (sym);
3552 mark_sym_for_renaming (sym);
3554 if (slp_node)
3555 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3558 if (j == 0)
3559 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3560 else
3561 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3562 prev_stmt_info = vinfo_for_stmt (new_stmt);
3564 break;
3566 case WIDEN:
3567 /* In case the vectorization factor (VF) is bigger than the number
3568 of elements that we can fit in a vectype (nunits), we have to
3569 generate more than one vector stmt - i.e - we need to "unroll"
3570 the vector stmt by a factor VF/nunits. */
3571 for (j = 0; j < ncopies; j++)
3573 if (j == 0)
3574 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3575 else
3576 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3578 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3580 /* Generate first half of the widened result: */
3581 new_stmt
3582 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3583 vec_oprnd0, vec_oprnd1,
3584 unary_op, vec_dest, bsi, stmt);
3585 if (j == 0)
3586 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3587 else
3588 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3589 prev_stmt_info = vinfo_for_stmt (new_stmt);
3591 /* Generate second half of the widened result: */
3592 new_stmt
3593 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3594 vec_oprnd0, vec_oprnd1,
3595 unary_op, vec_dest, bsi, stmt);
3596 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3597 prev_stmt_info = vinfo_for_stmt (new_stmt);
3599 break;
3601 case NARROW:
3602 /* In case the vectorization factor (VF) is bigger than the number
3603 of elements that we can fit in a vectype (nunits), we have to
3604 generate more than one vector stmt - i.e - we need to "unroll"
3605 the vector stmt by a factor VF/nunits. */
3606 for (j = 0; j < ncopies; j++)
3608 /* Handle uses. */
3609 if (j == 0)
3611 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3612 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3614 else
3616 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3617 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3620 /* Arguments are ready. Create the new vector stmt. */
3621 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3622 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3623 new_temp = make_ssa_name (vec_dest, new_stmt);
3624 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3625 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3627 if (j == 0)
3628 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3629 else
3630 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3632 prev_stmt_info = vinfo_for_stmt (new_stmt);
3635 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3638 if (vec_oprnds0)
3639 VEC_free (tree, heap, vec_oprnds0);
3641 return true;
3645 /* Function vectorizable_assignment.
3647 Check if STMT performs an assignment (copy) that can be vectorized.
3648 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3649 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3650 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3652 bool
3653 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3654 slp_tree slp_node)
3656 tree vec_dest;
3657 tree scalar_dest;
3658 tree op;
3659 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3660 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3661 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3662 tree new_temp;
3663 tree def, def_stmt;
3664 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3665 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3666 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3667 int i;
3668 VEC(tree,heap) *vec_oprnds = NULL;
3669 tree vop;
3671 gcc_assert (ncopies >= 1);
3672 if (ncopies > 1)
3673 return false; /* FORNOW */
3675 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3676 return false;
3678 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3679 return false;
3681 /* Is vectorizable assignment? */
3682 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3683 return false;
3685 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3686 if (TREE_CODE (scalar_dest) != SSA_NAME)
3687 return false;
3689 op = GIMPLE_STMT_OPERAND (stmt, 1);
3690 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3692 if (vect_print_dump_info (REPORT_DETAILS))
3693 fprintf (vect_dump, "use not simple.");
3694 return false;
3697 if (!vec_stmt) /* transformation not required. */
3699 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3700 if (vect_print_dump_info (REPORT_DETAILS))
3701 fprintf (vect_dump, "=== vectorizable_assignment ===");
3702 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3703 return true;
3706 /** Transform. **/
3707 if (vect_print_dump_info (REPORT_DETAILS))
3708 fprintf (vect_dump, "transform assignment.");
3710 /* Handle def. */
3711 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3713 /* Handle use. */
3714 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3716 /* Arguments are ready. create the new vector stmt. */
3717 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3719 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3720 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3721 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3722 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3723 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3725 if (slp_node)
3726 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3729 VEC_free (tree, heap, vec_oprnds);
3730 return true;
3734 /* Function vect_min_worthwhile_factor.
3736 For a loop where we could vectorize the operation indicated by CODE,
3737 return the minimum vectorization factor that makes it worthwhile
3738 to use generic vectors. */
3739 static int
3740 vect_min_worthwhile_factor (enum tree_code code)
3742 switch (code)
3744 case PLUS_EXPR:
3745 case MINUS_EXPR:
3746 case NEGATE_EXPR:
3747 return 4;
3749 case BIT_AND_EXPR:
3750 case BIT_IOR_EXPR:
3751 case BIT_XOR_EXPR:
3752 case BIT_NOT_EXPR:
3753 return 2;
3755 default:
3756 return INT_MAX;
3761 /* Function vectorizable_induction
3763 Check if PHI performs an induction computation that can be vectorized.
3764 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3765 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3766 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3768 bool
3769 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3770 tree *vec_stmt)
3772 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3773 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3774 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3775 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3776 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3777 tree vec_def;
3779 gcc_assert (ncopies >= 1);
3781 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3782 return false;
3784 /* FORNOW: SLP not supported. */
3785 if (STMT_SLP_TYPE (stmt_info))
3786 return false;
3788 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3790 if (TREE_CODE (phi) != PHI_NODE)
3791 return false;
3793 if (!vec_stmt) /* transformation not required. */
3795 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3796 if (vect_print_dump_info (REPORT_DETAILS))
3797 fprintf (vect_dump, "=== vectorizable_induction ===");
3798 vect_model_induction_cost (stmt_info, ncopies);
3799 return true;
3802 /** Transform. **/
3804 if (vect_print_dump_info (REPORT_DETAILS))
3805 fprintf (vect_dump, "transform induction phi.");
3807 vec_def = get_initial_def_for_induction (phi);
3808 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3809 return true;
3813 /* Function vectorizable_operation.
3815 Check if STMT performs a binary or unary operation that can be vectorized.
3816 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3817 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3818 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3820 bool
3821 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3822 slp_tree slp_node)
3824 tree vec_dest;
3825 tree scalar_dest;
3826 tree operation;
3827 tree op0, op1 = NULL;
3828 tree vec_oprnd1 = NULL_TREE;
3829 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3830 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3831 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3832 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3833 enum tree_code code;
3834 enum machine_mode vec_mode;
3835 tree new_temp;
3836 int op_type;
3837 optab optab;
3838 int icode;
3839 enum machine_mode optab_op2_mode;
3840 tree def, def_stmt;
3841 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3842 tree new_stmt = NULL_TREE;
3843 stmt_vec_info prev_stmt_info;
3844 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3845 int nunits_out;
3846 tree vectype_out;
3847 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3848 int j, i;
3849 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3850 tree vop0, vop1;
3851 unsigned int k;
3852 bool scalar_shift_arg = false;
3854 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3855 this, so we can safely override NCOPIES with 1 here. */
3856 if (slp_node)
3857 ncopies = 1;
3858 gcc_assert (ncopies >= 1);
3859 /* FORNOW. This restriction should be relaxed. */
3860 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3862 if (vect_print_dump_info (REPORT_DETAILS))
3863 fprintf (vect_dump, "multiple types in nested loop.");
3864 return false;
3867 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3868 return false;
3870 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3871 return false;
3873 /* Is STMT a vectorizable binary/unary operation? */
3874 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3875 return false;
3877 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3878 return false;
3880 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3881 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3882 if (!vectype_out)
3883 return false;
3884 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3885 if (nunits_out != nunits_in)
3886 return false;
3888 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3889 code = TREE_CODE (operation);
3891 /* For pointer addition, we should use the normal plus for
3892 the vector addition. */
3893 if (code == POINTER_PLUS_EXPR)
3894 code = PLUS_EXPR;
3896 optab = optab_for_tree_code (code, vectype);
3898 /* Support only unary or binary operations. */
3899 op_type = TREE_OPERAND_LENGTH (operation);
3900 if (op_type != unary_op && op_type != binary_op)
3902 if (vect_print_dump_info (REPORT_DETAILS))
3903 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3904 return false;
3907 op0 = TREE_OPERAND (operation, 0);
3908 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3910 if (vect_print_dump_info (REPORT_DETAILS))
3911 fprintf (vect_dump, "use not simple.");
3912 return false;
3915 if (op_type == binary_op)
3917 op1 = TREE_OPERAND (operation, 1);
3918 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3920 if (vect_print_dump_info (REPORT_DETAILS))
3921 fprintf (vect_dump, "use not simple.");
3922 return false;
3926 /* Supportable by target? */
3927 if (!optab)
3929 if (vect_print_dump_info (REPORT_DETAILS))
3930 fprintf (vect_dump, "no optab.");
3931 return false;
3933 vec_mode = TYPE_MODE (vectype);
3934 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3935 if (icode == CODE_FOR_nothing)
3937 if (vect_print_dump_info (REPORT_DETAILS))
3938 fprintf (vect_dump, "op not supported by target.");
3939 /* Check only during analysis. */
3940 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3941 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3942 < vect_min_worthwhile_factor (code)
3943 && !vec_stmt))
3944 return false;
3945 if (vect_print_dump_info (REPORT_DETAILS))
3946 fprintf (vect_dump, "proceeding using word mode.");
3949 /* Worthwhile without SIMD support? Check only during analysis. */
3950 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3951 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3952 < vect_min_worthwhile_factor (code)
3953 && !vec_stmt)
3955 if (vect_print_dump_info (REPORT_DETAILS))
3956 fprintf (vect_dump, "not worthwhile without SIMD support.");
3957 return false;
3960 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3962 /* FORNOW: not yet supported. */
3963 if (!VECTOR_MODE_P (vec_mode))
3964 return false;
3966 /* Invariant argument is needed for a vector shift
3967 by a scalar shift operand. */
3968 optab_op2_mode = insn_data[icode].operand[2].mode;
3969 if (!VECTOR_MODE_P (optab_op2_mode))
3971 if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
3973 if (vect_print_dump_info (REPORT_DETAILS))
3974 fprintf (vect_dump, "operand mode requires invariant"
3975 " argument.");
3976 return false;
3979 scalar_shift_arg = true;
3983 if (!vec_stmt) /* transformation not required. */
3985 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3986 if (vect_print_dump_info (REPORT_DETAILS))
3987 fprintf (vect_dump, "=== vectorizable_operation ===");
3988 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3989 return true;
3992 /** Transform. **/
3994 if (vect_print_dump_info (REPORT_DETAILS))
3995 fprintf (vect_dump, "transform binary/unary operation.");
3997 /* Handle def. */
3998 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4000 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4001 created in the previous stages of the recursion, so no allocation is
4002 needed, except for the case of shift with scalar shift argument. In that
4003 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4004 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4005 In case of loop-based vectorization we allocate VECs of size 1. We
4006 allocate VEC_OPRNDS1 only in case of binary operation. */
4007 if (!slp_node)
4009 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4010 if (op_type == binary_op)
4011 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4013 else if (scalar_shift_arg)
4014 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4016 /* In case the vectorization factor (VF) is bigger than the number
4017 of elements that we can fit in a vectype (nunits), we have to generate
4018 more than one vector stmt - i.e - we need to "unroll" the
4019 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4020 from one copy of the vector stmt to the next, in the field
4021 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4022 stages to find the correct vector defs to be used when vectorizing
4023 stmts that use the defs of the current stmt. The example below illustrates
4024 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4025 4 vectorized stmts):
4027 before vectorization:
4028 RELATED_STMT VEC_STMT
4029 S1: x = memref - -
4030 S2: z = x + 1 - -
4032 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4033 there):
4034 RELATED_STMT VEC_STMT
4035 VS1_0: vx0 = memref0 VS1_1 -
4036 VS1_1: vx1 = memref1 VS1_2 -
4037 VS1_2: vx2 = memref2 VS1_3 -
4038 VS1_3: vx3 = memref3 - -
4039 S1: x = load - VS1_0
4040 S2: z = x + 1 - -
4042 step2: vectorize stmt S2 (done here):
4043 To vectorize stmt S2 we first need to find the relevant vector
4044 def for the first operand 'x'. This is, as usual, obtained from
4045 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4046 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4047 relevant vector def 'vx0'. Having found 'vx0' we can generate
4048 the vector stmt VS2_0, and as usual, record it in the
4049 STMT_VINFO_VEC_STMT of stmt S2.
4050 When creating the second copy (VS2_1), we obtain the relevant vector
4051 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4052 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4053 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4054 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4055 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4056 chain of stmts and pointers:
4057 RELATED_STMT VEC_STMT
4058 VS1_0: vx0 = memref0 VS1_1 -
4059 VS1_1: vx1 = memref1 VS1_2 -
4060 VS1_2: vx2 = memref2 VS1_3 -
4061 VS1_3: vx3 = memref3 - -
4062 S1: x = load - VS1_0
4063 VS2_0: vz0 = vx0 + v1 VS2_1 -
4064 VS2_1: vz1 = vx1 + v1 VS2_2 -
4065 VS2_2: vz2 = vx2 + v1 VS2_3 -
4066 VS2_3: vz3 = vx3 + v1 - -
4067 S2: z = x + 1 - VS2_0 */
4069 prev_stmt_info = NULL;
4070 for (j = 0; j < ncopies; j++)
4072 /* Handle uses. */
4073 if (j == 0)
4075 if (op_type == binary_op
4076 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
4078 /* Vector shl and shr insn patterns can be defined with scalar
4079 operand 2 (shift operand). In this case, use constant or loop
4080 invariant op1 directly, without extending it to vector mode
4081 first. */
4082 optab_op2_mode = insn_data[icode].operand[2].mode;
4083 if (!VECTOR_MODE_P (optab_op2_mode))
4085 if (vect_print_dump_info (REPORT_DETAILS))
4086 fprintf (vect_dump, "operand 1 using scalar mode.");
4087 vec_oprnd1 = op1;
4088 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4089 if (slp_node)
4091 /* Store vec_oprnd1 for every vector stmt to be created
4092 for SLP_NODE. We check during the analysis that all the
4093 shift arguments are the same.
4094 TODO: Allow different constants for different vector
4095 stmts generated for an SLP instance. */
4096 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4097 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4102 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4103 (a special case for certain kind of vector shifts); otherwise,
4104 operand 1 should be of a vector type (the usual case). */
4105 if (op_type == binary_op && !vec_oprnd1)
4106 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4107 slp_node);
4108 else
4109 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4110 slp_node);
4112 else
4113 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4115 /* Arguments are ready. Create the new vector stmt. */
4116 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4118 if (op_type == binary_op)
4120 vop1 = VEC_index (tree, vec_oprnds1, i);
4121 new_stmt = build_gimple_modify_stmt (vec_dest,
4122 build2 (code, vectype, vop0, vop1));
4124 else
4125 new_stmt = build_gimple_modify_stmt (vec_dest,
4126 build1 (code, vectype, vop0));
4128 new_temp = make_ssa_name (vec_dest, new_stmt);
4129 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4130 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4131 if (slp_node)
4132 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4135 if (j == 0)
4136 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4137 else
4138 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4139 prev_stmt_info = vinfo_for_stmt (new_stmt);
4142 VEC_free (tree, heap, vec_oprnds0);
4143 if (vec_oprnds1)
4144 VEC_free (tree, heap, vec_oprnds1);
4146 return true;
4150 /* Function vectorizable_type_demotion
4152 Check if STMT performs a binary or unary operation that involves
4153 type demotion, and if it can be vectorized.
4154 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4155 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4156 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4158 bool
4159 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4160 tree *vec_stmt)
4162 tree vec_dest;
4163 tree scalar_dest;
4164 tree operation;
4165 tree op0;
4166 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4167 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4168 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4170 enum tree_code code, code1 = ERROR_MARK;
4171 tree new_temp;
4172 tree def, def_stmt;
4173 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4174 tree new_stmt;
4175 stmt_vec_info prev_stmt_info;
4176 int nunits_in;
4177 int nunits_out;
4178 tree vectype_out;
4179 int ncopies;
4180 int j;
4181 tree expr;
4182 tree vectype_in;
4184 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4185 return false;
4187 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4188 return false;
4190 /* Is STMT a vectorizable type-demotion operation? */
4191 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4192 return false;
4194 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4195 return false;
4197 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4198 code = TREE_CODE (operation);
4199 if (code != NOP_EXPR && code != CONVERT_EXPR)
4200 return false;
4202 op0 = TREE_OPERAND (operation, 0);
4203 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4204 if (!vectype_in)
4205 return false;
4206 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4208 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4209 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4210 if (!vectype_out)
4211 return false;
4212 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4213 if (nunits_in != nunits_out / 2) /* FORNOW */
4214 return false;
4216 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4217 gcc_assert (ncopies >= 1);
4218 /* FORNOW. This restriction should be relaxed. */
4219 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4221 if (vect_print_dump_info (REPORT_DETAILS))
4222 fprintf (vect_dump, "multiple types in nested loop.");
4223 return false;
4226 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4227 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4228 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4229 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4230 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4231 return false;
4233 /* Check the operands of the operation. */
4234 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4236 if (vect_print_dump_info (REPORT_DETAILS))
4237 fprintf (vect_dump, "use not simple.");
4238 return false;
4241 /* Supportable by target? */
4242 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4243 return false;
4245 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4247 if (!vec_stmt) /* transformation not required. */
4249 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4250 if (vect_print_dump_info (REPORT_DETAILS))
4251 fprintf (vect_dump, "=== vectorizable_demotion ===");
4252 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4253 return true;
4256 /** Transform. **/
4257 if (vect_print_dump_info (REPORT_DETAILS))
4258 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4259 ncopies);
4261 /* Handle def. */
4262 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4264 /* In case the vectorization factor (VF) is bigger than the number
4265 of elements that we can fit in a vectype (nunits), we have to generate
4266 more than one vector stmt - i.e - we need to "unroll" the
4267 vector stmt by a factor VF/nunits. */
4268 prev_stmt_info = NULL;
4269 for (j = 0; j < ncopies; j++)
4271 /* Handle uses. */
4272 if (j == 0)
4274 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4275 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4277 else
4279 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4280 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4283 /* Arguments are ready. Create the new vector stmt. */
4284 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4285 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4286 new_temp = make_ssa_name (vec_dest, new_stmt);
4287 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4288 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4290 if (j == 0)
4291 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4292 else
4293 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4295 prev_stmt_info = vinfo_for_stmt (new_stmt);
4298 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4299 return true;
4303 /* Function vectorizable_type_promotion
4305 Check if STMT performs a binary or unary operation that involves
4306 type promotion, and if it can be vectorized.
4307 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4308 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4309 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4311 bool
4312 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4313 tree *vec_stmt)
4315 tree vec_dest;
4316 tree scalar_dest;
4317 tree operation;
4318 tree op0, op1 = NULL;
4319 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4320 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4321 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4322 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4323 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4324 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4325 int op_type;
4326 tree def, def_stmt;
4327 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4328 tree new_stmt;
4329 stmt_vec_info prev_stmt_info;
4330 int nunits_in;
4331 int nunits_out;
4332 tree vectype_out;
4333 int ncopies;
4334 int j;
4335 tree vectype_in;
4337 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4338 return false;
4340 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4341 return false;
4343 /* Is STMT a vectorizable type-promotion operation? */
4344 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4345 return false;
4347 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4348 return false;
4350 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4351 code = TREE_CODE (operation);
4352 if (code != NOP_EXPR && code != CONVERT_EXPR
4353 && code != WIDEN_MULT_EXPR)
4354 return false;
4356 op0 = TREE_OPERAND (operation, 0);
4357 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4358 if (!vectype_in)
4359 return false;
4360 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4362 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4363 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4364 if (!vectype_out)
4365 return false;
4366 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4367 if (nunits_out != nunits_in / 2) /* FORNOW */
4368 return false;
4370 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4371 gcc_assert (ncopies >= 1);
4372 /* FORNOW. This restriction should be relaxed. */
4373 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4375 if (vect_print_dump_info (REPORT_DETAILS))
4376 fprintf (vect_dump, "multiple types in nested loop.");
4377 return false;
4380 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4381 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4382 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4383 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4384 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4385 return false;
4387 /* Check the operands of the operation. */
4388 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4390 if (vect_print_dump_info (REPORT_DETAILS))
4391 fprintf (vect_dump, "use not simple.");
4392 return false;
4395 op_type = TREE_CODE_LENGTH (code);
4396 if (op_type == binary_op)
4398 op1 = TREE_OPERAND (operation, 1);
4399 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4401 if (vect_print_dump_info (REPORT_DETAILS))
4402 fprintf (vect_dump, "use not simple.");
4403 return false;
4407 /* Supportable by target? */
4408 if (!supportable_widening_operation (code, stmt, vectype_in,
4409 &decl1, &decl2, &code1, &code2))
4410 return false;
4412 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4414 if (!vec_stmt) /* transformation not required. */
4416 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4417 if (vect_print_dump_info (REPORT_DETAILS))
4418 fprintf (vect_dump, "=== vectorizable_promotion ===");
4419 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4420 return true;
4423 /** Transform. **/
4425 if (vect_print_dump_info (REPORT_DETAILS))
4426 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4427 ncopies);
4429 /* Handle def. */
4430 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4432 /* In case the vectorization factor (VF) is bigger than the number
4433 of elements that we can fit in a vectype (nunits), we have to generate
4434 more than one vector stmt - i.e - we need to "unroll" the
4435 vector stmt by a factor VF/nunits. */
4437 prev_stmt_info = NULL;
4438 for (j = 0; j < ncopies; j++)
4440 /* Handle uses. */
4441 if (j == 0)
4443 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4444 if (op_type == binary_op)
4445 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4447 else
4449 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4450 if (op_type == binary_op)
4451 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4454 /* Arguments are ready. Create the new vector stmt. We are creating
4455 two vector defs because the widened result does not fit in one vector.
4456 The vectorized stmt can be expressed as a call to a taregt builtin,
4457 or a using a tree-code. */
4458 /* Generate first half of the widened result: */
4459 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4460 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4461 if (j == 0)
4462 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4463 else
4464 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4465 prev_stmt_info = vinfo_for_stmt (new_stmt);
4467 /* Generate second half of the widened result: */
4468 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4469 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4470 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4471 prev_stmt_info = vinfo_for_stmt (new_stmt);
4475 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4476 return true;
4480 /* Function vect_strided_store_supported.
4482 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4483 and FALSE otherwise. */
4485 static bool
4486 vect_strided_store_supported (tree vectype)
4488 optab interleave_high_optab, interleave_low_optab;
4489 int mode;
4491 mode = (int) TYPE_MODE (vectype);
4493 /* Check that the operation is supported. */
4494 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4495 vectype);
4496 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4497 vectype);
4498 if (!interleave_high_optab || !interleave_low_optab)
4500 if (vect_print_dump_info (REPORT_DETAILS))
4501 fprintf (vect_dump, "no optab for interleave.");
4502 return false;
4505 if (optab_handler (interleave_high_optab, mode)->insn_code
4506 == CODE_FOR_nothing
4507 || optab_handler (interleave_low_optab, mode)->insn_code
4508 == CODE_FOR_nothing)
4510 if (vect_print_dump_info (REPORT_DETAILS))
4511 fprintf (vect_dump, "interleave op not supported by target.");
4512 return false;
4515 return true;
4519 /* Function vect_permute_store_chain.
4521 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4522 a power of 2, generate interleave_high/low stmts to reorder the data
4523 correctly for the stores. Return the final references for stores in
4524 RESULT_CHAIN.
4526 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4527 The input is 4 vectors each containing 8 elements. We assign a number to each
4528 element, the input sequence is:
4530 1st vec: 0 1 2 3 4 5 6 7
4531 2nd vec: 8 9 10 11 12 13 14 15
4532 3rd vec: 16 17 18 19 20 21 22 23
4533 4th vec: 24 25 26 27 28 29 30 31
4535 The output sequence should be:
4537 1st vec: 0 8 16 24 1 9 17 25
4538 2nd vec: 2 10 18 26 3 11 19 27
4539 3rd vec: 4 12 20 28 5 13 21 30
4540 4th vec: 6 14 22 30 7 15 23 31
4542 i.e., we interleave the contents of the four vectors in their order.
4544 We use interleave_high/low instructions to create such output. The input of
4545 each interleave_high/low operation is two vectors:
4546 1st vec 2nd vec
4547 0 1 2 3 4 5 6 7
4548 the even elements of the result vector are obtained left-to-right from the
4549 high/low elements of the first vector. The odd elements of the result are
4550 obtained left-to-right from the high/low elements of the second vector.
4551 The output of interleave_high will be: 0 4 1 5
4552 and of interleave_low: 2 6 3 7
4555 The permutation is done in log LENGTH stages. In each stage interleave_high
4556 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4557 where the first argument is taken from the first half of DR_CHAIN and the
4558 second argument from it's second half.
4559 In our example,
4561 I1: interleave_high (1st vec, 3rd vec)
4562 I2: interleave_low (1st vec, 3rd vec)
4563 I3: interleave_high (2nd vec, 4th vec)
4564 I4: interleave_low (2nd vec, 4th vec)
4566 The output for the first stage is:
4568 I1: 0 16 1 17 2 18 3 19
4569 I2: 4 20 5 21 6 22 7 23
4570 I3: 8 24 9 25 10 26 11 27
4571 I4: 12 28 13 29 14 30 15 31
4573 The output of the second stage, i.e. the final result is:
4575 I1: 0 8 16 24 1 9 17 25
4576 I2: 2 10 18 26 3 11 19 27
4577 I3: 4 12 20 28 5 13 21 30
4578 I4: 6 14 22 30 7 15 23 31. */
4580 static bool
4581 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4582 unsigned int length,
4583 tree stmt,
4584 block_stmt_iterator *bsi,
4585 VEC(tree,heap) **result_chain)
4587 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4588 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4589 tree scalar_dest, tmp;
4590 int i;
4591 unsigned int j;
4593 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4595 /* Check that the operation is supported. */
4596 if (!vect_strided_store_supported (vectype))
4597 return false;
4599 *result_chain = VEC_copy (tree, heap, dr_chain);
4601 for (i = 0; i < exact_log2 (length); i++)
4603 for (j = 0; j < length/2; j++)
4605 vect1 = VEC_index (tree, dr_chain, j);
4606 vect2 = VEC_index (tree, dr_chain, j+length/2);
4608 /* Create interleaving stmt:
4609 in the case of big endian:
4610 high = interleave_high (vect1, vect2)
4611 and in the case of little endian:
4612 high = interleave_low (vect1, vect2). */
4613 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4614 DECL_GIMPLE_REG_P (perm_dest) = 1;
4615 add_referenced_var (perm_dest);
4616 if (BYTES_BIG_ENDIAN)
4617 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4618 else
4619 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4620 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4621 high = make_ssa_name (perm_dest, perm_stmt);
4622 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4623 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4624 VEC_replace (tree, *result_chain, 2*j, high);
4626 /* Create interleaving stmt:
4627 in the case of big endian:
4628 low = interleave_low (vect1, vect2)
4629 and in the case of little endian:
4630 low = interleave_high (vect1, vect2). */
4631 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4632 DECL_GIMPLE_REG_P (perm_dest) = 1;
4633 add_referenced_var (perm_dest);
4634 if (BYTES_BIG_ENDIAN)
4635 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4636 else
4637 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4638 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4639 low = make_ssa_name (perm_dest, perm_stmt);
4640 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4641 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4642 VEC_replace (tree, *result_chain, 2*j+1, low);
4644 dr_chain = VEC_copy (tree, heap, *result_chain);
4646 return true;
4650 /* Function vectorizable_store.
4652 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4653 can be vectorized.
4654 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4655 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4656 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4658 bool
4659 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4660 slp_tree slp_node)
4662 tree scalar_dest;
4663 tree data_ref;
4664 tree op;
4665 tree vec_oprnd = NULL_TREE;
4666 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4667 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4668 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4669 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4670 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4671 enum machine_mode vec_mode;
4672 tree dummy;
4673 enum dr_alignment_support alignment_support_scheme;
4674 tree def, def_stmt;
4675 enum vect_def_type dt;
4676 stmt_vec_info prev_stmt_info = NULL;
4677 tree dataref_ptr = NULL_TREE;
4678 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4679 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4680 int j;
4681 tree next_stmt, first_stmt = NULL_TREE;
4682 bool strided_store = false;
4683 unsigned int group_size, i;
4684 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4685 bool inv_p;
4686 VEC(tree,heap) *vec_oprnds = NULL;
4687 bool slp = (slp_node != NULL);
4688 stmt_vec_info first_stmt_vinfo;
4689 unsigned int vec_num;
4691 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4692 this, so we can safely override NCOPIES with 1 here. */
4693 if (slp)
4694 ncopies = 1;
4696 gcc_assert (ncopies >= 1);
4698 /* FORNOW. This restriction should be relaxed. */
4699 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4701 if (vect_print_dump_info (REPORT_DETAILS))
4702 fprintf (vect_dump, "multiple types in nested loop.");
4703 return false;
4706 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4707 return false;
4709 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4710 return false;
4712 /* Is vectorizable store? */
4714 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4715 return false;
4717 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4718 if (TREE_CODE (scalar_dest) != ARRAY_REF
4719 && TREE_CODE (scalar_dest) != INDIRECT_REF
4720 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4721 return false;
4723 op = GIMPLE_STMT_OPERAND (stmt, 1);
4724 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4726 if (vect_print_dump_info (REPORT_DETAILS))
4727 fprintf (vect_dump, "use not simple.");
4728 return false;
4731 vec_mode = TYPE_MODE (vectype);
4732 /* FORNOW. In some cases can vectorize even if data-type not supported
4733 (e.g. - array initialization with 0). */
4734 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4735 return false;
4737 if (!STMT_VINFO_DATA_REF (stmt_info))
4738 return false;
4740 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4742 strided_store = true;
4743 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4744 if (!vect_strided_store_supported (vectype)
4745 && !PURE_SLP_STMT (stmt_info) && !slp)
4746 return false;
4748 if (first_stmt == stmt)
4750 /* STMT is the leader of the group. Check the operands of all the
4751 stmts of the group. */
4752 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4753 while (next_stmt)
4755 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4756 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4758 if (vect_print_dump_info (REPORT_DETAILS))
4759 fprintf (vect_dump, "use not simple.");
4760 return false;
4762 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4767 if (!vec_stmt) /* transformation not required. */
4769 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4770 if (!PURE_SLP_STMT (stmt_info))
4771 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4772 return true;
4775 /** Transform. **/
4777 if (strided_store)
4779 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4780 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4782 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4784 /* FORNOW */
4785 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4787 /* We vectorize all the stmts of the interleaving group when we
4788 reach the last stmt in the group. */
4789 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4790 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4791 && !slp)
4793 *vec_stmt = NULL_TREE;
4794 return true;
4797 if (slp)
4798 strided_store = false;
4800 /* VEC_NUM is the number of vect stmts to be created for this group. */
4801 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4802 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4803 else
4804 vec_num = group_size;
4806 else
4808 first_stmt = stmt;
4809 first_dr = dr;
4810 group_size = vec_num = 1;
4811 first_stmt_vinfo = stmt_info;
4814 if (vect_print_dump_info (REPORT_DETAILS))
4815 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4817 dr_chain = VEC_alloc (tree, heap, group_size);
4818 oprnds = VEC_alloc (tree, heap, group_size);
4820 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4821 gcc_assert (alignment_support_scheme);
4822 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4824 /* In case the vectorization factor (VF) is bigger than the number
4825 of elements that we can fit in a vectype (nunits), we have to generate
4826 more than one vector stmt - i.e - we need to "unroll" the
4827 vector stmt by a factor VF/nunits. For more details see documentation in
4828 vect_get_vec_def_for_copy_stmt. */
4830 /* In case of interleaving (non-unit strided access):
4832 S1: &base + 2 = x2
4833 S2: &base = x0
4834 S3: &base + 1 = x1
4835 S4: &base + 3 = x3
4837 We create vectorized stores starting from base address (the access of the
4838 first stmt in the chain (S2 in the above example), when the last store stmt
4839 of the chain (S4) is reached:
4841 VS1: &base = vx2
4842 VS2: &base + vec_size*1 = vx0
4843 VS3: &base + vec_size*2 = vx1
4844 VS4: &base + vec_size*3 = vx3
4846 Then permutation statements are generated:
4848 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4849 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4852 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4853 (the order of the data-refs in the output of vect_permute_store_chain
4854 corresponds to the order of scalar stmts in the interleaving chain - see
4855 the documentation of vect_permute_store_chain()).
4857 In case of both multiple types and interleaving, above vector stores and
4858 permutation stmts are created for every copy. The result vector stmts are
4859 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4860 STMT_VINFO_RELATED_STMT for the next copies.
4863 prev_stmt_info = NULL;
4864 for (j = 0; j < ncopies; j++)
4866 tree new_stmt;
4867 tree ptr_incr;
4869 if (j == 0)
4871 if (slp)
4873 /* Get vectorized arguments for SLP_NODE. */
4874 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4876 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4878 else
4880 /* For interleaved stores we collect vectorized defs for all the
4881 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4882 used as an input to vect_permute_store_chain(), and OPRNDS as
4883 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4885 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4886 OPRNDS are of size 1. */
4887 next_stmt = first_stmt;
4888 for (i = 0; i < group_size; i++)
4890 /* Since gaps are not supported for interleaved stores,
4891 GROUP_SIZE is the exact number of stmts in the chain.
4892 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4893 there is no interleaving, GROUP_SIZE is 1, and only one
4894 iteration of the loop will be executed. */
4895 gcc_assert (next_stmt);
4896 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4898 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4899 NULL);
4900 VEC_quick_push(tree, dr_chain, vec_oprnd);
4901 VEC_quick_push(tree, oprnds, vec_oprnd);
4902 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4905 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4906 &dummy, &ptr_incr, false,
4907 TREE_TYPE (vec_oprnd), &inv_p);
4908 gcc_assert (!inv_p);
4910 else
4912 /* FORNOW SLP doesn't work for multiple types. */
4913 gcc_assert (!slp);
4915 /* For interleaved stores we created vectorized defs for all the
4916 defs stored in OPRNDS in the previous iteration (previous copy).
4917 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4918 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4919 next copy.
4920 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4921 OPRNDS are of size 1. */
4922 for (i = 0; i < group_size; i++)
4924 op = VEC_index (tree, oprnds, i);
4925 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4926 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4927 VEC_replace(tree, dr_chain, i, vec_oprnd);
4928 VEC_replace(tree, oprnds, i, vec_oprnd);
4930 dataref_ptr =
4931 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4934 if (strided_store)
4936 result_chain = VEC_alloc (tree, heap, group_size);
4937 /* Permute. */
4938 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4939 &result_chain))
4940 return false;
4943 next_stmt = first_stmt;
4944 for (i = 0; i < vec_num; i++)
4946 if (i > 0)
4947 /* Bump the vector pointer. */
4948 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4949 NULL_TREE);
4951 if (slp)
4952 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4953 else if (strided_store)
4954 /* For strided stores vectorized defs are interleaved in
4955 vect_permute_store_chain(). */
4956 vec_oprnd = VEC_index (tree, result_chain, i);
4958 data_ref = build_fold_indirect_ref (dataref_ptr);
4959 /* Arguments are ready. Create the new vector stmt. */
4960 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4961 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4962 mark_symbols_for_renaming (new_stmt);
4964 if (j == 0)
4965 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4966 else
4967 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4969 prev_stmt_info = vinfo_for_stmt (new_stmt);
4970 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4971 if (!next_stmt)
4972 break;
4976 VEC_free (tree, heap, dr_chain);
4977 VEC_free (tree, heap, oprnds);
4978 if (result_chain)
4979 VEC_free (tree, heap, result_chain);
4981 return true;
4985 /* Function vect_setup_realignment
4987 This function is called when vectorizing an unaligned load using
4988 the dr_explicit_realign[_optimized] scheme.
4989 This function generates the following code at the loop prolog:
4991 p = initial_addr;
4992 x msq_init = *(floor(p)); # prolog load
4993 realignment_token = call target_builtin;
4994 loop:
4995 x msq = phi (msq_init, ---)
4997 The stmts marked with x are generated only for the case of
4998 dr_explicit_realign_optimized.
5000 The code above sets up a new (vector) pointer, pointing to the first
5001 location accessed by STMT, and a "floor-aligned" load using that pointer.
5002 It also generates code to compute the "realignment-token" (if the relevant
5003 target hook was defined), and creates a phi-node at the loop-header bb
5004 whose arguments are the result of the prolog-load (created by this
5005 function) and the result of a load that takes place in the loop (to be
5006 created by the caller to this function).
5008 For the case of dr_explicit_realign_optimized:
5009 The caller to this function uses the phi-result (msq) to create the
5010 realignment code inside the loop, and sets up the missing phi argument,
5011 as follows:
5012 loop:
5013 msq = phi (msq_init, lsq)
5014 lsq = *(floor(p')); # load in loop
5015 result = realign_load (msq, lsq, realignment_token);
5017 For the case of dr_explicit_realign:
5018 loop:
5019 msq = *(floor(p)); # load in loop
5020 p' = p + (VS-1);
5021 lsq = *(floor(p')); # load in loop
5022 result = realign_load (msq, lsq, realignment_token);
5024 Input:
5025 STMT - (scalar) load stmt to be vectorized. This load accesses
5026 a memory location that may be unaligned.
5027 BSI - place where new code is to be inserted.
5028 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5029 is used.
5031 Output:
5032 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5033 target hook, if defined.
5034 Return value - the result of the loop-header phi node. */
5036 static tree
5037 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
5038 tree *realignment_token,
5039 enum dr_alignment_support alignment_support_scheme,
5040 tree init_addr,
5041 struct loop **at_loop)
5043 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5044 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5045 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5047 edge pe;
5048 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5049 tree vec_dest;
5050 tree inc;
5051 tree ptr;
5052 tree data_ref;
5053 tree new_stmt;
5054 basic_block new_bb;
5055 tree msq_init = NULL_TREE;
5056 tree new_temp;
5057 tree phi_stmt;
5058 tree msq = NULL_TREE;
5059 tree stmts = NULL_TREE;
5060 bool inv_p;
5061 bool compute_in_loop = false;
5062 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5063 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5064 struct loop *loop_for_initial_load;
5066 gcc_assert (alignment_support_scheme == dr_explicit_realign
5067 || alignment_support_scheme == dr_explicit_realign_optimized);
5069 /* We need to generate three things:
5070 1. the misalignment computation
5071 2. the extra vector load (for the optimized realignment scheme).
5072 3. the phi node for the two vectors from which the realignment is
5073 done (for the optimized realignment scheme).
5076 /* 1. Determine where to generate the misalignment computation.
5078 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5079 calculation will be generated by this function, outside the loop (in the
5080 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5081 caller, inside the loop.
5083 Background: If the misalignment remains fixed throughout the iterations of
5084 the loop, then both realignment schemes are applicable, and also the
5085 misalignment computation can be done outside LOOP. This is because we are
5086 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5087 are a multiple of VS (the Vector Size), and therefore the misalignment in
5088 different vectorized LOOP iterations is always the same.
5089 The problem arises only if the memory access is in an inner-loop nested
5090 inside LOOP, which is now being vectorized using outer-loop vectorization.
5091 This is the only case when the misalignment of the memory access may not
5092 remain fixed throughout the iterations of the inner-loop (as explained in
5093 detail in vect_supportable_dr_alignment). In this case, not only is the
5094 optimized realignment scheme not applicable, but also the misalignment
5095 computation (and generation of the realignment token that is passed to
5096 REALIGN_LOAD) have to be done inside the loop.
5098 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5099 or not, which in turn determines if the misalignment is computed inside
5100 the inner-loop, or outside LOOP. */
5102 if (init_addr != NULL_TREE)
5104 compute_in_loop = true;
5105 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5109 /* 2. Determine where to generate the extra vector load.
5111 For the optimized realignment scheme, instead of generating two vector
5112 loads in each iteration, we generate a single extra vector load in the
5113 preheader of the loop, and in each iteration reuse the result of the
5114 vector load from the previous iteration. In case the memory access is in
5115 an inner-loop nested inside LOOP, which is now being vectorized using
5116 outer-loop vectorization, we need to determine whether this initial vector
5117 load should be generated at the preheader of the inner-loop, or can be
5118 generated at the preheader of LOOP. If the memory access has no evolution
5119 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5120 to be generated inside LOOP (in the preheader of the inner-loop). */
5122 if (nested_in_vect_loop)
5124 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5125 bool invariant_in_outerloop =
5126 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5127 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5129 else
5130 loop_for_initial_load = loop;
5131 if (at_loop)
5132 *at_loop = loop_for_initial_load;
5134 /* 3. For the case of the optimized realignment, create the first vector
5135 load at the loop preheader. */
5137 if (alignment_support_scheme == dr_explicit_realign_optimized)
5139 /* Create msq_init = *(floor(p1)) in the loop preheader */
5141 gcc_assert (!compute_in_loop);
5142 pe = loop_preheader_edge (loop_for_initial_load);
5143 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5144 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5145 &init_addr, &inc, true, NULL_TREE, &inv_p);
5146 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5147 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5148 new_temp = make_ssa_name (vec_dest, new_stmt);
5149 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5150 mark_symbols_for_renaming (new_stmt);
5151 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5152 gcc_assert (!new_bb);
5153 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5156 /* 4. Create realignment token using a target builtin, if available.
5157 It is done either inside the containing loop, or before LOOP (as
5158 determined above). */
5160 if (targetm.vectorize.builtin_mask_for_load)
5162 tree builtin_decl;
5164 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5165 if (compute_in_loop)
5166 gcc_assert (init_addr); /* already computed by the caller. */
5167 else
5169 /* Generate the INIT_ADDR computation outside LOOP. */
5170 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5171 NULL_TREE, loop);
5172 pe = loop_preheader_edge (loop);
5173 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5174 gcc_assert (!new_bb);
5177 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5178 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5179 vec_dest = vect_create_destination_var (scalar_dest,
5180 TREE_TYPE (new_stmt));
5181 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5182 new_temp = make_ssa_name (vec_dest, new_stmt);
5183 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5185 if (compute_in_loop)
5186 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5187 else
5189 /* Generate the misalignment computation outside LOOP. */
5190 pe = loop_preheader_edge (loop);
5191 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5192 gcc_assert (!new_bb);
5195 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5197 /* The result of the CALL_EXPR to this builtin is determined from
5198 the value of the parameter and no global variables are touched
5199 which makes the builtin a "const" function. Requiring the
5200 builtin to have the "const" attribute makes it unnecessary
5201 to call mark_call_clobbered. */
5202 gcc_assert (TREE_READONLY (builtin_decl));
5205 if (alignment_support_scheme == dr_explicit_realign)
5206 return msq;
5208 gcc_assert (!compute_in_loop);
5209 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5212 /* 5. Create msq = phi <msq_init, lsq> in loop */
5214 pe = loop_preheader_edge (containing_loop);
5215 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5216 msq = make_ssa_name (vec_dest, NULL_TREE);
5217 phi_stmt = create_phi_node (msq, containing_loop->header);
5218 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5219 add_phi_arg (phi_stmt, msq_init, pe);
5221 return msq;
5225 /* Function vect_strided_load_supported.
5227 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5228 and FALSE otherwise. */
5230 static bool
5231 vect_strided_load_supported (tree vectype)
5233 optab perm_even_optab, perm_odd_optab;
5234 int mode;
5236 mode = (int) TYPE_MODE (vectype);
5238 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5239 if (!perm_even_optab)
5241 if (vect_print_dump_info (REPORT_DETAILS))
5242 fprintf (vect_dump, "no optab for perm_even.");
5243 return false;
5246 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5248 if (vect_print_dump_info (REPORT_DETAILS))
5249 fprintf (vect_dump, "perm_even op not supported by target.");
5250 return false;
5253 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5254 if (!perm_odd_optab)
5256 if (vect_print_dump_info (REPORT_DETAILS))
5257 fprintf (vect_dump, "no optab for perm_odd.");
5258 return false;
5261 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5263 if (vect_print_dump_info (REPORT_DETAILS))
5264 fprintf (vect_dump, "perm_odd op not supported by target.");
5265 return false;
5267 return true;
5271 /* Function vect_permute_load_chain.
5273 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5274 a power of 2, generate extract_even/odd stmts to reorder the input data
5275 correctly. Return the final references for loads in RESULT_CHAIN.
5277 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5278 The input is 4 vectors each containing 8 elements. We assign a number to each
5279 element, the input sequence is:
5281 1st vec: 0 1 2 3 4 5 6 7
5282 2nd vec: 8 9 10 11 12 13 14 15
5283 3rd vec: 16 17 18 19 20 21 22 23
5284 4th vec: 24 25 26 27 28 29 30 31
5286 The output sequence should be:
5288 1st vec: 0 4 8 12 16 20 24 28
5289 2nd vec: 1 5 9 13 17 21 25 29
5290 3rd vec: 2 6 10 14 18 22 26 30
5291 4th vec: 3 7 11 15 19 23 27 31
5293 i.e., the first output vector should contain the first elements of each
5294 interleaving group, etc.
5296 We use extract_even/odd instructions to create such output. The input of each
5297 extract_even/odd operation is two vectors
5298 1st vec 2nd vec
5299 0 1 2 3 4 5 6 7
5301 and the output is the vector of extracted even/odd elements. The output of
5302 extract_even will be: 0 2 4 6
5303 and of extract_odd: 1 3 5 7
5306 The permutation is done in log LENGTH stages. In each stage extract_even and
5307 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5308 order. In our example,
5310 E1: extract_even (1st vec, 2nd vec)
5311 E2: extract_odd (1st vec, 2nd vec)
5312 E3: extract_even (3rd vec, 4th vec)
5313 E4: extract_odd (3rd vec, 4th vec)
5315 The output for the first stage will be:
5317 E1: 0 2 4 6 8 10 12 14
5318 E2: 1 3 5 7 9 11 13 15
5319 E3: 16 18 20 22 24 26 28 30
5320 E4: 17 19 21 23 25 27 29 31
5322 In order to proceed and create the correct sequence for the next stage (or
5323 for the correct output, if the second stage is the last one, as in our
5324 example), we first put the output of extract_even operation and then the
5325 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5326 The input for the second stage is:
5328 1st vec (E1): 0 2 4 6 8 10 12 14
5329 2nd vec (E3): 16 18 20 22 24 26 28 30
5330 3rd vec (E2): 1 3 5 7 9 11 13 15
5331 4th vec (E4): 17 19 21 23 25 27 29 31
5333 The output of the second stage:
5335 E1: 0 4 8 12 16 20 24 28
5336 E2: 2 6 10 14 18 22 26 30
5337 E3: 1 5 9 13 17 21 25 29
5338 E4: 3 7 11 15 19 23 27 31
5340 And RESULT_CHAIN after reordering:
5342 1st vec (E1): 0 4 8 12 16 20 24 28
5343 2nd vec (E3): 1 5 9 13 17 21 25 29
5344 3rd vec (E2): 2 6 10 14 18 22 26 30
5345 4th vec (E4): 3 7 11 15 19 23 27 31. */
5347 static bool
5348 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5349 unsigned int length,
5350 tree stmt,
5351 block_stmt_iterator *bsi,
5352 VEC(tree,heap) **result_chain)
5354 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5355 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5356 tree tmp;
5357 int i;
5358 unsigned int j;
5360 /* Check that the operation is supported. */
5361 if (!vect_strided_load_supported (vectype))
5362 return false;
5364 *result_chain = VEC_copy (tree, heap, dr_chain);
5365 for (i = 0; i < exact_log2 (length); i++)
5367 for (j = 0; j < length; j +=2)
5369 first_vect = VEC_index (tree, dr_chain, j);
5370 second_vect = VEC_index (tree, dr_chain, j+1);
5372 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5373 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5374 DECL_GIMPLE_REG_P (perm_dest) = 1;
5375 add_referenced_var (perm_dest);
5377 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5378 first_vect, second_vect);
5379 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5381 data_ref = make_ssa_name (perm_dest, perm_stmt);
5382 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5383 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5384 mark_symbols_for_renaming (perm_stmt);
5386 VEC_replace (tree, *result_chain, j/2, data_ref);
5388 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5389 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5390 DECL_GIMPLE_REG_P (perm_dest) = 1;
5391 add_referenced_var (perm_dest);
5393 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5394 first_vect, second_vect);
5395 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5396 data_ref = make_ssa_name (perm_dest, perm_stmt);
5397 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5398 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5399 mark_symbols_for_renaming (perm_stmt);
5401 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5403 dr_chain = VEC_copy (tree, heap, *result_chain);
5405 return true;
5409 /* Function vect_transform_strided_load.
5411 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5412 to perform their permutation and ascribe the result vectorized statements to
5413 the scalar statements.
5416 static bool
5417 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5418 block_stmt_iterator *bsi)
5420 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5421 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5422 tree next_stmt, new_stmt;
5423 VEC(tree,heap) *result_chain = NULL;
5424 unsigned int i, gap_count;
5425 tree tmp_data_ref;
5427 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5428 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5429 vectors, that are ready for vector computation. */
5430 result_chain = VEC_alloc (tree, heap, size);
5431 /* Permute. */
5432 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5433 return false;
5435 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5436 Since we scan the chain starting from it's first node, their order
5437 corresponds the order of data-refs in RESULT_CHAIN. */
5438 next_stmt = first_stmt;
5439 gap_count = 1;
5440 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5442 if (!next_stmt)
5443 break;
5445 /* Skip the gaps. Loads created for the gaps will be removed by dead
5446 code elimination pass later.
5447 DR_GROUP_GAP is the number of steps in elements from the previous
5448 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5449 correspond to the gaps.
5451 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5453 gap_count++;
5454 continue;
5457 while (next_stmt)
5459 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5460 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5461 copies, and we put the new vector statement in the first available
5462 RELATED_STMT. */
5463 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5464 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5465 else
5467 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5468 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5469 vinfo_for_stmt (prev_stmt));
5470 while (rel_stmt)
5472 prev_stmt = rel_stmt;
5473 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5475 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5477 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5478 gap_count = 1;
5479 /* If NEXT_STMT accesses the same DR as the previous statement,
5480 put the same TMP_DATA_REF as its vectorized statement; otherwise
5481 get the next data-ref from RESULT_CHAIN. */
5482 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5483 break;
5487 VEC_free (tree, heap, result_chain);
5488 return true;
5492 /* vectorizable_load.
5494 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5495 can be vectorized.
5496 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5497 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5498 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5500 bool
5501 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5502 slp_tree slp_node)
5504 tree scalar_dest;
5505 tree vec_dest = NULL;
5506 tree data_ref = NULL;
5507 tree op;
5508 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5509 stmt_vec_info prev_stmt_info;
5510 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5511 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5512 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5513 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5514 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5515 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5516 tree new_temp;
5517 int mode;
5518 tree new_stmt = NULL_TREE;
5519 tree dummy;
5520 enum dr_alignment_support alignment_support_scheme;
5521 tree dataref_ptr = NULL_TREE;
5522 tree ptr_incr;
5523 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5524 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5525 int i, j, group_size;
5526 tree msq = NULL_TREE, lsq;
5527 tree offset = NULL_TREE;
5528 tree realignment_token = NULL_TREE;
5529 tree phi = NULL_TREE;
5530 VEC(tree,heap) *dr_chain = NULL;
5531 bool strided_load = false;
5532 tree first_stmt;
5533 tree scalar_type;
5534 bool inv_p;
5535 bool compute_in_loop = false;
5536 struct loop *at_loop;
5537 int vec_num;
5538 bool slp = (slp_node != NULL);
5540 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5541 this, so we can safely override NCOPIES with 1 here. */
5542 if (slp)
5543 ncopies = 1;
5545 gcc_assert (ncopies >= 1);
5547 /* FORNOW. This restriction should be relaxed. */
5548 if (nested_in_vect_loop && ncopies > 1)
5550 if (vect_print_dump_info (REPORT_DETAILS))
5551 fprintf (vect_dump, "multiple types in nested loop.");
5552 return false;
5555 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5556 return false;
5558 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5559 return false;
5561 /* Is vectorizable load? */
5562 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5563 return false;
5565 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5566 if (TREE_CODE (scalar_dest) != SSA_NAME)
5567 return false;
5569 op = GIMPLE_STMT_OPERAND (stmt, 1);
5570 if (TREE_CODE (op) != ARRAY_REF
5571 && TREE_CODE (op) != INDIRECT_REF
5572 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5573 return false;
5575 if (!STMT_VINFO_DATA_REF (stmt_info))
5576 return false;
5578 scalar_type = TREE_TYPE (DR_REF (dr));
5579 mode = (int) TYPE_MODE (vectype);
5581 /* FORNOW. In some cases can vectorize even if data-type not supported
5582 (e.g. - data copies). */
5583 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5585 if (vect_print_dump_info (REPORT_DETAILS))
5586 fprintf (vect_dump, "Aligned load, but unsupported type.");
5587 return false;
5590 /* Check if the load is a part of an interleaving chain. */
5591 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5593 strided_load = true;
5594 /* FORNOW */
5595 gcc_assert (! nested_in_vect_loop);
5597 /* Check if interleaving is supported. */
5598 if (!vect_strided_load_supported (vectype)
5599 && !PURE_SLP_STMT (stmt_info) && !slp)
5600 return false;
5603 if (!vec_stmt) /* transformation not required. */
5605 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5606 vect_model_load_cost (stmt_info, ncopies, NULL);
5607 return true;
5610 if (vect_print_dump_info (REPORT_DETAILS))
5611 fprintf (vect_dump, "transform load.");
5613 /** Transform. **/
5615 if (strided_load)
5617 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5618 /* Check if the chain of loads is already vectorized. */
5619 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5621 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5622 return true;
5624 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5625 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5626 dr_chain = VEC_alloc (tree, heap, group_size);
5628 /* VEC_NUM is the number of vect stmts to be created for this group. */
5629 if (slp)
5631 strided_load = false;
5632 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5634 else
5635 vec_num = group_size;
5637 else
5639 first_stmt = stmt;
5640 first_dr = dr;
5641 group_size = vec_num = 1;
5644 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5645 gcc_assert (alignment_support_scheme);
5647 /* In case the vectorization factor (VF) is bigger than the number
5648 of elements that we can fit in a vectype (nunits), we have to generate
5649 more than one vector stmt - i.e - we need to "unroll" the
5650 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5651 from one copy of the vector stmt to the next, in the field
5652 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5653 stages to find the correct vector defs to be used when vectorizing
5654 stmts that use the defs of the current stmt. The example below illustrates
5655 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5656 4 vectorized stmts):
5658 before vectorization:
5659 RELATED_STMT VEC_STMT
5660 S1: x = memref - -
5661 S2: z = x + 1 - -
5663 step 1: vectorize stmt S1:
5664 We first create the vector stmt VS1_0, and, as usual, record a
5665 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5666 Next, we create the vector stmt VS1_1, and record a pointer to
5667 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5668 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5669 stmts and pointers:
5670 RELATED_STMT VEC_STMT
5671 VS1_0: vx0 = memref0 VS1_1 -
5672 VS1_1: vx1 = memref1 VS1_2 -
5673 VS1_2: vx2 = memref2 VS1_3 -
5674 VS1_3: vx3 = memref3 - -
5675 S1: x = load - VS1_0
5676 S2: z = x + 1 - -
5678 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5679 information we recorded in RELATED_STMT field is used to vectorize
5680 stmt S2. */
5682 /* In case of interleaving (non-unit strided access):
5684 S1: x2 = &base + 2
5685 S2: x0 = &base
5686 S3: x1 = &base + 1
5687 S4: x3 = &base + 3
5689 Vectorized loads are created in the order of memory accesses
5690 starting from the access of the first stmt of the chain:
5692 VS1: vx0 = &base
5693 VS2: vx1 = &base + vec_size*1
5694 VS3: vx3 = &base + vec_size*2
5695 VS4: vx4 = &base + vec_size*3
5697 Then permutation statements are generated:
5699 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5700 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5703 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5704 (the order of the data-refs in the output of vect_permute_load_chain
5705 corresponds to the order of scalar stmts in the interleaving chain - see
5706 the documentation of vect_permute_load_chain()).
5707 The generation of permutation stmts and recording them in
5708 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5710 In case of both multiple types and interleaving, the vector loads and
5711 permutation stmts above are created for every copy. The result vector stmts
5712 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5713 STMT_VINFO_RELATED_STMT for the next copies. */
5715 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5716 on a target that supports unaligned accesses (dr_unaligned_supported)
5717 we generate the following code:
5718 p = initial_addr;
5719 indx = 0;
5720 loop {
5721 p = p + indx * vectype_size;
5722 vec_dest = *(p);
5723 indx = indx + 1;
5726 Otherwise, the data reference is potentially unaligned on a target that
5727 does not support unaligned accesses (dr_explicit_realign_optimized) -
5728 then generate the following code, in which the data in each iteration is
5729 obtained by two vector loads, one from the previous iteration, and one
5730 from the current iteration:
5731 p1 = initial_addr;
5732 msq_init = *(floor(p1))
5733 p2 = initial_addr + VS - 1;
5734 realignment_token = call target_builtin;
5735 indx = 0;
5736 loop {
5737 p2 = p2 + indx * vectype_size
5738 lsq = *(floor(p2))
5739 vec_dest = realign_load (msq, lsq, realignment_token)
5740 indx = indx + 1;
5741 msq = lsq;
5742 } */
5744 /* If the misalignment remains the same throughout the execution of the
5745 loop, we can create the init_addr and permutation mask at the loop
5746 preheader. Otherwise, it needs to be created inside the loop.
5747 This can only occur when vectorizing memory accesses in the inner-loop
5748 nested within an outer-loop that is being vectorized. */
5750 if (nested_in_vect_loop_p (loop, stmt)
5751 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5753 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5754 compute_in_loop = true;
5757 if ((alignment_support_scheme == dr_explicit_realign_optimized
5758 || alignment_support_scheme == dr_explicit_realign)
5759 && !compute_in_loop)
5761 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5762 alignment_support_scheme, NULL_TREE,
5763 &at_loop);
5764 if (alignment_support_scheme == dr_explicit_realign_optimized)
5766 phi = SSA_NAME_DEF_STMT (msq);
5767 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5770 else
5771 at_loop = loop;
5773 prev_stmt_info = NULL;
5774 for (j = 0; j < ncopies; j++)
5776 /* 1. Create the vector pointer update chain. */
5777 if (j == 0)
5778 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5779 at_loop, offset,
5780 &dummy, &ptr_incr, false,
5781 NULL_TREE, &inv_p);
5782 else
5783 dataref_ptr =
5784 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5786 for (i = 0; i < vec_num; i++)
5788 if (i > 0)
5789 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5790 NULL_TREE);
5792 /* 2. Create the vector-load in the loop. */
5793 switch (alignment_support_scheme)
5795 case dr_aligned:
5796 gcc_assert (aligned_access_p (first_dr));
5797 data_ref = build_fold_indirect_ref (dataref_ptr);
5798 break;
5799 case dr_unaligned_supported:
5801 int mis = DR_MISALIGNMENT (first_dr);
5802 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5804 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5805 data_ref =
5806 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5807 break;
5809 case dr_explicit_realign:
5811 tree ptr, bump;
5812 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5814 if (compute_in_loop)
5815 msq = vect_setup_realignment (first_stmt, bsi,
5816 &realignment_token,
5817 dr_explicit_realign,
5818 dataref_ptr, NULL);
5820 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5821 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5822 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5823 new_temp = make_ssa_name (vec_dest, new_stmt);
5824 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5825 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5826 copy_virtual_operands (new_stmt, stmt);
5827 mark_symbols_for_renaming (new_stmt);
5828 msq = new_temp;
5830 bump = size_binop (MULT_EXPR, vs_minus_1,
5831 TYPE_SIZE_UNIT (scalar_type));
5832 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5833 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5834 break;
5836 case dr_explicit_realign_optimized:
5837 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5838 break;
5839 default:
5840 gcc_unreachable ();
5842 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5843 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5844 new_temp = make_ssa_name (vec_dest, new_stmt);
5845 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5846 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5847 mark_symbols_for_renaming (new_stmt);
5849 /* 3. Handle explicit realignment if necessary/supported. Create in
5850 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5851 if (alignment_support_scheme == dr_explicit_realign_optimized
5852 || alignment_support_scheme == dr_explicit_realign)
5854 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5855 if (!realignment_token)
5856 realignment_token = dataref_ptr;
5857 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5858 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5859 realignment_token);
5860 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5861 new_temp = make_ssa_name (vec_dest, new_stmt);
5862 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5863 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5865 if (alignment_support_scheme == dr_explicit_realign_optimized)
5867 if (i == vec_num - 1 && j == ncopies - 1)
5868 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5869 msq = lsq;
5873 /* 4. Handle invariant-load. */
5874 if (inv_p)
5876 gcc_assert (!strided_load);
5877 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5878 if (j == 0)
5880 int k;
5881 tree t = NULL_TREE;
5882 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5884 /* CHECKME: bitpos depends on endianess? */
5885 bitpos = bitsize_zero_node;
5886 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5887 bitsize, bitpos);
5888 vec_dest =
5889 vect_create_destination_var (scalar_dest, NULL_TREE);
5890 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5891 new_temp = make_ssa_name (vec_dest, new_stmt);
5892 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5893 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5895 for (k = nunits - 1; k >= 0; --k)
5896 t = tree_cons (NULL_TREE, new_temp, t);
5897 /* FIXME: use build_constructor directly. */
5898 vec_inv = build_constructor_from_list (vectype, t);
5899 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5900 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5902 else
5903 gcc_unreachable (); /* FORNOW. */
5906 /* Collect vector loads and later create their permutation in
5907 vect_transform_strided_load (). */
5908 if (strided_load)
5909 VEC_quick_push (tree, dr_chain, new_temp);
5911 /* Store vector loads in the corresponding SLP_NODE. */
5912 if (slp)
5913 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5916 /* FORNOW: SLP with multiple types is unsupported. */
5917 if (slp)
5918 return true;
5920 if (strided_load)
5922 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5923 return false;
5924 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5925 VEC_free (tree, heap, dr_chain);
5926 dr_chain = VEC_alloc (tree, heap, group_size);
5928 else
5930 if (j == 0)
5931 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5932 else
5933 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5934 prev_stmt_info = vinfo_for_stmt (new_stmt);
5938 if (dr_chain)
5939 VEC_free (tree, heap, dr_chain);
5941 return true;
5945 /* Function vectorizable_live_operation.
5947 STMT computes a value that is used outside the loop. Check if
5948 it can be supported. */
5950 bool
5951 vectorizable_live_operation (tree stmt,
5952 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5953 tree *vec_stmt ATTRIBUTE_UNUSED)
5955 tree operation;
5956 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5957 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5958 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5959 int i;
5960 int op_type;
5961 tree op;
5962 tree def, def_stmt;
5963 enum vect_def_type dt;
5965 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5967 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5968 return false;
5970 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5971 return false;
5973 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5974 return false;
5976 /* FORNOW. CHECKME. */
5977 if (nested_in_vect_loop_p (loop, stmt))
5978 return false;
5980 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5981 op_type = TREE_OPERAND_LENGTH (operation);
5983 /* FORNOW: support only if all uses are invariant. This means
5984 that the scalar operations can remain in place, unvectorized.
5985 The original last scalar value that they compute will be used. */
5987 for (i = 0; i < op_type; i++)
5989 op = TREE_OPERAND (operation, i);
5990 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5992 if (vect_print_dump_info (REPORT_DETAILS))
5993 fprintf (vect_dump, "use not simple.");
5994 return false;
5997 if (dt != vect_invariant_def && dt != vect_constant_def)
5998 return false;
6001 /* No transformation is required for the cases we currently support. */
6002 return true;
6006 /* Function vect_is_simple_cond.
6008 Input:
6009 LOOP - the loop that is being vectorized.
6010 COND - Condition that is checked for simple use.
6012 Returns whether a COND can be vectorized. Checks whether
6013 condition operands are supportable using vec_is_simple_use. */
6015 static bool
6016 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6018 tree lhs, rhs;
6019 tree def;
6020 enum vect_def_type dt;
6022 if (!COMPARISON_CLASS_P (cond))
6023 return false;
6025 lhs = TREE_OPERAND (cond, 0);
6026 rhs = TREE_OPERAND (cond, 1);
6028 if (TREE_CODE (lhs) == SSA_NAME)
6030 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6031 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6032 return false;
6034 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6035 && TREE_CODE (lhs) != FIXED_CST)
6036 return false;
6038 if (TREE_CODE (rhs) == SSA_NAME)
6040 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6041 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6042 return false;
6044 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6045 && TREE_CODE (rhs) != FIXED_CST)
6046 return false;
6048 return true;
6051 /* vectorizable_condition.
6053 Check if STMT is conditional modify expression that can be vectorized.
6054 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6055 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6056 at BSI.
6058 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6060 bool
6061 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
6063 tree scalar_dest = NULL_TREE;
6064 tree vec_dest = NULL_TREE;
6065 tree op = NULL_TREE;
6066 tree cond_expr, then_clause, else_clause;
6067 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6068 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6069 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6070 tree vec_compare, vec_cond_expr;
6071 tree new_temp;
6072 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6073 enum machine_mode vec_mode;
6074 tree def;
6075 enum vect_def_type dt;
6076 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6077 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6079 gcc_assert (ncopies >= 1);
6080 if (ncopies > 1)
6081 return false; /* FORNOW */
6083 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6084 return false;
6086 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6087 return false;
6089 /* FORNOW: SLP not supported. */
6090 if (STMT_SLP_TYPE (stmt_info))
6091 return false;
6093 /* FORNOW: not yet supported. */
6094 if (STMT_VINFO_LIVE_P (stmt_info))
6096 if (vect_print_dump_info (REPORT_DETAILS))
6097 fprintf (vect_dump, "value used after loop.");
6098 return false;
6101 /* Is vectorizable conditional operation? */
6102 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6103 return false;
6105 op = GIMPLE_STMT_OPERAND (stmt, 1);
6107 if (TREE_CODE (op) != COND_EXPR)
6108 return false;
6110 cond_expr = TREE_OPERAND (op, 0);
6111 then_clause = TREE_OPERAND (op, 1);
6112 else_clause = TREE_OPERAND (op, 2);
6114 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6115 return false;
6117 /* We do not handle two different vector types for the condition
6118 and the values. */
6119 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6120 return false;
6122 if (TREE_CODE (then_clause) == SSA_NAME)
6124 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6125 if (!vect_is_simple_use (then_clause, loop_vinfo,
6126 &then_def_stmt, &def, &dt))
6127 return false;
6129 else if (TREE_CODE (then_clause) != INTEGER_CST
6130 && TREE_CODE (then_clause) != REAL_CST
6131 && TREE_CODE (then_clause) != FIXED_CST)
6132 return false;
6134 if (TREE_CODE (else_clause) == SSA_NAME)
6136 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6137 if (!vect_is_simple_use (else_clause, loop_vinfo,
6138 &else_def_stmt, &def, &dt))
6139 return false;
6141 else if (TREE_CODE (else_clause) != INTEGER_CST
6142 && TREE_CODE (else_clause) != REAL_CST
6143 && TREE_CODE (else_clause) != FIXED_CST)
6144 return false;
6147 vec_mode = TYPE_MODE (vectype);
6149 if (!vec_stmt)
6151 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6152 return expand_vec_cond_expr_p (op, vec_mode);
6155 /* Transform */
6157 /* Handle def. */
6158 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6159 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6161 /* Handle cond expr. */
6162 vec_cond_lhs =
6163 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6164 vec_cond_rhs =
6165 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6166 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6167 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6169 /* Arguments are ready. create the new vector stmt. */
6170 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6171 vec_cond_lhs, vec_cond_rhs);
6172 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6173 vec_compare, vec_then_clause, vec_else_clause);
6175 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6176 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6177 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6178 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6180 return true;
6184 /* Function vect_transform_stmt.
6186 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6188 static bool
6189 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6190 slp_tree slp_node)
6192 bool is_store = false;
6193 tree vec_stmt = NULL_TREE;
6194 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6195 tree orig_stmt_in_pattern;
6196 bool done;
6198 switch (STMT_VINFO_TYPE (stmt_info))
6200 case type_demotion_vec_info_type:
6201 gcc_assert (!slp_node);
6202 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6203 gcc_assert (done);
6204 break;
6206 case type_promotion_vec_info_type:
6207 gcc_assert (!slp_node);
6208 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6209 gcc_assert (done);
6210 break;
6212 case type_conversion_vec_info_type:
6213 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6214 gcc_assert (done);
6215 break;
6217 case induc_vec_info_type:
6218 gcc_assert (!slp_node);
6219 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6220 gcc_assert (done);
6221 break;
6223 case op_vec_info_type:
6224 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6225 gcc_assert (done);
6226 break;
6228 case assignment_vec_info_type:
6229 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6230 gcc_assert (done);
6231 break;
6233 case load_vec_info_type:
6234 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6235 gcc_assert (done);
6236 break;
6238 case store_vec_info_type:
6239 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6240 gcc_assert (done);
6241 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6243 /* In case of interleaving, the whole chain is vectorized when the
6244 last store in the chain is reached. Store stmts before the last
6245 one are skipped, and there vec_stmt_info shouldn't be freed
6246 meanwhile. */
6247 *strided_store = true;
6248 if (STMT_VINFO_VEC_STMT (stmt_info))
6249 is_store = true;
6251 else
6252 is_store = true;
6253 break;
6255 case condition_vec_info_type:
6256 gcc_assert (!slp_node);
6257 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6258 gcc_assert (done);
6259 break;
6261 case call_vec_info_type:
6262 gcc_assert (!slp_node);
6263 done = vectorizable_call (stmt, bsi, &vec_stmt);
6264 break;
6266 case reduc_vec_info_type:
6267 gcc_assert (!slp_node);
6268 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6269 gcc_assert (done);
6270 break;
6272 default:
6273 if (!STMT_VINFO_LIVE_P (stmt_info))
6275 if (vect_print_dump_info (REPORT_DETAILS))
6276 fprintf (vect_dump, "stmt not supported.");
6277 gcc_unreachable ();
6281 if (STMT_VINFO_LIVE_P (stmt_info)
6282 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6284 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6285 gcc_assert (done);
6288 if (vec_stmt)
6290 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6291 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6292 if (orig_stmt_in_pattern)
6294 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6295 /* STMT was inserted by the vectorizer to replace a computation idiom.
6296 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6297 computed this idiom. We need to record a pointer to VEC_STMT in
6298 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6299 documentation of vect_pattern_recog. */
6300 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6302 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6303 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6308 return is_store;
6312 /* This function builds ni_name = number of iterations loop executes
6313 on the loop preheader. */
6315 static tree
6316 vect_build_loop_niters (loop_vec_info loop_vinfo)
6318 tree ni_name, stmt, var;
6319 edge pe;
6320 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6321 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6323 var = create_tmp_var (TREE_TYPE (ni), "niters");
6324 add_referenced_var (var);
6325 ni_name = force_gimple_operand (ni, &stmt, false, var);
6327 pe = loop_preheader_edge (loop);
6328 if (stmt)
6330 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6331 gcc_assert (!new_bb);
6334 return ni_name;
6338 /* This function generates the following statements:
6340 ni_name = number of iterations loop executes
6341 ratio = ni_name / vf
6342 ratio_mult_vf_name = ratio * vf
6344 and places them at the loop preheader edge. */
6346 static void
6347 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6348 tree *ni_name_ptr,
6349 tree *ratio_mult_vf_name_ptr,
6350 tree *ratio_name_ptr)
6353 edge pe;
6354 basic_block new_bb;
6355 tree stmt, ni_name;
6356 tree var;
6357 tree ratio_name;
6358 tree ratio_mult_vf_name;
6359 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6360 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6361 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6362 tree log_vf;
6364 pe = loop_preheader_edge (loop);
6366 /* Generate temporary variable that contains
6367 number of iterations loop executes. */
6369 ni_name = vect_build_loop_niters (loop_vinfo);
6370 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6372 /* Create: ratio = ni >> log2(vf) */
6374 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6375 if (!is_gimple_val (ratio_name))
6377 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6378 add_referenced_var (var);
6380 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6381 pe = loop_preheader_edge (loop);
6382 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6383 gcc_assert (!new_bb);
6386 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6388 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6389 ratio_name, log_vf);
6390 if (!is_gimple_val (ratio_mult_vf_name))
6392 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6393 add_referenced_var (var);
6395 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6396 true, var);
6397 pe = loop_preheader_edge (loop);
6398 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6399 gcc_assert (!new_bb);
6402 *ni_name_ptr = ni_name;
6403 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6404 *ratio_name_ptr = ratio_name;
6406 return;
6410 /* Function vect_update_ivs_after_vectorizer.
6412 "Advance" the induction variables of LOOP to the value they should take
6413 after the execution of LOOP. This is currently necessary because the
6414 vectorizer does not handle induction variables that are used after the
6415 loop. Such a situation occurs when the last iterations of LOOP are
6416 peeled, because:
6417 1. We introduced new uses after LOOP for IVs that were not originally used
6418 after LOOP: the IVs of LOOP are now used by an epilog loop.
6419 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6420 times, whereas the loop IVs should be bumped N times.
6422 Input:
6423 - LOOP - a loop that is going to be vectorized. The last few iterations
6424 of LOOP were peeled.
6425 - NITERS - the number of iterations that LOOP executes (before it is
6426 vectorized). i.e, the number of times the ivs should be bumped.
6427 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6428 coming out from LOOP on which there are uses of the LOOP ivs
6429 (this is the path from LOOP->exit to epilog_loop->preheader).
6431 The new definitions of the ivs are placed in LOOP->exit.
6432 The phi args associated with the edge UPDATE_E in the bb
6433 UPDATE_E->dest are updated accordingly.
6435 Assumption 1: Like the rest of the vectorizer, this function assumes
6436 a single loop exit that has a single predecessor.
6438 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6439 organized in the same order.
6441 Assumption 3: The access function of the ivs is simple enough (see
6442 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6444 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6445 coming out of LOOP on which the ivs of LOOP are used (this is the path
6446 that leads to the epilog loop; other paths skip the epilog loop). This
6447 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6448 needs to have its phis updated.
6451 static void
6452 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6453 edge update_e)
6455 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6456 basic_block exit_bb = single_exit (loop)->dest;
6457 tree phi, phi1;
6458 basic_block update_bb = update_e->dest;
6460 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6462 /* Make sure there exists a single-predecessor exit bb: */
6463 gcc_assert (single_pred_p (exit_bb));
6465 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6466 phi && phi1;
6467 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6469 tree access_fn = NULL;
6470 tree evolution_part;
6471 tree init_expr;
6472 tree step_expr;
6473 tree var, ni, ni_name;
6474 block_stmt_iterator last_bsi;
6476 if (vect_print_dump_info (REPORT_DETAILS))
6478 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6479 print_generic_expr (vect_dump, phi, TDF_SLIM);
6482 /* Skip virtual phi's. */
6483 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6485 if (vect_print_dump_info (REPORT_DETAILS))
6486 fprintf (vect_dump, "virtual phi. skip.");
6487 continue;
6490 /* Skip reduction phis. */
6491 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6493 if (vect_print_dump_info (REPORT_DETAILS))
6494 fprintf (vect_dump, "reduc phi. skip.");
6495 continue;
6498 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6499 gcc_assert (access_fn);
6500 evolution_part =
6501 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6502 gcc_assert (evolution_part != NULL_TREE);
6504 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6505 of degree >= 2 or exponential. */
6506 gcc_assert (!tree_is_chrec (evolution_part));
6508 step_expr = evolution_part;
6509 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6510 loop->num));
6512 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6513 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6514 init_expr,
6515 fold_convert (sizetype,
6516 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6517 niters, step_expr)));
6518 else
6519 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6520 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6521 fold_convert (TREE_TYPE (init_expr),
6522 niters),
6523 step_expr),
6524 init_expr);
6528 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6529 add_referenced_var (var);
6531 last_bsi = bsi_last (exit_bb);
6532 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6533 true, BSI_SAME_STMT);
6535 /* Fix phi expressions in the successor bb. */
6536 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6540 /* Return the more conservative threshold between the
6541 min_profitable_iters returned by the cost model and the user
6542 specified threshold, if provided. */
6544 static unsigned int
6545 conservative_cost_threshold (loop_vec_info loop_vinfo,
6546 int min_profitable_iters)
6548 unsigned int th;
6549 int min_scalar_loop_bound;
6551 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6552 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6554 /* Use the cost model only if it is more conservative than user specified
6555 threshold. */
6556 th = (unsigned) min_scalar_loop_bound;
6557 if (min_profitable_iters
6558 && (!min_scalar_loop_bound
6559 || min_profitable_iters > min_scalar_loop_bound))
6560 th = (unsigned) min_profitable_iters;
6562 if (th && vect_print_dump_info (REPORT_COST))
6563 fprintf (vect_dump, "Vectorization may not be profitable.");
6565 return th;
6568 /* Function vect_do_peeling_for_loop_bound
6570 Peel the last iterations of the loop represented by LOOP_VINFO.
6571 The peeled iterations form a new epilog loop. Given that the loop now
6572 iterates NITERS times, the new epilog loop iterates
6573 NITERS % VECTORIZATION_FACTOR times.
6575 The original loop will later be made to iterate
6576 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6578 static void
6579 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6581 tree ni_name, ratio_mult_vf_name;
6582 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6583 struct loop *new_loop;
6584 edge update_e;
6585 basic_block preheader;
6586 int loop_num;
6587 bool check_profitability = false;
6588 unsigned int th = 0;
6589 int min_profitable_iters;
6591 if (vect_print_dump_info (REPORT_DETAILS))
6592 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6594 initialize_original_copy_tables ();
6596 /* Generate the following variables on the preheader of original loop:
6598 ni_name = number of iteration the original loop executes
6599 ratio = ni_name / vf
6600 ratio_mult_vf_name = ratio * vf */
6601 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6602 &ratio_mult_vf_name, ratio);
6604 loop_num = loop->num;
6606 /* If cost model check not done during versioning and
6607 peeling for alignment. */
6608 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6609 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6610 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6612 check_profitability = true;
6614 /* Get profitability threshold for vectorized loop. */
6615 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6617 th = conservative_cost_threshold (loop_vinfo,
6618 min_profitable_iters);
6621 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6622 ratio_mult_vf_name, ni_name, false,
6623 th, check_profitability);
6624 gcc_assert (new_loop);
6625 gcc_assert (loop_num == loop->num);
6626 #ifdef ENABLE_CHECKING
6627 slpeel_verify_cfg_after_peeling (loop, new_loop);
6628 #endif
6630 /* A guard that controls whether the new_loop is to be executed or skipped
6631 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6632 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6633 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6634 is on the path where the LOOP IVs are used and need to be updated. */
6636 preheader = loop_preheader_edge (new_loop)->src;
6637 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6638 update_e = EDGE_PRED (preheader, 0);
6639 else
6640 update_e = EDGE_PRED (preheader, 1);
6642 /* Update IVs of original loop as if they were advanced
6643 by ratio_mult_vf_name steps. */
6644 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6646 /* After peeling we have to reset scalar evolution analyzer. */
6647 scev_reset ();
6649 free_original_copy_tables ();
6653 /* Function vect_gen_niters_for_prolog_loop
6655 Set the number of iterations for the loop represented by LOOP_VINFO
6656 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6657 and the misalignment of DR - the data reference recorded in
6658 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6659 this loop, the data reference DR will refer to an aligned location.
6661 The following computation is generated:
6663 If the misalignment of DR is known at compile time:
6664 addr_mis = int mis = DR_MISALIGNMENT (dr);
6665 Else, compute address misalignment in bytes:
6666 addr_mis = addr & (vectype_size - 1)
6668 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6670 (elem_size = element type size; an element is the scalar element
6671 whose type is the inner type of the vectype)
6673 For interleaving,
6675 prolog_niters = min ( LOOP_NITERS ,
6676 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6677 where group_size is the size of the interleaved group.
6679 The above formulas assume that VF == number of elements in the vector. This
6680 may not hold when there are multiple-types in the loop.
6681 In this case, for some data-references in the loop the VF does not represent
6682 the number of elements that fit in the vector. Therefore, instead of VF we
6683 use TYPE_VECTOR_SUBPARTS. */
6685 static tree
6686 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6688 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6689 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6690 tree var, stmt;
6691 tree iters, iters_name;
6692 edge pe;
6693 basic_block new_bb;
6694 tree dr_stmt = DR_STMT (dr);
6695 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6696 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6697 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6698 tree niters_type = TREE_TYPE (loop_niters);
6699 int group_size = 1;
6700 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6701 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6703 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6705 /* For interleaved access element size must be multiplied by the size of
6706 the interleaved group. */
6707 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6708 DR_GROUP_FIRST_DR (stmt_info)));
6709 element_size *= group_size;
6712 pe = loop_preheader_edge (loop);
6714 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6716 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6717 int elem_misalign = byte_misalign / element_size;
6719 if (vect_print_dump_info (REPORT_DETAILS))
6720 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6721 iters = build_int_cst (niters_type,
6722 (nelements - elem_misalign)&(nelements/group_size-1));
6724 else
6726 tree new_stmts = NULL_TREE;
6727 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6728 &new_stmts, NULL_TREE, loop);
6729 tree ptr_type = TREE_TYPE (start_addr);
6730 tree size = TYPE_SIZE (ptr_type);
6731 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6732 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6733 tree elem_size_log =
6734 build_int_cst (type, exact_log2 (vectype_align/nelements));
6735 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6736 tree nelements_tree = build_int_cst (type, nelements);
6737 tree byte_misalign;
6738 tree elem_misalign;
6740 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6741 gcc_assert (!new_bb);
6743 /* Create: byte_misalign = addr & (vectype_size - 1) */
6744 byte_misalign =
6745 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6747 /* Create: elem_misalign = byte_misalign / element_size */
6748 elem_misalign =
6749 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6751 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6752 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6753 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6754 iters = fold_convert (niters_type, iters);
6757 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6758 /* If the loop bound is known at compile time we already verified that it is
6759 greater than vf; since the misalignment ('iters') is at most vf, there's
6760 no need to generate the MIN_EXPR in this case. */
6761 if (TREE_CODE (loop_niters) != INTEGER_CST)
6762 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6764 if (vect_print_dump_info (REPORT_DETAILS))
6766 fprintf (vect_dump, "niters for prolog loop: ");
6767 print_generic_expr (vect_dump, iters, TDF_SLIM);
6770 var = create_tmp_var (niters_type, "prolog_loop_niters");
6771 add_referenced_var (var);
6772 iters_name = force_gimple_operand (iters, &stmt, false, var);
6774 /* Insert stmt on loop preheader edge. */
6775 if (stmt)
6777 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6778 gcc_assert (!new_bb);
6781 return iters_name;
6785 /* Function vect_update_init_of_dr
6787 NITERS iterations were peeled from LOOP. DR represents a data reference
6788 in LOOP. This function updates the information recorded in DR to
6789 account for the fact that the first NITERS iterations had already been
6790 executed. Specifically, it updates the OFFSET field of DR. */
6792 static void
6793 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6795 tree offset = DR_OFFSET (dr);
6797 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6798 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6799 DR_OFFSET (dr) = offset;
6803 /* Function vect_update_inits_of_drs
6805 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6806 This function updates the information recorded for the data references in
6807 the loop to account for the fact that the first NITERS iterations had
6808 already been executed. Specifically, it updates the initial_condition of
6809 the access_function of all the data_references in the loop. */
6811 static void
6812 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6814 unsigned int i;
6815 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6816 struct data_reference *dr;
6818 if (vect_print_dump_info (REPORT_DETAILS))
6819 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6821 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6822 vect_update_init_of_dr (dr, niters);
6826 /* Function vect_do_peeling_for_alignment
6828 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6829 'niters' is set to the misalignment of one of the data references in the
6830 loop, thereby forcing it to refer to an aligned location at the beginning
6831 of the execution of this loop. The data reference for which we are
6832 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6834 static void
6835 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6837 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6838 tree niters_of_prolog_loop, ni_name;
6839 tree n_iters;
6840 struct loop *new_loop;
6841 bool check_profitability = false;
6842 unsigned int th = 0;
6843 int min_profitable_iters;
6845 if (vect_print_dump_info (REPORT_DETAILS))
6846 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6848 initialize_original_copy_tables ();
6850 ni_name = vect_build_loop_niters (loop_vinfo);
6851 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6854 /* If cost model check not done during versioning. */
6855 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6856 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6858 check_profitability = true;
6860 /* Get profitability threshold for vectorized loop. */
6861 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6863 th = conservative_cost_threshold (loop_vinfo,
6864 min_profitable_iters);
6867 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6868 new_loop =
6869 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6870 niters_of_prolog_loop, ni_name, true,
6871 th, check_profitability);
6873 gcc_assert (new_loop);
6874 #ifdef ENABLE_CHECKING
6875 slpeel_verify_cfg_after_peeling (new_loop, loop);
6876 #endif
6878 /* Update number of times loop executes. */
6879 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6880 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6881 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6883 /* Update the init conditions of the access functions of all data refs. */
6884 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6886 /* After peeling we have to reset scalar evolution analyzer. */
6887 scev_reset ();
6889 free_original_copy_tables ();
6893 /* Function vect_create_cond_for_align_checks.
6895 Create a conditional expression that represents the alignment checks for
6896 all of data references (array element references) whose alignment must be
6897 checked at runtime.
6899 Input:
6900 COND_EXPR - input conditional expression. New conditions will be chained
6901 with logical AND operation.
6902 LOOP_VINFO - two fields of the loop information are used.
6903 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6904 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6906 Output:
6907 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6908 expression.
6909 The returned value is the conditional expression to be used in the if
6910 statement that controls which version of the loop gets executed at runtime.
6912 The algorithm makes two assumptions:
6913 1) The number of bytes "n" in a vector is a power of 2.
6914 2) An address "a" is aligned if a%n is zero and that this
6915 test can be done as a&(n-1) == 0. For example, for 16
6916 byte vectors the test is a&0xf == 0. */
6918 static void
6919 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6920 tree *cond_expr,
6921 tree *cond_expr_stmt_list)
6923 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6924 VEC(tree,heap) *may_misalign_stmts
6925 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6926 tree ref_stmt, tmp;
6927 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6928 tree mask_cst;
6929 unsigned int i;
6930 tree psize;
6931 tree int_ptrsize_type;
6932 char tmp_name[20];
6933 tree or_tmp_name = NULL_TREE;
6934 tree and_tmp, and_tmp_name, and_stmt;
6935 tree ptrsize_zero;
6936 tree part_cond_expr;
6938 /* Check that mask is one less than a power of 2, i.e., mask is
6939 all zeros followed by all ones. */
6940 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6942 /* CHECKME: what is the best integer or unsigned type to use to hold a
6943 cast from a pointer value? */
6944 psize = TYPE_SIZE (ptr_type_node);
6945 int_ptrsize_type
6946 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6948 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6949 of the first vector of the i'th data reference. */
6951 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6953 tree new_stmt_list = NULL_TREE;
6954 tree addr_base;
6955 tree addr_tmp, addr_tmp_name, addr_stmt;
6956 tree or_tmp, new_or_tmp_name, or_stmt;
6958 /* create: addr_tmp = (int)(address_of_first_vector) */
6959 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6960 &new_stmt_list, NULL_TREE, loop);
6962 if (new_stmt_list != NULL_TREE)
6963 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6965 sprintf (tmp_name, "%s%d", "addr2int", i);
6966 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6967 add_referenced_var (addr_tmp);
6968 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6969 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6970 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6971 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6972 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6974 /* The addresses are OR together. */
6976 if (or_tmp_name != NULL_TREE)
6978 /* create: or_tmp = or_tmp | addr_tmp */
6979 sprintf (tmp_name, "%s%d", "orptrs", i);
6980 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6981 add_referenced_var (or_tmp);
6982 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6983 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6984 or_tmp_name, addr_tmp_name);
6985 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6986 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6987 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6988 or_tmp_name = new_or_tmp_name;
6990 else
6991 or_tmp_name = addr_tmp_name;
6993 } /* end for i */
6995 mask_cst = build_int_cst (int_ptrsize_type, mask);
6997 /* create: and_tmp = or_tmp & mask */
6998 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6999 add_referenced_var (and_tmp);
7000 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
7002 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
7003 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
7004 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7005 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
7007 /* Make and_tmp the left operand of the conditional test against zero.
7008 if and_tmp has a nonzero bit then some address is unaligned. */
7009 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7010 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7011 and_tmp_name, ptrsize_zero);
7012 if (*cond_expr)
7013 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7014 *cond_expr, part_cond_expr);
7015 else
7016 *cond_expr = part_cond_expr;
7019 /* Function vect_vfa_segment_size.
7021 Create an expression that computes the size of segment
7022 that will be accessed for a data reference. The functions takes into
7023 account that realignment loads may access one more vector.
7025 Input:
7026 DR: The data reference.
7027 VECT_FACTOR: vectorization factor.
7029 Return an expression whose value is the size of segment which will be
7030 accessed by DR. */
7032 static tree
7033 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7035 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7036 DR_STEP (dr), vect_factor);
7038 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7040 tree vector_size = TYPE_SIZE_UNIT
7041 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7043 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7044 segment_length, vector_size);
7046 return fold_convert (sizetype, segment_length);
7049 /* Function vect_create_cond_for_alias_checks.
7051 Create a conditional expression that represents the run-time checks for
7052 overlapping of address ranges represented by a list of data references
7053 relations passed as input.
7055 Input:
7056 COND_EXPR - input conditional expression. New conditions will be chained
7057 with logical AND operation.
7058 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7059 to be checked.
7061 Output:
7062 COND_EXPR - conditional expression.
7063 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7064 expression.
7067 The returned value is the conditional expression to be used in the if
7068 statement that controls which version of the loop gets executed at runtime.
7071 static void
7072 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7073 tree * cond_expr,
7074 tree * cond_expr_stmt_list)
7076 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7077 VEC (ddr_p, heap) * may_alias_ddrs =
7078 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7079 tree vect_factor =
7080 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7082 ddr_p ddr;
7083 unsigned int i;
7084 tree part_cond_expr;
7086 /* Create expression
7087 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7088 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7092 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7093 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7095 if (VEC_empty (ddr_p, may_alias_ddrs))
7096 return;
7098 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7100 struct data_reference *dr_a, *dr_b;
7101 tree dr_group_first_a, dr_group_first_b;
7102 tree addr_base_a, addr_base_b;
7103 tree segment_length_a, segment_length_b;
7104 tree stmt_a, stmt_b;
7106 dr_a = DDR_A (ddr);
7107 stmt_a = DR_STMT (DDR_A (ddr));
7108 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7109 if (dr_group_first_a)
7111 stmt_a = dr_group_first_a;
7112 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7115 dr_b = DDR_B (ddr);
7116 stmt_b = DR_STMT (DDR_B (ddr));
7117 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7118 if (dr_group_first_b)
7120 stmt_b = dr_group_first_b;
7121 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7124 addr_base_a =
7125 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7126 NULL_TREE, loop);
7127 addr_base_b =
7128 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7129 NULL_TREE, loop);
7131 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7132 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7134 if (vect_print_dump_info (REPORT_DR_DETAILS))
7136 fprintf (vect_dump,
7137 "create runtime check for data references ");
7138 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7139 fprintf (vect_dump, " and ");
7140 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7144 part_cond_expr =
7145 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7146 fold_build2 (LT_EXPR, boolean_type_node,
7147 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7148 addr_base_a,
7149 segment_length_a),
7150 addr_base_b),
7151 fold_build2 (LT_EXPR, boolean_type_node,
7152 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7153 addr_base_b,
7154 segment_length_b),
7155 addr_base_a));
7157 if (*cond_expr)
7158 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7159 *cond_expr, part_cond_expr);
7160 else
7161 *cond_expr = part_cond_expr;
7163 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7164 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7165 VEC_length (ddr_p, may_alias_ddrs));
7169 /* Function vect_loop_versioning.
7171 If the loop has data references that may or may not be aligned or/and
7172 has data reference relations whose independence was not proven then
7173 two versions of the loop need to be generated, one which is vectorized
7174 and one which isn't. A test is then generated to control which of the
7175 loops is executed. The test checks for the alignment of all of the
7176 data references that may or may not be aligned. An additional
7177 sequence of runtime tests is generated for each pairs of DDRs whose
7178 independence was not proven. The vectorized version of loop is
7179 executed only if both alias and alignment tests are passed.
7181 The test generated to check which version of loop is executed
7182 is modified to also check for profitability as indicated by the
7183 cost model initially. */
7185 static void
7186 vect_loop_versioning (loop_vec_info loop_vinfo)
7188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7189 struct loop *nloop;
7190 tree cond_expr = NULL_TREE;
7191 tree cond_expr_stmt_list = NULL_TREE;
7192 basic_block condition_bb;
7193 block_stmt_iterator cond_exp_bsi;
7194 basic_block merge_bb;
7195 basic_block new_exit_bb;
7196 edge new_exit_e, e;
7197 tree orig_phi, new_phi, arg;
7198 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7199 tree gimplify_stmt_list;
7200 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7201 int min_profitable_iters = 0;
7202 unsigned int th;
7204 /* Get profitability threshold for vectorized loop. */
7205 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7207 th = conservative_cost_threshold (loop_vinfo,
7208 min_profitable_iters);
7210 cond_expr =
7211 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7212 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7214 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7215 false, NULL_TREE);
7217 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7218 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7219 &cond_expr_stmt_list);
7221 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7222 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7223 &cond_expr_stmt_list);
7225 cond_expr =
7226 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7227 cond_expr =
7228 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7229 NULL_TREE);
7230 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7232 initialize_original_copy_tables ();
7233 nloop = loop_version (loop, cond_expr, &condition_bb,
7234 prob, prob, REG_BR_PROB_BASE - prob, true);
7235 free_original_copy_tables();
7237 /* Loop versioning violates an assumption we try to maintain during
7238 vectorization - that the loop exit block has a single predecessor.
7239 After versioning, the exit block of both loop versions is the same
7240 basic block (i.e. it has two predecessors). Just in order to simplify
7241 following transformations in the vectorizer, we fix this situation
7242 here by adding a new (empty) block on the exit-edge of the loop,
7243 with the proper loop-exit phis to maintain loop-closed-form. */
7245 merge_bb = single_exit (loop)->dest;
7246 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7247 new_exit_bb = split_edge (single_exit (loop));
7248 new_exit_e = single_exit (loop);
7249 e = EDGE_SUCC (new_exit_bb, 0);
7251 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7252 orig_phi = PHI_CHAIN (orig_phi))
7254 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7255 new_exit_bb);
7256 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7257 add_phi_arg (new_phi, arg, new_exit_e);
7258 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7261 /* End loop-exit-fixes after versioning. */
7263 update_ssa (TODO_update_ssa);
7264 if (cond_expr_stmt_list)
7266 cond_exp_bsi = bsi_last (condition_bb);
7267 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7271 /* Remove a group of stores (for SLP or interleaving), free their
7272 stmt_vec_info. */
7274 static void
7275 vect_remove_stores (tree first_stmt)
7277 tree next = first_stmt;
7278 tree tmp;
7279 block_stmt_iterator next_si;
7281 while (next)
7283 /* Free the attached stmt_vec_info and remove the stmt. */
7284 next_si = bsi_for_stmt (next);
7285 bsi_remove (&next_si, true);
7286 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
7287 free_stmt_vec_info (next);
7288 next = tmp;
7293 /* Vectorize SLP instance tree in postorder. */
7295 static bool
7296 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7298 tree stmt;
7299 bool strided_store, is_store;
7300 block_stmt_iterator si;
7301 stmt_vec_info stmt_info;
7303 if (!node)
7304 return false;
7306 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7307 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7309 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7310 stmt_info = vinfo_for_stmt (stmt);
7311 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7312 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7314 if (vect_print_dump_info (REPORT_DETAILS))
7316 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7317 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7320 si = bsi_for_stmt (stmt);
7321 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7322 if (is_store)
7324 if (DR_GROUP_FIRST_DR (stmt_info))
7325 /* If IS_STORE is TRUE, the vectorization of the
7326 interleaving chain was completed - free all the stores in
7327 the chain. */
7328 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7329 else
7330 /* FORNOW: SLP originates only from strided stores. */
7331 gcc_unreachable ();
7333 return true;
7336 /* FORNOW: SLP originates only from strided stores. */
7337 return false;
7341 static bool
7342 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7344 VEC (slp_instance, heap) *slp_instances =
7345 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7346 slp_instance instance;
7347 unsigned int vec_stmts_size;
7348 unsigned int group_size, i;
7349 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7350 bool is_store = false;
7352 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7354 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7355 /* For each SLP instance calculate number of vector stmts to be created
7356 for the scalar stmts in each node of the SLP tree. Number of vector
7357 elements in one vector iteration is the number of scalar elements in
7358 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7359 size. */
7360 vec_stmts_size = vectorization_factor * group_size / nunits;
7362 /* Schedule the tree of INSTANCE. */
7363 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7364 vec_stmts_size);
7366 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7367 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7368 fprintf (vect_dump, "vectorizing stmts using SLP.");
7371 return is_store;
7374 /* Function vect_transform_loop.
7376 The analysis phase has determined that the loop is vectorizable.
7377 Vectorize the loop - created vectorized stmts to replace the scalar
7378 stmts in the loop, and update the loop exit condition. */
7380 void
7381 vect_transform_loop (loop_vec_info loop_vinfo)
7383 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7384 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7385 int nbbs = loop->num_nodes;
7386 block_stmt_iterator si;
7387 int i;
7388 tree ratio = NULL;
7389 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7390 bool strided_store;
7391 bool slp_scheduled = false;
7392 unsigned int nunits;
7394 if (vect_print_dump_info (REPORT_DETAILS))
7395 fprintf (vect_dump, "=== vec_transform_loop ===");
7397 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7398 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7399 vect_loop_versioning (loop_vinfo);
7401 /* CHECKME: we wouldn't need this if we called update_ssa once
7402 for all loops. */
7403 bitmap_zero (vect_memsyms_to_rename);
7405 /* Peel the loop if there are data refs with unknown alignment.
7406 Only one data ref with unknown store is allowed. */
7408 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7409 vect_do_peeling_for_alignment (loop_vinfo);
7411 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7412 compile time constant), or it is a constant that doesn't divide by the
7413 vectorization factor, then an epilog loop needs to be created.
7414 We therefore duplicate the loop: the original loop will be vectorized,
7415 and will compute the first (n/VF) iterations. The second copy of the loop
7416 will remain scalar and will compute the remaining (n%VF) iterations.
7417 (VF is the vectorization factor). */
7419 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7420 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7421 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7422 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7423 else
7424 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7425 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7427 /* 1) Make sure the loop header has exactly two entries
7428 2) Make sure we have a preheader basic block. */
7430 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7432 split_edge (loop_preheader_edge (loop));
7434 /* FORNOW: the vectorizer supports only loops which body consist
7435 of one basic block (header + empty latch). When the vectorizer will
7436 support more involved loop forms, the order by which the BBs are
7437 traversed need to be reconsidered. */
7439 for (i = 0; i < nbbs; i++)
7441 basic_block bb = bbs[i];
7442 stmt_vec_info stmt_info;
7443 tree phi;
7445 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7447 if (vect_print_dump_info (REPORT_DETAILS))
7449 fprintf (vect_dump, "------>vectorizing phi: ");
7450 print_generic_expr (vect_dump, phi, TDF_SLIM);
7452 stmt_info = vinfo_for_stmt (phi);
7453 if (!stmt_info)
7454 continue;
7456 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7457 && !STMT_VINFO_LIVE_P (stmt_info))
7458 continue;
7460 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7461 != (unsigned HOST_WIDE_INT) vectorization_factor)
7462 && vect_print_dump_info (REPORT_DETAILS))
7463 fprintf (vect_dump, "multiple-types.");
7465 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7467 if (vect_print_dump_info (REPORT_DETAILS))
7468 fprintf (vect_dump, "transform phi.");
7469 vect_transform_stmt (phi, NULL, NULL, NULL);
7473 for (si = bsi_start (bb); !bsi_end_p (si);)
7475 tree stmt = bsi_stmt (si);
7476 bool is_store;
7478 if (vect_print_dump_info (REPORT_DETAILS))
7480 fprintf (vect_dump, "------>vectorizing statement: ");
7481 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7484 stmt_info = vinfo_for_stmt (stmt);
7486 /* vector stmts created in the outer-loop during vectorization of
7487 stmts in an inner-loop may not have a stmt_info, and do not
7488 need to be vectorized. */
7489 if (!stmt_info)
7491 bsi_next (&si);
7492 continue;
7495 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7496 && !STMT_VINFO_LIVE_P (stmt_info))
7498 bsi_next (&si);
7499 continue;
7502 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7503 nunits =
7504 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7505 if (!STMT_SLP_TYPE (stmt_info)
7506 && nunits != (unsigned int) vectorization_factor
7507 && vect_print_dump_info (REPORT_DETAILS))
7508 /* For SLP VF is set according to unrolling factor, and not to
7509 vector size, hence for SLP this print is not valid. */
7510 fprintf (vect_dump, "multiple-types.");
7512 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7513 reached. */
7514 if (STMT_SLP_TYPE (stmt_info))
7516 if (!slp_scheduled)
7518 slp_scheduled = true;
7520 if (vect_print_dump_info (REPORT_DETAILS))
7521 fprintf (vect_dump, "=== scheduling SLP instances ===");
7523 is_store = vect_schedule_slp (loop_vinfo, nunits);
7525 /* IS_STORE is true if STMT is a store. Stores cannot be of
7526 hybrid SLP type. They are removed in
7527 vect_schedule_slp_instance and their vinfo is destroyed. */
7528 if (is_store)
7530 bsi_next (&si);
7531 continue;
7535 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7536 if (PURE_SLP_STMT (stmt_info))
7538 bsi_next (&si);
7539 continue;
7543 /* -------- vectorize statement ------------ */
7544 if (vect_print_dump_info (REPORT_DETAILS))
7545 fprintf (vect_dump, "transform statement.");
7547 strided_store = false;
7548 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7549 if (is_store)
7551 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7553 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7554 interleaving chain was completed - free all the stores in
7555 the chain. */
7556 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7557 bsi_remove (&si, true);
7558 continue;
7560 else
7562 /* Free the attached stmt_vec_info and remove the stmt. */
7563 free_stmt_vec_info (stmt);
7564 bsi_remove (&si, true);
7565 continue;
7568 bsi_next (&si);
7569 } /* stmts in BB */
7570 } /* BBs in loop */
7572 slpeel_make_loop_iterate_ntimes (loop, ratio);
7574 mark_set_for_renaming (vect_memsyms_to_rename);
7576 /* The memory tags and pointers in vectorized statements need to
7577 have their SSA forms updated. FIXME, why can't this be delayed
7578 until all the loops have been transformed? */
7579 update_ssa (TODO_update_ssa);
7581 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7582 fprintf (vect_dump, "LOOP VECTORIZED.");
7583 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7584 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");