Daily bump.
[official-gcc.git] / gcc / tree-vect-transform.c
blob59fa6d5de8078ffeb6a5d7fd72b451d9d2cb5b4f
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int scalar_outside_cost = 0;
123 bool runtime_test = false;
124 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
127 int nbbs = loop->num_nodes;
128 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
129 int peel_guard_costs = 0;
130 int innerloop_iters = 0, factor;
131 VEC (slp_instance, heap) *slp_instances;
132 slp_instance instance;
134 /* Cost model disabled. */
135 if (!flag_vect_cost_model)
137 if (vect_print_dump_info (REPORT_COST))
138 fprintf (vect_dump, "cost model disabled.");
139 return 0;
142 /* If the number of iterations is unknown, or the
143 peeling-for-misalignment amount is unknown, we will have to generate
144 a runtime test to test the loop count against the threshold. */
145 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
146 || (byte_misalign < 0))
147 runtime_test = true;
149 /* Requires loop versioning tests to handle misalignment. */
151 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
153 /* FIXME: Make cost depend on complexity of individual check. */
154 vec_outside_cost +=
155 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
156 if (vect_print_dump_info (REPORT_COST))
157 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
158 "versioning to treat misalignment.\n");
161 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
163 /* FIXME: Make cost depend on complexity of individual check. */
164 vec_outside_cost +=
165 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
166 if (vect_print_dump_info (REPORT_COST))
167 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
168 "versioning aliasing.\n");
171 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
172 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
174 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
177 /* Count statements in scalar loop. Using this as scalar cost for a single
178 iteration for now.
180 TODO: Add outer loop support.
182 TODO: Consider assigning different costs to different scalar
183 statements. */
185 /* FORNOW. */
186 if (loop->inner)
187 innerloop_iters = 50; /* FIXME */
189 for (i = 0; i < nbbs; i++)
191 block_stmt_iterator si;
192 basic_block bb = bbs[i];
194 if (bb->loop_father == loop->inner)
195 factor = innerloop_iters;
196 else
197 factor = 1;
199 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
201 tree stmt = bsi_stmt (si);
202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
203 /* Skip stmts that are not vectorized inside the loop. */
204 if (!STMT_VINFO_RELEVANT_P (stmt_info)
205 && (!STMT_VINFO_LIVE_P (stmt_info)
206 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
207 continue;
208 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
209 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
210 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
211 some of the "outside" costs are generated inside the outer-loop. */
212 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
216 /* Add additional cost for the peeled instructions in prologue and epilogue
217 loop.
219 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
220 at compile-time - we assume it's vf/2 (the worst would be vf-1).
222 TODO: Build an expression that represents peel_iters for prologue and
223 epilogue to be used in a run-time test. */
225 if (byte_misalign < 0)
227 peel_iters_prologue = vf/2;
228 if (vect_print_dump_info (REPORT_COST))
229 fprintf (vect_dump, "cost model: "
230 "prologue peel iters set to vf/2.");
232 /* If peeling for alignment is unknown, loop bound of main loop becomes
233 unknown. */
234 peel_iters_epilogue = vf/2;
235 if (vect_print_dump_info (REPORT_COST))
236 fprintf (vect_dump, "cost model: "
237 "epilogue peel iters set to vf/2 because "
238 "peeling for alignment is unknown .");
240 /* If peeled iterations are unknown, count a taken branch and a not taken
241 branch per peeled loop. Even if scalar loop iterations are known,
242 vector iterations are not known since peeled prologue iterations are
243 not known. Hence guards remain the same. */
244 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
245 + TARG_COND_NOT_TAKEN_BRANCH_COST);
248 else
250 if (byte_misalign)
252 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
253 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
254 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
255 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
257 peel_iters_prologue = nelements - (byte_misalign / element_size);
259 else
260 peel_iters_prologue = 0;
262 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
264 peel_iters_epilogue = vf/2;
265 if (vect_print_dump_info (REPORT_COST))
266 fprintf (vect_dump, "cost model: "
267 "epilogue peel iters set to vf/2 because "
268 "loop iterations are unknown .");
270 /* If peeled iterations are known but number of scalar loop
271 iterations are unknown, count a taken branch per peeled loop. */
272 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
275 else
277 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
278 peel_iters_prologue = niters < peel_iters_prologue ?
279 niters : peel_iters_prologue;
280 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
284 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
285 + (peel_iters_epilogue * scalar_single_iter_cost)
286 + peel_guard_costs;
288 /* FORNOW: The scalar outside cost is incremented in one of the
289 following ways:
291 1. The vectorizer checks for alignment and aliasing and generates
292 a condition that allows dynamic vectorization. A cost model
293 check is ANDED with the versioning condition. Hence scalar code
294 path now has the added cost of the versioning check.
296 if (cost > th & versioning_check)
297 jmp to vector code
299 Hence run-time scalar is incremented by not-taken branch cost.
301 2. The vectorizer then checks if a prologue is required. If the
302 cost model check was not done before during versioning, it has to
303 be done before the prologue check.
305 if (cost <= th)
306 prologue = scalar_iters
307 if (prologue == 0)
308 jmp to vector code
309 else
310 execute prologue
311 if (prologue == num_iters)
312 go to exit
314 Hence the run-time scalar cost is incremented by a taken branch,
315 plus a not-taken branch, plus a taken branch cost.
317 3. The vectorizer then checks if an epilogue is required. If the
318 cost model check was not done before during prologue check, it
319 has to be done with the epilogue check.
321 if (prologue == 0)
322 jmp to vector code
323 else
324 execute prologue
325 if (prologue == num_iters)
326 go to exit
327 vector code:
328 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
329 jmp to epilogue
331 Hence the run-time scalar cost should be incremented by 2 taken
332 branches.
334 TODO: The back end may reorder the BBS's differently and reverse
335 conditions/branch directions. Change the stimates below to
336 something more reasonable. */
338 if (runtime_test)
340 /* Cost model check occurs at versioning. */
341 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
342 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
343 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
344 else
346 /* Cost model occurs at prologue generation. */
347 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
348 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
349 + TARG_COND_NOT_TAKEN_BRANCH_COST;
350 /* Cost model check occurs at epilogue generation. */
351 else
352 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
356 /* Add SLP costs. */
357 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
358 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
360 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
361 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
364 /* Calculate number of iterations required to make the vector version
365 profitable, relative to the loop bodies only. The following condition
366 must hold true:
367 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
368 where
369 SIC = scalar iteration cost, VIC = vector iteration cost,
370 VOC = vector outside cost, VF = vectorization factor,
371 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
372 SOC = scalar outside cost for run time cost model check. */
374 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
376 if (vec_outside_cost <= 0)
377 min_profitable_iters = 1;
378 else
380 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
381 - vec_inside_cost * peel_iters_prologue
382 - vec_inside_cost * peel_iters_epilogue)
383 / ((scalar_single_iter_cost * vf)
384 - vec_inside_cost);
386 if ((scalar_single_iter_cost * vf * min_profitable_iters)
387 <= ((vec_inside_cost * min_profitable_iters)
388 + ((vec_outside_cost - scalar_outside_cost) * vf)))
389 min_profitable_iters++;
392 /* vector version will never be profitable. */
393 else
395 if (vect_print_dump_info (REPORT_COST))
396 fprintf (vect_dump, "cost model: vector iteration cost = %d "
397 "is divisible by scalar iteration cost = %d by a factor "
398 "greater than or equal to the vectorization factor = %d .",
399 vec_inside_cost, scalar_single_iter_cost, vf);
400 return -1;
403 if (vect_print_dump_info (REPORT_COST))
405 fprintf (vect_dump, "Cost model analysis: \n");
406 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
407 vec_inside_cost);
408 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
409 vec_outside_cost);
410 fprintf (vect_dump, " Scalar iteration cost: %d\n",
411 scalar_single_iter_cost);
412 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
413 fprintf (vect_dump, " prologue iterations: %d\n",
414 peel_iters_prologue);
415 fprintf (vect_dump, " epilogue iterations: %d\n",
416 peel_iters_epilogue);
417 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
418 min_profitable_iters);
421 min_profitable_iters =
422 min_profitable_iters < vf ? vf : min_profitable_iters;
424 /* Because the condition we create is:
425 if (niters <= min_profitable_iters)
426 then skip the vectorized loop. */
427 min_profitable_iters--;
429 if (vect_print_dump_info (REPORT_COST))
430 fprintf (vect_dump, " Profitability threshold = %d\n",
431 min_profitable_iters);
433 return min_profitable_iters;
437 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
438 functions. Design better to avoid maintenance issues. */
440 /* Function vect_model_reduction_cost.
442 Models cost for a reduction operation, including the vector ops
443 generated within the strip-mine loop, the initial definition before
444 the loop, and the epilogue code that must be generated. */
446 static bool
447 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
448 int ncopies)
450 int outer_cost = 0;
451 enum tree_code code;
452 optab optab;
453 tree vectype;
454 tree orig_stmt;
455 tree reduction_op;
456 enum machine_mode mode;
457 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
458 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
462 /* Cost of reduction op inside loop. */
463 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
465 reduction_op = TREE_OPERAND (operation, op_type-1);
466 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
467 if (!vectype)
469 if (vect_print_dump_info (REPORT_COST))
471 fprintf (vect_dump, "unsupported data-type ");
472 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
474 return false;
477 mode = TYPE_MODE (vectype);
478 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
480 if (!orig_stmt)
481 orig_stmt = STMT_VINFO_STMT (stmt_info);
483 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
485 /* Add in cost for initial definition. */
486 outer_cost += TARG_SCALAR_TO_VEC_COST;
488 /* Determine cost of epilogue code.
490 We have a reduction operator that will reduce the vector in one statement.
491 Also requires scalar extract. */
493 if (!nested_in_vect_loop_p (loop, orig_stmt))
495 if (reduc_code < NUM_TREE_CODES)
496 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
497 else
499 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
500 tree bitsize =
501 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
502 int element_bitsize = tree_low_cst (bitsize, 1);
503 int nelements = vec_size_in_bits / element_bitsize;
505 optab = optab_for_tree_code (code, vectype);
507 /* We have a whole vector shift available. */
508 if (VECTOR_MODE_P (mode)
509 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
510 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
511 /* Final reduction via vector shifts and the reduction operator. Also
512 requires scalar extract. */
513 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
514 + TARG_VEC_TO_SCALAR_COST);
515 else
516 /* Use extracts and reduction op for final reduction. For N elements,
517 we have N extracts and N-1 reduction ops. */
518 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
522 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
524 if (vect_print_dump_info (REPORT_COST))
525 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
526 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
527 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
529 return true;
533 /* Function vect_model_induction_cost.
535 Models cost for induction operations. */
537 static void
538 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
540 /* loop cost for vec_loop. */
541 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
542 /* prologue cost for vec_init and vec_step. */
543 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
545 if (vect_print_dump_info (REPORT_COST))
546 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
547 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
548 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
552 /* Function vect_model_simple_cost.
554 Models cost for simple operations, i.e. those that only emit ncopies of a
555 single op. Right now, this does not account for multiple insns that could
556 be generated for the single vector op. We will handle that shortly. */
558 void
559 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
560 enum vect_def_type *dt, slp_tree slp_node)
562 int i;
563 int inside_cost = 0, outside_cost = 0;
565 inside_cost = ncopies * TARG_VEC_STMT_COST;
567 /* FORNOW: Assuming maximum 2 args per stmts. */
568 for (i = 0; i < 2; i++)
570 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
571 outside_cost += TARG_SCALAR_TO_VEC_COST;
574 if (vect_print_dump_info (REPORT_COST))
575 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
576 "outside_cost = %d .", inside_cost, outside_cost);
578 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
579 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
580 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
584 /* Function vect_cost_strided_group_size
586 For strided load or store, return the group_size only if it is the first
587 load or store of a group, else return 1. This ensures that group size is
588 only returned once per group. */
590 static int
591 vect_cost_strided_group_size (stmt_vec_info stmt_info)
593 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
595 if (first_stmt == STMT_VINFO_STMT (stmt_info))
596 return DR_GROUP_SIZE (stmt_info);
598 return 1;
602 /* Function vect_model_store_cost
604 Models cost for stores. In the case of strided accesses, one access
605 has the overhead of the strided access attributed to it. */
607 void
608 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
609 enum vect_def_type dt, slp_tree slp_node)
611 int group_size;
612 int inside_cost = 0, outside_cost = 0;
614 if (dt == vect_constant_def || dt == vect_invariant_def)
615 outside_cost = TARG_SCALAR_TO_VEC_COST;
617 /* Strided access? */
618 if (DR_GROUP_FIRST_DR (stmt_info))
619 group_size = vect_cost_strided_group_size (stmt_info);
620 /* Not a strided access. */
621 else
622 group_size = 1;
624 /* Is this an access in a group of stores, which provide strided access?
625 If so, add in the cost of the permutes. */
626 if (group_size > 1)
628 /* Uses a high and low interleave operation for each needed permute. */
629 inside_cost = ncopies * exact_log2(group_size) * group_size
630 * TARG_VEC_STMT_COST;
632 if (vect_print_dump_info (REPORT_COST))
633 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
634 group_size);
638 /* Costs of the stores. */
639 inside_cost += ncopies * TARG_VEC_STORE_COST;
641 if (vect_print_dump_info (REPORT_COST))
642 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
643 "outside_cost = %d .", inside_cost, outside_cost);
645 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
646 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
647 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
651 /* Function vect_model_load_cost
653 Models cost for loads. In the case of strided accesses, the last access
654 has the overhead of the strided access attributed to it. Since unaligned
655 accesses are supported for loads, we also account for the costs of the
656 access scheme chosen. */
658 void
659 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
662 int group_size;
663 int alignment_support_cheme;
664 tree first_stmt;
665 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
666 int inside_cost = 0, outside_cost = 0;
668 /* Strided accesses? */
669 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
670 if (first_stmt && !slp_node)
672 group_size = vect_cost_strided_group_size (stmt_info);
673 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
675 /* Not a strided access. */
676 else
678 group_size = 1;
679 first_dr = dr;
682 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
684 /* Is this an access in a group of loads providing strided access?
685 If so, add in the cost of the permutes. */
686 if (group_size > 1)
688 /* Uses an even and odd extract operations for each needed permute. */
689 inside_cost = ncopies * exact_log2(group_size) * group_size
690 * TARG_VEC_STMT_COST;
692 if (vect_print_dump_info (REPORT_COST))
693 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
694 group_size);
698 /* The loads themselves. */
699 switch (alignment_support_cheme)
701 case dr_aligned:
703 inside_cost += ncopies * TARG_VEC_LOAD_COST;
705 if (vect_print_dump_info (REPORT_COST))
706 fprintf (vect_dump, "vect_model_load_cost: aligned.");
708 break;
710 case dr_unaligned_supported:
712 /* Here, we assign an additional cost for the unaligned load. */
713 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
715 if (vect_print_dump_info (REPORT_COST))
716 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
717 "hardware.");
719 break;
721 case dr_explicit_realign:
723 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
725 /* FIXME: If the misalignment remains fixed across the iterations of
726 the containing loop, the following cost should be added to the
727 outside costs. */
728 if (targetm.vectorize.builtin_mask_for_load)
729 inside_cost += TARG_VEC_STMT_COST;
731 break;
733 case dr_explicit_realign_optimized:
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
737 "pipelined.");
739 /* Unaligned software pipeline has a load of an address, an initial
740 load, and possibly a mask operation to "prime" the loop. However,
741 if this is an access in a group of loads, which provide strided
742 access, then the above cost should only be considered for one
743 access in the group. Inside the loop, there is a load op
744 and a realignment op. */
746 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
748 outside_cost = 2*TARG_VEC_STMT_COST;
749 if (targetm.vectorize.builtin_mask_for_load)
750 outside_cost += TARG_VEC_STMT_COST;
753 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 break;
758 default:
759 gcc_unreachable ();
762 if (vect_print_dump_info (REPORT_COST))
763 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
764 "outside_cost = %d .", inside_cost, outside_cost);
766 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
767 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
768 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
772 /* Function vect_get_new_vect_var.
774 Returns a name for a new variable. The current naming scheme appends the
775 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
776 the name of vectorizer generated variables, and appends that to NAME if
777 provided. */
779 static tree
780 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
782 const char *prefix;
783 tree new_vect_var;
785 switch (var_kind)
787 case vect_simple_var:
788 prefix = "vect_";
789 break;
790 case vect_scalar_var:
791 prefix = "stmp_";
792 break;
793 case vect_pointer_var:
794 prefix = "vect_p";
795 break;
796 default:
797 gcc_unreachable ();
800 if (name)
802 char* tmp = concat (prefix, name, NULL);
803 new_vect_var = create_tmp_var (type, tmp);
804 free (tmp);
806 else
807 new_vect_var = create_tmp_var (type, prefix);
809 /* Mark vector typed variable as a gimple register variable. */
810 if (TREE_CODE (type) == VECTOR_TYPE)
811 DECL_GIMPLE_REG_P (new_vect_var) = true;
813 return new_vect_var;
817 /* Function vect_create_addr_base_for_vector_ref.
819 Create an expression that computes the address of the first memory location
820 that will be accessed for a data reference.
822 Input:
823 STMT: The statement containing the data reference.
824 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
825 OFFSET: Optional. If supplied, it is be added to the initial address.
826 LOOP: Specify relative to which loop-nest should the address be computed.
827 For example, when the dataref is in an inner-loop nested in an
828 outer-loop that is now being vectorized, LOOP can be either the
829 outer-loop, or the inner-loop. The first memory location accessed
830 by the following dataref ('in' points to short):
832 for (i=0; i<N; i++)
833 for (j=0; j<M; j++)
834 s += in[i+j]
836 is as follows:
837 if LOOP=i_loop: &in (relative to i_loop)
838 if LOOP=j_loop: &in+i*2B (relative to j_loop)
840 Output:
841 1. Return an SSA_NAME whose value is the address of the memory location of
842 the first vector of the data reference.
843 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
844 these statement(s) which define the returned SSA_NAME.
846 FORNOW: We are only handling array accesses with step 1. */
848 static tree
849 vect_create_addr_base_for_vector_ref (tree stmt,
850 tree *new_stmt_list,
851 tree offset,
852 struct loop *loop)
854 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
855 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
856 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
857 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
858 tree base_name;
859 tree data_ref_base_var;
860 tree new_base_stmt;
861 tree vec_stmt;
862 tree addr_base, addr_expr;
863 tree dest, new_stmt;
864 tree base_offset = unshare_expr (DR_OFFSET (dr));
865 tree init = unshare_expr (DR_INIT (dr));
866 tree vect_ptr_type, addr_expr2;
867 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
869 gcc_assert (loop);
870 if (loop != containing_loop)
872 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
873 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
875 gcc_assert (nested_in_vect_loop_p (loop, stmt));
877 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
878 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
879 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
882 /* Create data_ref_base */
883 base_name = build_fold_indirect_ref (data_ref_base);
884 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
885 add_referenced_var (data_ref_base_var);
886 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
887 true, data_ref_base_var);
888 append_to_statement_list_force(new_base_stmt, new_stmt_list);
890 /* Create base_offset */
891 base_offset = size_binop (PLUS_EXPR, base_offset, init);
892 base_offset = fold_convert (sizetype, base_offset);
893 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
894 add_referenced_var (dest);
895 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
896 append_to_statement_list_force (new_stmt, new_stmt_list);
898 if (offset)
900 tree tmp = create_tmp_var (sizetype, "offset");
902 add_referenced_var (tmp);
903 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
904 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
905 base_offset, offset);
906 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
907 append_to_statement_list_force (new_stmt, new_stmt_list);
910 /* base + base_offset */
911 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
912 data_ref_base, base_offset);
914 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
916 /* addr_expr = addr_base */
917 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
918 get_name (base_name));
919 add_referenced_var (addr_expr);
920 vec_stmt = fold_convert (vect_ptr_type, addr_base);
921 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
922 get_name (base_name));
923 add_referenced_var (addr_expr2);
924 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
925 append_to_statement_list_force (new_stmt, new_stmt_list);
927 if (vect_print_dump_info (REPORT_DETAILS))
929 fprintf (vect_dump, "created ");
930 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
932 return vec_stmt;
936 /* Function vect_create_data_ref_ptr.
938 Create a new pointer to vector type (vp), that points to the first location
939 accessed in the loop by STMT, along with the def-use update chain to
940 appropriately advance the pointer through the loop iterations. Also set
941 aliasing information for the pointer. This vector pointer is used by the
942 callers to this function to create a memory reference expression for vector
943 load/store access.
945 Input:
946 1. STMT: a stmt that references memory. Expected to be of the form
947 GIMPLE_MODIFY_STMT <name, data-ref> or
948 GIMPLE_MODIFY_STMT <data-ref, name>.
949 2. AT_LOOP: the loop where the vector memref is to be created.
950 3. OFFSET (optional): an offset to be added to the initial address accessed
951 by the data-ref in STMT.
952 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
953 pointing to the initial address.
954 5. TYPE: if not NULL indicates the required type of the data-ref
956 Output:
957 1. Declare a new ptr to vector_type, and have it point to the base of the
958 data reference (initial addressed accessed by the data reference).
959 For example, for vector of type V8HI, the following code is generated:
961 v8hi *vp;
962 vp = (v8hi *)initial_address;
964 if OFFSET is not supplied:
965 initial_address = &a[init];
966 if OFFSET is supplied:
967 initial_address = &a[init + OFFSET];
969 Return the initial_address in INITIAL_ADDRESS.
971 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
972 update the pointer in each iteration of the loop.
974 Return the increment stmt that updates the pointer in PTR_INCR.
976 3. Set INV_P to true if the access pattern of the data reference in the
977 vectorized loop is invariant. Set it to false otherwise.
979 4. Return the pointer. */
981 static tree
982 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
983 tree offset, tree *initial_address, tree *ptr_incr,
984 bool only_init, tree type, bool *inv_p)
986 tree base_name;
987 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
988 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
990 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
991 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
992 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
993 tree vect_ptr_type;
994 tree vect_ptr;
995 tree tag;
996 tree new_temp;
997 tree vec_stmt;
998 tree new_stmt_list = NULL_TREE;
999 edge pe;
1000 basic_block new_bb;
1001 tree vect_ptr_init;
1002 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1003 tree vptr;
1004 block_stmt_iterator incr_bsi;
1005 bool insert_after;
1006 tree indx_before_incr, indx_after_incr;
1007 tree incr;
1008 tree step;
1010 /* Check the step (evolution) of the load in LOOP, and record
1011 whether it's invariant. */
1012 if (nested_in_vect_loop)
1013 step = STMT_VINFO_DR_STEP (stmt_info);
1014 else
1015 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1017 if (tree_int_cst_compare (step, size_zero_node) == 0)
1018 *inv_p = true;
1019 else
1020 *inv_p = false;
1022 /* Create an expression for the first address accessed by this load
1023 in LOOP. */
1024 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1026 if (vect_print_dump_info (REPORT_DETAILS))
1028 tree data_ref_base = base_name;
1029 fprintf (vect_dump, "create vector-pointer variable to type: ");
1030 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1031 if (TREE_CODE (data_ref_base) == VAR_DECL)
1032 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1033 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1034 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1035 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1036 fprintf (vect_dump, " vectorizing a record based array ref: ");
1037 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1038 fprintf (vect_dump, " vectorizing a pointer ref: ");
1039 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1042 /** (1) Create the new vector-pointer variable: **/
1043 if (type)
1044 vect_ptr_type = build_pointer_type (type);
1045 else
1046 vect_ptr_type = build_pointer_type (vectype);
1047 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1048 get_name (base_name));
1049 add_referenced_var (vect_ptr);
1051 /** (2) Add aliasing information to the new vector-pointer:
1052 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1054 tag = DR_SYMBOL_TAG (dr);
1055 gcc_assert (tag);
1057 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1058 tag must be created with tag added to its may alias list. */
1059 if (!MTAG_P (tag))
1060 new_type_alias (vect_ptr, tag, DR_REF (dr));
1061 else
1062 set_symbol_mem_tag (vect_ptr, tag);
1064 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1066 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1067 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1068 def-use update cycles for the pointer: One relative to the outer-loop
1069 (LOOP), which is what steps (3) and (4) below do. The other is relative
1070 to the inner-loop (which is the inner-most loop containing the dataref),
1071 and this is done be step (5) below.
1073 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1074 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1075 redundant. Steps (3),(4) create the following:
1077 vp0 = &base_addr;
1078 LOOP: vp1 = phi(vp0,vp2)
1079 ...
1081 vp2 = vp1 + step
1082 goto LOOP
1084 If there is an inner-loop nested in loop, then step (5) will also be
1085 applied, and an additional update in the inner-loop will be created:
1087 vp0 = &base_addr;
1088 LOOP: vp1 = phi(vp0,vp2)
1090 inner: vp3 = phi(vp1,vp4)
1091 vp4 = vp3 + inner_step
1092 if () goto inner
1094 vp2 = vp1 + step
1095 if () goto LOOP */
1097 /** (3) Calculate the initial address the vector-pointer, and set
1098 the vector-pointer to point to it before the loop: **/
1100 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1102 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1103 offset, loop);
1104 pe = loop_preheader_edge (loop);
1105 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1106 gcc_assert (!new_bb);
1107 *initial_address = new_temp;
1109 /* Create: p = (vectype *) initial_base */
1110 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1111 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1112 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1113 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1114 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1115 gcc_assert (!new_bb);
1118 /** (4) Handle the updating of the vector-pointer inside the loop.
1119 This is needed when ONLY_INIT is false, and also when AT_LOOP
1120 is the inner-loop nested in LOOP (during outer-loop vectorization).
1123 if (only_init && at_loop == loop) /* No update in loop is required. */
1125 /* Copy the points-to information if it exists. */
1126 if (DR_PTR_INFO (dr))
1127 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1128 vptr = vect_ptr_init;
1130 else
1132 /* The step of the vector pointer is the Vector Size. */
1133 tree step = TYPE_SIZE_UNIT (vectype);
1134 /* One exception to the above is when the scalar step of the load in
1135 LOOP is zero. In this case the step here is also zero. */
1136 if (*inv_p)
1137 step = size_zero_node;
1139 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1141 create_iv (vect_ptr_init,
1142 fold_convert (vect_ptr_type, step),
1143 NULL_TREE, loop, &incr_bsi, insert_after,
1144 &indx_before_incr, &indx_after_incr);
1145 incr = bsi_stmt (incr_bsi);
1146 set_stmt_info (stmt_ann (incr),
1147 new_stmt_vec_info (incr, loop_vinfo));
1149 /* Copy the points-to information if it exists. */
1150 if (DR_PTR_INFO (dr))
1152 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1153 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1155 merge_alias_info (vect_ptr_init, indx_before_incr);
1156 merge_alias_info (vect_ptr_init, indx_after_incr);
1157 if (ptr_incr)
1158 *ptr_incr = incr;
1160 vptr = indx_before_incr;
1163 if (!nested_in_vect_loop || only_init)
1164 return vptr;
1167 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1168 nested in LOOP, if exists: **/
1170 gcc_assert (nested_in_vect_loop);
1171 if (!only_init)
1173 standard_iv_increment_position (containing_loop, &incr_bsi,
1174 &insert_after);
1175 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1176 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1177 &indx_after_incr);
1178 incr = bsi_stmt (incr_bsi);
1179 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1181 /* Copy the points-to information if it exists. */
1182 if (DR_PTR_INFO (dr))
1184 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1185 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1187 merge_alias_info (vect_ptr_init, indx_before_incr);
1188 merge_alias_info (vect_ptr_init, indx_after_incr);
1189 if (ptr_incr)
1190 *ptr_incr = incr;
1192 return indx_before_incr;
1194 else
1195 gcc_unreachable ();
1199 /* Function bump_vector_ptr
1201 Increment a pointer (to a vector type) by vector-size. If requested,
1202 i.e. if PTR-INCR is given, then also connect the new increment stmt
1203 to the existing def-use update-chain of the pointer, by modifying
1204 the PTR_INCR as illustrated below:
1206 The pointer def-use update-chain before this function:
1207 DATAREF_PTR = phi (p_0, p_2)
1208 ....
1209 PTR_INCR: p_2 = DATAREF_PTR + step
1211 The pointer def-use update-chain after this function:
1212 DATAREF_PTR = phi (p_0, p_2)
1213 ....
1214 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1215 ....
1216 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1218 Input:
1219 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1220 in the loop.
1221 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1222 the loop. The increment amount across iterations is expected
1223 to be vector_size.
1224 BSI - location where the new update stmt is to be placed.
1225 STMT - the original scalar memory-access stmt that is being vectorized.
1226 BUMP - optional. The offset by which to bump the pointer. If not given,
1227 the offset is assumed to be vector_size.
1229 Output: Return NEW_DATAREF_PTR as illustrated above.
1233 static tree
1234 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1235 tree stmt, tree bump)
1237 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1238 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1239 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1240 tree vptr_type = TREE_TYPE (dataref_ptr);
1241 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1242 tree update = TYPE_SIZE_UNIT (vectype);
1243 tree incr_stmt;
1244 ssa_op_iter iter;
1245 use_operand_p use_p;
1246 tree new_dataref_ptr;
1248 if (bump)
1249 update = bump;
1251 incr_stmt = build_gimple_modify_stmt (ptr_var,
1252 build2 (POINTER_PLUS_EXPR, vptr_type,
1253 dataref_ptr, update));
1254 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1255 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1256 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1258 /* Copy the points-to information if it exists. */
1259 if (DR_PTR_INFO (dr))
1260 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1261 merge_alias_info (new_dataref_ptr, dataref_ptr);
1263 if (!ptr_incr)
1264 return new_dataref_ptr;
1266 /* Update the vector-pointer's cross-iteration increment. */
1267 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1269 tree use = USE_FROM_PTR (use_p);
1271 if (use == dataref_ptr)
1272 SET_USE (use_p, new_dataref_ptr);
1273 else
1274 gcc_assert (tree_int_cst_compare (use, update) == 0);
1277 return new_dataref_ptr;
1281 /* Function vect_create_destination_var.
1283 Create a new temporary of type VECTYPE. */
1285 static tree
1286 vect_create_destination_var (tree scalar_dest, tree vectype)
1288 tree vec_dest;
1289 const char *new_name;
1290 tree type;
1291 enum vect_var_kind kind;
1293 kind = vectype ? vect_simple_var : vect_scalar_var;
1294 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1296 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1298 new_name = get_name (scalar_dest);
1299 if (!new_name)
1300 new_name = "var_";
1301 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1302 add_referenced_var (vec_dest);
1304 return vec_dest;
1308 /* Function vect_init_vector.
1310 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1311 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1312 is not NULL. Otherwise, place the initialization at the loop preheader.
1313 Return the DEF of INIT_STMT.
1314 It will be used in the vectorization of STMT. */
1316 static tree
1317 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1318 block_stmt_iterator *bsi)
1320 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1321 tree new_var;
1322 tree init_stmt;
1323 tree vec_oprnd;
1324 edge pe;
1325 tree new_temp;
1326 basic_block new_bb;
1328 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1329 add_referenced_var (new_var);
1330 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1331 new_temp = make_ssa_name (new_var, init_stmt);
1332 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1334 if (bsi)
1335 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1336 else
1338 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1339 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1341 if (nested_in_vect_loop_p (loop, stmt))
1342 loop = loop->inner;
1343 pe = loop_preheader_edge (loop);
1344 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1345 gcc_assert (!new_bb);
1348 if (vect_print_dump_info (REPORT_DETAILS))
1350 fprintf (vect_dump, "created new init_stmt: ");
1351 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1354 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1355 return vec_oprnd;
1359 /* For constant and loop invariant defs of SLP_NODE this function returns
1360 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1361 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1362 stmts. */
1364 static void
1365 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1366 unsigned int op_num)
1368 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1369 tree stmt = VEC_index (tree, stmts, 0);
1370 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1371 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1372 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1373 tree vec_cst;
1374 tree t = NULL_TREE;
1375 int j, number_of_places_left_in_vector;
1376 tree vector_type;
1377 tree op, vop, operation;
1378 int group_size = VEC_length (tree, stmts);
1379 unsigned int vec_num, i;
1380 int number_of_copies = 1;
1381 bool is_store = false;
1382 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1383 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1384 bool constant_p;
1386 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1387 is_store = true;
1389 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1390 created vectors. It is greater than 1 if unrolling is performed.
1392 For example, we have two scalar operands, s1 and s2 (e.g., group of
1393 strided accesses of size two), while NUINTS is four (i.e., four scalars
1394 of this type can be packed in a vector). The output vector will contain
1395 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1396 will be 2).
1398 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1399 containing the operands.
1401 For example, NUINTS is four as before, and the group size is 8
1402 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1403 {s5, s6, s7, s8}. */
1405 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1407 number_of_places_left_in_vector = nunits;
1408 constant_p = true;
1409 for (j = 0; j < number_of_copies; j++)
1411 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1413 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1414 if (is_store)
1415 op = operation;
1416 else
1417 op = TREE_OPERAND (operation, op_num);
1418 if (!CONSTANT_CLASS_P (op))
1419 constant_p = false;
1421 /* Create 'vect_ = {op0,op1,...,opn}'. */
1422 t = tree_cons (NULL_TREE, op, t);
1424 number_of_places_left_in_vector--;
1426 if (number_of_places_left_in_vector == 0)
1428 number_of_places_left_in_vector = nunits;
1430 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1431 gcc_assert (vector_type);
1432 if (constant_p)
1433 vec_cst = build_vector (vector_type, t);
1434 else
1435 vec_cst = build_constructor_from_list (vector_type, t);
1436 constant_p = true;
1437 VEC_quick_push (tree, voprnds,
1438 vect_init_vector (stmt, vec_cst, vector_type,
1439 NULL));
1440 t = NULL_TREE;
1445 /* Since the vectors are created in the reverse order, we should invert
1446 them. */
1447 vec_num = VEC_length (tree, voprnds);
1448 for (j = vec_num - 1; j >= 0; j--)
1450 vop = VEC_index (tree, voprnds, j);
1451 VEC_quick_push (tree, *vec_oprnds, vop);
1454 VEC_free (tree, heap, voprnds);
1456 /* In case that VF is greater than the unrolling factor needed for the SLP
1457 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1458 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1459 to replicate the vectors. */
1460 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1462 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1463 VEC_quick_push (tree, *vec_oprnds, vop);
1468 /* Get vectorized definitions from SLP_NODE that contains corresponding
1469 vectorized def-stmts. */
1471 static void
1472 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1474 tree vec_oprnd;
1475 tree vec_def_stmt;
1476 unsigned int i;
1478 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1480 for (i = 0;
1481 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1482 i++)
1484 gcc_assert (vec_def_stmt);
1485 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1486 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1491 /* Get vectorized definitions for SLP_NODE.
1492 If the scalar definitions are loop invariants or constants, collect them and
1493 call vect_get_constant_vectors() to create vector stmts.
1494 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1495 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1496 vect_get_slp_vect_defs() to retrieve them.
1497 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1498 the right node. This is used when the second operand must remain scalar. */
1500 static void
1501 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1502 VEC (tree,heap) **vec_oprnds1)
1504 tree operation, first_stmt;
1506 /* Allocate memory for vectorized defs. */
1507 *vec_oprnds0 = VEC_alloc (tree, heap,
1508 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1510 /* SLP_NODE corresponds either to a group of stores or to a group of
1511 unary/binary operations. We don't call this function for loads. */
1512 if (SLP_TREE_LEFT (slp_node))
1513 /* The defs are already vectorized. */
1514 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1515 else
1516 /* Build vectors from scalar defs. */
1517 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1519 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1520 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1521 /* Since we don't call this function with loads, this is a group of
1522 stores. */
1523 return;
1525 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1526 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1527 return;
1529 *vec_oprnds1 = VEC_alloc (tree, heap,
1530 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1532 if (SLP_TREE_RIGHT (slp_node))
1533 /* The defs are already vectorized. */
1534 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1535 else
1536 /* Build vectors from scalar defs. */
1537 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1541 /* Function get_initial_def_for_induction
1543 Input:
1544 STMT - a stmt that performs an induction operation in the loop.
1545 IV_PHI - the initial value of the induction variable
1547 Output:
1548 Return a vector variable, initialized with the first VF values of
1549 the induction variable. E.g., for an iv with IV_PHI='X' and
1550 evolution S, for a vector of 4 units, we want to return:
1551 [X, X + S, X + 2*S, X + 3*S]. */
1553 static tree
1554 get_initial_def_for_induction (tree iv_phi)
1556 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1557 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1558 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1559 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1560 tree vectype;
1561 int nunits;
1562 edge pe = loop_preheader_edge (loop);
1563 struct loop *iv_loop;
1564 basic_block new_bb;
1565 tree vec, vec_init, vec_step, t;
1566 tree access_fn;
1567 tree new_var;
1568 tree new_name;
1569 tree init_stmt;
1570 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1571 tree init_expr, step_expr;
1572 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1573 int i;
1574 bool ok;
1575 int ncopies;
1576 tree expr;
1577 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1578 bool nested_in_vect_loop = false;
1579 tree stmts;
1580 imm_use_iterator imm_iter;
1581 use_operand_p use_p;
1582 tree exit_phi;
1583 edge latch_e;
1584 tree loop_arg;
1585 block_stmt_iterator si;
1586 basic_block bb = bb_for_stmt (iv_phi);
1588 vectype = get_vectype_for_scalar_type (scalar_type);
1589 gcc_assert (vectype);
1590 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1591 ncopies = vf / nunits;
1593 gcc_assert (phi_info);
1594 gcc_assert (ncopies >= 1);
1596 /* Find the first insertion point in the BB. */
1597 si = bsi_after_labels (bb);
1599 if (INTEGRAL_TYPE_P (scalar_type))
1600 step_expr = build_int_cst (scalar_type, 0);
1601 else
1602 step_expr = build_real (scalar_type, dconst0);
1604 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1605 if (nested_in_vect_loop_p (loop, iv_phi))
1607 nested_in_vect_loop = true;
1608 iv_loop = loop->inner;
1610 else
1611 iv_loop = loop;
1612 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1614 latch_e = loop_latch_edge (iv_loop);
1615 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1617 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1618 gcc_assert (access_fn);
1619 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1620 &init_expr, &step_expr);
1621 gcc_assert (ok);
1622 pe = loop_preheader_edge (iv_loop);
1624 /* Create the vector that holds the initial_value of the induction. */
1625 if (nested_in_vect_loop)
1627 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1628 been created during vectorization of previous stmts; We obtain it from
1629 the STMT_VINFO_VEC_STMT of the defining stmt. */
1630 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1631 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1633 else
1635 /* iv_loop is the loop to be vectorized. Create:
1636 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1637 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1638 add_referenced_var (new_var);
1640 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1641 if (stmts)
1643 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1644 gcc_assert (!new_bb);
1647 t = NULL_TREE;
1648 t = tree_cons (NULL_TREE, init_expr, t);
1649 for (i = 1; i < nunits; i++)
1651 tree tmp;
1653 /* Create: new_name_i = new_name + step_expr */
1654 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1655 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1656 new_name = make_ssa_name (new_var, init_stmt);
1657 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1659 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1660 gcc_assert (!new_bb);
1662 if (vect_print_dump_info (REPORT_DETAILS))
1664 fprintf (vect_dump, "created new init_stmt: ");
1665 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1667 t = tree_cons (NULL_TREE, new_name, t);
1669 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1670 vec = build_constructor_from_list (vectype, nreverse (t));
1671 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1675 /* Create the vector that holds the step of the induction. */
1676 if (nested_in_vect_loop)
1677 /* iv_loop is nested in the loop to be vectorized. Generate:
1678 vec_step = [S, S, S, S] */
1679 new_name = step_expr;
1680 else
1682 /* iv_loop is the loop to be vectorized. Generate:
1683 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1684 expr = build_int_cst (scalar_type, vf);
1685 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1688 t = NULL_TREE;
1689 for (i = 0; i < nunits; i++)
1690 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1691 gcc_assert (CONSTANT_CLASS_P (new_name));
1692 vec = build_vector (vectype, t);
1693 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1696 /* Create the following def-use cycle:
1697 loop prolog:
1698 vec_init = ...
1699 vec_step = ...
1700 loop:
1701 vec_iv = PHI <vec_init, vec_loop>
1703 STMT
1705 vec_loop = vec_iv + vec_step; */
1707 /* Create the induction-phi that defines the induction-operand. */
1708 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1709 add_referenced_var (vec_dest);
1710 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1711 set_stmt_info (get_stmt_ann (induction_phi),
1712 new_stmt_vec_info (induction_phi, loop_vinfo));
1713 induc_def = PHI_RESULT (induction_phi);
1715 /* Create the iv update inside the loop */
1716 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1717 build2 (PLUS_EXPR, vectype,
1718 induc_def, vec_step));
1719 vec_def = make_ssa_name (vec_dest, new_stmt);
1720 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1721 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1722 set_stmt_info (get_stmt_ann (new_stmt),
1723 new_stmt_vec_info (new_stmt, loop_vinfo));
1725 /* Set the arguments of the phi node: */
1726 add_phi_arg (induction_phi, vec_init, pe);
1727 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1730 /* In case that vectorization factor (VF) is bigger than the number
1731 of elements that we can fit in a vectype (nunits), we have to generate
1732 more than one vector stmt - i.e - we need to "unroll" the
1733 vector stmt by a factor VF/nunits. For more details see documentation
1734 in vectorizable_operation. */
1736 if (ncopies > 1)
1738 stmt_vec_info prev_stmt_vinfo;
1739 /* FORNOW. This restriction should be relaxed. */
1740 gcc_assert (!nested_in_vect_loop);
1742 /* Create the vector that holds the step of the induction. */
1743 expr = build_int_cst (scalar_type, nunits);
1744 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1745 t = NULL_TREE;
1746 for (i = 0; i < nunits; i++)
1747 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1748 gcc_assert (CONSTANT_CLASS_P (new_name));
1749 vec = build_vector (vectype, t);
1750 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1752 vec_def = induc_def;
1753 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1754 for (i = 1; i < ncopies; i++)
1756 tree tmp;
1758 /* vec_i = vec_prev + vec_step */
1759 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1760 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1761 vec_def = make_ssa_name (vec_dest, new_stmt);
1762 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1763 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1764 set_stmt_info (get_stmt_ann (new_stmt),
1765 new_stmt_vec_info (new_stmt, loop_vinfo));
1766 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1767 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1771 if (nested_in_vect_loop)
1773 /* Find the loop-closed exit-phi of the induction, and record
1774 the final vector of induction results: */
1775 exit_phi = NULL;
1776 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1778 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1780 exit_phi = USE_STMT (use_p);
1781 break;
1784 if (exit_phi)
1786 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1787 /* FORNOW. Currently not supporting the case that an inner-loop induction
1788 is not used in the outer-loop (i.e. only outside the outer-loop). */
1789 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1790 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1792 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1793 if (vect_print_dump_info (REPORT_DETAILS))
1795 fprintf (vect_dump, "vector of inductions after inner-loop:");
1796 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1802 if (vect_print_dump_info (REPORT_DETAILS))
1804 fprintf (vect_dump, "transform induction: created def-use cycle:");
1805 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1806 fprintf (vect_dump, "\n");
1807 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1810 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1811 return induc_def;
1815 /* Function vect_get_vec_def_for_operand.
1817 OP is an operand in STMT. This function returns a (vector) def that will be
1818 used in the vectorized stmt for STMT.
1820 In the case that OP is an SSA_NAME which is defined in the loop, then
1821 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1823 In case OP is an invariant or constant, a new stmt that creates a vector def
1824 needs to be introduced. */
1826 static tree
1827 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1829 tree vec_oprnd;
1830 tree vec_stmt;
1831 tree def_stmt;
1832 stmt_vec_info def_stmt_info = NULL;
1833 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1834 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1835 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1836 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1837 tree vec_inv;
1838 tree vec_cst;
1839 tree t = NULL_TREE;
1840 tree def;
1841 int i;
1842 enum vect_def_type dt;
1843 bool is_simple_use;
1844 tree vector_type;
1846 if (vect_print_dump_info (REPORT_DETAILS))
1848 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1849 print_generic_expr (vect_dump, op, TDF_SLIM);
1852 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1853 gcc_assert (is_simple_use);
1854 if (vect_print_dump_info (REPORT_DETAILS))
1856 if (def)
1858 fprintf (vect_dump, "def = ");
1859 print_generic_expr (vect_dump, def, TDF_SLIM);
1861 if (def_stmt)
1863 fprintf (vect_dump, " def_stmt = ");
1864 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1868 switch (dt)
1870 /* Case 1: operand is a constant. */
1871 case vect_constant_def:
1873 if (scalar_def)
1874 *scalar_def = op;
1876 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1877 if (vect_print_dump_info (REPORT_DETAILS))
1878 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1880 for (i = nunits - 1; i >= 0; --i)
1882 t = tree_cons (NULL_TREE, op, t);
1884 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1885 gcc_assert (vector_type);
1886 vec_cst = build_vector (vector_type, t);
1888 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1891 /* Case 2: operand is defined outside the loop - loop invariant. */
1892 case vect_invariant_def:
1894 if (scalar_def)
1895 *scalar_def = def;
1897 /* Create 'vec_inv = {inv,inv,..,inv}' */
1898 if (vect_print_dump_info (REPORT_DETAILS))
1899 fprintf (vect_dump, "Create vector_inv.");
1901 for (i = nunits - 1; i >= 0; --i)
1903 t = tree_cons (NULL_TREE, def, t);
1906 /* FIXME: use build_constructor directly. */
1907 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1908 gcc_assert (vector_type);
1909 vec_inv = build_constructor_from_list (vector_type, t);
1910 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1913 /* Case 3: operand is defined inside the loop. */
1914 case vect_loop_def:
1916 if (scalar_def)
1917 *scalar_def = def_stmt;
1919 /* Get the def from the vectorized stmt. */
1920 def_stmt_info = vinfo_for_stmt (def_stmt);
1921 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1922 gcc_assert (vec_stmt);
1923 if (TREE_CODE (vec_stmt) == PHI_NODE)
1924 vec_oprnd = PHI_RESULT (vec_stmt);
1925 else
1926 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1927 return vec_oprnd;
1930 /* Case 4: operand is defined by a loop header phi - reduction */
1931 case vect_reduction_def:
1933 struct loop *loop;
1935 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1936 loop = (bb_for_stmt (def_stmt))->loop_father;
1938 /* Get the def before the loop */
1939 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1940 return get_initial_def_for_reduction (stmt, op, scalar_def);
1943 /* Case 5: operand is defined by loop-header phi - induction. */
1944 case vect_induction_def:
1946 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1948 /* Get the def from the vectorized stmt. */
1949 def_stmt_info = vinfo_for_stmt (def_stmt);
1950 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1951 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1952 vec_oprnd = PHI_RESULT (vec_stmt);
1953 return vec_oprnd;
1956 default:
1957 gcc_unreachable ();
1962 /* Function vect_get_vec_def_for_stmt_copy
1964 Return a vector-def for an operand. This function is used when the
1965 vectorized stmt to be created (by the caller to this function) is a "copy"
1966 created in case the vectorized result cannot fit in one vector, and several
1967 copies of the vector-stmt are required. In this case the vector-def is
1968 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1969 of the stmt that defines VEC_OPRND.
1970 DT is the type of the vector def VEC_OPRND.
1972 Context:
1973 In case the vectorization factor (VF) is bigger than the number
1974 of elements that can fit in a vectype (nunits), we have to generate
1975 more than one vector stmt to vectorize the scalar stmt. This situation
1976 arises when there are multiple data-types operated upon in the loop; the
1977 smallest data-type determines the VF, and as a result, when vectorizing
1978 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1979 vector stmt (each computing a vector of 'nunits' results, and together
1980 computing 'VF' results in each iteration). This function is called when
1981 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1982 which VF=16 and nunits=4, so the number of copies required is 4):
1984 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1986 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1987 VS1.1: vx.1 = memref1 VS1.2
1988 VS1.2: vx.2 = memref2 VS1.3
1989 VS1.3: vx.3 = memref3
1991 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1992 VSnew.1: vz1 = vx.1 + ... VSnew.2
1993 VSnew.2: vz2 = vx.2 + ... VSnew.3
1994 VSnew.3: vz3 = vx.3 + ...
1996 The vectorization of S1 is explained in vectorizable_load.
1997 The vectorization of S2:
1998 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1999 the function 'vect_get_vec_def_for_operand' is called to
2000 get the relevant vector-def for each operand of S2. For operand x it
2001 returns the vector-def 'vx.0'.
2003 To create the remaining copies of the vector-stmt (VSnew.j), this
2004 function is called to get the relevant vector-def for each operand. It is
2005 obtained from the respective VS1.j stmt, which is recorded in the
2006 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2008 For example, to obtain the vector-def 'vx.1' in order to create the
2009 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2010 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2011 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2012 and return its def ('vx.1').
2013 Overall, to create the above sequence this function will be called 3 times:
2014 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2015 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2016 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2018 static tree
2019 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2021 tree vec_stmt_for_operand;
2022 stmt_vec_info def_stmt_info;
2024 /* Do nothing; can reuse same def. */
2025 if (dt == vect_invariant_def || dt == vect_constant_def )
2026 return vec_oprnd;
2028 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2029 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2030 gcc_assert (def_stmt_info);
2031 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2032 gcc_assert (vec_stmt_for_operand);
2033 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
2034 return vec_oprnd;
2038 /* Get vectorized definitions for the operands to create a copy of an original
2039 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2041 static void
2042 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2043 VEC(tree,heap) **vec_oprnds0,
2044 VEC(tree,heap) **vec_oprnds1)
2046 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2048 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2049 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2051 if (vec_oprnds1 && *vec_oprnds1)
2053 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2054 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2055 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2060 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2062 static void
2063 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2064 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2066 if (slp_node)
2067 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2068 else
2070 tree vec_oprnd;
2072 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2073 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2074 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2076 if (op1)
2078 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2079 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2080 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2086 /* Function vect_finish_stmt_generation.
2088 Insert a new stmt. */
2090 static void
2091 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2092 block_stmt_iterator *bsi)
2094 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2095 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2097 gcc_assert (stmt == bsi_stmt (*bsi));
2098 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2100 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2102 set_stmt_info (get_stmt_ann (vec_stmt),
2103 new_stmt_vec_info (vec_stmt, loop_vinfo));
2105 if (vect_print_dump_info (REPORT_DETAILS))
2107 fprintf (vect_dump, "add new stmt: ");
2108 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2111 /* Make sure bsi points to the stmt that is being vectorized. */
2112 gcc_assert (stmt == bsi_stmt (*bsi));
2114 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2118 /* Function get_initial_def_for_reduction
2120 Input:
2121 STMT - a stmt that performs a reduction operation in the loop.
2122 INIT_VAL - the initial value of the reduction variable
2124 Output:
2125 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2126 of the reduction (used for adjusting the epilog - see below).
2127 Return a vector variable, initialized according to the operation that STMT
2128 performs. This vector will be used as the initial value of the
2129 vector of partial results.
2131 Option1 (adjust in epilog): Initialize the vector as follows:
2132 add: [0,0,...,0,0]
2133 mult: [1,1,...,1,1]
2134 min/max: [init_val,init_val,..,init_val,init_val]
2135 bit and/or: [init_val,init_val,..,init_val,init_val]
2136 and when necessary (e.g. add/mult case) let the caller know
2137 that it needs to adjust the result by init_val.
2139 Option2: Initialize the vector as follows:
2140 add: [0,0,...,0,init_val]
2141 mult: [1,1,...,1,init_val]
2142 min/max: [init_val,init_val,...,init_val]
2143 bit and/or: [init_val,init_val,...,init_val]
2144 and no adjustments are needed.
2146 For example, for the following code:
2148 s = init_val;
2149 for (i=0;i<n;i++)
2150 s = s + a[i];
2152 STMT is 's = s + a[i]', and the reduction variable is 's'.
2153 For a vector of 4 units, we want to return either [0,0,0,init_val],
2154 or [0,0,0,0] and let the caller know that it needs to adjust
2155 the result at the end by 'init_val'.
2157 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2158 initialization vector is simpler (same element in all entries).
2159 A cost model should help decide between these two schemes. */
2161 static tree
2162 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2164 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2165 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2166 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2167 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2168 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2169 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2170 tree type = TREE_TYPE (init_val);
2171 tree vecdef;
2172 tree def_for_init;
2173 tree init_def;
2174 tree t = NULL_TREE;
2175 int i;
2176 tree vector_type;
2177 bool nested_in_vect_loop = false;
2179 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2180 if (nested_in_vect_loop_p (loop, stmt))
2181 nested_in_vect_loop = true;
2182 else
2183 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2185 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2187 switch (code)
2189 case WIDEN_SUM_EXPR:
2190 case DOT_PROD_EXPR:
2191 case PLUS_EXPR:
2192 if (nested_in_vect_loop)
2193 *adjustment_def = vecdef;
2194 else
2195 *adjustment_def = init_val;
2196 /* Create a vector of zeros for init_def. */
2197 if (SCALAR_FLOAT_TYPE_P (type))
2198 def_for_init = build_real (type, dconst0);
2199 else
2200 def_for_init = build_int_cst (type, 0);
2201 for (i = nunits - 1; i >= 0; --i)
2202 t = tree_cons (NULL_TREE, def_for_init, t);
2203 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2204 gcc_assert (vector_type);
2205 init_def = build_vector (vector_type, t);
2206 break;
2208 case MIN_EXPR:
2209 case MAX_EXPR:
2210 *adjustment_def = NULL_TREE;
2211 init_def = vecdef;
2212 break;
2214 default:
2215 gcc_unreachable ();
2218 return init_def;
2222 /* Function vect_create_epilog_for_reduction
2224 Create code at the loop-epilog to finalize the result of a reduction
2225 computation.
2227 VECT_DEF is a vector of partial results.
2228 REDUC_CODE is the tree-code for the epilog reduction.
2229 STMT is the scalar reduction stmt that is being vectorized.
2230 REDUCTION_PHI is the phi-node that carries the reduction computation.
2232 This function:
2233 1. Creates the reduction def-use cycle: sets the arguments for
2234 REDUCTION_PHI:
2235 The loop-entry argument is the vectorized initial-value of the reduction.
2236 The loop-latch argument is VECT_DEF - the vector of partial sums.
2237 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2238 by applying the operation specified by REDUC_CODE if available, or by
2239 other means (whole-vector shifts or a scalar loop).
2240 The function also creates a new phi node at the loop exit to preserve
2241 loop-closed form, as illustrated below.
2243 The flow at the entry to this function:
2245 loop:
2246 vec_def = phi <null, null> # REDUCTION_PHI
2247 VECT_DEF = vector_stmt # vectorized form of STMT
2248 s_loop = scalar_stmt # (scalar) STMT
2249 loop_exit:
2250 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2251 use <s_out0>
2252 use <s_out0>
2254 The above is transformed by this function into:
2256 loop:
2257 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2258 VECT_DEF = vector_stmt # vectorized form of STMT
2259 s_loop = scalar_stmt # (scalar) STMT
2260 loop_exit:
2261 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2262 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2263 v_out2 = reduce <v_out1>
2264 s_out3 = extract_field <v_out2, 0>
2265 s_out4 = adjust_result <s_out3>
2266 use <s_out4>
2267 use <s_out4>
2270 static void
2271 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2272 enum tree_code reduc_code, tree reduction_phi)
2274 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2275 tree vectype;
2276 enum machine_mode mode;
2277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2278 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2279 basic_block exit_bb;
2280 tree scalar_dest;
2281 tree scalar_type;
2282 tree new_phi;
2283 block_stmt_iterator exit_bsi;
2284 tree vec_dest;
2285 tree new_temp = NULL_TREE;
2286 tree new_name;
2287 tree epilog_stmt = NULL_TREE;
2288 tree new_scalar_dest, exit_phi, new_dest;
2289 tree bitsize, bitpos, bytesize;
2290 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2291 tree adjustment_def;
2292 tree vec_initial_def;
2293 tree orig_name;
2294 imm_use_iterator imm_iter;
2295 use_operand_p use_p;
2296 bool extract_scalar_result = false;
2297 tree reduction_op, expr;
2298 tree orig_stmt;
2299 tree use_stmt;
2300 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2301 bool nested_in_vect_loop = false;
2302 int op_type;
2303 VEC(tree,heap) *phis = NULL;
2304 int i;
2306 if (nested_in_vect_loop_p (loop, stmt))
2308 loop = loop->inner;
2309 nested_in_vect_loop = true;
2312 op_type = TREE_OPERAND_LENGTH (operation);
2313 reduction_op = TREE_OPERAND (operation, op_type-1);
2314 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2315 gcc_assert (vectype);
2316 mode = TYPE_MODE (vectype);
2318 /*** 1. Create the reduction def-use cycle ***/
2320 /* 1.1 set the loop-entry arg of the reduction-phi: */
2321 /* For the case of reduction, vect_get_vec_def_for_operand returns
2322 the scalar def before the loop, that defines the initial value
2323 of the reduction variable. */
2324 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2325 &adjustment_def);
2326 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2328 /* 1.2 set the loop-latch arg for the reduction-phi: */
2329 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2331 if (vect_print_dump_info (REPORT_DETAILS))
2333 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2334 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2335 fprintf (vect_dump, "\n");
2336 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2340 /*** 2. Create epilog code
2341 The reduction epilog code operates across the elements of the vector
2342 of partial results computed by the vectorized loop.
2343 The reduction epilog code consists of:
2344 step 1: compute the scalar result in a vector (v_out2)
2345 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2346 step 3: adjust the scalar result (s_out3) if needed.
2348 Step 1 can be accomplished using one the following three schemes:
2349 (scheme 1) using reduc_code, if available.
2350 (scheme 2) using whole-vector shifts, if available.
2351 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2352 combined.
2354 The overall epilog code looks like this:
2356 s_out0 = phi <s_loop> # original EXIT_PHI
2357 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2358 v_out2 = reduce <v_out1> # step 1
2359 s_out3 = extract_field <v_out2, 0> # step 2
2360 s_out4 = adjust_result <s_out3> # step 3
2362 (step 3 is optional, and step2 1 and 2 may be combined).
2363 Lastly, the uses of s_out0 are replaced by s_out4.
2365 ***/
2367 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2368 v_out1 = phi <v_loop> */
2370 exit_bb = single_exit (loop)->dest;
2371 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2372 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2373 exit_bsi = bsi_after_labels (exit_bb);
2375 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2376 (i.e. when reduc_code is not available) and in the final adjustment
2377 code (if needed). Also get the original scalar reduction variable as
2378 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2379 represents a reduction pattern), the tree-code and scalar-def are
2380 taken from the original stmt that the pattern-stmt (STMT) replaces.
2381 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2382 are taken from STMT. */
2384 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2385 if (!orig_stmt)
2387 /* Regular reduction */
2388 orig_stmt = stmt;
2390 else
2392 /* Reduction pattern */
2393 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2394 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2395 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2397 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2398 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2399 scalar_type = TREE_TYPE (scalar_dest);
2400 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2401 bitsize = TYPE_SIZE (scalar_type);
2402 bytesize = TYPE_SIZE_UNIT (scalar_type);
2405 /* In case this is a reduction in an inner-loop while vectorizing an outer
2406 loop - we don't need to extract a single scalar result at the end of the
2407 inner-loop. The final vector of partial results will be used in the
2408 vectorized outer-loop, or reduced to a scalar result at the end of the
2409 outer-loop. */
2410 if (nested_in_vect_loop)
2411 goto vect_finalize_reduction;
2413 /* 2.3 Create the reduction code, using one of the three schemes described
2414 above. */
2416 if (reduc_code < NUM_TREE_CODES)
2418 tree tmp;
2420 /*** Case 1: Create:
2421 v_out2 = reduc_expr <v_out1> */
2423 if (vect_print_dump_info (REPORT_DETAILS))
2424 fprintf (vect_dump, "Reduce using direct vector reduction.");
2426 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2427 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2428 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2429 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2430 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2431 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2433 extract_scalar_result = true;
2435 else
2437 enum tree_code shift_code = 0;
2438 bool have_whole_vector_shift = true;
2439 int bit_offset;
2440 int element_bitsize = tree_low_cst (bitsize, 1);
2441 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2442 tree vec_temp;
2444 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2445 shift_code = VEC_RSHIFT_EXPR;
2446 else
2447 have_whole_vector_shift = false;
2449 /* Regardless of whether we have a whole vector shift, if we're
2450 emulating the operation via tree-vect-generic, we don't want
2451 to use it. Only the first round of the reduction is likely
2452 to still be profitable via emulation. */
2453 /* ??? It might be better to emit a reduction tree code here, so that
2454 tree-vect-generic can expand the first round via bit tricks. */
2455 if (!VECTOR_MODE_P (mode))
2456 have_whole_vector_shift = false;
2457 else
2459 optab optab = optab_for_tree_code (code, vectype);
2460 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2461 have_whole_vector_shift = false;
2464 if (have_whole_vector_shift)
2466 /*** Case 2: Create:
2467 for (offset = VS/2; offset >= element_size; offset/=2)
2469 Create: va' = vec_shift <va, offset>
2470 Create: va = vop <va, va'>
2471 } */
2473 if (vect_print_dump_info (REPORT_DETAILS))
2474 fprintf (vect_dump, "Reduce using vector shifts");
2476 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2477 new_temp = PHI_RESULT (new_phi);
2479 for (bit_offset = vec_size_in_bits/2;
2480 bit_offset >= element_bitsize;
2481 bit_offset /= 2)
2483 tree bitpos = size_int (bit_offset);
2484 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2485 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2486 new_name = make_ssa_name (vec_dest, epilog_stmt);
2487 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2488 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2490 tmp = build2 (code, vectype, new_name, new_temp);
2491 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2492 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2493 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2494 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2497 extract_scalar_result = true;
2499 else
2501 tree rhs;
2503 /*** Case 3: Create:
2504 s = extract_field <v_out2, 0>
2505 for (offset = element_size;
2506 offset < vector_size;
2507 offset += element_size;)
2509 Create: s' = extract_field <v_out2, offset>
2510 Create: s = op <s, s'>
2511 } */
2513 if (vect_print_dump_info (REPORT_DETAILS))
2514 fprintf (vect_dump, "Reduce using scalar code. ");
2516 vec_temp = PHI_RESULT (new_phi);
2517 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2518 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2519 bitsize_zero_node);
2520 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2521 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2522 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2523 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2525 for (bit_offset = element_bitsize;
2526 bit_offset < vec_size_in_bits;
2527 bit_offset += element_bitsize)
2529 tree tmp;
2530 tree bitpos = bitsize_int (bit_offset);
2531 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2532 bitpos);
2534 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2535 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2536 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2537 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2539 tmp = build2 (code, scalar_type, new_name, new_temp);
2540 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2541 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2542 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2543 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2546 extract_scalar_result = false;
2550 /* 2.4 Extract the final scalar result. Create:
2551 s_out3 = extract_field <v_out2, bitpos> */
2553 if (extract_scalar_result)
2555 tree rhs;
2557 gcc_assert (!nested_in_vect_loop);
2558 if (vect_print_dump_info (REPORT_DETAILS))
2559 fprintf (vect_dump, "extract scalar result");
2561 if (BYTES_BIG_ENDIAN)
2562 bitpos = size_binop (MULT_EXPR,
2563 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2564 TYPE_SIZE (scalar_type));
2565 else
2566 bitpos = bitsize_zero_node;
2568 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2569 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2570 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2571 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2572 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2575 vect_finalize_reduction:
2577 /* 2.5 Adjust the final result by the initial value of the reduction
2578 variable. (When such adjustment is not needed, then
2579 'adjustment_def' is zero). For example, if code is PLUS we create:
2580 new_temp = loop_exit_def + adjustment_def */
2582 if (adjustment_def)
2584 if (nested_in_vect_loop)
2586 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2587 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2588 new_dest = vect_create_destination_var (scalar_dest, vectype);
2590 else
2592 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2593 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2594 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2596 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2597 new_temp = make_ssa_name (new_dest, epilog_stmt);
2598 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2599 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2603 /* 2.6 Handle the loop-exit phi */
2605 /* Replace uses of s_out0 with uses of s_out3:
2606 Find the loop-closed-use at the loop exit of the original scalar result.
2607 (The reduction result is expected to have two immediate uses - one at the
2608 latch block, and one at the loop exit). */
2609 phis = VEC_alloc (tree, heap, 10);
2610 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2612 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2614 exit_phi = USE_STMT (use_p);
2615 VEC_quick_push (tree, phis, exit_phi);
2618 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2619 gcc_assert (!VEC_empty (tree, phis));
2621 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2623 if (nested_in_vect_loop)
2625 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2627 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2628 is not used in the outer-loop (but only outside the outer-loop). */
2629 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2630 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2632 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2633 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2634 set_stmt_info (get_stmt_ann (epilog_stmt),
2635 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2636 continue;
2639 /* Replace the uses: */
2640 orig_name = PHI_RESULT (exit_phi);
2641 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2642 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2643 SET_USE (use_p, new_temp);
2645 VEC_free (tree, heap, phis);
2649 /* Function vectorizable_reduction.
2651 Check if STMT performs a reduction operation that can be vectorized.
2652 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2653 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2654 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2656 This function also handles reduction idioms (patterns) that have been
2657 recognized in advance during vect_pattern_recog. In this case, STMT may be
2658 of this form:
2659 X = pattern_expr (arg0, arg1, ..., X)
2660 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2661 sequence that had been detected and replaced by the pattern-stmt (STMT).
2663 In some cases of reduction patterns, the type of the reduction variable X is
2664 different than the type of the other arguments of STMT.
2665 In such cases, the vectype that is used when transforming STMT into a vector
2666 stmt is different than the vectype that is used to determine the
2667 vectorization factor, because it consists of a different number of elements
2668 than the actual number of elements that are being operated upon in parallel.
2670 For example, consider an accumulation of shorts into an int accumulator.
2671 On some targets it's possible to vectorize this pattern operating on 8
2672 shorts at a time (hence, the vectype for purposes of determining the
2673 vectorization factor should be V8HI); on the other hand, the vectype that
2674 is used to create the vector form is actually V4SI (the type of the result).
2676 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2677 indicates what is the actual level of parallelism (V8HI in the example), so
2678 that the right vectorization factor would be derived. This vectype
2679 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2680 be used to create the vectorized stmt. The right vectype for the vectorized
2681 stmt is obtained from the type of the result X:
2682 get_vectype_for_scalar_type (TREE_TYPE (X))
2684 This means that, contrary to "regular" reductions (or "regular" stmts in
2685 general), the following equation:
2686 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2687 does *NOT* necessarily hold for reduction patterns. */
2689 bool
2690 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2692 tree vec_dest;
2693 tree scalar_dest;
2694 tree op;
2695 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2696 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2697 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2698 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2699 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2700 tree operation;
2701 enum tree_code code, orig_code, epilog_reduc_code = 0;
2702 enum machine_mode vec_mode;
2703 int op_type;
2704 optab optab, reduc_optab;
2705 tree new_temp = NULL_TREE;
2706 tree def, def_stmt;
2707 enum vect_def_type dt;
2708 tree new_phi;
2709 tree scalar_type;
2710 bool is_simple_use;
2711 tree orig_stmt;
2712 stmt_vec_info orig_stmt_info;
2713 tree expr = NULL_TREE;
2714 int i;
2715 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2716 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2717 stmt_vec_info prev_stmt_info;
2718 tree reduc_def;
2719 tree new_stmt = NULL_TREE;
2720 int j;
2722 if (nested_in_vect_loop_p (loop, stmt))
2724 loop = loop->inner;
2725 /* FORNOW. This restriction should be relaxed. */
2726 if (ncopies > 1)
2728 if (vect_print_dump_info (REPORT_DETAILS))
2729 fprintf (vect_dump, "multiple types in nested loop.");
2730 return false;
2734 gcc_assert (ncopies >= 1);
2736 /* FORNOW: SLP not supported. */
2737 if (STMT_SLP_TYPE (stmt_info))
2738 return false;
2740 /* 1. Is vectorizable reduction? */
2742 /* Not supportable if the reduction variable is used in the loop. */
2743 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2744 return false;
2746 /* Reductions that are not used even in an enclosing outer-loop,
2747 are expected to be "live" (used out of the loop). */
2748 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2749 && !STMT_VINFO_LIVE_P (stmt_info))
2750 return false;
2752 /* Make sure it was already recognized as a reduction computation. */
2753 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2754 return false;
2756 /* 2. Has this been recognized as a reduction pattern?
2758 Check if STMT represents a pattern that has been recognized
2759 in earlier analysis stages. For stmts that represent a pattern,
2760 the STMT_VINFO_RELATED_STMT field records the last stmt in
2761 the original sequence that constitutes the pattern. */
2763 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2764 if (orig_stmt)
2766 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2767 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2768 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2769 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2772 /* 3. Check the operands of the operation. The first operands are defined
2773 inside the loop body. The last operand is the reduction variable,
2774 which is defined by the loop-header-phi. */
2776 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2778 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2779 code = TREE_CODE (operation);
2780 op_type = TREE_OPERAND_LENGTH (operation);
2781 if (op_type != binary_op && op_type != ternary_op)
2782 return false;
2783 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2784 scalar_type = TREE_TYPE (scalar_dest);
2785 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2786 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2787 return false;
2789 /* All uses but the last are expected to be defined in the loop.
2790 The last use is the reduction variable. */
2791 for (i = 0; i < op_type-1; i++)
2793 op = TREE_OPERAND (operation, i);
2794 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2795 gcc_assert (is_simple_use);
2796 if (dt != vect_loop_def
2797 && dt != vect_invariant_def
2798 && dt != vect_constant_def
2799 && dt != vect_induction_def)
2800 return false;
2803 op = TREE_OPERAND (operation, i);
2804 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2805 gcc_assert (is_simple_use);
2806 gcc_assert (dt == vect_reduction_def);
2807 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2808 if (orig_stmt)
2809 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2810 else
2811 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2813 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2814 return false;
2816 /* 4. Supportable by target? */
2818 /* 4.1. check support for the operation in the loop */
2819 optab = optab_for_tree_code (code, vectype);
2820 if (!optab)
2822 if (vect_print_dump_info (REPORT_DETAILS))
2823 fprintf (vect_dump, "no optab.");
2824 return false;
2826 vec_mode = TYPE_MODE (vectype);
2827 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2829 if (vect_print_dump_info (REPORT_DETAILS))
2830 fprintf (vect_dump, "op not supported by target.");
2831 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2832 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2833 < vect_min_worthwhile_factor (code))
2834 return false;
2835 if (vect_print_dump_info (REPORT_DETAILS))
2836 fprintf (vect_dump, "proceeding using word mode.");
2839 /* Worthwhile without SIMD support? */
2840 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2841 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2842 < vect_min_worthwhile_factor (code))
2844 if (vect_print_dump_info (REPORT_DETAILS))
2845 fprintf (vect_dump, "not worthwhile without SIMD support.");
2846 return false;
2849 /* 4.2. Check support for the epilog operation.
2851 If STMT represents a reduction pattern, then the type of the
2852 reduction variable may be different than the type of the rest
2853 of the arguments. For example, consider the case of accumulation
2854 of shorts into an int accumulator; The original code:
2855 S1: int_a = (int) short_a;
2856 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2858 was replaced with:
2859 STMT: int_acc = widen_sum <short_a, int_acc>
2861 This means that:
2862 1. The tree-code that is used to create the vector operation in the
2863 epilog code (that reduces the partial results) is not the
2864 tree-code of STMT, but is rather the tree-code of the original
2865 stmt from the pattern that STMT is replacing. I.e, in the example
2866 above we want to use 'widen_sum' in the loop, but 'plus' in the
2867 epilog.
2868 2. The type (mode) we use to check available target support
2869 for the vector operation to be created in the *epilog*, is
2870 determined by the type of the reduction variable (in the example
2871 above we'd check this: plus_optab[vect_int_mode]).
2872 However the type (mode) we use to check available target support
2873 for the vector operation to be created *inside the loop*, is
2874 determined by the type of the other arguments to STMT (in the
2875 example we'd check this: widen_sum_optab[vect_short_mode]).
2877 This is contrary to "regular" reductions, in which the types of all
2878 the arguments are the same as the type of the reduction variable.
2879 For "regular" reductions we can therefore use the same vector type
2880 (and also the same tree-code) when generating the epilog code and
2881 when generating the code inside the loop. */
2883 if (orig_stmt)
2885 /* This is a reduction pattern: get the vectype from the type of the
2886 reduction variable, and get the tree-code from orig_stmt. */
2887 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2888 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2889 if (!vectype)
2891 if (vect_print_dump_info (REPORT_DETAILS))
2893 fprintf (vect_dump, "unsupported data-type ");
2894 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2896 return false;
2899 vec_mode = TYPE_MODE (vectype);
2901 else
2903 /* Regular reduction: use the same vectype and tree-code as used for
2904 the vector code inside the loop can be used for the epilog code. */
2905 orig_code = code;
2908 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2909 return false;
2910 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2911 if (!reduc_optab)
2913 if (vect_print_dump_info (REPORT_DETAILS))
2914 fprintf (vect_dump, "no optab for reduction.");
2915 epilog_reduc_code = NUM_TREE_CODES;
2917 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2919 if (vect_print_dump_info (REPORT_DETAILS))
2920 fprintf (vect_dump, "reduc op not supported by target.");
2921 epilog_reduc_code = NUM_TREE_CODES;
2924 if (!vec_stmt) /* transformation not required. */
2926 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2927 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2928 return false;
2929 return true;
2932 /** Transform. **/
2934 if (vect_print_dump_info (REPORT_DETAILS))
2935 fprintf (vect_dump, "transform reduction.");
2937 /* Create the destination vector */
2938 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2940 /* Create the reduction-phi that defines the reduction-operand. */
2941 new_phi = create_phi_node (vec_dest, loop->header);
2943 /* In case the vectorization factor (VF) is bigger than the number
2944 of elements that we can fit in a vectype (nunits), we have to generate
2945 more than one vector stmt - i.e - we need to "unroll" the
2946 vector stmt by a factor VF/nunits. For more details see documentation
2947 in vectorizable_operation. */
2949 prev_stmt_info = NULL;
2950 for (j = 0; j < ncopies; j++)
2952 /* Handle uses. */
2953 if (j == 0)
2955 op = TREE_OPERAND (operation, 0);
2956 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2957 if (op_type == ternary_op)
2959 op = TREE_OPERAND (operation, 1);
2960 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2963 /* Get the vector def for the reduction variable from the phi node */
2964 reduc_def = PHI_RESULT (new_phi);
2966 else
2968 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2969 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2970 if (op_type == ternary_op)
2971 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2973 /* Get the vector def for the reduction variable from the vectorized
2974 reduction operation generated in the previous iteration (j-1) */
2975 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2978 /* Arguments are ready. create the new vector stmt. */
2979 if (op_type == binary_op)
2980 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2981 else
2982 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2983 reduc_def);
2984 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2985 new_temp = make_ssa_name (vec_dest, new_stmt);
2986 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2987 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2989 if (j == 0)
2990 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2991 else
2992 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2993 prev_stmt_info = vinfo_for_stmt (new_stmt);
2996 /* Finalize the reduction-phi (set it's arguments) and create the
2997 epilog reduction code. */
2998 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2999 return true;
3002 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3003 a function declaration if the target has a vectorized version
3004 of the function, or NULL_TREE if the function cannot be vectorized. */
3006 tree
3007 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
3009 tree fndecl = get_callee_fndecl (call);
3010 enum built_in_function code;
3012 /* We only handle functions that do not read or clobber memory -- i.e.
3013 const or novops ones. */
3014 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3015 return NULL_TREE;
3017 if (!fndecl
3018 || TREE_CODE (fndecl) != FUNCTION_DECL
3019 || !DECL_BUILT_IN (fndecl))
3020 return NULL_TREE;
3022 code = DECL_FUNCTION_CODE (fndecl);
3023 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3024 vectype_in);
3027 /* Function vectorizable_call.
3029 Check if STMT performs a function call that can be vectorized.
3030 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3031 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3032 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3034 bool
3035 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3037 tree vec_dest;
3038 tree scalar_dest;
3039 tree operation;
3040 tree op, type;
3041 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3042 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3043 tree vectype_out, vectype_in;
3044 int nunits_in;
3045 int nunits_out;
3046 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3047 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3048 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
3049 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3050 tree new_stmt;
3051 int ncopies, j, nargs;
3052 call_expr_arg_iterator iter;
3053 tree vargs;
3054 enum { NARROW, NONE, WIDEN } modifier;
3056 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3057 return false;
3059 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3060 return false;
3062 /* FORNOW: SLP not supported. */
3063 if (STMT_SLP_TYPE (stmt_info))
3064 return false;
3066 /* Is STMT a vectorizable call? */
3067 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3068 return false;
3070 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3071 return false;
3073 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3074 if (TREE_CODE (operation) != CALL_EXPR)
3075 return false;
3077 /* Process function arguments. */
3078 rhs_type = NULL_TREE;
3079 nargs = 0;
3080 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3082 /* Bail out if the function has more than two arguments, we
3083 do not have interesting builtin functions to vectorize with
3084 more than two arguments. */
3085 if (nargs >= 2)
3086 return false;
3088 /* We can only handle calls with arguments of the same type. */
3089 if (rhs_type
3090 && rhs_type != TREE_TYPE (op))
3092 if (vect_print_dump_info (REPORT_DETAILS))
3093 fprintf (vect_dump, "argument types differ.");
3094 return false;
3096 rhs_type = TREE_TYPE (op);
3098 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3100 if (vect_print_dump_info (REPORT_DETAILS))
3101 fprintf (vect_dump, "use not simple.");
3102 return false;
3105 ++nargs;
3108 /* No arguments is also not good. */
3109 if (nargs == 0)
3110 return false;
3112 vectype_in = get_vectype_for_scalar_type (rhs_type);
3113 if (!vectype_in)
3114 return false;
3115 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3117 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3118 vectype_out = get_vectype_for_scalar_type (lhs_type);
3119 if (!vectype_out)
3120 return false;
3121 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3123 /* FORNOW */
3124 if (nunits_in == nunits_out / 2)
3125 modifier = NARROW;
3126 else if (nunits_out == nunits_in)
3127 modifier = NONE;
3128 else if (nunits_out == nunits_in / 2)
3129 modifier = WIDEN;
3130 else
3131 return false;
3133 /* For now, we only vectorize functions if a target specific builtin
3134 is available. TODO -- in some cases, it might be profitable to
3135 insert the calls for pieces of the vector, in order to be able
3136 to vectorize other operations in the loop. */
3137 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3138 if (fndecl == NULL_TREE)
3140 if (vect_print_dump_info (REPORT_DETAILS))
3141 fprintf (vect_dump, "function is not vectorizable.");
3143 return false;
3146 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3148 if (modifier == NARROW)
3149 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3150 else
3151 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3153 /* Sanity check: make sure that at least one copy of the vectorized stmt
3154 needs to be generated. */
3155 gcc_assert (ncopies >= 1);
3157 /* FORNOW. This restriction should be relaxed. */
3158 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3160 if (vect_print_dump_info (REPORT_DETAILS))
3161 fprintf (vect_dump, "multiple types in nested loop.");
3162 return false;
3165 if (!vec_stmt) /* transformation not required. */
3167 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3168 if (vect_print_dump_info (REPORT_DETAILS))
3169 fprintf (vect_dump, "=== vectorizable_call ===");
3170 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3171 return true;
3174 /** Transform. **/
3176 if (vect_print_dump_info (REPORT_DETAILS))
3177 fprintf (vect_dump, "transform operation.");
3179 /* FORNOW. This restriction should be relaxed. */
3180 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3182 if (vect_print_dump_info (REPORT_DETAILS))
3183 fprintf (vect_dump, "multiple types in nested loop.");
3184 return false;
3187 /* Handle def. */
3188 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3189 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3191 prev_stmt_info = NULL;
3192 switch (modifier)
3194 case NONE:
3195 for (j = 0; j < ncopies; ++j)
3197 /* Build argument list for the vectorized call. */
3198 /* FIXME: Rewrite this so that it doesn't
3199 construct a temporary list. */
3200 vargs = NULL_TREE;
3201 nargs = 0;
3202 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3204 if (j == 0)
3205 vec_oprnd0
3206 = vect_get_vec_def_for_operand (op, stmt, NULL);
3207 else
3208 vec_oprnd0
3209 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3211 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3213 ++nargs;
3215 vargs = nreverse (vargs);
3217 rhs = build_function_call_expr (fndecl, vargs);
3218 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3219 new_temp = make_ssa_name (vec_dest, new_stmt);
3220 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3222 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3224 if (j == 0)
3225 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3226 else
3227 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3229 prev_stmt_info = vinfo_for_stmt (new_stmt);
3232 break;
3234 case NARROW:
3235 for (j = 0; j < ncopies; ++j)
3237 /* Build argument list for the vectorized call. */
3238 /* FIXME: Rewrite this so that it doesn't
3239 construct a temporary list. */
3240 vargs = NULL_TREE;
3241 nargs = 0;
3242 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3244 if (j == 0)
3246 vec_oprnd0
3247 = vect_get_vec_def_for_operand (op, stmt, NULL);
3248 vec_oprnd1
3249 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3251 else
3253 vec_oprnd0
3254 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3255 vec_oprnd1
3256 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3259 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3260 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3262 ++nargs;
3264 vargs = nreverse (vargs);
3266 rhs = build_function_call_expr (fndecl, vargs);
3267 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3268 new_temp = make_ssa_name (vec_dest, new_stmt);
3269 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3271 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3273 if (j == 0)
3274 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3275 else
3276 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3278 prev_stmt_info = vinfo_for_stmt (new_stmt);
3281 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3283 break;
3285 case WIDEN:
3286 /* No current target implements this case. */
3287 return false;
3290 /* The call in STMT might prevent it from being removed in dce.
3291 We however cannot remove it here, due to the way the ssa name
3292 it defines is mapped to the new definition. So just replace
3293 rhs of the statement with something harmless. */
3294 type = TREE_TYPE (scalar_dest);
3295 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3296 update_stmt (stmt);
3298 return true;
3302 /* Function vect_gen_widened_results_half
3304 Create a vector stmt whose code, type, number of arguments, and result
3305 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3306 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3307 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3308 needs to be created (DECL is a function-decl of a target-builtin).
3309 STMT is the original scalar stmt that we are vectorizing. */
3311 static tree
3312 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3313 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3314 tree vec_dest, block_stmt_iterator *bsi,
3315 tree stmt)
3317 tree expr;
3318 tree new_stmt;
3319 tree new_temp;
3320 tree sym;
3321 ssa_op_iter iter;
3323 /* Generate half of the widened result: */
3324 if (code == CALL_EXPR)
3326 /* Target specific support */
3327 if (op_type == binary_op)
3328 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3329 else
3330 expr = build_call_expr (decl, 1, vec_oprnd0);
3332 else
3334 /* Generic support */
3335 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3336 if (op_type == binary_op)
3337 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3338 else
3339 expr = build1 (code, vectype, vec_oprnd0);
3341 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3342 new_temp = make_ssa_name (vec_dest, new_stmt);
3343 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3344 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3346 if (code == CALL_EXPR)
3348 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3350 if (TREE_CODE (sym) == SSA_NAME)
3351 sym = SSA_NAME_VAR (sym);
3352 mark_sym_for_renaming (sym);
3356 return new_stmt;
3360 /* Check if STMT performs a conversion operation, that can be vectorized.
3361 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3362 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3363 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3365 bool
3366 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3367 tree *vec_stmt, slp_tree slp_node)
3369 tree vec_dest;
3370 tree scalar_dest;
3371 tree operation;
3372 tree op0;
3373 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3374 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3375 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3376 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3377 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3378 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3379 tree new_temp;
3380 tree def, def_stmt;
3381 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3382 tree new_stmt = NULL_TREE;
3383 stmt_vec_info prev_stmt_info;
3384 int nunits_in;
3385 int nunits_out;
3386 tree vectype_out, vectype_in;
3387 int ncopies, j;
3388 tree expr;
3389 tree rhs_type, lhs_type;
3390 tree builtin_decl;
3391 enum { NARROW, NONE, WIDEN } modifier;
3392 int i;
3393 VEC(tree,heap) *vec_oprnds0 = NULL;
3394 tree vop0;
3396 /* Is STMT a vectorizable conversion? */
3398 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3399 return false;
3401 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3402 return false;
3404 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3405 return false;
3407 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3408 return false;
3410 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3411 code = TREE_CODE (operation);
3412 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3413 return false;
3415 /* Check types of lhs and rhs. */
3416 op0 = TREE_OPERAND (operation, 0);
3417 rhs_type = TREE_TYPE (op0);
3418 vectype_in = get_vectype_for_scalar_type (rhs_type);
3419 if (!vectype_in)
3420 return false;
3421 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3423 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3424 lhs_type = TREE_TYPE (scalar_dest);
3425 vectype_out = get_vectype_for_scalar_type (lhs_type);
3426 if (!vectype_out)
3427 return false;
3428 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3430 /* FORNOW */
3431 if (nunits_in == nunits_out / 2)
3432 modifier = NARROW;
3433 else if (nunits_out == nunits_in)
3434 modifier = NONE;
3435 else if (nunits_out == nunits_in / 2)
3436 modifier = WIDEN;
3437 else
3438 return false;
3440 if (modifier == NONE)
3441 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3443 /* Bail out if the types are both integral or non-integral. */
3444 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3445 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3446 return false;
3448 if (modifier == NARROW)
3449 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3450 else
3451 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3453 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3454 this, so we can safely override NCOPIES with 1 here. */
3455 if (slp_node)
3456 ncopies = 1;
3458 /* Sanity check: make sure that at least one copy of the vectorized stmt
3459 needs to be generated. */
3460 gcc_assert (ncopies >= 1);
3462 /* FORNOW. This restriction should be relaxed. */
3463 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3465 if (vect_print_dump_info (REPORT_DETAILS))
3466 fprintf (vect_dump, "multiple types in nested loop.");
3467 return false;
3470 /* Check the operands of the operation. */
3471 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3473 if (vect_print_dump_info (REPORT_DETAILS))
3474 fprintf (vect_dump, "use not simple.");
3475 return false;
3478 /* Supportable by target? */
3479 if ((modifier == NONE
3480 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3481 || (modifier == WIDEN
3482 && !supportable_widening_operation (code, stmt, vectype_in,
3483 &decl1, &decl2,
3484 &code1, &code2))
3485 || (modifier == NARROW
3486 && !supportable_narrowing_operation (code, stmt, vectype_in,
3487 &code1)))
3489 if (vect_print_dump_info (REPORT_DETAILS))
3490 fprintf (vect_dump, "op not supported by target.");
3491 return false;
3494 if (modifier != NONE)
3496 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3497 /* FORNOW: SLP not supported. */
3498 if (STMT_SLP_TYPE (stmt_info))
3499 return false;
3502 if (!vec_stmt) /* transformation not required. */
3504 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3505 return true;
3508 /** Transform. **/
3509 if (vect_print_dump_info (REPORT_DETAILS))
3510 fprintf (vect_dump, "transform conversion.");
3512 /* Handle def. */
3513 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3515 if (modifier == NONE && !slp_node)
3516 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3518 prev_stmt_info = NULL;
3519 switch (modifier)
3521 case NONE:
3522 for (j = 0; j < ncopies; j++)
3524 tree sym;
3525 ssa_op_iter iter;
3527 if (j == 0)
3528 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3529 else
3530 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3532 builtin_decl =
3533 targetm.vectorize.builtin_conversion (code, vectype_in);
3534 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3536 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3538 /* Arguments are ready. create the new vector stmt. */
3539 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3540 new_temp = make_ssa_name (vec_dest, new_stmt);
3541 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3542 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3543 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3544 SSA_OP_ALL_VIRTUALS)
3546 if (TREE_CODE (sym) == SSA_NAME)
3547 sym = SSA_NAME_VAR (sym);
3548 mark_sym_for_renaming (sym);
3550 if (slp_node)
3551 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3554 if (j == 0)
3555 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3556 else
3557 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3558 prev_stmt_info = vinfo_for_stmt (new_stmt);
3560 break;
3562 case WIDEN:
3563 /* In case the vectorization factor (VF) is bigger than the number
3564 of elements that we can fit in a vectype (nunits), we have to
3565 generate more than one vector stmt - i.e - we need to "unroll"
3566 the vector stmt by a factor VF/nunits. */
3567 for (j = 0; j < ncopies; j++)
3569 if (j == 0)
3570 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3571 else
3572 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3574 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3576 /* Generate first half of the widened result: */
3577 new_stmt
3578 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3579 vec_oprnd0, vec_oprnd1,
3580 unary_op, vec_dest, bsi, stmt);
3581 if (j == 0)
3582 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3583 else
3584 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3585 prev_stmt_info = vinfo_for_stmt (new_stmt);
3587 /* Generate second half of the widened result: */
3588 new_stmt
3589 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3590 vec_oprnd0, vec_oprnd1,
3591 unary_op, vec_dest, bsi, stmt);
3592 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3593 prev_stmt_info = vinfo_for_stmt (new_stmt);
3595 break;
3597 case NARROW:
3598 /* In case the vectorization factor (VF) is bigger than the number
3599 of elements that we can fit in a vectype (nunits), we have to
3600 generate more than one vector stmt - i.e - we need to "unroll"
3601 the vector stmt by a factor VF/nunits. */
3602 for (j = 0; j < ncopies; j++)
3604 /* Handle uses. */
3605 if (j == 0)
3607 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3608 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3610 else
3612 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3613 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3616 /* Arguments are ready. Create the new vector stmt. */
3617 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3618 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3619 new_temp = make_ssa_name (vec_dest, new_stmt);
3620 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3621 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3623 if (j == 0)
3624 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3625 else
3626 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3628 prev_stmt_info = vinfo_for_stmt (new_stmt);
3631 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3634 if (vec_oprnds0)
3635 VEC_free (tree, heap, vec_oprnds0);
3637 return true;
3641 /* Function vectorizable_assignment.
3643 Check if STMT performs an assignment (copy) that can be vectorized.
3644 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3645 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3646 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3648 bool
3649 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3650 slp_tree slp_node)
3652 tree vec_dest;
3653 tree scalar_dest;
3654 tree op;
3655 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3656 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3657 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3658 tree new_temp;
3659 tree def, def_stmt;
3660 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3661 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3662 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3663 int i;
3664 VEC(tree,heap) *vec_oprnds = NULL;
3665 tree vop;
3667 gcc_assert (ncopies >= 1);
3668 if (ncopies > 1)
3669 return false; /* FORNOW */
3671 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3672 return false;
3674 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3675 return false;
3677 /* Is vectorizable assignment? */
3678 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3679 return false;
3681 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3682 if (TREE_CODE (scalar_dest) != SSA_NAME)
3683 return false;
3685 op = GIMPLE_STMT_OPERAND (stmt, 1);
3686 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3688 if (vect_print_dump_info (REPORT_DETAILS))
3689 fprintf (vect_dump, "use not simple.");
3690 return false;
3693 if (!vec_stmt) /* transformation not required. */
3695 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3696 if (vect_print_dump_info (REPORT_DETAILS))
3697 fprintf (vect_dump, "=== vectorizable_assignment ===");
3698 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3699 return true;
3702 /** Transform. **/
3703 if (vect_print_dump_info (REPORT_DETAILS))
3704 fprintf (vect_dump, "transform assignment.");
3706 /* Handle def. */
3707 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3709 /* Handle use. */
3710 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3712 /* Arguments are ready. create the new vector stmt. */
3713 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3715 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3716 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3717 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3718 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3719 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3721 if (slp_node)
3722 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3725 VEC_free (tree, heap, vec_oprnds);
3726 return true;
3730 /* Function vect_min_worthwhile_factor.
3732 For a loop where we could vectorize the operation indicated by CODE,
3733 return the minimum vectorization factor that makes it worthwhile
3734 to use generic vectors. */
3735 static int
3736 vect_min_worthwhile_factor (enum tree_code code)
3738 switch (code)
3740 case PLUS_EXPR:
3741 case MINUS_EXPR:
3742 case NEGATE_EXPR:
3743 return 4;
3745 case BIT_AND_EXPR:
3746 case BIT_IOR_EXPR:
3747 case BIT_XOR_EXPR:
3748 case BIT_NOT_EXPR:
3749 return 2;
3751 default:
3752 return INT_MAX;
3757 /* Function vectorizable_induction
3759 Check if PHI performs an induction computation that can be vectorized.
3760 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3761 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3762 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3764 bool
3765 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3766 tree *vec_stmt)
3768 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3769 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3770 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3771 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3772 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3773 tree vec_def;
3775 gcc_assert (ncopies >= 1);
3777 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3778 return false;
3780 /* FORNOW: SLP not supported. */
3781 if (STMT_SLP_TYPE (stmt_info))
3782 return false;
3784 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3786 if (TREE_CODE (phi) != PHI_NODE)
3787 return false;
3789 if (!vec_stmt) /* transformation not required. */
3791 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3792 if (vect_print_dump_info (REPORT_DETAILS))
3793 fprintf (vect_dump, "=== vectorizable_induction ===");
3794 vect_model_induction_cost (stmt_info, ncopies);
3795 return true;
3798 /** Transform. **/
3800 if (vect_print_dump_info (REPORT_DETAILS))
3801 fprintf (vect_dump, "transform induction phi.");
3803 vec_def = get_initial_def_for_induction (phi);
3804 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3805 return true;
3809 /* Function vectorizable_operation.
3811 Check if STMT performs a binary or unary operation that can be vectorized.
3812 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3813 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3814 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3816 bool
3817 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3818 slp_tree slp_node)
3820 tree vec_dest;
3821 tree scalar_dest;
3822 tree operation;
3823 tree op0, op1 = NULL;
3824 tree vec_oprnd1 = NULL_TREE;
3825 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3826 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3827 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3828 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3829 enum tree_code code;
3830 enum machine_mode vec_mode;
3831 tree new_temp;
3832 int op_type;
3833 optab optab;
3834 int icode;
3835 enum machine_mode optab_op2_mode;
3836 tree def, def_stmt;
3837 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3838 tree new_stmt = NULL_TREE;
3839 stmt_vec_info prev_stmt_info;
3840 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3841 int nunits_out;
3842 tree vectype_out;
3843 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3844 int j, i;
3845 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3846 tree vop0, vop1;
3847 unsigned int k;
3848 bool scalar_shift_arg = false;
3850 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3851 this, so we can safely override NCOPIES with 1 here. */
3852 if (slp_node)
3853 ncopies = 1;
3854 gcc_assert (ncopies >= 1);
3855 /* FORNOW. This restriction should be relaxed. */
3856 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3858 if (vect_print_dump_info (REPORT_DETAILS))
3859 fprintf (vect_dump, "multiple types in nested loop.");
3860 return false;
3863 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3864 return false;
3866 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3867 return false;
3869 /* Is STMT a vectorizable binary/unary operation? */
3870 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3871 return false;
3873 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3874 return false;
3876 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3877 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3878 if (!vectype_out)
3879 return false;
3880 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3881 if (nunits_out != nunits_in)
3882 return false;
3884 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3885 code = TREE_CODE (operation);
3887 /* For pointer addition, we should use the normal plus for
3888 the vector addition. */
3889 if (code == POINTER_PLUS_EXPR)
3890 code = PLUS_EXPR;
3892 optab = optab_for_tree_code (code, vectype);
3894 /* Support only unary or binary operations. */
3895 op_type = TREE_OPERAND_LENGTH (operation);
3896 if (op_type != unary_op && op_type != binary_op)
3898 if (vect_print_dump_info (REPORT_DETAILS))
3899 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3900 return false;
3903 op0 = TREE_OPERAND (operation, 0);
3904 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3906 if (vect_print_dump_info (REPORT_DETAILS))
3907 fprintf (vect_dump, "use not simple.");
3908 return false;
3911 if (op_type == binary_op)
3913 op1 = TREE_OPERAND (operation, 1);
3914 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3916 if (vect_print_dump_info (REPORT_DETAILS))
3917 fprintf (vect_dump, "use not simple.");
3918 return false;
3922 /* Supportable by target? */
3923 if (!optab)
3925 if (vect_print_dump_info (REPORT_DETAILS))
3926 fprintf (vect_dump, "no optab.");
3927 return false;
3929 vec_mode = TYPE_MODE (vectype);
3930 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3931 if (icode == CODE_FOR_nothing)
3933 if (vect_print_dump_info (REPORT_DETAILS))
3934 fprintf (vect_dump, "op not supported by target.");
3935 /* Check only during analysis. */
3936 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3937 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3938 < vect_min_worthwhile_factor (code)
3939 && !vec_stmt))
3940 return false;
3941 if (vect_print_dump_info (REPORT_DETAILS))
3942 fprintf (vect_dump, "proceeding using word mode.");
3945 /* Worthwhile without SIMD support? Check only during analysis. */
3946 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3947 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3948 < vect_min_worthwhile_factor (code)
3949 && !vec_stmt)
3951 if (vect_print_dump_info (REPORT_DETAILS))
3952 fprintf (vect_dump, "not worthwhile without SIMD support.");
3953 return false;
3956 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3958 /* FORNOW: not yet supported. */
3959 if (!VECTOR_MODE_P (vec_mode))
3960 return false;
3962 /* Invariant argument is needed for a vector shift
3963 by a scalar shift operand. */
3964 optab_op2_mode = insn_data[icode].operand[2].mode;
3965 if (!VECTOR_MODE_P (optab_op2_mode))
3967 if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
3969 if (vect_print_dump_info (REPORT_DETAILS))
3970 fprintf (vect_dump, "operand mode requires invariant"
3971 " argument.");
3972 return false;
3975 scalar_shift_arg = true;
3979 if (!vec_stmt) /* transformation not required. */
3981 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3982 if (vect_print_dump_info (REPORT_DETAILS))
3983 fprintf (vect_dump, "=== vectorizable_operation ===");
3984 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3985 return true;
3988 /** Transform. **/
3990 if (vect_print_dump_info (REPORT_DETAILS))
3991 fprintf (vect_dump, "transform binary/unary operation.");
3993 /* Handle def. */
3994 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3996 /* Allocate VECs for vector operands. In case of SLP, vector operands are
3997 created in the previous stages of the recursion, so no allocation is
3998 needed, except for the case of shift with scalar shift argument. In that
3999 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4000 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4001 In case of loop-based vectorization we allocate VECs of size 1. We
4002 allocate VEC_OPRNDS1 only in case of binary operation. */
4003 if (!slp_node)
4005 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4006 if (op_type == binary_op)
4007 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4009 else if (scalar_shift_arg)
4010 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4012 /* In case the vectorization factor (VF) is bigger than the number
4013 of elements that we can fit in a vectype (nunits), we have to generate
4014 more than one vector stmt - i.e - we need to "unroll" the
4015 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4016 from one copy of the vector stmt to the next, in the field
4017 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4018 stages to find the correct vector defs to be used when vectorizing
4019 stmts that use the defs of the current stmt. The example below illustrates
4020 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4021 4 vectorized stmts):
4023 before vectorization:
4024 RELATED_STMT VEC_STMT
4025 S1: x = memref - -
4026 S2: z = x + 1 - -
4028 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4029 there):
4030 RELATED_STMT VEC_STMT
4031 VS1_0: vx0 = memref0 VS1_1 -
4032 VS1_1: vx1 = memref1 VS1_2 -
4033 VS1_2: vx2 = memref2 VS1_3 -
4034 VS1_3: vx3 = memref3 - -
4035 S1: x = load - VS1_0
4036 S2: z = x + 1 - -
4038 step2: vectorize stmt S2 (done here):
4039 To vectorize stmt S2 we first need to find the relevant vector
4040 def for the first operand 'x'. This is, as usual, obtained from
4041 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4042 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4043 relevant vector def 'vx0'. Having found 'vx0' we can generate
4044 the vector stmt VS2_0, and as usual, record it in the
4045 STMT_VINFO_VEC_STMT of stmt S2.
4046 When creating the second copy (VS2_1), we obtain the relevant vector
4047 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4048 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4049 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4050 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4051 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4052 chain of stmts and pointers:
4053 RELATED_STMT VEC_STMT
4054 VS1_0: vx0 = memref0 VS1_1 -
4055 VS1_1: vx1 = memref1 VS1_2 -
4056 VS1_2: vx2 = memref2 VS1_3 -
4057 VS1_3: vx3 = memref3 - -
4058 S1: x = load - VS1_0
4059 VS2_0: vz0 = vx0 + v1 VS2_1 -
4060 VS2_1: vz1 = vx1 + v1 VS2_2 -
4061 VS2_2: vz2 = vx2 + v1 VS2_3 -
4062 VS2_3: vz3 = vx3 + v1 - -
4063 S2: z = x + 1 - VS2_0 */
4065 prev_stmt_info = NULL;
4066 for (j = 0; j < ncopies; j++)
4068 /* Handle uses. */
4069 if (j == 0)
4071 if (op_type == binary_op
4072 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
4074 /* Vector shl and shr insn patterns can be defined with scalar
4075 operand 2 (shift operand). In this case, use constant or loop
4076 invariant op1 directly, without extending it to vector mode
4077 first. */
4078 optab_op2_mode = insn_data[icode].operand[2].mode;
4079 if (!VECTOR_MODE_P (optab_op2_mode))
4081 if (vect_print_dump_info (REPORT_DETAILS))
4082 fprintf (vect_dump, "operand 1 using scalar mode.");
4083 vec_oprnd1 = op1;
4084 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4085 if (slp_node)
4087 /* Store vec_oprnd1 for every vector stmt to be created
4088 for SLP_NODE. We check during the analysis that all the
4089 shift arguments are the same.
4090 TODO: Allow different constants for different vector
4091 stmts generated for an SLP instance. */
4092 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4093 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4098 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4099 (a special case for certain kind of vector shifts); otherwise,
4100 operand 1 should be of a vector type (the usual case). */
4101 if (op_type == binary_op && !vec_oprnd1)
4102 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4103 slp_node);
4104 else
4105 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4106 slp_node);
4108 else
4109 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4111 /* Arguments are ready. Create the new vector stmt. */
4112 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4114 if (op_type == binary_op)
4116 vop1 = VEC_index (tree, vec_oprnds1, i);
4117 new_stmt = build_gimple_modify_stmt (vec_dest,
4118 build2 (code, vectype, vop0, vop1));
4120 else
4121 new_stmt = build_gimple_modify_stmt (vec_dest,
4122 build1 (code, vectype, vop0));
4124 new_temp = make_ssa_name (vec_dest, new_stmt);
4125 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4126 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4127 if (slp_node)
4128 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4131 if (j == 0)
4132 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4133 else
4134 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4135 prev_stmt_info = vinfo_for_stmt (new_stmt);
4138 VEC_free (tree, heap, vec_oprnds0);
4139 if (vec_oprnds1)
4140 VEC_free (tree, heap, vec_oprnds1);
4142 return true;
4146 /* Function vectorizable_type_demotion
4148 Check if STMT performs a binary or unary operation that involves
4149 type demotion, and if it can be vectorized.
4150 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4151 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4152 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4154 bool
4155 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4156 tree *vec_stmt)
4158 tree vec_dest;
4159 tree scalar_dest;
4160 tree operation;
4161 tree op0;
4162 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4165 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4166 enum tree_code code, code1 = ERROR_MARK;
4167 tree new_temp;
4168 tree def, def_stmt;
4169 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4170 tree new_stmt;
4171 stmt_vec_info prev_stmt_info;
4172 int nunits_in;
4173 int nunits_out;
4174 tree vectype_out;
4175 int ncopies;
4176 int j;
4177 tree expr;
4178 tree vectype_in;
4180 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4181 return false;
4183 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4184 return false;
4186 /* Is STMT a vectorizable type-demotion operation? */
4187 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4188 return false;
4190 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4191 return false;
4193 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4194 code = TREE_CODE (operation);
4195 if (code != NOP_EXPR && code != CONVERT_EXPR)
4196 return false;
4198 op0 = TREE_OPERAND (operation, 0);
4199 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4200 if (!vectype_in)
4201 return false;
4202 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4204 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4205 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4206 if (!vectype_out)
4207 return false;
4208 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4209 if (nunits_in != nunits_out / 2) /* FORNOW */
4210 return false;
4212 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4213 gcc_assert (ncopies >= 1);
4214 /* FORNOW. This restriction should be relaxed. */
4215 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4217 if (vect_print_dump_info (REPORT_DETAILS))
4218 fprintf (vect_dump, "multiple types in nested loop.");
4219 return false;
4222 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4223 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4224 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4225 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4226 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4227 return false;
4229 /* Check the operands of the operation. */
4230 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4232 if (vect_print_dump_info (REPORT_DETAILS))
4233 fprintf (vect_dump, "use not simple.");
4234 return false;
4237 /* Supportable by target? */
4238 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4239 return false;
4241 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4243 if (!vec_stmt) /* transformation not required. */
4245 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4246 if (vect_print_dump_info (REPORT_DETAILS))
4247 fprintf (vect_dump, "=== vectorizable_demotion ===");
4248 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4249 return true;
4252 /** Transform. **/
4253 if (vect_print_dump_info (REPORT_DETAILS))
4254 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4255 ncopies);
4257 /* Handle def. */
4258 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4260 /* In case the vectorization factor (VF) is bigger than the number
4261 of elements that we can fit in a vectype (nunits), we have to generate
4262 more than one vector stmt - i.e - we need to "unroll" the
4263 vector stmt by a factor VF/nunits. */
4264 prev_stmt_info = NULL;
4265 for (j = 0; j < ncopies; j++)
4267 /* Handle uses. */
4268 if (j == 0)
4270 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4271 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4273 else
4275 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4276 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4279 /* Arguments are ready. Create the new vector stmt. */
4280 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4281 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4282 new_temp = make_ssa_name (vec_dest, new_stmt);
4283 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4284 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4286 if (j == 0)
4287 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4288 else
4289 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4291 prev_stmt_info = vinfo_for_stmt (new_stmt);
4294 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4295 return true;
4299 /* Function vectorizable_type_promotion
4301 Check if STMT performs a binary or unary operation that involves
4302 type promotion, and if it can be vectorized.
4303 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4304 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4305 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4307 bool
4308 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4309 tree *vec_stmt)
4311 tree vec_dest;
4312 tree scalar_dest;
4313 tree operation;
4314 tree op0, op1 = NULL;
4315 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4316 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4317 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4318 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4319 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4320 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4321 int op_type;
4322 tree def, def_stmt;
4323 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4324 tree new_stmt;
4325 stmt_vec_info prev_stmt_info;
4326 int nunits_in;
4327 int nunits_out;
4328 tree vectype_out;
4329 int ncopies;
4330 int j;
4331 tree vectype_in;
4333 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4334 return false;
4336 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4337 return false;
4339 /* Is STMT a vectorizable type-promotion operation? */
4340 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4341 return false;
4343 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4344 return false;
4346 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4347 code = TREE_CODE (operation);
4348 if (code != NOP_EXPR && code != CONVERT_EXPR
4349 && code != WIDEN_MULT_EXPR)
4350 return false;
4352 op0 = TREE_OPERAND (operation, 0);
4353 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4354 if (!vectype_in)
4355 return false;
4356 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4358 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4359 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4360 if (!vectype_out)
4361 return false;
4362 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4363 if (nunits_out != nunits_in / 2) /* FORNOW */
4364 return false;
4366 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4367 gcc_assert (ncopies >= 1);
4368 /* FORNOW. This restriction should be relaxed. */
4369 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4371 if (vect_print_dump_info (REPORT_DETAILS))
4372 fprintf (vect_dump, "multiple types in nested loop.");
4373 return false;
4376 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4377 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4378 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4379 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4380 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4381 return false;
4383 /* Check the operands of the operation. */
4384 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4386 if (vect_print_dump_info (REPORT_DETAILS))
4387 fprintf (vect_dump, "use not simple.");
4388 return false;
4391 op_type = TREE_CODE_LENGTH (code);
4392 if (op_type == binary_op)
4394 op1 = TREE_OPERAND (operation, 1);
4395 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4397 if (vect_print_dump_info (REPORT_DETAILS))
4398 fprintf (vect_dump, "use not simple.");
4399 return false;
4403 /* Supportable by target? */
4404 if (!supportable_widening_operation (code, stmt, vectype_in,
4405 &decl1, &decl2, &code1, &code2))
4406 return false;
4408 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4410 if (!vec_stmt) /* transformation not required. */
4412 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4413 if (vect_print_dump_info (REPORT_DETAILS))
4414 fprintf (vect_dump, "=== vectorizable_promotion ===");
4415 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4416 return true;
4419 /** Transform. **/
4421 if (vect_print_dump_info (REPORT_DETAILS))
4422 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4423 ncopies);
4425 /* Handle def. */
4426 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4428 /* In case the vectorization factor (VF) is bigger than the number
4429 of elements that we can fit in a vectype (nunits), we have to generate
4430 more than one vector stmt - i.e - we need to "unroll" the
4431 vector stmt by a factor VF/nunits. */
4433 prev_stmt_info = NULL;
4434 for (j = 0; j < ncopies; j++)
4436 /* Handle uses. */
4437 if (j == 0)
4439 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4440 if (op_type == binary_op)
4441 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4443 else
4445 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4446 if (op_type == binary_op)
4447 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4450 /* Arguments are ready. Create the new vector stmt. We are creating
4451 two vector defs because the widened result does not fit in one vector.
4452 The vectorized stmt can be expressed as a call to a taregt builtin,
4453 or a using a tree-code. */
4454 /* Generate first half of the widened result: */
4455 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4456 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4457 if (j == 0)
4458 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4459 else
4460 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4461 prev_stmt_info = vinfo_for_stmt (new_stmt);
4463 /* Generate second half of the widened result: */
4464 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4465 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4466 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4467 prev_stmt_info = vinfo_for_stmt (new_stmt);
4471 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4472 return true;
4476 /* Function vect_strided_store_supported.
4478 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4479 and FALSE otherwise. */
4481 static bool
4482 vect_strided_store_supported (tree vectype)
4484 optab interleave_high_optab, interleave_low_optab;
4485 int mode;
4487 mode = (int) TYPE_MODE (vectype);
4489 /* Check that the operation is supported. */
4490 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4491 vectype);
4492 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4493 vectype);
4494 if (!interleave_high_optab || !interleave_low_optab)
4496 if (vect_print_dump_info (REPORT_DETAILS))
4497 fprintf (vect_dump, "no optab for interleave.");
4498 return false;
4501 if (optab_handler (interleave_high_optab, mode)->insn_code
4502 == CODE_FOR_nothing
4503 || optab_handler (interleave_low_optab, mode)->insn_code
4504 == CODE_FOR_nothing)
4506 if (vect_print_dump_info (REPORT_DETAILS))
4507 fprintf (vect_dump, "interleave op not supported by target.");
4508 return false;
4511 return true;
4515 /* Function vect_permute_store_chain.
4517 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4518 a power of 2, generate interleave_high/low stmts to reorder the data
4519 correctly for the stores. Return the final references for stores in
4520 RESULT_CHAIN.
4522 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4523 The input is 4 vectors each containing 8 elements. We assign a number to each
4524 element, the input sequence is:
4526 1st vec: 0 1 2 3 4 5 6 7
4527 2nd vec: 8 9 10 11 12 13 14 15
4528 3rd vec: 16 17 18 19 20 21 22 23
4529 4th vec: 24 25 26 27 28 29 30 31
4531 The output sequence should be:
4533 1st vec: 0 8 16 24 1 9 17 25
4534 2nd vec: 2 10 18 26 3 11 19 27
4535 3rd vec: 4 12 20 28 5 13 21 30
4536 4th vec: 6 14 22 30 7 15 23 31
4538 i.e., we interleave the contents of the four vectors in their order.
4540 We use interleave_high/low instructions to create such output. The input of
4541 each interleave_high/low operation is two vectors:
4542 1st vec 2nd vec
4543 0 1 2 3 4 5 6 7
4544 the even elements of the result vector are obtained left-to-right from the
4545 high/low elements of the first vector. The odd elements of the result are
4546 obtained left-to-right from the high/low elements of the second vector.
4547 The output of interleave_high will be: 0 4 1 5
4548 and of interleave_low: 2 6 3 7
4551 The permutation is done in log LENGTH stages. In each stage interleave_high
4552 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4553 where the first argument is taken from the first half of DR_CHAIN and the
4554 second argument from it's second half.
4555 In our example,
4557 I1: interleave_high (1st vec, 3rd vec)
4558 I2: interleave_low (1st vec, 3rd vec)
4559 I3: interleave_high (2nd vec, 4th vec)
4560 I4: interleave_low (2nd vec, 4th vec)
4562 The output for the first stage is:
4564 I1: 0 16 1 17 2 18 3 19
4565 I2: 4 20 5 21 6 22 7 23
4566 I3: 8 24 9 25 10 26 11 27
4567 I4: 12 28 13 29 14 30 15 31
4569 The output of the second stage, i.e. the final result is:
4571 I1: 0 8 16 24 1 9 17 25
4572 I2: 2 10 18 26 3 11 19 27
4573 I3: 4 12 20 28 5 13 21 30
4574 I4: 6 14 22 30 7 15 23 31. */
4576 static bool
4577 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4578 unsigned int length,
4579 tree stmt,
4580 block_stmt_iterator *bsi,
4581 VEC(tree,heap) **result_chain)
4583 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4584 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4585 tree scalar_dest, tmp;
4586 int i;
4587 unsigned int j;
4589 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4591 /* Check that the operation is supported. */
4592 if (!vect_strided_store_supported (vectype))
4593 return false;
4595 *result_chain = VEC_copy (tree, heap, dr_chain);
4597 for (i = 0; i < exact_log2 (length); i++)
4599 for (j = 0; j < length/2; j++)
4601 vect1 = VEC_index (tree, dr_chain, j);
4602 vect2 = VEC_index (tree, dr_chain, j+length/2);
4604 /* Create interleaving stmt:
4605 in the case of big endian:
4606 high = interleave_high (vect1, vect2)
4607 and in the case of little endian:
4608 high = interleave_low (vect1, vect2). */
4609 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4610 DECL_GIMPLE_REG_P (perm_dest) = 1;
4611 add_referenced_var (perm_dest);
4612 if (BYTES_BIG_ENDIAN)
4613 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4614 else
4615 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4616 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4617 high = make_ssa_name (perm_dest, perm_stmt);
4618 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4619 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4620 VEC_replace (tree, *result_chain, 2*j, high);
4622 /* Create interleaving stmt:
4623 in the case of big endian:
4624 low = interleave_low (vect1, vect2)
4625 and in the case of little endian:
4626 low = interleave_high (vect1, vect2). */
4627 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4628 DECL_GIMPLE_REG_P (perm_dest) = 1;
4629 add_referenced_var (perm_dest);
4630 if (BYTES_BIG_ENDIAN)
4631 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4632 else
4633 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4634 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4635 low = make_ssa_name (perm_dest, perm_stmt);
4636 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4637 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4638 VEC_replace (tree, *result_chain, 2*j+1, low);
4640 dr_chain = VEC_copy (tree, heap, *result_chain);
4642 return true;
4646 /* Function vectorizable_store.
4648 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4649 can be vectorized.
4650 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4651 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4652 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4654 bool
4655 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4656 slp_tree slp_node)
4658 tree scalar_dest;
4659 tree data_ref;
4660 tree op;
4661 tree vec_oprnd = NULL_TREE;
4662 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4663 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4664 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4665 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4666 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4667 enum machine_mode vec_mode;
4668 tree dummy;
4669 enum dr_alignment_support alignment_support_scheme;
4670 tree def, def_stmt;
4671 enum vect_def_type dt;
4672 stmt_vec_info prev_stmt_info = NULL;
4673 tree dataref_ptr = NULL_TREE;
4674 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4675 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4676 int j;
4677 tree next_stmt, first_stmt = NULL_TREE;
4678 bool strided_store = false;
4679 unsigned int group_size, i;
4680 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4681 bool inv_p;
4682 VEC(tree,heap) *vec_oprnds = NULL;
4683 bool slp = (slp_node != NULL);
4684 stmt_vec_info first_stmt_vinfo;
4685 unsigned int vec_num;
4687 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4688 this, so we can safely override NCOPIES with 1 here. */
4689 if (slp)
4690 ncopies = 1;
4692 gcc_assert (ncopies >= 1);
4694 /* FORNOW. This restriction should be relaxed. */
4695 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4697 if (vect_print_dump_info (REPORT_DETAILS))
4698 fprintf (vect_dump, "multiple types in nested loop.");
4699 return false;
4702 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4703 return false;
4705 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4706 return false;
4708 /* Is vectorizable store? */
4710 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4711 return false;
4713 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4714 if (TREE_CODE (scalar_dest) != ARRAY_REF
4715 && TREE_CODE (scalar_dest) != INDIRECT_REF
4716 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4717 return false;
4719 op = GIMPLE_STMT_OPERAND (stmt, 1);
4720 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4722 if (vect_print_dump_info (REPORT_DETAILS))
4723 fprintf (vect_dump, "use not simple.");
4724 return false;
4727 vec_mode = TYPE_MODE (vectype);
4728 /* FORNOW. In some cases can vectorize even if data-type not supported
4729 (e.g. - array initialization with 0). */
4730 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4731 return false;
4733 if (!STMT_VINFO_DATA_REF (stmt_info))
4734 return false;
4736 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4738 strided_store = true;
4739 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4740 if (!vect_strided_store_supported (vectype)
4741 && !PURE_SLP_STMT (stmt_info) && !slp)
4742 return false;
4744 if (first_stmt == stmt)
4746 /* STMT is the leader of the group. Check the operands of all the
4747 stmts of the group. */
4748 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4749 while (next_stmt)
4751 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4752 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4754 if (vect_print_dump_info (REPORT_DETAILS))
4755 fprintf (vect_dump, "use not simple.");
4756 return false;
4758 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4763 if (!vec_stmt) /* transformation not required. */
4765 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4766 if (!PURE_SLP_STMT (stmt_info))
4767 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4768 return true;
4771 /** Transform. **/
4773 if (strided_store)
4775 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4776 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4778 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4780 /* FORNOW */
4781 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4783 /* We vectorize all the stmts of the interleaving group when we
4784 reach the last stmt in the group. */
4785 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4786 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4787 && !slp)
4789 *vec_stmt = NULL_TREE;
4790 return true;
4793 if (slp)
4794 strided_store = false;
4796 /* VEC_NUM is the number of vect stmts to be created for this group. */
4797 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4798 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4799 else
4800 vec_num = group_size;
4802 else
4804 first_stmt = stmt;
4805 first_dr = dr;
4806 group_size = vec_num = 1;
4807 first_stmt_vinfo = stmt_info;
4810 if (vect_print_dump_info (REPORT_DETAILS))
4811 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4813 dr_chain = VEC_alloc (tree, heap, group_size);
4814 oprnds = VEC_alloc (tree, heap, group_size);
4816 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4817 gcc_assert (alignment_support_scheme);
4818 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4820 /* In case the vectorization factor (VF) is bigger than the number
4821 of elements that we can fit in a vectype (nunits), we have to generate
4822 more than one vector stmt - i.e - we need to "unroll" the
4823 vector stmt by a factor VF/nunits. For more details see documentation in
4824 vect_get_vec_def_for_copy_stmt. */
4826 /* In case of interleaving (non-unit strided access):
4828 S1: &base + 2 = x2
4829 S2: &base = x0
4830 S3: &base + 1 = x1
4831 S4: &base + 3 = x3
4833 We create vectorized stores starting from base address (the access of the
4834 first stmt in the chain (S2 in the above example), when the last store stmt
4835 of the chain (S4) is reached:
4837 VS1: &base = vx2
4838 VS2: &base + vec_size*1 = vx0
4839 VS3: &base + vec_size*2 = vx1
4840 VS4: &base + vec_size*3 = vx3
4842 Then permutation statements are generated:
4844 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4845 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4848 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4849 (the order of the data-refs in the output of vect_permute_store_chain
4850 corresponds to the order of scalar stmts in the interleaving chain - see
4851 the documentation of vect_permute_store_chain()).
4853 In case of both multiple types and interleaving, above vector stores and
4854 permutation stmts are created for every copy. The result vector stmts are
4855 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4856 STMT_VINFO_RELATED_STMT for the next copies.
4859 prev_stmt_info = NULL;
4860 for (j = 0; j < ncopies; j++)
4862 tree new_stmt;
4863 tree ptr_incr;
4865 if (j == 0)
4867 if (slp)
4869 /* Get vectorized arguments for SLP_NODE. */
4870 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4872 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4874 else
4876 /* For interleaved stores we collect vectorized defs for all the
4877 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4878 used as an input to vect_permute_store_chain(), and OPRNDS as
4879 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4881 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4882 OPRNDS are of size 1. */
4883 next_stmt = first_stmt;
4884 for (i = 0; i < group_size; i++)
4886 /* Since gaps are not supported for interleaved stores,
4887 GROUP_SIZE is the exact number of stmts in the chain.
4888 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4889 there is no interleaving, GROUP_SIZE is 1, and only one
4890 iteration of the loop will be executed. */
4891 gcc_assert (next_stmt);
4892 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4894 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4895 NULL);
4896 VEC_quick_push(tree, dr_chain, vec_oprnd);
4897 VEC_quick_push(tree, oprnds, vec_oprnd);
4898 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4901 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4902 &dummy, &ptr_incr, false,
4903 TREE_TYPE (vec_oprnd), &inv_p);
4904 gcc_assert (!inv_p);
4906 else
4908 /* FORNOW SLP doesn't work for multiple types. */
4909 gcc_assert (!slp);
4911 /* For interleaved stores we created vectorized defs for all the
4912 defs stored in OPRNDS in the previous iteration (previous copy).
4913 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4914 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4915 next copy.
4916 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4917 OPRNDS are of size 1. */
4918 for (i = 0; i < group_size; i++)
4920 op = VEC_index (tree, oprnds, i);
4921 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4922 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4923 VEC_replace(tree, dr_chain, i, vec_oprnd);
4924 VEC_replace(tree, oprnds, i, vec_oprnd);
4926 dataref_ptr =
4927 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4930 if (strided_store)
4932 result_chain = VEC_alloc (tree, heap, group_size);
4933 /* Permute. */
4934 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4935 &result_chain))
4936 return false;
4939 next_stmt = first_stmt;
4940 for (i = 0; i < vec_num; i++)
4942 if (i > 0)
4943 /* Bump the vector pointer. */
4944 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4945 NULL_TREE);
4947 if (slp)
4948 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4949 else if (strided_store)
4950 /* For strided stores vectorized defs are interleaved in
4951 vect_permute_store_chain(). */
4952 vec_oprnd = VEC_index (tree, result_chain, i);
4954 data_ref = build_fold_indirect_ref (dataref_ptr);
4955 /* Arguments are ready. Create the new vector stmt. */
4956 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4957 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4958 mark_symbols_for_renaming (new_stmt);
4960 if (j == 0)
4961 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4962 else
4963 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4965 prev_stmt_info = vinfo_for_stmt (new_stmt);
4966 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4967 if (!next_stmt)
4968 break;
4972 VEC_free (tree, heap, dr_chain);
4973 VEC_free (tree, heap, oprnds);
4974 if (result_chain)
4975 VEC_free (tree, heap, result_chain);
4977 return true;
4981 /* Function vect_setup_realignment
4983 This function is called when vectorizing an unaligned load using
4984 the dr_explicit_realign[_optimized] scheme.
4985 This function generates the following code at the loop prolog:
4987 p = initial_addr;
4988 x msq_init = *(floor(p)); # prolog load
4989 realignment_token = call target_builtin;
4990 loop:
4991 x msq = phi (msq_init, ---)
4993 The stmts marked with x are generated only for the case of
4994 dr_explicit_realign_optimized.
4996 The code above sets up a new (vector) pointer, pointing to the first
4997 location accessed by STMT, and a "floor-aligned" load using that pointer.
4998 It also generates code to compute the "realignment-token" (if the relevant
4999 target hook was defined), and creates a phi-node at the loop-header bb
5000 whose arguments are the result of the prolog-load (created by this
5001 function) and the result of a load that takes place in the loop (to be
5002 created by the caller to this function).
5004 For the case of dr_explicit_realign_optimized:
5005 The caller to this function uses the phi-result (msq) to create the
5006 realignment code inside the loop, and sets up the missing phi argument,
5007 as follows:
5008 loop:
5009 msq = phi (msq_init, lsq)
5010 lsq = *(floor(p')); # load in loop
5011 result = realign_load (msq, lsq, realignment_token);
5013 For the case of dr_explicit_realign:
5014 loop:
5015 msq = *(floor(p)); # load in loop
5016 p' = p + (VS-1);
5017 lsq = *(floor(p')); # load in loop
5018 result = realign_load (msq, lsq, realignment_token);
5020 Input:
5021 STMT - (scalar) load stmt to be vectorized. This load accesses
5022 a memory location that may be unaligned.
5023 BSI - place where new code is to be inserted.
5024 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5025 is used.
5027 Output:
5028 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5029 target hook, if defined.
5030 Return value - the result of the loop-header phi node. */
5032 static tree
5033 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
5034 tree *realignment_token,
5035 enum dr_alignment_support alignment_support_scheme,
5036 tree init_addr,
5037 struct loop **at_loop)
5039 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5040 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5041 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5042 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5043 edge pe;
5044 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5045 tree vec_dest;
5046 tree inc;
5047 tree ptr;
5048 tree data_ref;
5049 tree new_stmt;
5050 basic_block new_bb;
5051 tree msq_init = NULL_TREE;
5052 tree new_temp;
5053 tree phi_stmt;
5054 tree msq = NULL_TREE;
5055 tree stmts = NULL_TREE;
5056 bool inv_p;
5057 bool compute_in_loop = false;
5058 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5059 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5060 struct loop *loop_for_initial_load;
5062 gcc_assert (alignment_support_scheme == dr_explicit_realign
5063 || alignment_support_scheme == dr_explicit_realign_optimized);
5065 /* We need to generate three things:
5066 1. the misalignment computation
5067 2. the extra vector load (for the optimized realignment scheme).
5068 3. the phi node for the two vectors from which the realignment is
5069 done (for the optimized realignment scheme).
5072 /* 1. Determine where to generate the misalignment computation.
5074 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5075 calculation will be generated by this function, outside the loop (in the
5076 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5077 caller, inside the loop.
5079 Background: If the misalignment remains fixed throughout the iterations of
5080 the loop, then both realignment schemes are applicable, and also the
5081 misalignment computation can be done outside LOOP. This is because we are
5082 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5083 are a multiple of VS (the Vector Size), and therefore the misalignment in
5084 different vectorized LOOP iterations is always the same.
5085 The problem arises only if the memory access is in an inner-loop nested
5086 inside LOOP, which is now being vectorized using outer-loop vectorization.
5087 This is the only case when the misalignment of the memory access may not
5088 remain fixed throughout the iterations of the inner-loop (as explained in
5089 detail in vect_supportable_dr_alignment). In this case, not only is the
5090 optimized realignment scheme not applicable, but also the misalignment
5091 computation (and generation of the realignment token that is passed to
5092 REALIGN_LOAD) have to be done inside the loop.
5094 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5095 or not, which in turn determines if the misalignment is computed inside
5096 the inner-loop, or outside LOOP. */
5098 if (init_addr != NULL_TREE)
5100 compute_in_loop = true;
5101 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5105 /* 2. Determine where to generate the extra vector load.
5107 For the optimized realignment scheme, instead of generating two vector
5108 loads in each iteration, we generate a single extra vector load in the
5109 preheader of the loop, and in each iteration reuse the result of the
5110 vector load from the previous iteration. In case the memory access is in
5111 an inner-loop nested inside LOOP, which is now being vectorized using
5112 outer-loop vectorization, we need to determine whether this initial vector
5113 load should be generated at the preheader of the inner-loop, or can be
5114 generated at the preheader of LOOP. If the memory access has no evolution
5115 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5116 to be generated inside LOOP (in the preheader of the inner-loop). */
5118 if (nested_in_vect_loop)
5120 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5121 bool invariant_in_outerloop =
5122 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5123 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5125 else
5126 loop_for_initial_load = loop;
5127 if (at_loop)
5128 *at_loop = loop_for_initial_load;
5130 /* 3. For the case of the optimized realignment, create the first vector
5131 load at the loop preheader. */
5133 if (alignment_support_scheme == dr_explicit_realign_optimized)
5135 /* Create msq_init = *(floor(p1)) in the loop preheader */
5137 gcc_assert (!compute_in_loop);
5138 pe = loop_preheader_edge (loop_for_initial_load);
5139 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5140 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5141 &init_addr, &inc, true, NULL_TREE, &inv_p);
5142 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5143 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5144 new_temp = make_ssa_name (vec_dest, new_stmt);
5145 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5146 mark_symbols_for_renaming (new_stmt);
5147 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5148 gcc_assert (!new_bb);
5149 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5152 /* 4. Create realignment token using a target builtin, if available.
5153 It is done either inside the containing loop, or before LOOP (as
5154 determined above). */
5156 if (targetm.vectorize.builtin_mask_for_load)
5158 tree builtin_decl;
5160 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5161 if (compute_in_loop)
5162 gcc_assert (init_addr); /* already computed by the caller. */
5163 else
5165 /* Generate the INIT_ADDR computation outside LOOP. */
5166 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5167 NULL_TREE, loop);
5168 pe = loop_preheader_edge (loop);
5169 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5170 gcc_assert (!new_bb);
5173 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5174 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5175 vec_dest = vect_create_destination_var (scalar_dest,
5176 TREE_TYPE (new_stmt));
5177 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5178 new_temp = make_ssa_name (vec_dest, new_stmt);
5179 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5181 if (compute_in_loop)
5182 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5183 else
5185 /* Generate the misalignment computation outside LOOP. */
5186 pe = loop_preheader_edge (loop);
5187 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5188 gcc_assert (!new_bb);
5191 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5193 /* The result of the CALL_EXPR to this builtin is determined from
5194 the value of the parameter and no global variables are touched
5195 which makes the builtin a "const" function. Requiring the
5196 builtin to have the "const" attribute makes it unnecessary
5197 to call mark_call_clobbered. */
5198 gcc_assert (TREE_READONLY (builtin_decl));
5201 if (alignment_support_scheme == dr_explicit_realign)
5202 return msq;
5204 gcc_assert (!compute_in_loop);
5205 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5208 /* 5. Create msq = phi <msq_init, lsq> in loop */
5210 pe = loop_preheader_edge (containing_loop);
5211 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5212 msq = make_ssa_name (vec_dest, NULL_TREE);
5213 phi_stmt = create_phi_node (msq, containing_loop->header);
5214 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5215 add_phi_arg (phi_stmt, msq_init, pe);
5217 return msq;
5221 /* Function vect_strided_load_supported.
5223 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5224 and FALSE otherwise. */
5226 static bool
5227 vect_strided_load_supported (tree vectype)
5229 optab perm_even_optab, perm_odd_optab;
5230 int mode;
5232 mode = (int) TYPE_MODE (vectype);
5234 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5235 if (!perm_even_optab)
5237 if (vect_print_dump_info (REPORT_DETAILS))
5238 fprintf (vect_dump, "no optab for perm_even.");
5239 return false;
5242 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5244 if (vect_print_dump_info (REPORT_DETAILS))
5245 fprintf (vect_dump, "perm_even op not supported by target.");
5246 return false;
5249 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5250 if (!perm_odd_optab)
5252 if (vect_print_dump_info (REPORT_DETAILS))
5253 fprintf (vect_dump, "no optab for perm_odd.");
5254 return false;
5257 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5259 if (vect_print_dump_info (REPORT_DETAILS))
5260 fprintf (vect_dump, "perm_odd op not supported by target.");
5261 return false;
5263 return true;
5267 /* Function vect_permute_load_chain.
5269 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5270 a power of 2, generate extract_even/odd stmts to reorder the input data
5271 correctly. Return the final references for loads in RESULT_CHAIN.
5273 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5274 The input is 4 vectors each containing 8 elements. We assign a number to each
5275 element, the input sequence is:
5277 1st vec: 0 1 2 3 4 5 6 7
5278 2nd vec: 8 9 10 11 12 13 14 15
5279 3rd vec: 16 17 18 19 20 21 22 23
5280 4th vec: 24 25 26 27 28 29 30 31
5282 The output sequence should be:
5284 1st vec: 0 4 8 12 16 20 24 28
5285 2nd vec: 1 5 9 13 17 21 25 29
5286 3rd vec: 2 6 10 14 18 22 26 30
5287 4th vec: 3 7 11 15 19 23 27 31
5289 i.e., the first output vector should contain the first elements of each
5290 interleaving group, etc.
5292 We use extract_even/odd instructions to create such output. The input of each
5293 extract_even/odd operation is two vectors
5294 1st vec 2nd vec
5295 0 1 2 3 4 5 6 7
5297 and the output is the vector of extracted even/odd elements. The output of
5298 extract_even will be: 0 2 4 6
5299 and of extract_odd: 1 3 5 7
5302 The permutation is done in log LENGTH stages. In each stage extract_even and
5303 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5304 order. In our example,
5306 E1: extract_even (1st vec, 2nd vec)
5307 E2: extract_odd (1st vec, 2nd vec)
5308 E3: extract_even (3rd vec, 4th vec)
5309 E4: extract_odd (3rd vec, 4th vec)
5311 The output for the first stage will be:
5313 E1: 0 2 4 6 8 10 12 14
5314 E2: 1 3 5 7 9 11 13 15
5315 E3: 16 18 20 22 24 26 28 30
5316 E4: 17 19 21 23 25 27 29 31
5318 In order to proceed and create the correct sequence for the next stage (or
5319 for the correct output, if the second stage is the last one, as in our
5320 example), we first put the output of extract_even operation and then the
5321 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5322 The input for the second stage is:
5324 1st vec (E1): 0 2 4 6 8 10 12 14
5325 2nd vec (E3): 16 18 20 22 24 26 28 30
5326 3rd vec (E2): 1 3 5 7 9 11 13 15
5327 4th vec (E4): 17 19 21 23 25 27 29 31
5329 The output of the second stage:
5331 E1: 0 4 8 12 16 20 24 28
5332 E2: 2 6 10 14 18 22 26 30
5333 E3: 1 5 9 13 17 21 25 29
5334 E4: 3 7 11 15 19 23 27 31
5336 And RESULT_CHAIN after reordering:
5338 1st vec (E1): 0 4 8 12 16 20 24 28
5339 2nd vec (E3): 1 5 9 13 17 21 25 29
5340 3rd vec (E2): 2 6 10 14 18 22 26 30
5341 4th vec (E4): 3 7 11 15 19 23 27 31. */
5343 static bool
5344 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5345 unsigned int length,
5346 tree stmt,
5347 block_stmt_iterator *bsi,
5348 VEC(tree,heap) **result_chain)
5350 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5351 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5352 tree tmp;
5353 int i;
5354 unsigned int j;
5356 /* Check that the operation is supported. */
5357 if (!vect_strided_load_supported (vectype))
5358 return false;
5360 *result_chain = VEC_copy (tree, heap, dr_chain);
5361 for (i = 0; i < exact_log2 (length); i++)
5363 for (j = 0; j < length; j +=2)
5365 first_vect = VEC_index (tree, dr_chain, j);
5366 second_vect = VEC_index (tree, dr_chain, j+1);
5368 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5369 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5370 DECL_GIMPLE_REG_P (perm_dest) = 1;
5371 add_referenced_var (perm_dest);
5373 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5374 first_vect, second_vect);
5375 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5377 data_ref = make_ssa_name (perm_dest, perm_stmt);
5378 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5379 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5380 mark_symbols_for_renaming (perm_stmt);
5382 VEC_replace (tree, *result_chain, j/2, data_ref);
5384 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5385 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5386 DECL_GIMPLE_REG_P (perm_dest) = 1;
5387 add_referenced_var (perm_dest);
5389 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5390 first_vect, second_vect);
5391 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5392 data_ref = make_ssa_name (perm_dest, perm_stmt);
5393 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5394 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5395 mark_symbols_for_renaming (perm_stmt);
5397 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5399 dr_chain = VEC_copy (tree, heap, *result_chain);
5401 return true;
5405 /* Function vect_transform_strided_load.
5407 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5408 to perform their permutation and ascribe the result vectorized statements to
5409 the scalar statements.
5412 static bool
5413 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5414 block_stmt_iterator *bsi)
5416 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5417 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5418 tree next_stmt, new_stmt;
5419 VEC(tree,heap) *result_chain = NULL;
5420 unsigned int i, gap_count;
5421 tree tmp_data_ref;
5423 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5424 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5425 vectors, that are ready for vector computation. */
5426 result_chain = VEC_alloc (tree, heap, size);
5427 /* Permute. */
5428 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5429 return false;
5431 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5432 Since we scan the chain starting from it's first node, their order
5433 corresponds the order of data-refs in RESULT_CHAIN. */
5434 next_stmt = first_stmt;
5435 gap_count = 1;
5436 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5438 if (!next_stmt)
5439 break;
5441 /* Skip the gaps. Loads created for the gaps will be removed by dead
5442 code elimination pass later.
5443 DR_GROUP_GAP is the number of steps in elements from the previous
5444 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5445 correspond to the gaps.
5447 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5449 gap_count++;
5450 continue;
5453 while (next_stmt)
5455 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5456 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5457 copies, and we put the new vector statement in the first available
5458 RELATED_STMT. */
5459 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5460 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5461 else
5463 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5464 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5465 vinfo_for_stmt (prev_stmt));
5466 while (rel_stmt)
5468 prev_stmt = rel_stmt;
5469 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5471 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5473 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5474 gap_count = 1;
5475 /* If NEXT_STMT accesses the same DR as the previous statement,
5476 put the same TMP_DATA_REF as its vectorized statement; otherwise
5477 get the next data-ref from RESULT_CHAIN. */
5478 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5479 break;
5483 VEC_free (tree, heap, result_chain);
5484 return true;
5488 /* vectorizable_load.
5490 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5491 can be vectorized.
5492 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5493 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5494 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5496 bool
5497 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5498 slp_tree slp_node)
5500 tree scalar_dest;
5501 tree vec_dest = NULL;
5502 tree data_ref = NULL;
5503 tree op;
5504 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5505 stmt_vec_info prev_stmt_info;
5506 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5507 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5508 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5509 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5510 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5511 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5512 tree new_temp;
5513 int mode;
5514 tree new_stmt = NULL_TREE;
5515 tree dummy;
5516 enum dr_alignment_support alignment_support_scheme;
5517 tree dataref_ptr = NULL_TREE;
5518 tree ptr_incr;
5519 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5520 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5521 int i, j, group_size;
5522 tree msq = NULL_TREE, lsq;
5523 tree offset = NULL_TREE;
5524 tree realignment_token = NULL_TREE;
5525 tree phi = NULL_TREE;
5526 VEC(tree,heap) *dr_chain = NULL;
5527 bool strided_load = false;
5528 tree first_stmt;
5529 tree scalar_type;
5530 bool inv_p;
5531 bool compute_in_loop = false;
5532 struct loop *at_loop;
5533 int vec_num;
5534 bool slp = (slp_node != NULL);
5536 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5537 this, so we can safely override NCOPIES with 1 here. */
5538 if (slp)
5539 ncopies = 1;
5541 gcc_assert (ncopies >= 1);
5543 /* FORNOW. This restriction should be relaxed. */
5544 if (nested_in_vect_loop && ncopies > 1)
5546 if (vect_print_dump_info (REPORT_DETAILS))
5547 fprintf (vect_dump, "multiple types in nested loop.");
5548 return false;
5551 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5552 return false;
5554 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5555 return false;
5557 /* Is vectorizable load? */
5558 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5559 return false;
5561 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5562 if (TREE_CODE (scalar_dest) != SSA_NAME)
5563 return false;
5565 op = GIMPLE_STMT_OPERAND (stmt, 1);
5566 if (TREE_CODE (op) != ARRAY_REF
5567 && TREE_CODE (op) != INDIRECT_REF
5568 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5569 return false;
5571 if (!STMT_VINFO_DATA_REF (stmt_info))
5572 return false;
5574 scalar_type = TREE_TYPE (DR_REF (dr));
5575 mode = (int) TYPE_MODE (vectype);
5577 /* FORNOW. In some cases can vectorize even if data-type not supported
5578 (e.g. - data copies). */
5579 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5581 if (vect_print_dump_info (REPORT_DETAILS))
5582 fprintf (vect_dump, "Aligned load, but unsupported type.");
5583 return false;
5586 /* Check if the load is a part of an interleaving chain. */
5587 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5589 strided_load = true;
5590 /* FORNOW */
5591 gcc_assert (! nested_in_vect_loop);
5593 /* Check if interleaving is supported. */
5594 if (!vect_strided_load_supported (vectype)
5595 && !PURE_SLP_STMT (stmt_info) && !slp)
5596 return false;
5599 if (!vec_stmt) /* transformation not required. */
5601 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5602 vect_model_load_cost (stmt_info, ncopies, NULL);
5603 return true;
5606 if (vect_print_dump_info (REPORT_DETAILS))
5607 fprintf (vect_dump, "transform load.");
5609 /** Transform. **/
5611 if (strided_load)
5613 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5614 /* Check if the chain of loads is already vectorized. */
5615 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5617 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5618 return true;
5620 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5621 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5622 dr_chain = VEC_alloc (tree, heap, group_size);
5624 /* VEC_NUM is the number of vect stmts to be created for this group. */
5625 if (slp)
5627 strided_load = false;
5628 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5630 else
5631 vec_num = group_size;
5633 else
5635 first_stmt = stmt;
5636 first_dr = dr;
5637 group_size = vec_num = 1;
5640 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5641 gcc_assert (alignment_support_scheme);
5643 /* In case the vectorization factor (VF) is bigger than the number
5644 of elements that we can fit in a vectype (nunits), we have to generate
5645 more than one vector stmt - i.e - we need to "unroll" the
5646 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5647 from one copy of the vector stmt to the next, in the field
5648 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5649 stages to find the correct vector defs to be used when vectorizing
5650 stmts that use the defs of the current stmt. The example below illustrates
5651 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5652 4 vectorized stmts):
5654 before vectorization:
5655 RELATED_STMT VEC_STMT
5656 S1: x = memref - -
5657 S2: z = x + 1 - -
5659 step 1: vectorize stmt S1:
5660 We first create the vector stmt VS1_0, and, as usual, record a
5661 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5662 Next, we create the vector stmt VS1_1, and record a pointer to
5663 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5664 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5665 stmts and pointers:
5666 RELATED_STMT VEC_STMT
5667 VS1_0: vx0 = memref0 VS1_1 -
5668 VS1_1: vx1 = memref1 VS1_2 -
5669 VS1_2: vx2 = memref2 VS1_3 -
5670 VS1_3: vx3 = memref3 - -
5671 S1: x = load - VS1_0
5672 S2: z = x + 1 - -
5674 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5675 information we recorded in RELATED_STMT field is used to vectorize
5676 stmt S2. */
5678 /* In case of interleaving (non-unit strided access):
5680 S1: x2 = &base + 2
5681 S2: x0 = &base
5682 S3: x1 = &base + 1
5683 S4: x3 = &base + 3
5685 Vectorized loads are created in the order of memory accesses
5686 starting from the access of the first stmt of the chain:
5688 VS1: vx0 = &base
5689 VS2: vx1 = &base + vec_size*1
5690 VS3: vx3 = &base + vec_size*2
5691 VS4: vx4 = &base + vec_size*3
5693 Then permutation statements are generated:
5695 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5696 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5699 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5700 (the order of the data-refs in the output of vect_permute_load_chain
5701 corresponds to the order of scalar stmts in the interleaving chain - see
5702 the documentation of vect_permute_load_chain()).
5703 The generation of permutation stmts and recording them in
5704 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5706 In case of both multiple types and interleaving, the vector loads and
5707 permutation stmts above are created for every copy. The result vector stmts
5708 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5709 STMT_VINFO_RELATED_STMT for the next copies. */
5711 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5712 on a target that supports unaligned accesses (dr_unaligned_supported)
5713 we generate the following code:
5714 p = initial_addr;
5715 indx = 0;
5716 loop {
5717 p = p + indx * vectype_size;
5718 vec_dest = *(p);
5719 indx = indx + 1;
5722 Otherwise, the data reference is potentially unaligned on a target that
5723 does not support unaligned accesses (dr_explicit_realign_optimized) -
5724 then generate the following code, in which the data in each iteration is
5725 obtained by two vector loads, one from the previous iteration, and one
5726 from the current iteration:
5727 p1 = initial_addr;
5728 msq_init = *(floor(p1))
5729 p2 = initial_addr + VS - 1;
5730 realignment_token = call target_builtin;
5731 indx = 0;
5732 loop {
5733 p2 = p2 + indx * vectype_size
5734 lsq = *(floor(p2))
5735 vec_dest = realign_load (msq, lsq, realignment_token)
5736 indx = indx + 1;
5737 msq = lsq;
5738 } */
5740 /* If the misalignment remains the same throughout the execution of the
5741 loop, we can create the init_addr and permutation mask at the loop
5742 preheader. Otherwise, it needs to be created inside the loop.
5743 This can only occur when vectorizing memory accesses in the inner-loop
5744 nested within an outer-loop that is being vectorized. */
5746 if (nested_in_vect_loop_p (loop, stmt)
5747 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5749 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5750 compute_in_loop = true;
5753 if ((alignment_support_scheme == dr_explicit_realign_optimized
5754 || alignment_support_scheme == dr_explicit_realign)
5755 && !compute_in_loop)
5757 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5758 alignment_support_scheme, NULL_TREE,
5759 &at_loop);
5760 if (alignment_support_scheme == dr_explicit_realign_optimized)
5762 phi = SSA_NAME_DEF_STMT (msq);
5763 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5766 else
5767 at_loop = loop;
5769 prev_stmt_info = NULL;
5770 for (j = 0; j < ncopies; j++)
5772 /* 1. Create the vector pointer update chain. */
5773 if (j == 0)
5774 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5775 at_loop, offset,
5776 &dummy, &ptr_incr, false,
5777 NULL_TREE, &inv_p);
5778 else
5779 dataref_ptr =
5780 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5782 for (i = 0; i < vec_num; i++)
5784 if (i > 0)
5785 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5786 NULL_TREE);
5788 /* 2. Create the vector-load in the loop. */
5789 switch (alignment_support_scheme)
5791 case dr_aligned:
5792 gcc_assert (aligned_access_p (first_dr));
5793 data_ref = build_fold_indirect_ref (dataref_ptr);
5794 break;
5795 case dr_unaligned_supported:
5797 int mis = DR_MISALIGNMENT (first_dr);
5798 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5800 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5801 data_ref =
5802 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5803 break;
5805 case dr_explicit_realign:
5807 tree ptr, bump;
5808 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5810 if (compute_in_loop)
5811 msq = vect_setup_realignment (first_stmt, bsi,
5812 &realignment_token,
5813 dr_explicit_realign,
5814 dataref_ptr, NULL);
5816 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5817 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5818 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5819 new_temp = make_ssa_name (vec_dest, new_stmt);
5820 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5821 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5822 copy_virtual_operands (new_stmt, stmt);
5823 mark_symbols_for_renaming (new_stmt);
5824 msq = new_temp;
5826 bump = size_binop (MULT_EXPR, vs_minus_1,
5827 TYPE_SIZE_UNIT (scalar_type));
5828 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5829 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5830 break;
5832 case dr_explicit_realign_optimized:
5833 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5834 break;
5835 default:
5836 gcc_unreachable ();
5838 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5839 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5840 new_temp = make_ssa_name (vec_dest, new_stmt);
5841 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5842 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5843 mark_symbols_for_renaming (new_stmt);
5845 /* 3. Handle explicit realignment if necessary/supported. Create in
5846 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5847 if (alignment_support_scheme == dr_explicit_realign_optimized
5848 || alignment_support_scheme == dr_explicit_realign)
5850 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5851 if (!realignment_token)
5852 realignment_token = dataref_ptr;
5853 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5854 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5855 realignment_token);
5856 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5857 new_temp = make_ssa_name (vec_dest, new_stmt);
5858 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5859 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5861 if (alignment_support_scheme == dr_explicit_realign_optimized)
5863 if (i == vec_num - 1 && j == ncopies - 1)
5864 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5865 msq = lsq;
5869 /* 4. Handle invariant-load. */
5870 if (inv_p)
5872 gcc_assert (!strided_load);
5873 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5874 if (j == 0)
5876 int k;
5877 tree t = NULL_TREE;
5878 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5880 /* CHECKME: bitpos depends on endianess? */
5881 bitpos = bitsize_zero_node;
5882 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5883 bitsize, bitpos);
5884 vec_dest =
5885 vect_create_destination_var (scalar_dest, NULL_TREE);
5886 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5887 new_temp = make_ssa_name (vec_dest, new_stmt);
5888 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5889 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5891 for (k = nunits - 1; k >= 0; --k)
5892 t = tree_cons (NULL_TREE, new_temp, t);
5893 /* FIXME: use build_constructor directly. */
5894 vec_inv = build_constructor_from_list (vectype, t);
5895 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5896 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5898 else
5899 gcc_unreachable (); /* FORNOW. */
5902 /* Collect vector loads and later create their permutation in
5903 vect_transform_strided_load (). */
5904 if (strided_load)
5905 VEC_quick_push (tree, dr_chain, new_temp);
5907 /* Store vector loads in the corresponding SLP_NODE. */
5908 if (slp)
5909 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5912 /* FORNOW: SLP with multiple types is unsupported. */
5913 if (slp)
5914 return true;
5916 if (strided_load)
5918 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5919 return false;
5920 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5921 VEC_free (tree, heap, dr_chain);
5922 dr_chain = VEC_alloc (tree, heap, group_size);
5924 else
5926 if (j == 0)
5927 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5928 else
5929 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5930 prev_stmt_info = vinfo_for_stmt (new_stmt);
5934 if (dr_chain)
5935 VEC_free (tree, heap, dr_chain);
5937 return true;
5941 /* Function vectorizable_live_operation.
5943 STMT computes a value that is used outside the loop. Check if
5944 it can be supported. */
5946 bool
5947 vectorizable_live_operation (tree stmt,
5948 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5949 tree *vec_stmt ATTRIBUTE_UNUSED)
5951 tree operation;
5952 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5953 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5954 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5955 int i;
5956 int op_type;
5957 tree op;
5958 tree def, def_stmt;
5959 enum vect_def_type dt;
5961 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5963 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5964 return false;
5966 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5967 return false;
5969 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5970 return false;
5972 /* FORNOW. CHECKME. */
5973 if (nested_in_vect_loop_p (loop, stmt))
5974 return false;
5976 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5977 op_type = TREE_OPERAND_LENGTH (operation);
5979 /* FORNOW: support only if all uses are invariant. This means
5980 that the scalar operations can remain in place, unvectorized.
5981 The original last scalar value that they compute will be used. */
5983 for (i = 0; i < op_type; i++)
5985 op = TREE_OPERAND (operation, i);
5986 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5988 if (vect_print_dump_info (REPORT_DETAILS))
5989 fprintf (vect_dump, "use not simple.");
5990 return false;
5993 if (dt != vect_invariant_def && dt != vect_constant_def)
5994 return false;
5997 /* No transformation is required for the cases we currently support. */
5998 return true;
6002 /* Function vect_is_simple_cond.
6004 Input:
6005 LOOP - the loop that is being vectorized.
6006 COND - Condition that is checked for simple use.
6008 Returns whether a COND can be vectorized. Checks whether
6009 condition operands are supportable using vec_is_simple_use. */
6011 static bool
6012 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6014 tree lhs, rhs;
6015 tree def;
6016 enum vect_def_type dt;
6018 if (!COMPARISON_CLASS_P (cond))
6019 return false;
6021 lhs = TREE_OPERAND (cond, 0);
6022 rhs = TREE_OPERAND (cond, 1);
6024 if (TREE_CODE (lhs) == SSA_NAME)
6026 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6027 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6028 return false;
6030 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6031 && TREE_CODE (lhs) != FIXED_CST)
6032 return false;
6034 if (TREE_CODE (rhs) == SSA_NAME)
6036 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6037 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6038 return false;
6040 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6041 && TREE_CODE (rhs) != FIXED_CST)
6042 return false;
6044 return true;
6047 /* vectorizable_condition.
6049 Check if STMT is conditional modify expression that can be vectorized.
6050 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6051 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6052 at BSI.
6054 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6056 bool
6057 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
6059 tree scalar_dest = NULL_TREE;
6060 tree vec_dest = NULL_TREE;
6061 tree op = NULL_TREE;
6062 tree cond_expr, then_clause, else_clause;
6063 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6064 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6065 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6066 tree vec_compare, vec_cond_expr;
6067 tree new_temp;
6068 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6069 enum machine_mode vec_mode;
6070 tree def;
6071 enum vect_def_type dt;
6072 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6073 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6075 gcc_assert (ncopies >= 1);
6076 if (ncopies > 1)
6077 return false; /* FORNOW */
6079 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6080 return false;
6082 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6083 return false;
6085 /* FORNOW: SLP not supported. */
6086 if (STMT_SLP_TYPE (stmt_info))
6087 return false;
6089 /* FORNOW: not yet supported. */
6090 if (STMT_VINFO_LIVE_P (stmt_info))
6092 if (vect_print_dump_info (REPORT_DETAILS))
6093 fprintf (vect_dump, "value used after loop.");
6094 return false;
6097 /* Is vectorizable conditional operation? */
6098 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6099 return false;
6101 op = GIMPLE_STMT_OPERAND (stmt, 1);
6103 if (TREE_CODE (op) != COND_EXPR)
6104 return false;
6106 cond_expr = TREE_OPERAND (op, 0);
6107 then_clause = TREE_OPERAND (op, 1);
6108 else_clause = TREE_OPERAND (op, 2);
6110 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6111 return false;
6113 /* We do not handle two different vector types for the condition
6114 and the values. */
6115 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6116 return false;
6118 if (TREE_CODE (then_clause) == SSA_NAME)
6120 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6121 if (!vect_is_simple_use (then_clause, loop_vinfo,
6122 &then_def_stmt, &def, &dt))
6123 return false;
6125 else if (TREE_CODE (then_clause) != INTEGER_CST
6126 && TREE_CODE (then_clause) != REAL_CST
6127 && TREE_CODE (then_clause) != FIXED_CST)
6128 return false;
6130 if (TREE_CODE (else_clause) == SSA_NAME)
6132 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6133 if (!vect_is_simple_use (else_clause, loop_vinfo,
6134 &else_def_stmt, &def, &dt))
6135 return false;
6137 else if (TREE_CODE (else_clause) != INTEGER_CST
6138 && TREE_CODE (else_clause) != REAL_CST
6139 && TREE_CODE (else_clause) != FIXED_CST)
6140 return false;
6143 vec_mode = TYPE_MODE (vectype);
6145 if (!vec_stmt)
6147 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6148 return expand_vec_cond_expr_p (op, vec_mode);
6151 /* Transform */
6153 /* Handle def. */
6154 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6155 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6157 /* Handle cond expr. */
6158 vec_cond_lhs =
6159 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6160 vec_cond_rhs =
6161 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6162 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6163 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6165 /* Arguments are ready. create the new vector stmt. */
6166 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6167 vec_cond_lhs, vec_cond_rhs);
6168 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6169 vec_compare, vec_then_clause, vec_else_clause);
6171 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6172 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6173 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6174 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6176 return true;
6180 /* Function vect_transform_stmt.
6182 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6184 static bool
6185 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6186 slp_tree slp_node)
6188 bool is_store = false;
6189 tree vec_stmt = NULL_TREE;
6190 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6191 tree orig_stmt_in_pattern;
6192 bool done;
6194 switch (STMT_VINFO_TYPE (stmt_info))
6196 case type_demotion_vec_info_type:
6197 gcc_assert (!slp_node);
6198 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6199 gcc_assert (done);
6200 break;
6202 case type_promotion_vec_info_type:
6203 gcc_assert (!slp_node);
6204 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6205 gcc_assert (done);
6206 break;
6208 case type_conversion_vec_info_type:
6209 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6210 gcc_assert (done);
6211 break;
6213 case induc_vec_info_type:
6214 gcc_assert (!slp_node);
6215 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6216 gcc_assert (done);
6217 break;
6219 case op_vec_info_type:
6220 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6221 gcc_assert (done);
6222 break;
6224 case assignment_vec_info_type:
6225 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6226 gcc_assert (done);
6227 break;
6229 case load_vec_info_type:
6230 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6231 gcc_assert (done);
6232 break;
6234 case store_vec_info_type:
6235 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6236 gcc_assert (done);
6237 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6239 /* In case of interleaving, the whole chain is vectorized when the
6240 last store in the chain is reached. Store stmts before the last
6241 one are skipped, and there vec_stmt_info shouldn't be freed
6242 meanwhile. */
6243 *strided_store = true;
6244 if (STMT_VINFO_VEC_STMT (stmt_info))
6245 is_store = true;
6247 else
6248 is_store = true;
6249 break;
6251 case condition_vec_info_type:
6252 gcc_assert (!slp_node);
6253 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6254 gcc_assert (done);
6255 break;
6257 case call_vec_info_type:
6258 gcc_assert (!slp_node);
6259 done = vectorizable_call (stmt, bsi, &vec_stmt);
6260 break;
6262 case reduc_vec_info_type:
6263 gcc_assert (!slp_node);
6264 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6265 gcc_assert (done);
6266 break;
6268 default:
6269 if (!STMT_VINFO_LIVE_P (stmt_info))
6271 if (vect_print_dump_info (REPORT_DETAILS))
6272 fprintf (vect_dump, "stmt not supported.");
6273 gcc_unreachable ();
6277 if (STMT_VINFO_LIVE_P (stmt_info)
6278 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6280 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6281 gcc_assert (done);
6284 if (vec_stmt)
6286 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6287 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6288 if (orig_stmt_in_pattern)
6290 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6291 /* STMT was inserted by the vectorizer to replace a computation idiom.
6292 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6293 computed this idiom. We need to record a pointer to VEC_STMT in
6294 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6295 documentation of vect_pattern_recog. */
6296 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6298 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6299 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6304 return is_store;
6308 /* This function builds ni_name = number of iterations loop executes
6309 on the loop preheader. */
6311 static tree
6312 vect_build_loop_niters (loop_vec_info loop_vinfo)
6314 tree ni_name, stmt, var;
6315 edge pe;
6316 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6317 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6319 var = create_tmp_var (TREE_TYPE (ni), "niters");
6320 add_referenced_var (var);
6321 ni_name = force_gimple_operand (ni, &stmt, false, var);
6323 pe = loop_preheader_edge (loop);
6324 if (stmt)
6326 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6327 gcc_assert (!new_bb);
6330 return ni_name;
6334 /* This function generates the following statements:
6336 ni_name = number of iterations loop executes
6337 ratio = ni_name / vf
6338 ratio_mult_vf_name = ratio * vf
6340 and places them at the loop preheader edge. */
6342 static void
6343 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6344 tree *ni_name_ptr,
6345 tree *ratio_mult_vf_name_ptr,
6346 tree *ratio_name_ptr)
6349 edge pe;
6350 basic_block new_bb;
6351 tree stmt, ni_name;
6352 tree var;
6353 tree ratio_name;
6354 tree ratio_mult_vf_name;
6355 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6356 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6357 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6358 tree log_vf;
6360 pe = loop_preheader_edge (loop);
6362 /* Generate temporary variable that contains
6363 number of iterations loop executes. */
6365 ni_name = vect_build_loop_niters (loop_vinfo);
6366 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6368 /* Create: ratio = ni >> log2(vf) */
6370 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6371 if (!is_gimple_val (ratio_name))
6373 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6374 add_referenced_var (var);
6376 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6377 pe = loop_preheader_edge (loop);
6378 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6379 gcc_assert (!new_bb);
6382 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6384 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6385 ratio_name, log_vf);
6386 if (!is_gimple_val (ratio_mult_vf_name))
6388 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6389 add_referenced_var (var);
6391 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6392 true, var);
6393 pe = loop_preheader_edge (loop);
6394 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6395 gcc_assert (!new_bb);
6398 *ni_name_ptr = ni_name;
6399 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6400 *ratio_name_ptr = ratio_name;
6402 return;
6406 /* Function vect_update_ivs_after_vectorizer.
6408 "Advance" the induction variables of LOOP to the value they should take
6409 after the execution of LOOP. This is currently necessary because the
6410 vectorizer does not handle induction variables that are used after the
6411 loop. Such a situation occurs when the last iterations of LOOP are
6412 peeled, because:
6413 1. We introduced new uses after LOOP for IVs that were not originally used
6414 after LOOP: the IVs of LOOP are now used by an epilog loop.
6415 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6416 times, whereas the loop IVs should be bumped N times.
6418 Input:
6419 - LOOP - a loop that is going to be vectorized. The last few iterations
6420 of LOOP were peeled.
6421 - NITERS - the number of iterations that LOOP executes (before it is
6422 vectorized). i.e, the number of times the ivs should be bumped.
6423 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6424 coming out from LOOP on which there are uses of the LOOP ivs
6425 (this is the path from LOOP->exit to epilog_loop->preheader).
6427 The new definitions of the ivs are placed in LOOP->exit.
6428 The phi args associated with the edge UPDATE_E in the bb
6429 UPDATE_E->dest are updated accordingly.
6431 Assumption 1: Like the rest of the vectorizer, this function assumes
6432 a single loop exit that has a single predecessor.
6434 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6435 organized in the same order.
6437 Assumption 3: The access function of the ivs is simple enough (see
6438 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6440 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6441 coming out of LOOP on which the ivs of LOOP are used (this is the path
6442 that leads to the epilog loop; other paths skip the epilog loop). This
6443 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6444 needs to have its phis updated.
6447 static void
6448 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6449 edge update_e)
6451 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6452 basic_block exit_bb = single_exit (loop)->dest;
6453 tree phi, phi1;
6454 basic_block update_bb = update_e->dest;
6456 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6458 /* Make sure there exists a single-predecessor exit bb: */
6459 gcc_assert (single_pred_p (exit_bb));
6461 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6462 phi && phi1;
6463 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6465 tree access_fn = NULL;
6466 tree evolution_part;
6467 tree init_expr;
6468 tree step_expr;
6469 tree var, ni, ni_name;
6470 block_stmt_iterator last_bsi;
6472 if (vect_print_dump_info (REPORT_DETAILS))
6474 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6475 print_generic_expr (vect_dump, phi, TDF_SLIM);
6478 /* Skip virtual phi's. */
6479 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6481 if (vect_print_dump_info (REPORT_DETAILS))
6482 fprintf (vect_dump, "virtual phi. skip.");
6483 continue;
6486 /* Skip reduction phis. */
6487 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6489 if (vect_print_dump_info (REPORT_DETAILS))
6490 fprintf (vect_dump, "reduc phi. skip.");
6491 continue;
6494 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6495 gcc_assert (access_fn);
6496 evolution_part =
6497 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6498 gcc_assert (evolution_part != NULL_TREE);
6500 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6501 of degree >= 2 or exponential. */
6502 gcc_assert (!tree_is_chrec (evolution_part));
6504 step_expr = evolution_part;
6505 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6506 loop->num));
6508 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6509 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6510 init_expr,
6511 fold_convert (sizetype,
6512 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6513 niters, step_expr)));
6514 else
6515 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6516 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6517 fold_convert (TREE_TYPE (init_expr),
6518 niters),
6519 step_expr),
6520 init_expr);
6524 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6525 add_referenced_var (var);
6527 last_bsi = bsi_last (exit_bb);
6528 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6529 true, BSI_SAME_STMT);
6531 /* Fix phi expressions in the successor bb. */
6532 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6536 /* Return the more conservative threshold between the
6537 min_profitable_iters returned by the cost model and the user
6538 specified threshold, if provided. */
6540 static unsigned int
6541 conservative_cost_threshold (loop_vec_info loop_vinfo,
6542 int min_profitable_iters)
6544 unsigned int th;
6545 int min_scalar_loop_bound;
6547 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6548 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6550 /* Use the cost model only if it is more conservative than user specified
6551 threshold. */
6552 th = (unsigned) min_scalar_loop_bound;
6553 if (min_profitable_iters
6554 && (!min_scalar_loop_bound
6555 || min_profitable_iters > min_scalar_loop_bound))
6556 th = (unsigned) min_profitable_iters;
6558 if (th && vect_print_dump_info (REPORT_COST))
6559 fprintf (vect_dump, "Vectorization may not be profitable.");
6561 return th;
6564 /* Function vect_do_peeling_for_loop_bound
6566 Peel the last iterations of the loop represented by LOOP_VINFO.
6567 The peeled iterations form a new epilog loop. Given that the loop now
6568 iterates NITERS times, the new epilog loop iterates
6569 NITERS % VECTORIZATION_FACTOR times.
6571 The original loop will later be made to iterate
6572 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6574 static void
6575 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6577 tree ni_name, ratio_mult_vf_name;
6578 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6579 struct loop *new_loop;
6580 edge update_e;
6581 basic_block preheader;
6582 int loop_num;
6583 bool check_profitability = false;
6584 unsigned int th = 0;
6585 int min_profitable_iters;
6587 if (vect_print_dump_info (REPORT_DETAILS))
6588 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6590 initialize_original_copy_tables ();
6592 /* Generate the following variables on the preheader of original loop:
6594 ni_name = number of iteration the original loop executes
6595 ratio = ni_name / vf
6596 ratio_mult_vf_name = ratio * vf */
6597 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6598 &ratio_mult_vf_name, ratio);
6600 loop_num = loop->num;
6602 /* If cost model check not done during versioning and
6603 peeling for alignment. */
6604 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6605 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6606 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6608 check_profitability = true;
6610 /* Get profitability threshold for vectorized loop. */
6611 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6613 th = conservative_cost_threshold (loop_vinfo,
6614 min_profitable_iters);
6617 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6618 ratio_mult_vf_name, ni_name, false,
6619 th, check_profitability);
6620 gcc_assert (new_loop);
6621 gcc_assert (loop_num == loop->num);
6622 #ifdef ENABLE_CHECKING
6623 slpeel_verify_cfg_after_peeling (loop, new_loop);
6624 #endif
6626 /* A guard that controls whether the new_loop is to be executed or skipped
6627 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6628 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6629 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6630 is on the path where the LOOP IVs are used and need to be updated. */
6632 preheader = loop_preheader_edge (new_loop)->src;
6633 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6634 update_e = EDGE_PRED (preheader, 0);
6635 else
6636 update_e = EDGE_PRED (preheader, 1);
6638 /* Update IVs of original loop as if they were advanced
6639 by ratio_mult_vf_name steps. */
6640 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6642 /* After peeling we have to reset scalar evolution analyzer. */
6643 scev_reset ();
6645 free_original_copy_tables ();
6649 /* Function vect_gen_niters_for_prolog_loop
6651 Set the number of iterations for the loop represented by LOOP_VINFO
6652 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6653 and the misalignment of DR - the data reference recorded in
6654 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6655 this loop, the data reference DR will refer to an aligned location.
6657 The following computation is generated:
6659 If the misalignment of DR is known at compile time:
6660 addr_mis = int mis = DR_MISALIGNMENT (dr);
6661 Else, compute address misalignment in bytes:
6662 addr_mis = addr & (vectype_size - 1)
6664 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6666 (elem_size = element type size; an element is the scalar element
6667 whose type is the inner type of the vectype)
6669 For interleaving,
6671 prolog_niters = min ( LOOP_NITERS ,
6672 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6673 where group_size is the size of the interleaved group.
6675 The above formulas assume that VF == number of elements in the vector. This
6676 may not hold when there are multiple-types in the loop.
6677 In this case, for some data-references in the loop the VF does not represent
6678 the number of elements that fit in the vector. Therefore, instead of VF we
6679 use TYPE_VECTOR_SUBPARTS. */
6681 static tree
6682 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6684 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6685 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6686 tree var, stmt;
6687 tree iters, iters_name;
6688 edge pe;
6689 basic_block new_bb;
6690 tree dr_stmt = DR_STMT (dr);
6691 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6692 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6693 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6694 tree niters_type = TREE_TYPE (loop_niters);
6695 int group_size = 1;
6696 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6697 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6699 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6701 /* For interleaved access element size must be multiplied by the size of
6702 the interleaved group. */
6703 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6704 DR_GROUP_FIRST_DR (stmt_info)));
6705 element_size *= group_size;
6708 pe = loop_preheader_edge (loop);
6710 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6712 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6713 int elem_misalign = byte_misalign / element_size;
6715 if (vect_print_dump_info (REPORT_DETAILS))
6716 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6717 iters = build_int_cst (niters_type,
6718 (nelements - elem_misalign)&(nelements/group_size-1));
6720 else
6722 tree new_stmts = NULL_TREE;
6723 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6724 &new_stmts, NULL_TREE, loop);
6725 tree ptr_type = TREE_TYPE (start_addr);
6726 tree size = TYPE_SIZE (ptr_type);
6727 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6728 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6729 tree elem_size_log =
6730 build_int_cst (type, exact_log2 (vectype_align/nelements));
6731 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6732 tree nelements_tree = build_int_cst (type, nelements);
6733 tree byte_misalign;
6734 tree elem_misalign;
6736 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6737 gcc_assert (!new_bb);
6739 /* Create: byte_misalign = addr & (vectype_size - 1) */
6740 byte_misalign =
6741 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6743 /* Create: elem_misalign = byte_misalign / element_size */
6744 elem_misalign =
6745 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6747 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6748 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6749 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6750 iters = fold_convert (niters_type, iters);
6753 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6754 /* If the loop bound is known at compile time we already verified that it is
6755 greater than vf; since the misalignment ('iters') is at most vf, there's
6756 no need to generate the MIN_EXPR in this case. */
6757 if (TREE_CODE (loop_niters) != INTEGER_CST)
6758 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6760 if (vect_print_dump_info (REPORT_DETAILS))
6762 fprintf (vect_dump, "niters for prolog loop: ");
6763 print_generic_expr (vect_dump, iters, TDF_SLIM);
6766 var = create_tmp_var (niters_type, "prolog_loop_niters");
6767 add_referenced_var (var);
6768 iters_name = force_gimple_operand (iters, &stmt, false, var);
6770 /* Insert stmt on loop preheader edge. */
6771 if (stmt)
6773 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6774 gcc_assert (!new_bb);
6777 return iters_name;
6781 /* Function vect_update_init_of_dr
6783 NITERS iterations were peeled from LOOP. DR represents a data reference
6784 in LOOP. This function updates the information recorded in DR to
6785 account for the fact that the first NITERS iterations had already been
6786 executed. Specifically, it updates the OFFSET field of DR. */
6788 static void
6789 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6791 tree offset = DR_OFFSET (dr);
6793 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6794 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6795 DR_OFFSET (dr) = offset;
6799 /* Function vect_update_inits_of_drs
6801 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6802 This function updates the information recorded for the data references in
6803 the loop to account for the fact that the first NITERS iterations had
6804 already been executed. Specifically, it updates the initial_condition of
6805 the access_function of all the data_references in the loop. */
6807 static void
6808 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6810 unsigned int i;
6811 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6812 struct data_reference *dr;
6814 if (vect_print_dump_info (REPORT_DETAILS))
6815 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6817 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6818 vect_update_init_of_dr (dr, niters);
6822 /* Function vect_do_peeling_for_alignment
6824 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6825 'niters' is set to the misalignment of one of the data references in the
6826 loop, thereby forcing it to refer to an aligned location at the beginning
6827 of the execution of this loop. The data reference for which we are
6828 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6830 static void
6831 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6833 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6834 tree niters_of_prolog_loop, ni_name;
6835 tree n_iters;
6836 struct loop *new_loop;
6837 bool check_profitability = false;
6838 unsigned int th = 0;
6839 int min_profitable_iters;
6841 if (vect_print_dump_info (REPORT_DETAILS))
6842 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6844 initialize_original_copy_tables ();
6846 ni_name = vect_build_loop_niters (loop_vinfo);
6847 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6850 /* If cost model check not done during versioning. */
6851 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6852 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6854 check_profitability = true;
6856 /* Get profitability threshold for vectorized loop. */
6857 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6859 th = conservative_cost_threshold (loop_vinfo,
6860 min_profitable_iters);
6863 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6864 new_loop =
6865 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6866 niters_of_prolog_loop, ni_name, true,
6867 th, check_profitability);
6869 gcc_assert (new_loop);
6870 #ifdef ENABLE_CHECKING
6871 slpeel_verify_cfg_after_peeling (new_loop, loop);
6872 #endif
6874 /* Update number of times loop executes. */
6875 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6876 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6877 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6879 /* Update the init conditions of the access functions of all data refs. */
6880 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6882 /* After peeling we have to reset scalar evolution analyzer. */
6883 scev_reset ();
6885 free_original_copy_tables ();
6889 /* Function vect_create_cond_for_align_checks.
6891 Create a conditional expression that represents the alignment checks for
6892 all of data references (array element references) whose alignment must be
6893 checked at runtime.
6895 Input:
6896 COND_EXPR - input conditional expression. New conditions will be chained
6897 with logical AND operation.
6898 LOOP_VINFO - two fields of the loop information are used.
6899 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6900 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6902 Output:
6903 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6904 expression.
6905 The returned value is the conditional expression to be used in the if
6906 statement that controls which version of the loop gets executed at runtime.
6908 The algorithm makes two assumptions:
6909 1) The number of bytes "n" in a vector is a power of 2.
6910 2) An address "a" is aligned if a%n is zero and that this
6911 test can be done as a&(n-1) == 0. For example, for 16
6912 byte vectors the test is a&0xf == 0. */
6914 static void
6915 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6916 tree *cond_expr,
6917 tree *cond_expr_stmt_list)
6919 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6920 VEC(tree,heap) *may_misalign_stmts
6921 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6922 tree ref_stmt, tmp;
6923 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6924 tree mask_cst;
6925 unsigned int i;
6926 tree psize;
6927 tree int_ptrsize_type;
6928 char tmp_name[20];
6929 tree or_tmp_name = NULL_TREE;
6930 tree and_tmp, and_tmp_name, and_stmt;
6931 tree ptrsize_zero;
6932 tree part_cond_expr;
6934 /* Check that mask is one less than a power of 2, i.e., mask is
6935 all zeros followed by all ones. */
6936 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6938 /* CHECKME: what is the best integer or unsigned type to use to hold a
6939 cast from a pointer value? */
6940 psize = TYPE_SIZE (ptr_type_node);
6941 int_ptrsize_type
6942 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6944 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6945 of the first vector of the i'th data reference. */
6947 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6949 tree new_stmt_list = NULL_TREE;
6950 tree addr_base;
6951 tree addr_tmp, addr_tmp_name, addr_stmt;
6952 tree or_tmp, new_or_tmp_name, or_stmt;
6954 /* create: addr_tmp = (int)(address_of_first_vector) */
6955 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6956 &new_stmt_list, NULL_TREE, loop);
6958 if (new_stmt_list != NULL_TREE)
6959 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6961 sprintf (tmp_name, "%s%d", "addr2int", i);
6962 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6963 add_referenced_var (addr_tmp);
6964 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6965 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6966 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6967 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6968 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6970 /* The addresses are OR together. */
6972 if (or_tmp_name != NULL_TREE)
6974 /* create: or_tmp = or_tmp | addr_tmp */
6975 sprintf (tmp_name, "%s%d", "orptrs", i);
6976 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6977 add_referenced_var (or_tmp);
6978 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6979 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6980 or_tmp_name, addr_tmp_name);
6981 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6982 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6983 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6984 or_tmp_name = new_or_tmp_name;
6986 else
6987 or_tmp_name = addr_tmp_name;
6989 } /* end for i */
6991 mask_cst = build_int_cst (int_ptrsize_type, mask);
6993 /* create: and_tmp = or_tmp & mask */
6994 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6995 add_referenced_var (and_tmp);
6996 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6998 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6999 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
7000 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7001 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
7003 /* Make and_tmp the left operand of the conditional test against zero.
7004 if and_tmp has a nonzero bit then some address is unaligned. */
7005 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7006 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7007 and_tmp_name, ptrsize_zero);
7008 if (*cond_expr)
7009 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7010 *cond_expr, part_cond_expr);
7011 else
7012 *cond_expr = part_cond_expr;
7015 /* Function vect_vfa_segment_size.
7017 Create an expression that computes the size of segment
7018 that will be accessed for a data reference. The functions takes into
7019 account that realignment loads may access one more vector.
7021 Input:
7022 DR: The data reference.
7023 VECT_FACTOR: vectorization factor.
7025 Return an expression whose value is the size of segment which will be
7026 accessed by DR. */
7028 static tree
7029 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7031 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7032 DR_STEP (dr), vect_factor);
7034 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7036 tree vector_size = TYPE_SIZE_UNIT
7037 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7039 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7040 segment_length, vector_size);
7042 return fold_convert (sizetype, segment_length);
7045 /* Function vect_create_cond_for_alias_checks.
7047 Create a conditional expression that represents the run-time checks for
7048 overlapping of address ranges represented by a list of data references
7049 relations passed as input.
7051 Input:
7052 COND_EXPR - input conditional expression. New conditions will be chained
7053 with logical AND operation.
7054 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7055 to be checked.
7057 Output:
7058 COND_EXPR - conditional expression.
7059 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7060 expression.
7063 The returned value is the conditional expression to be used in the if
7064 statement that controls which version of the loop gets executed at runtime.
7067 static void
7068 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7069 tree * cond_expr,
7070 tree * cond_expr_stmt_list)
7072 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7073 VEC (ddr_p, heap) * may_alias_ddrs =
7074 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7075 tree vect_factor =
7076 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7078 ddr_p ddr;
7079 unsigned int i;
7080 tree part_cond_expr;
7082 /* Create expression
7083 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7084 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7088 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7089 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7091 if (VEC_empty (ddr_p, may_alias_ddrs))
7092 return;
7094 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7096 struct data_reference *dr_a, *dr_b;
7097 tree dr_group_first_a, dr_group_first_b;
7098 tree addr_base_a, addr_base_b;
7099 tree segment_length_a, segment_length_b;
7100 tree stmt_a, stmt_b;
7102 dr_a = DDR_A (ddr);
7103 stmt_a = DR_STMT (DDR_A (ddr));
7104 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7105 if (dr_group_first_a)
7107 stmt_a = dr_group_first_a;
7108 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7111 dr_b = DDR_B (ddr);
7112 stmt_b = DR_STMT (DDR_B (ddr));
7113 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7114 if (dr_group_first_b)
7116 stmt_b = dr_group_first_b;
7117 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7120 addr_base_a =
7121 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7122 NULL_TREE, loop);
7123 addr_base_b =
7124 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7125 NULL_TREE, loop);
7127 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7128 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7130 if (vect_print_dump_info (REPORT_DR_DETAILS))
7132 fprintf (vect_dump,
7133 "create runtime check for data references ");
7134 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7135 fprintf (vect_dump, " and ");
7136 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7140 part_cond_expr =
7141 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7142 fold_build2 (LT_EXPR, boolean_type_node,
7143 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7144 addr_base_a,
7145 segment_length_a),
7146 addr_base_b),
7147 fold_build2 (LT_EXPR, boolean_type_node,
7148 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7149 addr_base_b,
7150 segment_length_b),
7151 addr_base_a));
7153 if (*cond_expr)
7154 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7155 *cond_expr, part_cond_expr);
7156 else
7157 *cond_expr = part_cond_expr;
7159 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7160 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7161 VEC_length (ddr_p, may_alias_ddrs));
7165 /* Function vect_loop_versioning.
7167 If the loop has data references that may or may not be aligned or/and
7168 has data reference relations whose independence was not proven then
7169 two versions of the loop need to be generated, one which is vectorized
7170 and one which isn't. A test is then generated to control which of the
7171 loops is executed. The test checks for the alignment of all of the
7172 data references that may or may not be aligned. An additional
7173 sequence of runtime tests is generated for each pairs of DDRs whose
7174 independence was not proven. The vectorized version of loop is
7175 executed only if both alias and alignment tests are passed.
7177 The test generated to check which version of loop is executed
7178 is modified to also check for profitability as indicated by the
7179 cost model initially. */
7181 static void
7182 vect_loop_versioning (loop_vec_info loop_vinfo)
7184 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7185 struct loop *nloop;
7186 tree cond_expr = NULL_TREE;
7187 tree cond_expr_stmt_list = NULL_TREE;
7188 basic_block condition_bb;
7189 block_stmt_iterator cond_exp_bsi;
7190 basic_block merge_bb;
7191 basic_block new_exit_bb;
7192 edge new_exit_e, e;
7193 tree orig_phi, new_phi, arg;
7194 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7195 tree gimplify_stmt_list;
7196 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7197 int min_profitable_iters = 0;
7198 unsigned int th;
7200 /* Get profitability threshold for vectorized loop. */
7201 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7203 th = conservative_cost_threshold (loop_vinfo,
7204 min_profitable_iters);
7206 cond_expr =
7207 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7208 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7210 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7211 false, NULL_TREE);
7213 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7214 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7215 &cond_expr_stmt_list);
7217 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7218 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7219 &cond_expr_stmt_list);
7221 cond_expr =
7222 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7223 cond_expr =
7224 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7225 NULL_TREE);
7226 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7228 initialize_original_copy_tables ();
7229 nloop = loop_version (loop, cond_expr, &condition_bb,
7230 prob, prob, REG_BR_PROB_BASE - prob, true);
7231 free_original_copy_tables();
7233 /* Loop versioning violates an assumption we try to maintain during
7234 vectorization - that the loop exit block has a single predecessor.
7235 After versioning, the exit block of both loop versions is the same
7236 basic block (i.e. it has two predecessors). Just in order to simplify
7237 following transformations in the vectorizer, we fix this situation
7238 here by adding a new (empty) block on the exit-edge of the loop,
7239 with the proper loop-exit phis to maintain loop-closed-form. */
7241 merge_bb = single_exit (loop)->dest;
7242 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7243 new_exit_bb = split_edge (single_exit (loop));
7244 new_exit_e = single_exit (loop);
7245 e = EDGE_SUCC (new_exit_bb, 0);
7247 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7248 orig_phi = PHI_CHAIN (orig_phi))
7250 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7251 new_exit_bb);
7252 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7253 add_phi_arg (new_phi, arg, new_exit_e);
7254 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7257 /* End loop-exit-fixes after versioning. */
7259 update_ssa (TODO_update_ssa);
7260 if (cond_expr_stmt_list)
7262 cond_exp_bsi = bsi_last (condition_bb);
7263 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7267 /* Remove a group of stores (for SLP or interleaving), free their
7268 stmt_vec_info. */
7270 static void
7271 vect_remove_stores (tree first_stmt)
7273 tree next = first_stmt;
7274 tree tmp;
7275 block_stmt_iterator next_si;
7277 while (next)
7279 /* Free the attached stmt_vec_info and remove the stmt. */
7280 next_si = bsi_for_stmt (next);
7281 bsi_remove (&next_si, true);
7282 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
7283 free_stmt_vec_info (next);
7284 next = tmp;
7289 /* Vectorize SLP instance tree in postorder. */
7291 static bool
7292 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7294 tree stmt;
7295 bool strided_store, is_store;
7296 block_stmt_iterator si;
7297 stmt_vec_info stmt_info;
7299 if (!node)
7300 return false;
7302 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7303 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7305 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7306 stmt_info = vinfo_for_stmt (stmt);
7307 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7308 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7310 if (vect_print_dump_info (REPORT_DETAILS))
7312 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7313 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7316 si = bsi_for_stmt (stmt);
7317 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7318 if (is_store)
7320 if (DR_GROUP_FIRST_DR (stmt_info))
7321 /* If IS_STORE is TRUE, the vectorization of the
7322 interleaving chain was completed - free all the stores in
7323 the chain. */
7324 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7325 else
7326 /* FORNOW: SLP originates only from strided stores. */
7327 gcc_unreachable ();
7329 return true;
7332 /* FORNOW: SLP originates only from strided stores. */
7333 return false;
7337 static bool
7338 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7340 VEC (slp_instance, heap) *slp_instances =
7341 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7342 slp_instance instance;
7343 unsigned int vec_stmts_size;
7344 unsigned int group_size, i;
7345 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7346 bool is_store = false;
7348 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7350 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7351 /* For each SLP instance calculate number of vector stmts to be created
7352 for the scalar stmts in each node of the SLP tree. Number of vector
7353 elements in one vector iteration is the number of scalar elements in
7354 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7355 size. */
7356 vec_stmts_size = vectorization_factor * group_size / nunits;
7358 /* Schedule the tree of INSTANCE. */
7359 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7360 vec_stmts_size);
7362 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7363 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7364 fprintf (vect_dump, "vectorizing stmts using SLP.");
7367 return is_store;
7370 /* Function vect_transform_loop.
7372 The analysis phase has determined that the loop is vectorizable.
7373 Vectorize the loop - created vectorized stmts to replace the scalar
7374 stmts in the loop, and update the loop exit condition. */
7376 void
7377 vect_transform_loop (loop_vec_info loop_vinfo)
7379 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7380 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7381 int nbbs = loop->num_nodes;
7382 block_stmt_iterator si;
7383 int i;
7384 tree ratio = NULL;
7385 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7386 bool strided_store;
7387 bool slp_scheduled = false;
7388 unsigned int nunits;
7390 if (vect_print_dump_info (REPORT_DETAILS))
7391 fprintf (vect_dump, "=== vec_transform_loop ===");
7393 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7394 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7395 vect_loop_versioning (loop_vinfo);
7397 /* CHECKME: we wouldn't need this if we called update_ssa once
7398 for all loops. */
7399 bitmap_zero (vect_memsyms_to_rename);
7401 /* Peel the loop if there are data refs with unknown alignment.
7402 Only one data ref with unknown store is allowed. */
7404 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7405 vect_do_peeling_for_alignment (loop_vinfo);
7407 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7408 compile time constant), or it is a constant that doesn't divide by the
7409 vectorization factor, then an epilog loop needs to be created.
7410 We therefore duplicate the loop: the original loop will be vectorized,
7411 and will compute the first (n/VF) iterations. The second copy of the loop
7412 will remain scalar and will compute the remaining (n%VF) iterations.
7413 (VF is the vectorization factor). */
7415 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7416 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7417 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7418 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7419 else
7420 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7421 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7423 /* 1) Make sure the loop header has exactly two entries
7424 2) Make sure we have a preheader basic block. */
7426 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7428 split_edge (loop_preheader_edge (loop));
7430 /* FORNOW: the vectorizer supports only loops which body consist
7431 of one basic block (header + empty latch). When the vectorizer will
7432 support more involved loop forms, the order by which the BBs are
7433 traversed need to be reconsidered. */
7435 for (i = 0; i < nbbs; i++)
7437 basic_block bb = bbs[i];
7438 stmt_vec_info stmt_info;
7439 tree phi;
7441 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7443 if (vect_print_dump_info (REPORT_DETAILS))
7445 fprintf (vect_dump, "------>vectorizing phi: ");
7446 print_generic_expr (vect_dump, phi, TDF_SLIM);
7448 stmt_info = vinfo_for_stmt (phi);
7449 if (!stmt_info)
7450 continue;
7452 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7453 && !STMT_VINFO_LIVE_P (stmt_info))
7454 continue;
7456 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7457 != (unsigned HOST_WIDE_INT) vectorization_factor)
7458 && vect_print_dump_info (REPORT_DETAILS))
7459 fprintf (vect_dump, "multiple-types.");
7461 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7463 if (vect_print_dump_info (REPORT_DETAILS))
7464 fprintf (vect_dump, "transform phi.");
7465 vect_transform_stmt (phi, NULL, NULL, NULL);
7469 for (si = bsi_start (bb); !bsi_end_p (si);)
7471 tree stmt = bsi_stmt (si);
7472 bool is_store;
7474 if (vect_print_dump_info (REPORT_DETAILS))
7476 fprintf (vect_dump, "------>vectorizing statement: ");
7477 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7480 stmt_info = vinfo_for_stmt (stmt);
7482 /* vector stmts created in the outer-loop during vectorization of
7483 stmts in an inner-loop may not have a stmt_info, and do not
7484 need to be vectorized. */
7485 if (!stmt_info)
7487 bsi_next (&si);
7488 continue;
7491 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7492 && !STMT_VINFO_LIVE_P (stmt_info))
7494 bsi_next (&si);
7495 continue;
7498 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7499 nunits =
7500 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7501 if (!STMT_SLP_TYPE (stmt_info)
7502 && nunits != (unsigned int) vectorization_factor
7503 && vect_print_dump_info (REPORT_DETAILS))
7504 /* For SLP VF is set according to unrolling factor, and not to
7505 vector size, hence for SLP this print is not valid. */
7506 fprintf (vect_dump, "multiple-types.");
7508 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7509 reached. */
7510 if (STMT_SLP_TYPE (stmt_info))
7512 if (!slp_scheduled)
7514 slp_scheduled = true;
7516 if (vect_print_dump_info (REPORT_DETAILS))
7517 fprintf (vect_dump, "=== scheduling SLP instances ===");
7519 is_store = vect_schedule_slp (loop_vinfo, nunits);
7521 /* IS_STORE is true if STMT is a store. Stores cannot be of
7522 hybrid SLP type. They are removed in
7523 vect_schedule_slp_instance and their vinfo is destroyed. */
7524 if (is_store)
7526 bsi_next (&si);
7527 continue;
7531 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7532 if (PURE_SLP_STMT (stmt_info))
7534 bsi_next (&si);
7535 continue;
7539 /* -------- vectorize statement ------------ */
7540 if (vect_print_dump_info (REPORT_DETAILS))
7541 fprintf (vect_dump, "transform statement.");
7543 strided_store = false;
7544 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7545 if (is_store)
7547 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7549 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7550 interleaving chain was completed - free all the stores in
7551 the chain. */
7552 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7553 bsi_remove (&si, true);
7554 continue;
7556 else
7558 /* Free the attached stmt_vec_info and remove the stmt. */
7559 free_stmt_vec_info (stmt);
7560 bsi_remove (&si, true);
7561 continue;
7564 bsi_next (&si);
7565 } /* stmts in BB */
7566 } /* BBs in loop */
7568 slpeel_make_loop_iterate_ntimes (loop, ratio);
7570 mark_set_for_renaming (vect_memsyms_to_rename);
7572 /* The memory tags and pointers in vectorized statements need to
7573 have their SSA forms updated. FIXME, why can't this be delayed
7574 until all the loops have been transformed? */
7575 update_ssa (TODO_update_ssa);
7577 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7578 fprintf (vect_dump, "LOOP VECTORIZED.");
7579 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7580 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");