* team.c (gomp_team_end): Free team immediately if it has
[official-gcc.git] / gcc / tree-vect-transform.c
blob1fa786da0b9618add6b5af571dc7420d43960317
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int scalar_outside_cost = 0;
123 bool runtime_test = false;
124 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
127 int nbbs = loop->num_nodes;
128 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
129 int peel_guard_costs = 0;
130 int innerloop_iters = 0, factor;
131 VEC (slp_instance, heap) *slp_instances;
132 slp_instance instance;
134 /* Cost model disabled. */
135 if (!flag_vect_cost_model)
137 if (vect_print_dump_info (REPORT_COST))
138 fprintf (vect_dump, "cost model disabled.");
139 return 0;
142 /* If the number of iterations is unknown, or the
143 peeling-for-misalignment amount is unknown, we will have to generate
144 a runtime test to test the loop count against the threshold. */
145 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
146 || (byte_misalign < 0))
147 runtime_test = true;
149 /* Requires loop versioning tests to handle misalignment. */
151 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
153 /* FIXME: Make cost depend on complexity of individual check. */
154 vec_outside_cost +=
155 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
156 if (vect_print_dump_info (REPORT_COST))
157 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
158 "versioning to treat misalignment.\n");
161 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
163 /* FIXME: Make cost depend on complexity of individual check. */
164 vec_outside_cost +=
165 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
166 if (vect_print_dump_info (REPORT_COST))
167 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
168 "versioning aliasing.\n");
171 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
172 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
174 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
177 /* Count statements in scalar loop. Using this as scalar cost for a single
178 iteration for now.
180 TODO: Add outer loop support.
182 TODO: Consider assigning different costs to different scalar
183 statements. */
185 /* FORNOW. */
186 if (loop->inner)
187 innerloop_iters = 50; /* FIXME */
189 for (i = 0; i < nbbs; i++)
191 block_stmt_iterator si;
192 basic_block bb = bbs[i];
194 if (bb->loop_father == loop->inner)
195 factor = innerloop_iters;
196 else
197 factor = 1;
199 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
201 tree stmt = bsi_stmt (si);
202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
203 /* Skip stmts that are not vectorized inside the loop. */
204 if (!STMT_VINFO_RELEVANT_P (stmt_info)
205 && (!STMT_VINFO_LIVE_P (stmt_info)
206 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
207 continue;
208 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
209 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
210 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
211 some of the "outside" costs are generated inside the outer-loop. */
212 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
216 /* Add additional cost for the peeled instructions in prologue and epilogue
217 loop.
219 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
220 at compile-time - we assume it's vf/2 (the worst would be vf-1).
222 TODO: Build an expression that represents peel_iters for prologue and
223 epilogue to be used in a run-time test. */
225 if (byte_misalign < 0)
227 peel_iters_prologue = vf/2;
228 if (vect_print_dump_info (REPORT_COST))
229 fprintf (vect_dump, "cost model: "
230 "prologue peel iters set to vf/2.");
232 /* If peeling for alignment is unknown, loop bound of main loop becomes
233 unknown. */
234 peel_iters_epilogue = vf/2;
235 if (vect_print_dump_info (REPORT_COST))
236 fprintf (vect_dump, "cost model: "
237 "epilogue peel iters set to vf/2 because "
238 "peeling for alignment is unknown .");
240 /* If peeled iterations are unknown, count a taken branch and a not taken
241 branch per peeled loop. Even if scalar loop iterations are known,
242 vector iterations are not known since peeled prologue iterations are
243 not known. Hence guards remain the same. */
244 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
245 + TARG_COND_NOT_TAKEN_BRANCH_COST);
248 else
250 if (byte_misalign)
252 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
253 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
254 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
255 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
257 peel_iters_prologue = nelements - (byte_misalign / element_size);
259 else
260 peel_iters_prologue = 0;
262 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
264 peel_iters_epilogue = vf/2;
265 if (vect_print_dump_info (REPORT_COST))
266 fprintf (vect_dump, "cost model: "
267 "epilogue peel iters set to vf/2 because "
268 "loop iterations are unknown .");
270 /* If peeled iterations are known but number of scalar loop
271 iterations are unknown, count a taken branch per peeled loop. */
272 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
275 else
277 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
278 peel_iters_prologue = niters < peel_iters_prologue ?
279 niters : peel_iters_prologue;
280 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
284 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
285 + (peel_iters_epilogue * scalar_single_iter_cost)
286 + peel_guard_costs;
288 /* FORNOW: The scalar outside cost is incremented in one of the
289 following ways:
291 1. The vectorizer checks for alignment and aliasing and generates
292 a condition that allows dynamic vectorization. A cost model
293 check is ANDED with the versioning condition. Hence scalar code
294 path now has the added cost of the versioning check.
296 if (cost > th & versioning_check)
297 jmp to vector code
299 Hence run-time scalar is incremented by not-taken branch cost.
301 2. The vectorizer then checks if a prologue is required. If the
302 cost model check was not done before during versioning, it has to
303 be done before the prologue check.
305 if (cost <= th)
306 prologue = scalar_iters
307 if (prologue == 0)
308 jmp to vector code
309 else
310 execute prologue
311 if (prologue == num_iters)
312 go to exit
314 Hence the run-time scalar cost is incremented by a taken branch,
315 plus a not-taken branch, plus a taken branch cost.
317 3. The vectorizer then checks if an epilogue is required. If the
318 cost model check was not done before during prologue check, it
319 has to be done with the epilogue check.
321 if (prologue == 0)
322 jmp to vector code
323 else
324 execute prologue
325 if (prologue == num_iters)
326 go to exit
327 vector code:
328 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
329 jmp to epilogue
331 Hence the run-time scalar cost should be incremented by 2 taken
332 branches.
334 TODO: The back end may reorder the BBS's differently and reverse
335 conditions/branch directions. Change the estimates below to
336 something more reasonable. */
338 if (runtime_test)
340 /* Cost model check occurs at versioning. */
341 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
342 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
343 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
344 else
346 /* Cost model occurs at prologue generation. */
347 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
348 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
349 + TARG_COND_NOT_TAKEN_BRANCH_COST;
350 /* Cost model check occurs at epilogue generation. */
351 else
352 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
356 /* Add SLP costs. */
357 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
358 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
360 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
361 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
364 /* Calculate number of iterations required to make the vector version
365 profitable, relative to the loop bodies only. The following condition
366 must hold true:
367 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
368 where
369 SIC = scalar iteration cost, VIC = vector iteration cost,
370 VOC = vector outside cost, VF = vectorization factor,
371 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
372 SOC = scalar outside cost for run time cost model check. */
374 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
376 if (vec_outside_cost <= 0)
377 min_profitable_iters = 1;
378 else
380 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
381 - vec_inside_cost * peel_iters_prologue
382 - vec_inside_cost * peel_iters_epilogue)
383 / ((scalar_single_iter_cost * vf)
384 - vec_inside_cost);
386 if ((scalar_single_iter_cost * vf * min_profitable_iters)
387 <= ((vec_inside_cost * min_profitable_iters)
388 + ((vec_outside_cost - scalar_outside_cost) * vf)))
389 min_profitable_iters++;
392 /* vector version will never be profitable. */
393 else
395 if (vect_print_dump_info (REPORT_COST))
396 fprintf (vect_dump, "cost model: vector iteration cost = %d "
397 "is divisible by scalar iteration cost = %d by a factor "
398 "greater than or equal to the vectorization factor = %d .",
399 vec_inside_cost, scalar_single_iter_cost, vf);
400 return -1;
403 if (vect_print_dump_info (REPORT_COST))
405 fprintf (vect_dump, "Cost model analysis: \n");
406 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
407 vec_inside_cost);
408 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
409 vec_outside_cost);
410 fprintf (vect_dump, " Scalar iteration cost: %d\n",
411 scalar_single_iter_cost);
412 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
413 fprintf (vect_dump, " prologue iterations: %d\n",
414 peel_iters_prologue);
415 fprintf (vect_dump, " epilogue iterations: %d\n",
416 peel_iters_epilogue);
417 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
418 min_profitable_iters);
421 min_profitable_iters =
422 min_profitable_iters < vf ? vf : min_profitable_iters;
424 /* Because the condition we create is:
425 if (niters <= min_profitable_iters)
426 then skip the vectorized loop. */
427 min_profitable_iters--;
429 if (vect_print_dump_info (REPORT_COST))
430 fprintf (vect_dump, " Profitability threshold = %d\n",
431 min_profitable_iters);
433 return min_profitable_iters;
437 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
438 functions. Design better to avoid maintenance issues. */
440 /* Function vect_model_reduction_cost.
442 Models cost for a reduction operation, including the vector ops
443 generated within the strip-mine loop, the initial definition before
444 the loop, and the epilogue code that must be generated. */
446 static bool
447 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
448 int ncopies)
450 int outer_cost = 0;
451 enum tree_code code;
452 optab optab;
453 tree vectype;
454 tree orig_stmt;
455 tree reduction_op;
456 enum machine_mode mode;
457 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
458 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
462 /* Cost of reduction op inside loop. */
463 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
465 reduction_op = TREE_OPERAND (operation, op_type-1);
466 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
467 if (!vectype)
469 if (vect_print_dump_info (REPORT_COST))
471 fprintf (vect_dump, "unsupported data-type ");
472 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
474 return false;
477 mode = TYPE_MODE (vectype);
478 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
480 if (!orig_stmt)
481 orig_stmt = STMT_VINFO_STMT (stmt_info);
483 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
485 /* Add in cost for initial definition. */
486 outer_cost += TARG_SCALAR_TO_VEC_COST;
488 /* Determine cost of epilogue code.
490 We have a reduction operator that will reduce the vector in one statement.
491 Also requires scalar extract. */
493 if (!nested_in_vect_loop_p (loop, orig_stmt))
495 if (reduc_code < NUM_TREE_CODES)
496 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
497 else
499 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
500 tree bitsize =
501 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
502 int element_bitsize = tree_low_cst (bitsize, 1);
503 int nelements = vec_size_in_bits / element_bitsize;
505 optab = optab_for_tree_code (code, vectype, optab_default);
507 /* We have a whole vector shift available. */
508 if (VECTOR_MODE_P (mode)
509 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
510 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
511 /* Final reduction via vector shifts and the reduction operator. Also
512 requires scalar extract. */
513 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
514 + TARG_VEC_TO_SCALAR_COST);
515 else
516 /* Use extracts and reduction op for final reduction. For N elements,
517 we have N extracts and N-1 reduction ops. */
518 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
522 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
524 if (vect_print_dump_info (REPORT_COST))
525 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
526 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
527 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
529 return true;
533 /* Function vect_model_induction_cost.
535 Models cost for induction operations. */
537 static void
538 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
540 /* loop cost for vec_loop. */
541 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
542 /* prologue cost for vec_init and vec_step. */
543 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
545 if (vect_print_dump_info (REPORT_COST))
546 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
547 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
548 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
552 /* Function vect_model_simple_cost.
554 Models cost for simple operations, i.e. those that only emit ncopies of a
555 single op. Right now, this does not account for multiple insns that could
556 be generated for the single vector op. We will handle that shortly. */
558 void
559 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
560 enum vect_def_type *dt, slp_tree slp_node)
562 int i;
563 int inside_cost = 0, outside_cost = 0;
565 inside_cost = ncopies * TARG_VEC_STMT_COST;
567 /* FORNOW: Assuming maximum 2 args per stmts. */
568 for (i = 0; i < 2; i++)
570 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
571 outside_cost += TARG_SCALAR_TO_VEC_COST;
574 if (vect_print_dump_info (REPORT_COST))
575 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
576 "outside_cost = %d .", inside_cost, outside_cost);
578 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
579 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
580 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
584 /* Function vect_cost_strided_group_size
586 For strided load or store, return the group_size only if it is the first
587 load or store of a group, else return 1. This ensures that group size is
588 only returned once per group. */
590 static int
591 vect_cost_strided_group_size (stmt_vec_info stmt_info)
593 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
595 if (first_stmt == STMT_VINFO_STMT (stmt_info))
596 return DR_GROUP_SIZE (stmt_info);
598 return 1;
602 /* Function vect_model_store_cost
604 Models cost for stores. In the case of strided accesses, one access
605 has the overhead of the strided access attributed to it. */
607 void
608 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
609 enum vect_def_type dt, slp_tree slp_node)
611 int group_size;
612 int inside_cost = 0, outside_cost = 0;
614 if (dt == vect_constant_def || dt == vect_invariant_def)
615 outside_cost = TARG_SCALAR_TO_VEC_COST;
617 /* Strided access? */
618 if (DR_GROUP_FIRST_DR (stmt_info))
619 group_size = vect_cost_strided_group_size (stmt_info);
620 /* Not a strided access. */
621 else
622 group_size = 1;
624 /* Is this an access in a group of stores, which provide strided access?
625 If so, add in the cost of the permutes. */
626 if (group_size > 1)
628 /* Uses a high and low interleave operation for each needed permute. */
629 inside_cost = ncopies * exact_log2(group_size) * group_size
630 * TARG_VEC_STMT_COST;
632 if (vect_print_dump_info (REPORT_COST))
633 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
634 group_size);
638 /* Costs of the stores. */
639 inside_cost += ncopies * TARG_VEC_STORE_COST;
641 if (vect_print_dump_info (REPORT_COST))
642 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
643 "outside_cost = %d .", inside_cost, outside_cost);
645 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
646 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
647 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
651 /* Function vect_model_load_cost
653 Models cost for loads. In the case of strided accesses, the last access
654 has the overhead of the strided access attributed to it. Since unaligned
655 accesses are supported for loads, we also account for the costs of the
656 access scheme chosen. */
658 void
659 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
662 int group_size;
663 int alignment_support_cheme;
664 tree first_stmt;
665 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
666 int inside_cost = 0, outside_cost = 0;
668 /* Strided accesses? */
669 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
670 if (first_stmt && !slp_node)
672 group_size = vect_cost_strided_group_size (stmt_info);
673 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
675 /* Not a strided access. */
676 else
678 group_size = 1;
679 first_dr = dr;
682 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
684 /* Is this an access in a group of loads providing strided access?
685 If so, add in the cost of the permutes. */
686 if (group_size > 1)
688 /* Uses an even and odd extract operations for each needed permute. */
689 inside_cost = ncopies * exact_log2(group_size) * group_size
690 * TARG_VEC_STMT_COST;
692 if (vect_print_dump_info (REPORT_COST))
693 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
694 group_size);
698 /* The loads themselves. */
699 switch (alignment_support_cheme)
701 case dr_aligned:
703 inside_cost += ncopies * TARG_VEC_LOAD_COST;
705 if (vect_print_dump_info (REPORT_COST))
706 fprintf (vect_dump, "vect_model_load_cost: aligned.");
708 break;
710 case dr_unaligned_supported:
712 /* Here, we assign an additional cost for the unaligned load. */
713 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
715 if (vect_print_dump_info (REPORT_COST))
716 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
717 "hardware.");
719 break;
721 case dr_explicit_realign:
723 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
725 /* FIXME: If the misalignment remains fixed across the iterations of
726 the containing loop, the following cost should be added to the
727 outside costs. */
728 if (targetm.vectorize.builtin_mask_for_load)
729 inside_cost += TARG_VEC_STMT_COST;
731 break;
733 case dr_explicit_realign_optimized:
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
737 "pipelined.");
739 /* Unaligned software pipeline has a load of an address, an initial
740 load, and possibly a mask operation to "prime" the loop. However,
741 if this is an access in a group of loads, which provide strided
742 access, then the above cost should only be considered for one
743 access in the group. Inside the loop, there is a load op
744 and a realignment op. */
746 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
748 outside_cost = 2*TARG_VEC_STMT_COST;
749 if (targetm.vectorize.builtin_mask_for_load)
750 outside_cost += TARG_VEC_STMT_COST;
753 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
755 break;
758 default:
759 gcc_unreachable ();
762 if (vect_print_dump_info (REPORT_COST))
763 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
764 "outside_cost = %d .", inside_cost, outside_cost);
766 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
767 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
768 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
772 /* Function vect_get_new_vect_var.
774 Returns a name for a new variable. The current naming scheme appends the
775 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
776 the name of vectorizer generated variables, and appends that to NAME if
777 provided. */
779 static tree
780 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
782 const char *prefix;
783 tree new_vect_var;
785 switch (var_kind)
787 case vect_simple_var:
788 prefix = "vect_";
789 break;
790 case vect_scalar_var:
791 prefix = "stmp_";
792 break;
793 case vect_pointer_var:
794 prefix = "vect_p";
795 break;
796 default:
797 gcc_unreachable ();
800 if (name)
802 char* tmp = concat (prefix, name, NULL);
803 new_vect_var = create_tmp_var (type, tmp);
804 free (tmp);
806 else
807 new_vect_var = create_tmp_var (type, prefix);
809 /* Mark vector typed variable as a gimple register variable. */
810 if (TREE_CODE (type) == VECTOR_TYPE)
811 DECL_GIMPLE_REG_P (new_vect_var) = true;
813 return new_vect_var;
817 /* Function vect_create_addr_base_for_vector_ref.
819 Create an expression that computes the address of the first memory location
820 that will be accessed for a data reference.
822 Input:
823 STMT: The statement containing the data reference.
824 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
825 OFFSET: Optional. If supplied, it is be added to the initial address.
826 LOOP: Specify relative to which loop-nest should the address be computed.
827 For example, when the dataref is in an inner-loop nested in an
828 outer-loop that is now being vectorized, LOOP can be either the
829 outer-loop, or the inner-loop. The first memory location accessed
830 by the following dataref ('in' points to short):
832 for (i=0; i<N; i++)
833 for (j=0; j<M; j++)
834 s += in[i+j]
836 is as follows:
837 if LOOP=i_loop: &in (relative to i_loop)
838 if LOOP=j_loop: &in+i*2B (relative to j_loop)
840 Output:
841 1. Return an SSA_NAME whose value is the address of the memory location of
842 the first vector of the data reference.
843 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
844 these statement(s) which define the returned SSA_NAME.
846 FORNOW: We are only handling array accesses with step 1. */
848 static tree
849 vect_create_addr_base_for_vector_ref (tree stmt,
850 tree *new_stmt_list,
851 tree offset,
852 struct loop *loop)
854 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
855 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
856 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
857 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
858 tree base_name;
859 tree data_ref_base_var;
860 tree new_base_stmt;
861 tree vec_stmt;
862 tree addr_base, addr_expr;
863 tree dest, new_stmt;
864 tree base_offset = unshare_expr (DR_OFFSET (dr));
865 tree init = unshare_expr (DR_INIT (dr));
866 tree vect_ptr_type, addr_expr2;
867 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
869 gcc_assert (loop);
870 if (loop != containing_loop)
872 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
873 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
875 gcc_assert (nested_in_vect_loop_p (loop, stmt));
877 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
878 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
879 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
882 /* Create data_ref_base */
883 base_name = build_fold_indirect_ref (data_ref_base);
884 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
885 add_referenced_var (data_ref_base_var);
886 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
887 true, data_ref_base_var);
888 append_to_statement_list_force(new_base_stmt, new_stmt_list);
890 /* Create base_offset */
891 base_offset = size_binop (PLUS_EXPR, base_offset, init);
892 base_offset = fold_convert (sizetype, base_offset);
893 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
894 add_referenced_var (dest);
895 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
896 append_to_statement_list_force (new_stmt, new_stmt_list);
898 if (offset)
900 tree tmp = create_tmp_var (sizetype, "offset");
902 add_referenced_var (tmp);
903 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
904 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
905 base_offset, offset);
906 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
907 append_to_statement_list_force (new_stmt, new_stmt_list);
910 /* base + base_offset */
911 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
912 data_ref_base, base_offset);
914 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
916 /* addr_expr = addr_base */
917 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
918 get_name (base_name));
919 add_referenced_var (addr_expr);
920 vec_stmt = fold_convert (vect_ptr_type, addr_base);
921 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
922 get_name (base_name));
923 add_referenced_var (addr_expr2);
924 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
925 append_to_statement_list_force (new_stmt, new_stmt_list);
927 if (vect_print_dump_info (REPORT_DETAILS))
929 fprintf (vect_dump, "created ");
930 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
932 return vec_stmt;
936 /* Function vect_create_data_ref_ptr.
938 Create a new pointer to vector type (vp), that points to the first location
939 accessed in the loop by STMT, along with the def-use update chain to
940 appropriately advance the pointer through the loop iterations. Also set
941 aliasing information for the pointer. This vector pointer is used by the
942 callers to this function to create a memory reference expression for vector
943 load/store access.
945 Input:
946 1. STMT: a stmt that references memory. Expected to be of the form
947 GIMPLE_MODIFY_STMT <name, data-ref> or
948 GIMPLE_MODIFY_STMT <data-ref, name>.
949 2. AT_LOOP: the loop where the vector memref is to be created.
950 3. OFFSET (optional): an offset to be added to the initial address accessed
951 by the data-ref in STMT.
952 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
953 pointing to the initial address.
955 Output:
956 1. Declare a new ptr to vector_type, and have it point to the base of the
957 data reference (initial addressed accessed by the data reference).
958 For example, for vector of type V8HI, the following code is generated:
960 v8hi *vp;
961 vp = (v8hi *)initial_address;
963 if OFFSET is not supplied:
964 initial_address = &a[init];
965 if OFFSET is supplied:
966 initial_address = &a[init + OFFSET];
968 Return the initial_address in INITIAL_ADDRESS.
970 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
971 update the pointer in each iteration of the loop.
973 Return the increment stmt that updates the pointer in PTR_INCR.
975 3. Set INV_P to true if the access pattern of the data reference in the
976 vectorized loop is invariant. Set it to false otherwise.
978 4. Return the pointer. */
980 static tree
981 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
982 tree offset, tree *initial_address, tree *ptr_incr,
983 bool only_init, bool *inv_p)
985 tree base_name;
986 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
987 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
988 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
989 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
990 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
991 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
992 tree vect_ptr_type;
993 tree vect_ptr;
994 tree tag;
995 tree new_temp;
996 tree vec_stmt;
997 tree new_stmt_list = NULL_TREE;
998 edge pe;
999 basic_block new_bb;
1000 tree vect_ptr_init;
1001 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1002 tree vptr;
1003 block_stmt_iterator incr_bsi;
1004 bool insert_after;
1005 tree indx_before_incr, indx_after_incr;
1006 tree incr;
1007 tree step;
1009 /* Check the step (evolution) of the load in LOOP, and record
1010 whether it's invariant. */
1011 if (nested_in_vect_loop)
1012 step = STMT_VINFO_DR_STEP (stmt_info);
1013 else
1014 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1016 if (tree_int_cst_compare (step, size_zero_node) == 0)
1017 *inv_p = true;
1018 else
1019 *inv_p = false;
1021 /* Create an expression for the first address accessed by this load
1022 in LOOP. */
1023 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1025 if (vect_print_dump_info (REPORT_DETAILS))
1027 tree data_ref_base = base_name;
1028 fprintf (vect_dump, "create vector-pointer variable to type: ");
1029 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1030 if (TREE_CODE (data_ref_base) == VAR_DECL)
1031 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1032 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1033 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1034 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1035 fprintf (vect_dump, " vectorizing a record based array ref: ");
1036 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1037 fprintf (vect_dump, " vectorizing a pointer ref: ");
1038 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1041 /** (1) Create the new vector-pointer variable: **/
1042 vect_ptr_type = build_pointer_type (vectype);
1044 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1045 get_name (base_name));
1046 add_referenced_var (vect_ptr);
1048 /** (2) Add aliasing information to the new vector-pointer:
1049 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1051 tag = DR_SYMBOL_TAG (dr);
1052 gcc_assert (tag);
1054 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1055 tag must be created with tag added to its may alias list. */
1056 if (!MTAG_P (tag))
1057 new_type_alias (vect_ptr, tag, DR_REF (dr));
1058 else
1059 set_symbol_mem_tag (vect_ptr, tag);
1061 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1062 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1063 def-use update cycles for the pointer: One relative to the outer-loop
1064 (LOOP), which is what steps (3) and (4) below do. The other is relative
1065 to the inner-loop (which is the inner-most loop containing the dataref),
1066 and this is done be step (5) below.
1068 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1069 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1070 redundant. Steps (3),(4) create the following:
1072 vp0 = &base_addr;
1073 LOOP: vp1 = phi(vp0,vp2)
1074 ...
1076 vp2 = vp1 + step
1077 goto LOOP
1079 If there is an inner-loop nested in loop, then step (5) will also be
1080 applied, and an additional update in the inner-loop will be created:
1082 vp0 = &base_addr;
1083 LOOP: vp1 = phi(vp0,vp2)
1085 inner: vp3 = phi(vp1,vp4)
1086 vp4 = vp3 + inner_step
1087 if () goto inner
1089 vp2 = vp1 + step
1090 if () goto LOOP */
1092 /** (3) Calculate the initial address the vector-pointer, and set
1093 the vector-pointer to point to it before the loop: **/
1095 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1097 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1098 offset, loop);
1099 pe = loop_preheader_edge (loop);
1100 if (new_stmt_list)
1102 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1103 gcc_assert (!new_bb);
1106 *initial_address = new_temp;
1108 /* Create: p = (vectype *) initial_base */
1109 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1110 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1111 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1112 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1113 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1114 gcc_assert (!new_bb);
1117 /** (4) Handle the updating of the vector-pointer inside the loop.
1118 This is needed when ONLY_INIT is false, and also when AT_LOOP
1119 is the inner-loop nested in LOOP (during outer-loop vectorization).
1122 if (only_init && at_loop == loop) /* No update in loop is required. */
1124 /* Copy the points-to information if it exists. */
1125 if (DR_PTR_INFO (dr))
1126 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1127 vptr = vect_ptr_init;
1129 else
1131 /* The step of the vector pointer is the Vector Size. */
1132 tree step = TYPE_SIZE_UNIT (vectype);
1133 /* One exception to the above is when the scalar step of the load in
1134 LOOP is zero. In this case the step here is also zero. */
1135 if (*inv_p)
1136 step = size_zero_node;
1138 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1140 create_iv (vect_ptr_init,
1141 fold_convert (vect_ptr_type, step),
1142 NULL_TREE, loop, &incr_bsi, insert_after,
1143 &indx_before_incr, &indx_after_incr);
1144 incr = bsi_stmt (incr_bsi);
1145 set_stmt_info (stmt_ann (incr),
1146 new_stmt_vec_info (incr, loop_vinfo));
1148 /* Copy the points-to information if it exists. */
1149 if (DR_PTR_INFO (dr))
1151 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1152 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1154 merge_alias_info (vect_ptr_init, indx_before_incr);
1155 merge_alias_info (vect_ptr_init, indx_after_incr);
1156 if (ptr_incr)
1157 *ptr_incr = incr;
1159 vptr = indx_before_incr;
1162 if (!nested_in_vect_loop || only_init)
1163 return vptr;
1166 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1167 nested in LOOP, if exists: **/
1169 gcc_assert (nested_in_vect_loop);
1170 if (!only_init)
1172 standard_iv_increment_position (containing_loop, &incr_bsi,
1173 &insert_after);
1174 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1175 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1176 &indx_after_incr);
1177 incr = bsi_stmt (incr_bsi);
1178 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1180 /* Copy the points-to information if it exists. */
1181 if (DR_PTR_INFO (dr))
1183 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1184 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1186 merge_alias_info (vect_ptr_init, indx_before_incr);
1187 merge_alias_info (vect_ptr_init, indx_after_incr);
1188 if (ptr_incr)
1189 *ptr_incr = incr;
1191 return indx_before_incr;
1193 else
1194 gcc_unreachable ();
1198 /* Function bump_vector_ptr
1200 Increment a pointer (to a vector type) by vector-size. If requested,
1201 i.e. if PTR-INCR is given, then also connect the new increment stmt
1202 to the existing def-use update-chain of the pointer, by modifying
1203 the PTR_INCR as illustrated below:
1205 The pointer def-use update-chain before this function:
1206 DATAREF_PTR = phi (p_0, p_2)
1207 ....
1208 PTR_INCR: p_2 = DATAREF_PTR + step
1210 The pointer def-use update-chain after this function:
1211 DATAREF_PTR = phi (p_0, p_2)
1212 ....
1213 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1214 ....
1215 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1217 Input:
1218 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1219 in the loop.
1220 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1221 the loop. The increment amount across iterations is expected
1222 to be vector_size.
1223 BSI - location where the new update stmt is to be placed.
1224 STMT - the original scalar memory-access stmt that is being vectorized.
1225 BUMP - optional. The offset by which to bump the pointer. If not given,
1226 the offset is assumed to be vector_size.
1228 Output: Return NEW_DATAREF_PTR as illustrated above.
1232 static tree
1233 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1234 tree stmt, tree bump)
1236 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1237 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1238 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1239 tree vptr_type = TREE_TYPE (dataref_ptr);
1240 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1241 tree update = TYPE_SIZE_UNIT (vectype);
1242 tree incr_stmt;
1243 ssa_op_iter iter;
1244 use_operand_p use_p;
1245 tree new_dataref_ptr;
1247 if (bump)
1248 update = bump;
1250 incr_stmt = build_gimple_modify_stmt (ptr_var,
1251 build2 (POINTER_PLUS_EXPR, vptr_type,
1252 dataref_ptr, update));
1253 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1254 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1255 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1257 /* Copy the points-to information if it exists. */
1258 if (DR_PTR_INFO (dr))
1259 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1260 merge_alias_info (new_dataref_ptr, dataref_ptr);
1262 if (!ptr_incr)
1263 return new_dataref_ptr;
1265 /* Update the vector-pointer's cross-iteration increment. */
1266 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1268 tree use = USE_FROM_PTR (use_p);
1270 if (use == dataref_ptr)
1271 SET_USE (use_p, new_dataref_ptr);
1272 else
1273 gcc_assert (tree_int_cst_compare (use, update) == 0);
1276 return new_dataref_ptr;
1280 /* Function vect_create_destination_var.
1282 Create a new temporary of type VECTYPE. */
1284 static tree
1285 vect_create_destination_var (tree scalar_dest, tree vectype)
1287 tree vec_dest;
1288 const char *new_name;
1289 tree type;
1290 enum vect_var_kind kind;
1292 kind = vectype ? vect_simple_var : vect_scalar_var;
1293 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1295 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1297 new_name = get_name (scalar_dest);
1298 if (!new_name)
1299 new_name = "var_";
1300 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1301 add_referenced_var (vec_dest);
1303 return vec_dest;
1307 /* Function vect_init_vector.
1309 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1310 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1311 is not NULL. Otherwise, place the initialization at the loop preheader.
1312 Return the DEF of INIT_STMT.
1313 It will be used in the vectorization of STMT. */
1315 static tree
1316 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1317 block_stmt_iterator *bsi)
1319 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1320 tree new_var;
1321 tree init_stmt;
1322 tree vec_oprnd;
1323 edge pe;
1324 tree new_temp;
1325 basic_block new_bb;
1327 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1328 add_referenced_var (new_var);
1329 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1330 new_temp = make_ssa_name (new_var, init_stmt);
1331 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1333 if (bsi)
1334 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1335 else
1337 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1338 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1340 if (nested_in_vect_loop_p (loop, stmt))
1341 loop = loop->inner;
1342 pe = loop_preheader_edge (loop);
1343 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1344 gcc_assert (!new_bb);
1347 if (vect_print_dump_info (REPORT_DETAILS))
1349 fprintf (vect_dump, "created new init_stmt: ");
1350 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1353 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1354 return vec_oprnd;
1358 /* For constant and loop invariant defs of SLP_NODE this function returns
1359 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1360 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1361 stmts. */
1363 static void
1364 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1365 unsigned int op_num)
1367 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1368 tree stmt = VEC_index (tree, stmts, 0);
1369 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1370 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1371 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1372 tree vec_cst;
1373 tree t = NULL_TREE;
1374 int j, number_of_places_left_in_vector;
1375 tree vector_type;
1376 tree op, vop, operation;
1377 int group_size = VEC_length (tree, stmts);
1378 unsigned int vec_num, i;
1379 int number_of_copies = 1;
1380 bool is_store = false;
1381 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1382 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1383 bool constant_p;
1385 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1386 is_store = true;
1388 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1389 created vectors. It is greater than 1 if unrolling is performed.
1391 For example, we have two scalar operands, s1 and s2 (e.g., group of
1392 strided accesses of size two), while NUNITS is four (i.e., four scalars
1393 of this type can be packed in a vector). The output vector will contain
1394 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1395 will be 2).
1397 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1398 containing the operands.
1400 For example, NUNITS is four as before, and the group size is 8
1401 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1402 {s5, s6, s7, s8}. */
1404 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1406 number_of_places_left_in_vector = nunits;
1407 constant_p = true;
1408 for (j = 0; j < number_of_copies; j++)
1410 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1412 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1413 if (is_store)
1414 op = operation;
1415 else
1416 op = TREE_OPERAND (operation, op_num);
1417 if (!CONSTANT_CLASS_P (op))
1418 constant_p = false;
1420 /* Create 'vect_ = {op0,op1,...,opn}'. */
1421 t = tree_cons (NULL_TREE, op, t);
1423 number_of_places_left_in_vector--;
1425 if (number_of_places_left_in_vector == 0)
1427 number_of_places_left_in_vector = nunits;
1429 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1430 gcc_assert (vector_type);
1431 if (constant_p)
1432 vec_cst = build_vector (vector_type, t);
1433 else
1434 vec_cst = build_constructor_from_list (vector_type, t);
1435 constant_p = true;
1436 VEC_quick_push (tree, voprnds,
1437 vect_init_vector (stmt, vec_cst, vector_type,
1438 NULL));
1439 t = NULL_TREE;
1444 /* Since the vectors are created in the reverse order, we should invert
1445 them. */
1446 vec_num = VEC_length (tree, voprnds);
1447 for (j = vec_num - 1; j >= 0; j--)
1449 vop = VEC_index (tree, voprnds, j);
1450 VEC_quick_push (tree, *vec_oprnds, vop);
1453 VEC_free (tree, heap, voprnds);
1455 /* In case that VF is greater than the unrolling factor needed for the SLP
1456 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1457 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1458 to replicate the vectors. */
1459 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1461 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1462 VEC_quick_push (tree, *vec_oprnds, vop);
1467 /* Get vectorized definitions from SLP_NODE that contains corresponding
1468 vectorized def-stmts. */
1470 static void
1471 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1473 tree vec_oprnd;
1474 tree vec_def_stmt;
1475 unsigned int i;
1477 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1479 for (i = 0;
1480 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1481 i++)
1483 gcc_assert (vec_def_stmt);
1484 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1485 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1490 /* Get vectorized definitions for SLP_NODE.
1491 If the scalar definitions are loop invariants or constants, collect them and
1492 call vect_get_constant_vectors() to create vector stmts.
1493 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1494 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1495 vect_get_slp_vect_defs() to retrieve them.
1496 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1497 the right node. This is used when the second operand must remain scalar. */
1499 static void
1500 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1501 VEC (tree,heap) **vec_oprnds1)
1503 tree operation, first_stmt;
1505 /* Allocate memory for vectorized defs. */
1506 *vec_oprnds0 = VEC_alloc (tree, heap,
1507 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1509 /* SLP_NODE corresponds either to a group of stores or to a group of
1510 unary/binary operations. We don't call this function for loads. */
1511 if (SLP_TREE_LEFT (slp_node))
1512 /* The defs are already vectorized. */
1513 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1514 else
1515 /* Build vectors from scalar defs. */
1516 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1518 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1519 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1520 /* Since we don't call this function with loads, this is a group of
1521 stores. */
1522 return;
1524 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1525 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1526 return;
1528 *vec_oprnds1 = VEC_alloc (tree, heap,
1529 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1531 if (SLP_TREE_RIGHT (slp_node))
1532 /* The defs are already vectorized. */
1533 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1534 else
1535 /* Build vectors from scalar defs. */
1536 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1540 /* Function get_initial_def_for_induction
1542 Input:
1543 STMT - a stmt that performs an induction operation in the loop.
1544 IV_PHI - the initial value of the induction variable
1546 Output:
1547 Return a vector variable, initialized with the first VF values of
1548 the induction variable. E.g., for an iv with IV_PHI='X' and
1549 evolution S, for a vector of 4 units, we want to return:
1550 [X, X + S, X + 2*S, X + 3*S]. */
1552 static tree
1553 get_initial_def_for_induction (tree iv_phi)
1555 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1556 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1557 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1558 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1559 tree vectype;
1560 int nunits;
1561 edge pe = loop_preheader_edge (loop);
1562 struct loop *iv_loop;
1563 basic_block new_bb;
1564 tree vec, vec_init, vec_step, t;
1565 tree access_fn;
1566 tree new_var;
1567 tree new_name;
1568 tree init_stmt;
1569 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1570 tree init_expr, step_expr;
1571 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1572 int i;
1573 bool ok;
1574 int ncopies;
1575 tree expr;
1576 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1577 bool nested_in_vect_loop = false;
1578 tree stmts;
1579 imm_use_iterator imm_iter;
1580 use_operand_p use_p;
1581 tree exit_phi;
1582 edge latch_e;
1583 tree loop_arg;
1584 block_stmt_iterator si;
1585 basic_block bb = bb_for_stmt (iv_phi);
1587 vectype = get_vectype_for_scalar_type (scalar_type);
1588 gcc_assert (vectype);
1589 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1590 ncopies = vf / nunits;
1592 gcc_assert (phi_info);
1593 gcc_assert (ncopies >= 1);
1595 /* Find the first insertion point in the BB. */
1596 si = bsi_after_labels (bb);
1598 if (INTEGRAL_TYPE_P (scalar_type))
1599 step_expr = build_int_cst (scalar_type, 0);
1600 else
1601 step_expr = build_real (scalar_type, dconst0);
1603 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1604 if (nested_in_vect_loop_p (loop, iv_phi))
1606 nested_in_vect_loop = true;
1607 iv_loop = loop->inner;
1609 else
1610 iv_loop = loop;
1611 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1613 latch_e = loop_latch_edge (iv_loop);
1614 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1616 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1617 gcc_assert (access_fn);
1618 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1619 &init_expr, &step_expr);
1620 gcc_assert (ok);
1621 pe = loop_preheader_edge (iv_loop);
1623 /* Create the vector that holds the initial_value of the induction. */
1624 if (nested_in_vect_loop)
1626 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1627 been created during vectorization of previous stmts; We obtain it from
1628 the STMT_VINFO_VEC_STMT of the defining stmt. */
1629 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1630 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1632 else
1634 /* iv_loop is the loop to be vectorized. Create:
1635 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1636 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1637 add_referenced_var (new_var);
1639 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1640 if (stmts)
1642 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1643 gcc_assert (!new_bb);
1646 t = NULL_TREE;
1647 t = tree_cons (NULL_TREE, init_expr, t);
1648 for (i = 1; i < nunits; i++)
1650 tree tmp;
1652 /* Create: new_name_i = new_name + step_expr */
1653 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1654 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1655 new_name = make_ssa_name (new_var, init_stmt);
1656 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1658 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1659 gcc_assert (!new_bb);
1661 if (vect_print_dump_info (REPORT_DETAILS))
1663 fprintf (vect_dump, "created new init_stmt: ");
1664 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1666 t = tree_cons (NULL_TREE, new_name, t);
1668 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1669 vec = build_constructor_from_list (vectype, nreverse (t));
1670 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1674 /* Create the vector that holds the step of the induction. */
1675 if (nested_in_vect_loop)
1676 /* iv_loop is nested in the loop to be vectorized. Generate:
1677 vec_step = [S, S, S, S] */
1678 new_name = step_expr;
1679 else
1681 /* iv_loop is the loop to be vectorized. Generate:
1682 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1683 expr = build_int_cst (scalar_type, vf);
1684 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1687 t = NULL_TREE;
1688 for (i = 0; i < nunits; i++)
1689 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1690 gcc_assert (CONSTANT_CLASS_P (new_name));
1691 vec = build_vector (vectype, t);
1692 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1695 /* Create the following def-use cycle:
1696 loop prolog:
1697 vec_init = ...
1698 vec_step = ...
1699 loop:
1700 vec_iv = PHI <vec_init, vec_loop>
1702 STMT
1704 vec_loop = vec_iv + vec_step; */
1706 /* Create the induction-phi that defines the induction-operand. */
1707 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1708 add_referenced_var (vec_dest);
1709 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1710 set_stmt_info (get_stmt_ann (induction_phi),
1711 new_stmt_vec_info (induction_phi, loop_vinfo));
1712 induc_def = PHI_RESULT (induction_phi);
1714 /* Create the iv update inside the loop */
1715 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1716 build2 (PLUS_EXPR, vectype,
1717 induc_def, vec_step));
1718 vec_def = make_ssa_name (vec_dest, new_stmt);
1719 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1720 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1721 set_stmt_info (get_stmt_ann (new_stmt),
1722 new_stmt_vec_info (new_stmt, loop_vinfo));
1724 /* Set the arguments of the phi node: */
1725 add_phi_arg (induction_phi, vec_init, pe);
1726 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1729 /* In case that vectorization factor (VF) is bigger than the number
1730 of elements that we can fit in a vectype (nunits), we have to generate
1731 more than one vector stmt - i.e - we need to "unroll" the
1732 vector stmt by a factor VF/nunits. For more details see documentation
1733 in vectorizable_operation. */
1735 if (ncopies > 1)
1737 stmt_vec_info prev_stmt_vinfo;
1738 /* FORNOW. This restriction should be relaxed. */
1739 gcc_assert (!nested_in_vect_loop);
1741 /* Create the vector that holds the step of the induction. */
1742 expr = build_int_cst (scalar_type, nunits);
1743 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1744 t = NULL_TREE;
1745 for (i = 0; i < nunits; i++)
1746 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1747 gcc_assert (CONSTANT_CLASS_P (new_name));
1748 vec = build_vector (vectype, t);
1749 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1751 vec_def = induc_def;
1752 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1753 for (i = 1; i < ncopies; i++)
1755 tree tmp;
1757 /* vec_i = vec_prev + vec_step */
1758 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1759 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1760 vec_def = make_ssa_name (vec_dest, new_stmt);
1761 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1762 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1763 set_stmt_info (get_stmt_ann (new_stmt),
1764 new_stmt_vec_info (new_stmt, loop_vinfo));
1765 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1766 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1770 if (nested_in_vect_loop)
1772 /* Find the loop-closed exit-phi of the induction, and record
1773 the final vector of induction results: */
1774 exit_phi = NULL;
1775 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1777 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1779 exit_phi = USE_STMT (use_p);
1780 break;
1783 if (exit_phi)
1785 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1786 /* FORNOW. Currently not supporting the case that an inner-loop induction
1787 is not used in the outer-loop (i.e. only outside the outer-loop). */
1788 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1789 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1791 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1792 if (vect_print_dump_info (REPORT_DETAILS))
1794 fprintf (vect_dump, "vector of inductions after inner-loop:");
1795 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1801 if (vect_print_dump_info (REPORT_DETAILS))
1803 fprintf (vect_dump, "transform induction: created def-use cycle:");
1804 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1805 fprintf (vect_dump, "\n");
1806 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1809 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1810 return induc_def;
1814 /* Function vect_get_vec_def_for_operand.
1816 OP is an operand in STMT. This function returns a (vector) def that will be
1817 used in the vectorized stmt for STMT.
1819 In the case that OP is an SSA_NAME which is defined in the loop, then
1820 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1822 In case OP is an invariant or constant, a new stmt that creates a vector def
1823 needs to be introduced. */
1825 static tree
1826 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1828 tree vec_oprnd;
1829 tree vec_stmt;
1830 tree def_stmt;
1831 stmt_vec_info def_stmt_info = NULL;
1832 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1833 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1834 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1835 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1836 tree vec_inv;
1837 tree vec_cst;
1838 tree t = NULL_TREE;
1839 tree def;
1840 int i;
1841 enum vect_def_type dt;
1842 bool is_simple_use;
1843 tree vector_type;
1845 if (vect_print_dump_info (REPORT_DETAILS))
1847 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1848 print_generic_expr (vect_dump, op, TDF_SLIM);
1851 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1852 gcc_assert (is_simple_use);
1853 if (vect_print_dump_info (REPORT_DETAILS))
1855 if (def)
1857 fprintf (vect_dump, "def = ");
1858 print_generic_expr (vect_dump, def, TDF_SLIM);
1860 if (def_stmt)
1862 fprintf (vect_dump, " def_stmt = ");
1863 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1867 switch (dt)
1869 /* Case 1: operand is a constant. */
1870 case vect_constant_def:
1872 if (scalar_def)
1873 *scalar_def = op;
1875 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1876 if (vect_print_dump_info (REPORT_DETAILS))
1877 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1879 for (i = nunits - 1; i >= 0; --i)
1881 t = tree_cons (NULL_TREE, op, t);
1883 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1884 gcc_assert (vector_type);
1885 vec_cst = build_vector (vector_type, t);
1887 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1890 /* Case 2: operand is defined outside the loop - loop invariant. */
1891 case vect_invariant_def:
1893 if (scalar_def)
1894 *scalar_def = def;
1896 /* Create 'vec_inv = {inv,inv,..,inv}' */
1897 if (vect_print_dump_info (REPORT_DETAILS))
1898 fprintf (vect_dump, "Create vector_inv.");
1900 for (i = nunits - 1; i >= 0; --i)
1902 t = tree_cons (NULL_TREE, def, t);
1905 /* FIXME: use build_constructor directly. */
1906 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1907 gcc_assert (vector_type);
1908 vec_inv = build_constructor_from_list (vector_type, t);
1909 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1912 /* Case 3: operand is defined inside the loop. */
1913 case vect_loop_def:
1915 if (scalar_def)
1916 *scalar_def = def_stmt;
1918 /* Get the def from the vectorized stmt. */
1919 def_stmt_info = vinfo_for_stmt (def_stmt);
1920 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1921 gcc_assert (vec_stmt);
1922 if (TREE_CODE (vec_stmt) == PHI_NODE)
1923 vec_oprnd = PHI_RESULT (vec_stmt);
1924 else
1925 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1926 return vec_oprnd;
1929 /* Case 4: operand is defined by a loop header phi - reduction */
1930 case vect_reduction_def:
1932 struct loop *loop;
1934 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1935 loop = (bb_for_stmt (def_stmt))->loop_father;
1937 /* Get the def before the loop */
1938 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1939 return get_initial_def_for_reduction (stmt, op, scalar_def);
1942 /* Case 5: operand is defined by loop-header phi - induction. */
1943 case vect_induction_def:
1945 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1947 /* Get the def from the vectorized stmt. */
1948 def_stmt_info = vinfo_for_stmt (def_stmt);
1949 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1950 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1951 vec_oprnd = PHI_RESULT (vec_stmt);
1952 return vec_oprnd;
1955 default:
1956 gcc_unreachable ();
1961 /* Function vect_get_vec_def_for_stmt_copy
1963 Return a vector-def for an operand. This function is used when the
1964 vectorized stmt to be created (by the caller to this function) is a "copy"
1965 created in case the vectorized result cannot fit in one vector, and several
1966 copies of the vector-stmt are required. In this case the vector-def is
1967 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1968 of the stmt that defines VEC_OPRND.
1969 DT is the type of the vector def VEC_OPRND.
1971 Context:
1972 In case the vectorization factor (VF) is bigger than the number
1973 of elements that can fit in a vectype (nunits), we have to generate
1974 more than one vector stmt to vectorize the scalar stmt. This situation
1975 arises when there are multiple data-types operated upon in the loop; the
1976 smallest data-type determines the VF, and as a result, when vectorizing
1977 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1978 vector stmt (each computing a vector of 'nunits' results, and together
1979 computing 'VF' results in each iteration). This function is called when
1980 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1981 which VF=16 and nunits=4, so the number of copies required is 4):
1983 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1985 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1986 VS1.1: vx.1 = memref1 VS1.2
1987 VS1.2: vx.2 = memref2 VS1.3
1988 VS1.3: vx.3 = memref3
1990 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1991 VSnew.1: vz1 = vx.1 + ... VSnew.2
1992 VSnew.2: vz2 = vx.2 + ... VSnew.3
1993 VSnew.3: vz3 = vx.3 + ...
1995 The vectorization of S1 is explained in vectorizable_load.
1996 The vectorization of S2:
1997 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1998 the function 'vect_get_vec_def_for_operand' is called to
1999 get the relevant vector-def for each operand of S2. For operand x it
2000 returns the vector-def 'vx.0'.
2002 To create the remaining copies of the vector-stmt (VSnew.j), this
2003 function is called to get the relevant vector-def for each operand. It is
2004 obtained from the respective VS1.j stmt, which is recorded in the
2005 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2007 For example, to obtain the vector-def 'vx.1' in order to create the
2008 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2009 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2010 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2011 and return its def ('vx.1').
2012 Overall, to create the above sequence this function will be called 3 times:
2013 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2014 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2015 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2017 static tree
2018 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2020 tree vec_stmt_for_operand;
2021 stmt_vec_info def_stmt_info;
2023 /* Do nothing; can reuse same def. */
2024 if (dt == vect_invariant_def || dt == vect_constant_def )
2025 return vec_oprnd;
2027 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2028 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2029 gcc_assert (def_stmt_info);
2030 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2031 gcc_assert (vec_stmt_for_operand);
2032 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
2033 return vec_oprnd;
2037 /* Get vectorized definitions for the operands to create a copy of an original
2038 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2040 static void
2041 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2042 VEC(tree,heap) **vec_oprnds0,
2043 VEC(tree,heap) **vec_oprnds1)
2045 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2047 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2048 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2050 if (vec_oprnds1 && *vec_oprnds1)
2052 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2053 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2054 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2059 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2061 static void
2062 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2063 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2065 if (slp_node)
2066 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2067 else
2069 tree vec_oprnd;
2071 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2072 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2073 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2075 if (op1)
2077 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2078 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2079 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2085 /* Function vect_finish_stmt_generation.
2087 Insert a new stmt. */
2089 static void
2090 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2091 block_stmt_iterator *bsi)
2093 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2094 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2096 gcc_assert (stmt == bsi_stmt (*bsi));
2097 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2099 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2101 set_stmt_info (get_stmt_ann (vec_stmt),
2102 new_stmt_vec_info (vec_stmt, loop_vinfo));
2104 if (vect_print_dump_info (REPORT_DETAILS))
2106 fprintf (vect_dump, "add new stmt: ");
2107 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2110 /* Make sure bsi points to the stmt that is being vectorized. */
2111 gcc_assert (stmt == bsi_stmt (*bsi));
2113 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2117 /* Function get_initial_def_for_reduction
2119 Input:
2120 STMT - a stmt that performs a reduction operation in the loop.
2121 INIT_VAL - the initial value of the reduction variable
2123 Output:
2124 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2125 of the reduction (used for adjusting the epilog - see below).
2126 Return a vector variable, initialized according to the operation that STMT
2127 performs. This vector will be used as the initial value of the
2128 vector of partial results.
2130 Option1 (adjust in epilog): Initialize the vector as follows:
2131 add: [0,0,...,0,0]
2132 mult: [1,1,...,1,1]
2133 min/max: [init_val,init_val,..,init_val,init_val]
2134 bit and/or: [init_val,init_val,..,init_val,init_val]
2135 and when necessary (e.g. add/mult case) let the caller know
2136 that it needs to adjust the result by init_val.
2138 Option2: Initialize the vector as follows:
2139 add: [0,0,...,0,init_val]
2140 mult: [1,1,...,1,init_val]
2141 min/max: [init_val,init_val,...,init_val]
2142 bit and/or: [init_val,init_val,...,init_val]
2143 and no adjustments are needed.
2145 For example, for the following code:
2147 s = init_val;
2148 for (i=0;i<n;i++)
2149 s = s + a[i];
2151 STMT is 's = s + a[i]', and the reduction variable is 's'.
2152 For a vector of 4 units, we want to return either [0,0,0,init_val],
2153 or [0,0,0,0] and let the caller know that it needs to adjust
2154 the result at the end by 'init_val'.
2156 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2157 initialization vector is simpler (same element in all entries).
2158 A cost model should help decide between these two schemes. */
2160 static tree
2161 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2163 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2164 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2165 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2166 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2167 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2168 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2169 tree type = TREE_TYPE (init_val);
2170 tree vecdef;
2171 tree def_for_init;
2172 tree init_def;
2173 tree t = NULL_TREE;
2174 int i;
2175 tree vector_type;
2176 bool nested_in_vect_loop = false;
2178 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2179 if (nested_in_vect_loop_p (loop, stmt))
2180 nested_in_vect_loop = true;
2181 else
2182 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2184 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2186 switch (code)
2188 case WIDEN_SUM_EXPR:
2189 case DOT_PROD_EXPR:
2190 case PLUS_EXPR:
2191 if (nested_in_vect_loop)
2192 *adjustment_def = vecdef;
2193 else
2194 *adjustment_def = init_val;
2195 /* Create a vector of zeros for init_def. */
2196 if (SCALAR_FLOAT_TYPE_P (type))
2197 def_for_init = build_real (type, dconst0);
2198 else
2199 def_for_init = build_int_cst (type, 0);
2200 for (i = nunits - 1; i >= 0; --i)
2201 t = tree_cons (NULL_TREE, def_for_init, t);
2202 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2203 gcc_assert (vector_type);
2204 init_def = build_vector (vector_type, t);
2205 break;
2207 case MIN_EXPR:
2208 case MAX_EXPR:
2209 *adjustment_def = NULL_TREE;
2210 init_def = vecdef;
2211 break;
2213 default:
2214 gcc_unreachable ();
2217 return init_def;
2221 /* Function vect_create_epilog_for_reduction
2223 Create code at the loop-epilog to finalize the result of a reduction
2224 computation.
2226 VECT_DEF is a vector of partial results.
2227 REDUC_CODE is the tree-code for the epilog reduction.
2228 STMT is the scalar reduction stmt that is being vectorized.
2229 REDUCTION_PHI is the phi-node that carries the reduction computation.
2231 This function:
2232 1. Creates the reduction def-use cycle: sets the arguments for
2233 REDUCTION_PHI:
2234 The loop-entry argument is the vectorized initial-value of the reduction.
2235 The loop-latch argument is VECT_DEF - the vector of partial sums.
2236 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2237 by applying the operation specified by REDUC_CODE if available, or by
2238 other means (whole-vector shifts or a scalar loop).
2239 The function also creates a new phi node at the loop exit to preserve
2240 loop-closed form, as illustrated below.
2242 The flow at the entry to this function:
2244 loop:
2245 vec_def = phi <null, null> # REDUCTION_PHI
2246 VECT_DEF = vector_stmt # vectorized form of STMT
2247 s_loop = scalar_stmt # (scalar) STMT
2248 loop_exit:
2249 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2250 use <s_out0>
2251 use <s_out0>
2253 The above is transformed by this function into:
2255 loop:
2256 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2257 VECT_DEF = vector_stmt # vectorized form of STMT
2258 s_loop = scalar_stmt # (scalar) STMT
2259 loop_exit:
2260 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2261 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2262 v_out2 = reduce <v_out1>
2263 s_out3 = extract_field <v_out2, 0>
2264 s_out4 = adjust_result <s_out3>
2265 use <s_out4>
2266 use <s_out4>
2269 static void
2270 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2271 enum tree_code reduc_code, tree reduction_phi)
2273 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2274 tree vectype;
2275 enum machine_mode mode;
2276 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2277 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2278 basic_block exit_bb;
2279 tree scalar_dest;
2280 tree scalar_type;
2281 tree new_phi;
2282 block_stmt_iterator exit_bsi;
2283 tree vec_dest;
2284 tree new_temp = NULL_TREE;
2285 tree new_name;
2286 tree epilog_stmt = NULL_TREE;
2287 tree new_scalar_dest, exit_phi, new_dest;
2288 tree bitsize, bitpos, bytesize;
2289 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2290 tree adjustment_def;
2291 tree vec_initial_def;
2292 tree orig_name;
2293 imm_use_iterator imm_iter;
2294 use_operand_p use_p;
2295 bool extract_scalar_result = false;
2296 tree reduction_op, expr;
2297 tree orig_stmt;
2298 tree use_stmt;
2299 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2300 bool nested_in_vect_loop = false;
2301 int op_type;
2302 VEC(tree,heap) *phis = NULL;
2303 int i;
2305 if (nested_in_vect_loop_p (loop, stmt))
2307 loop = loop->inner;
2308 nested_in_vect_loop = true;
2311 op_type = TREE_OPERAND_LENGTH (operation);
2312 reduction_op = TREE_OPERAND (operation, op_type-1);
2313 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2314 gcc_assert (vectype);
2315 mode = TYPE_MODE (vectype);
2317 /*** 1. Create the reduction def-use cycle ***/
2319 /* 1.1 set the loop-entry arg of the reduction-phi: */
2320 /* For the case of reduction, vect_get_vec_def_for_operand returns
2321 the scalar def before the loop, that defines the initial value
2322 of the reduction variable. */
2323 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2324 &adjustment_def);
2325 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2327 /* 1.2 set the loop-latch arg for the reduction-phi: */
2328 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2330 if (vect_print_dump_info (REPORT_DETAILS))
2332 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2333 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2334 fprintf (vect_dump, "\n");
2335 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2339 /*** 2. Create epilog code
2340 The reduction epilog code operates across the elements of the vector
2341 of partial results computed by the vectorized loop.
2342 The reduction epilog code consists of:
2343 step 1: compute the scalar result in a vector (v_out2)
2344 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2345 step 3: adjust the scalar result (s_out3) if needed.
2347 Step 1 can be accomplished using one the following three schemes:
2348 (scheme 1) using reduc_code, if available.
2349 (scheme 2) using whole-vector shifts, if available.
2350 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2351 combined.
2353 The overall epilog code looks like this:
2355 s_out0 = phi <s_loop> # original EXIT_PHI
2356 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2357 v_out2 = reduce <v_out1> # step 1
2358 s_out3 = extract_field <v_out2, 0> # step 2
2359 s_out4 = adjust_result <s_out3> # step 3
2361 (step 3 is optional, and step2 1 and 2 may be combined).
2362 Lastly, the uses of s_out0 are replaced by s_out4.
2364 ***/
2366 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2367 v_out1 = phi <v_loop> */
2369 exit_bb = single_exit (loop)->dest;
2370 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2371 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2372 exit_bsi = bsi_after_labels (exit_bb);
2374 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2375 (i.e. when reduc_code is not available) and in the final adjustment
2376 code (if needed). Also get the original scalar reduction variable as
2377 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2378 represents a reduction pattern), the tree-code and scalar-def are
2379 taken from the original stmt that the pattern-stmt (STMT) replaces.
2380 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2381 are taken from STMT. */
2383 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2384 if (!orig_stmt)
2386 /* Regular reduction */
2387 orig_stmt = stmt;
2389 else
2391 /* Reduction pattern */
2392 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2393 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2394 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2396 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2397 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2398 scalar_type = TREE_TYPE (scalar_dest);
2399 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2400 bitsize = TYPE_SIZE (scalar_type);
2401 bytesize = TYPE_SIZE_UNIT (scalar_type);
2404 /* In case this is a reduction in an inner-loop while vectorizing an outer
2405 loop - we don't need to extract a single scalar result at the end of the
2406 inner-loop. The final vector of partial results will be used in the
2407 vectorized outer-loop, or reduced to a scalar result at the end of the
2408 outer-loop. */
2409 if (nested_in_vect_loop)
2410 goto vect_finalize_reduction;
2412 /* 2.3 Create the reduction code, using one of the three schemes described
2413 above. */
2415 if (reduc_code < NUM_TREE_CODES)
2417 tree tmp;
2419 /*** Case 1: Create:
2420 v_out2 = reduc_expr <v_out1> */
2422 if (vect_print_dump_info (REPORT_DETAILS))
2423 fprintf (vect_dump, "Reduce using direct vector reduction.");
2425 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2426 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2427 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2428 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2429 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2430 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2432 extract_scalar_result = true;
2434 else
2436 enum tree_code shift_code = 0;
2437 bool have_whole_vector_shift = true;
2438 int bit_offset;
2439 int element_bitsize = tree_low_cst (bitsize, 1);
2440 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2441 tree vec_temp;
2443 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2444 shift_code = VEC_RSHIFT_EXPR;
2445 else
2446 have_whole_vector_shift = false;
2448 /* Regardless of whether we have a whole vector shift, if we're
2449 emulating the operation via tree-vect-generic, we don't want
2450 to use it. Only the first round of the reduction is likely
2451 to still be profitable via emulation. */
2452 /* ??? It might be better to emit a reduction tree code here, so that
2453 tree-vect-generic can expand the first round via bit tricks. */
2454 if (!VECTOR_MODE_P (mode))
2455 have_whole_vector_shift = false;
2456 else
2458 optab optab = optab_for_tree_code (code, vectype, optab_default);
2459 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2460 have_whole_vector_shift = false;
2463 if (have_whole_vector_shift)
2465 /*** Case 2: Create:
2466 for (offset = VS/2; offset >= element_size; offset/=2)
2468 Create: va' = vec_shift <va, offset>
2469 Create: va = vop <va, va'>
2470 } */
2472 if (vect_print_dump_info (REPORT_DETAILS))
2473 fprintf (vect_dump, "Reduce using vector shifts");
2475 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2476 new_temp = PHI_RESULT (new_phi);
2478 for (bit_offset = vec_size_in_bits/2;
2479 bit_offset >= element_bitsize;
2480 bit_offset /= 2)
2482 tree bitpos = size_int (bit_offset);
2483 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2484 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2485 new_name = make_ssa_name (vec_dest, epilog_stmt);
2486 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2487 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2489 tmp = build2 (code, vectype, new_name, new_temp);
2490 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2491 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2492 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2493 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2496 extract_scalar_result = true;
2498 else
2500 tree rhs;
2502 /*** Case 3: Create:
2503 s = extract_field <v_out2, 0>
2504 for (offset = element_size;
2505 offset < vector_size;
2506 offset += element_size;)
2508 Create: s' = extract_field <v_out2, offset>
2509 Create: s = op <s, s'>
2510 } */
2512 if (vect_print_dump_info (REPORT_DETAILS))
2513 fprintf (vect_dump, "Reduce using scalar code. ");
2515 vec_temp = PHI_RESULT (new_phi);
2516 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2517 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2518 bitsize_zero_node);
2519 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2520 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2521 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2522 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2524 for (bit_offset = element_bitsize;
2525 bit_offset < vec_size_in_bits;
2526 bit_offset += element_bitsize)
2528 tree tmp;
2529 tree bitpos = bitsize_int (bit_offset);
2530 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2531 bitpos);
2533 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2534 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2535 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2536 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2538 tmp = build2 (code, scalar_type, new_name, new_temp);
2539 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2540 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2541 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2542 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2545 extract_scalar_result = false;
2549 /* 2.4 Extract the final scalar result. Create:
2550 s_out3 = extract_field <v_out2, bitpos> */
2552 if (extract_scalar_result)
2554 tree rhs;
2556 gcc_assert (!nested_in_vect_loop);
2557 if (vect_print_dump_info (REPORT_DETAILS))
2558 fprintf (vect_dump, "extract scalar result");
2560 if (BYTES_BIG_ENDIAN)
2561 bitpos = size_binop (MULT_EXPR,
2562 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2563 TYPE_SIZE (scalar_type));
2564 else
2565 bitpos = bitsize_zero_node;
2567 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2568 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2569 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2570 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2571 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2574 vect_finalize_reduction:
2576 /* 2.5 Adjust the final result by the initial value of the reduction
2577 variable. (When such adjustment is not needed, then
2578 'adjustment_def' is zero). For example, if code is PLUS we create:
2579 new_temp = loop_exit_def + adjustment_def */
2581 if (adjustment_def)
2583 if (nested_in_vect_loop)
2585 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2586 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2587 new_dest = vect_create_destination_var (scalar_dest, vectype);
2589 else
2591 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2592 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2593 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2595 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2596 new_temp = make_ssa_name (new_dest, epilog_stmt);
2597 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2598 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2602 /* 2.6 Handle the loop-exit phi */
2604 /* Replace uses of s_out0 with uses of s_out3:
2605 Find the loop-closed-use at the loop exit of the original scalar result.
2606 (The reduction result is expected to have two immediate uses - one at the
2607 latch block, and one at the loop exit). */
2608 phis = VEC_alloc (tree, heap, 10);
2609 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2611 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2613 exit_phi = USE_STMT (use_p);
2614 VEC_quick_push (tree, phis, exit_phi);
2617 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2618 gcc_assert (!VEC_empty (tree, phis));
2620 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2622 if (nested_in_vect_loop)
2624 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2626 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2627 is not used in the outer-loop (but only outside the outer-loop). */
2628 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2629 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2631 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2632 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2633 set_stmt_info (get_stmt_ann (epilog_stmt),
2634 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2635 continue;
2638 /* Replace the uses: */
2639 orig_name = PHI_RESULT (exit_phi);
2640 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2641 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2642 SET_USE (use_p, new_temp);
2644 VEC_free (tree, heap, phis);
2648 /* Function vectorizable_reduction.
2650 Check if STMT performs a reduction operation that can be vectorized.
2651 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2652 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2653 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2655 This function also handles reduction idioms (patterns) that have been
2656 recognized in advance during vect_pattern_recog. In this case, STMT may be
2657 of this form:
2658 X = pattern_expr (arg0, arg1, ..., X)
2659 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2660 sequence that had been detected and replaced by the pattern-stmt (STMT).
2662 In some cases of reduction patterns, the type of the reduction variable X is
2663 different than the type of the other arguments of STMT.
2664 In such cases, the vectype that is used when transforming STMT into a vector
2665 stmt is different than the vectype that is used to determine the
2666 vectorization factor, because it consists of a different number of elements
2667 than the actual number of elements that are being operated upon in parallel.
2669 For example, consider an accumulation of shorts into an int accumulator.
2670 On some targets it's possible to vectorize this pattern operating on 8
2671 shorts at a time (hence, the vectype for purposes of determining the
2672 vectorization factor should be V8HI); on the other hand, the vectype that
2673 is used to create the vector form is actually V4SI (the type of the result).
2675 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2676 indicates what is the actual level of parallelism (V8HI in the example), so
2677 that the right vectorization factor would be derived. This vectype
2678 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2679 be used to create the vectorized stmt. The right vectype for the vectorized
2680 stmt is obtained from the type of the result X:
2681 get_vectype_for_scalar_type (TREE_TYPE (X))
2683 This means that, contrary to "regular" reductions (or "regular" stmts in
2684 general), the following equation:
2685 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2686 does *NOT* necessarily hold for reduction patterns. */
2688 bool
2689 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2691 tree vec_dest;
2692 tree scalar_dest;
2693 tree op;
2694 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2695 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2696 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2697 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2698 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2699 tree operation;
2700 enum tree_code code, orig_code, epilog_reduc_code = 0;
2701 enum machine_mode vec_mode;
2702 int op_type;
2703 optab optab, reduc_optab;
2704 tree new_temp = NULL_TREE;
2705 tree def, def_stmt;
2706 enum vect_def_type dt;
2707 tree new_phi;
2708 tree scalar_type;
2709 bool is_simple_use;
2710 tree orig_stmt;
2711 stmt_vec_info orig_stmt_info;
2712 tree expr = NULL_TREE;
2713 int i;
2714 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2715 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2716 stmt_vec_info prev_stmt_info;
2717 tree reduc_def;
2718 tree new_stmt = NULL_TREE;
2719 int j;
2721 if (nested_in_vect_loop_p (loop, stmt))
2723 loop = loop->inner;
2724 /* FORNOW. This restriction should be relaxed. */
2725 if (ncopies > 1)
2727 if (vect_print_dump_info (REPORT_DETAILS))
2728 fprintf (vect_dump, "multiple types in nested loop.");
2729 return false;
2733 gcc_assert (ncopies >= 1);
2735 /* FORNOW: SLP not supported. */
2736 if (STMT_SLP_TYPE (stmt_info))
2737 return false;
2739 /* 1. Is vectorizable reduction? */
2741 /* Not supportable if the reduction variable is used in the loop. */
2742 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2743 return false;
2745 /* Reductions that are not used even in an enclosing outer-loop,
2746 are expected to be "live" (used out of the loop). */
2747 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2748 && !STMT_VINFO_LIVE_P (stmt_info))
2749 return false;
2751 /* Make sure it was already recognized as a reduction computation. */
2752 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2753 return false;
2755 /* 2. Has this been recognized as a reduction pattern?
2757 Check if STMT represents a pattern that has been recognized
2758 in earlier analysis stages. For stmts that represent a pattern,
2759 the STMT_VINFO_RELATED_STMT field records the last stmt in
2760 the original sequence that constitutes the pattern. */
2762 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2763 if (orig_stmt)
2765 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2766 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2767 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2768 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2771 /* 3. Check the operands of the operation. The first operands are defined
2772 inside the loop body. The last operand is the reduction variable,
2773 which is defined by the loop-header-phi. */
2775 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2777 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2778 code = TREE_CODE (operation);
2779 op_type = TREE_OPERAND_LENGTH (operation);
2780 if (op_type != binary_op && op_type != ternary_op)
2781 return false;
2782 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2783 scalar_type = TREE_TYPE (scalar_dest);
2784 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2785 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2786 return false;
2788 /* All uses but the last are expected to be defined in the loop.
2789 The last use is the reduction variable. */
2790 for (i = 0; i < op_type-1; i++)
2792 op = TREE_OPERAND (operation, i);
2793 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2794 gcc_assert (is_simple_use);
2795 if (dt != vect_loop_def
2796 && dt != vect_invariant_def
2797 && dt != vect_constant_def
2798 && dt != vect_induction_def)
2799 return false;
2802 op = TREE_OPERAND (operation, i);
2803 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2804 gcc_assert (is_simple_use);
2805 gcc_assert (dt == vect_reduction_def);
2806 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2807 if (orig_stmt)
2808 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2809 else
2810 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2812 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2813 return false;
2815 /* 4. Supportable by target? */
2817 /* 4.1. check support for the operation in the loop */
2818 optab = optab_for_tree_code (code, vectype, optab_default);
2819 if (!optab)
2821 if (vect_print_dump_info (REPORT_DETAILS))
2822 fprintf (vect_dump, "no optab.");
2823 return false;
2825 vec_mode = TYPE_MODE (vectype);
2826 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2828 if (vect_print_dump_info (REPORT_DETAILS))
2829 fprintf (vect_dump, "op not supported by target.");
2830 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2831 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2832 < vect_min_worthwhile_factor (code))
2833 return false;
2834 if (vect_print_dump_info (REPORT_DETAILS))
2835 fprintf (vect_dump, "proceeding using word mode.");
2838 /* Worthwhile without SIMD support? */
2839 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2840 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2841 < vect_min_worthwhile_factor (code))
2843 if (vect_print_dump_info (REPORT_DETAILS))
2844 fprintf (vect_dump, "not worthwhile without SIMD support.");
2845 return false;
2848 /* 4.2. Check support for the epilog operation.
2850 If STMT represents a reduction pattern, then the type of the
2851 reduction variable may be different than the type of the rest
2852 of the arguments. For example, consider the case of accumulation
2853 of shorts into an int accumulator; The original code:
2854 S1: int_a = (int) short_a;
2855 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2857 was replaced with:
2858 STMT: int_acc = widen_sum <short_a, int_acc>
2860 This means that:
2861 1. The tree-code that is used to create the vector operation in the
2862 epilog code (that reduces the partial results) is not the
2863 tree-code of STMT, but is rather the tree-code of the original
2864 stmt from the pattern that STMT is replacing. I.e, in the example
2865 above we want to use 'widen_sum' in the loop, but 'plus' in the
2866 epilog.
2867 2. The type (mode) we use to check available target support
2868 for the vector operation to be created in the *epilog*, is
2869 determined by the type of the reduction variable (in the example
2870 above we'd check this: plus_optab[vect_int_mode]).
2871 However the type (mode) we use to check available target support
2872 for the vector operation to be created *inside the loop*, is
2873 determined by the type of the other arguments to STMT (in the
2874 example we'd check this: widen_sum_optab[vect_short_mode]).
2876 This is contrary to "regular" reductions, in which the types of all
2877 the arguments are the same as the type of the reduction variable.
2878 For "regular" reductions we can therefore use the same vector type
2879 (and also the same tree-code) when generating the epilog code and
2880 when generating the code inside the loop. */
2882 if (orig_stmt)
2884 /* This is a reduction pattern: get the vectype from the type of the
2885 reduction variable, and get the tree-code from orig_stmt. */
2886 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2887 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2888 if (!vectype)
2890 if (vect_print_dump_info (REPORT_DETAILS))
2892 fprintf (vect_dump, "unsupported data-type ");
2893 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2895 return false;
2898 vec_mode = TYPE_MODE (vectype);
2900 else
2902 /* Regular reduction: use the same vectype and tree-code as used for
2903 the vector code inside the loop can be used for the epilog code. */
2904 orig_code = code;
2907 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2908 return false;
2909 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
2910 if (!reduc_optab)
2912 if (vect_print_dump_info (REPORT_DETAILS))
2913 fprintf (vect_dump, "no optab for reduction.");
2914 epilog_reduc_code = NUM_TREE_CODES;
2916 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2918 if (vect_print_dump_info (REPORT_DETAILS))
2919 fprintf (vect_dump, "reduc op not supported by target.");
2920 epilog_reduc_code = NUM_TREE_CODES;
2923 if (!vec_stmt) /* transformation not required. */
2925 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2926 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2927 return false;
2928 return true;
2931 /** Transform. **/
2933 if (vect_print_dump_info (REPORT_DETAILS))
2934 fprintf (vect_dump, "transform reduction.");
2936 /* Create the destination vector */
2937 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2939 /* Create the reduction-phi that defines the reduction-operand. */
2940 new_phi = create_phi_node (vec_dest, loop->header);
2942 /* In case the vectorization factor (VF) is bigger than the number
2943 of elements that we can fit in a vectype (nunits), we have to generate
2944 more than one vector stmt - i.e - we need to "unroll" the
2945 vector stmt by a factor VF/nunits. For more details see documentation
2946 in vectorizable_operation. */
2948 prev_stmt_info = NULL;
2949 for (j = 0; j < ncopies; j++)
2951 /* Handle uses. */
2952 if (j == 0)
2954 op = TREE_OPERAND (operation, 0);
2955 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2956 if (op_type == ternary_op)
2958 op = TREE_OPERAND (operation, 1);
2959 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2962 /* Get the vector def for the reduction variable from the phi node */
2963 reduc_def = PHI_RESULT (new_phi);
2965 else
2967 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2968 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2969 if (op_type == ternary_op)
2970 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2972 /* Get the vector def for the reduction variable from the vectorized
2973 reduction operation generated in the previous iteration (j-1) */
2974 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2977 /* Arguments are ready. create the new vector stmt. */
2978 if (op_type == binary_op)
2979 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2980 else
2981 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2982 reduc_def);
2983 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2984 new_temp = make_ssa_name (vec_dest, new_stmt);
2985 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2986 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2988 if (j == 0)
2989 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2990 else
2991 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2992 prev_stmt_info = vinfo_for_stmt (new_stmt);
2995 /* Finalize the reduction-phi (set it's arguments) and create the
2996 epilog reduction code. */
2997 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2998 return true;
3001 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3002 a function declaration if the target has a vectorized version
3003 of the function, or NULL_TREE if the function cannot be vectorized. */
3005 tree
3006 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
3008 tree fndecl = get_callee_fndecl (call);
3009 enum built_in_function code;
3011 /* We only handle functions that do not read or clobber memory -- i.e.
3012 const or novops ones. */
3013 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3014 return NULL_TREE;
3016 if (!fndecl
3017 || TREE_CODE (fndecl) != FUNCTION_DECL
3018 || !DECL_BUILT_IN (fndecl))
3019 return NULL_TREE;
3021 code = DECL_FUNCTION_CODE (fndecl);
3022 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3023 vectype_in);
3026 /* Function vectorizable_call.
3028 Check if STMT performs a function call that can be vectorized.
3029 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3030 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3031 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3033 bool
3034 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3036 tree vec_dest;
3037 tree scalar_dest;
3038 tree operation;
3039 tree op, type;
3040 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3041 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3042 tree vectype_out, vectype_in;
3043 int nunits_in;
3044 int nunits_out;
3045 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3046 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3047 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
3048 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3049 tree new_stmt;
3050 int ncopies, j, nargs;
3051 call_expr_arg_iterator iter;
3052 tree vargs;
3053 enum { NARROW, NONE, WIDEN } modifier;
3055 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3056 return false;
3058 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3059 return false;
3061 /* FORNOW: SLP not supported. */
3062 if (STMT_SLP_TYPE (stmt_info))
3063 return false;
3065 /* Is STMT a vectorizable call? */
3066 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3067 return false;
3069 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3070 return false;
3072 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3073 if (TREE_CODE (operation) != CALL_EXPR)
3074 return false;
3076 /* Process function arguments. */
3077 rhs_type = NULL_TREE;
3078 nargs = 0;
3079 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3081 /* Bail out if the function has more than two arguments, we
3082 do not have interesting builtin functions to vectorize with
3083 more than two arguments. */
3084 if (nargs >= 2)
3085 return false;
3087 /* We can only handle calls with arguments of the same type. */
3088 if (rhs_type
3089 && rhs_type != TREE_TYPE (op))
3091 if (vect_print_dump_info (REPORT_DETAILS))
3092 fprintf (vect_dump, "argument types differ.");
3093 return false;
3095 rhs_type = TREE_TYPE (op);
3097 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3099 if (vect_print_dump_info (REPORT_DETAILS))
3100 fprintf (vect_dump, "use not simple.");
3101 return false;
3104 ++nargs;
3107 /* No arguments is also not good. */
3108 if (nargs == 0)
3109 return false;
3111 vectype_in = get_vectype_for_scalar_type (rhs_type);
3112 if (!vectype_in)
3113 return false;
3114 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3116 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3117 vectype_out = get_vectype_for_scalar_type (lhs_type);
3118 if (!vectype_out)
3119 return false;
3120 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3122 /* FORNOW */
3123 if (nunits_in == nunits_out / 2)
3124 modifier = NARROW;
3125 else if (nunits_out == nunits_in)
3126 modifier = NONE;
3127 else if (nunits_out == nunits_in / 2)
3128 modifier = WIDEN;
3129 else
3130 return false;
3132 /* For now, we only vectorize functions if a target specific builtin
3133 is available. TODO -- in some cases, it might be profitable to
3134 insert the calls for pieces of the vector, in order to be able
3135 to vectorize other operations in the loop. */
3136 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3137 if (fndecl == NULL_TREE)
3139 if (vect_print_dump_info (REPORT_DETAILS))
3140 fprintf (vect_dump, "function is not vectorizable.");
3142 return false;
3145 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3147 if (modifier == NARROW)
3148 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3149 else
3150 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3152 /* Sanity check: make sure that at least one copy of the vectorized stmt
3153 needs to be generated. */
3154 gcc_assert (ncopies >= 1);
3156 /* FORNOW. This restriction should be relaxed. */
3157 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3159 if (vect_print_dump_info (REPORT_DETAILS))
3160 fprintf (vect_dump, "multiple types in nested loop.");
3161 return false;
3164 if (!vec_stmt) /* transformation not required. */
3166 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3167 if (vect_print_dump_info (REPORT_DETAILS))
3168 fprintf (vect_dump, "=== vectorizable_call ===");
3169 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3170 return true;
3173 /** Transform. **/
3175 if (vect_print_dump_info (REPORT_DETAILS))
3176 fprintf (vect_dump, "transform operation.");
3178 /* FORNOW. This restriction should be relaxed. */
3179 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3181 if (vect_print_dump_info (REPORT_DETAILS))
3182 fprintf (vect_dump, "multiple types in nested loop.");
3183 return false;
3186 /* Handle def. */
3187 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3188 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3190 prev_stmt_info = NULL;
3191 switch (modifier)
3193 case NONE:
3194 for (j = 0; j < ncopies; ++j)
3196 /* Build argument list for the vectorized call. */
3197 /* FIXME: Rewrite this so that it doesn't
3198 construct a temporary list. */
3199 vargs = NULL_TREE;
3200 nargs = 0;
3201 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3203 if (j == 0)
3204 vec_oprnd0
3205 = vect_get_vec_def_for_operand (op, stmt, NULL);
3206 else
3207 vec_oprnd0
3208 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3210 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3212 ++nargs;
3214 vargs = nreverse (vargs);
3216 rhs = build_function_call_expr (fndecl, vargs);
3217 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3218 new_temp = make_ssa_name (vec_dest, new_stmt);
3219 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3221 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3223 if (j == 0)
3224 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3225 else
3226 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3228 prev_stmt_info = vinfo_for_stmt (new_stmt);
3231 break;
3233 case NARROW:
3234 for (j = 0; j < ncopies; ++j)
3236 /* Build argument list for the vectorized call. */
3237 /* FIXME: Rewrite this so that it doesn't
3238 construct a temporary list. */
3239 vargs = NULL_TREE;
3240 nargs = 0;
3241 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3243 if (j == 0)
3245 vec_oprnd0
3246 = vect_get_vec_def_for_operand (op, stmt, NULL);
3247 vec_oprnd1
3248 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3250 else
3252 vec_oprnd0
3253 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3254 vec_oprnd1
3255 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3258 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3259 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3261 ++nargs;
3263 vargs = nreverse (vargs);
3265 rhs = build_function_call_expr (fndecl, vargs);
3266 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3267 new_temp = make_ssa_name (vec_dest, new_stmt);
3268 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3270 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3272 if (j == 0)
3273 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3274 else
3275 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3277 prev_stmt_info = vinfo_for_stmt (new_stmt);
3280 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3282 break;
3284 case WIDEN:
3285 /* No current target implements this case. */
3286 return false;
3289 /* The call in STMT might prevent it from being removed in dce.
3290 We however cannot remove it here, due to the way the ssa name
3291 it defines is mapped to the new definition. So just replace
3292 rhs of the statement with something harmless. */
3293 type = TREE_TYPE (scalar_dest);
3294 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3295 update_stmt (stmt);
3297 return true;
3301 /* Function vect_gen_widened_results_half
3303 Create a vector stmt whose code, type, number of arguments, and result
3304 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3305 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3306 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3307 needs to be created (DECL is a function-decl of a target-builtin).
3308 STMT is the original scalar stmt that we are vectorizing. */
3310 static tree
3311 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3312 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3313 tree vec_dest, block_stmt_iterator *bsi,
3314 tree stmt)
3316 tree expr;
3317 tree new_stmt;
3318 tree new_temp;
3319 tree sym;
3320 ssa_op_iter iter;
3322 /* Generate half of the widened result: */
3323 if (code == CALL_EXPR)
3325 /* Target specific support */
3326 if (op_type == binary_op)
3327 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3328 else
3329 expr = build_call_expr (decl, 1, vec_oprnd0);
3331 else
3333 /* Generic support */
3334 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3335 if (op_type == binary_op)
3336 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3337 else
3338 expr = build1 (code, vectype, vec_oprnd0);
3340 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3341 new_temp = make_ssa_name (vec_dest, new_stmt);
3342 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3343 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3345 if (code == CALL_EXPR)
3347 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3349 if (TREE_CODE (sym) == SSA_NAME)
3350 sym = SSA_NAME_VAR (sym);
3351 mark_sym_for_renaming (sym);
3355 return new_stmt;
3359 /* Check if STMT performs a conversion operation, that can be vectorized.
3360 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3361 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3362 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3364 bool
3365 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3366 tree *vec_stmt, slp_tree slp_node)
3368 tree vec_dest;
3369 tree scalar_dest;
3370 tree operation;
3371 tree op0;
3372 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3373 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3374 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3375 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3376 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3377 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3378 tree new_temp;
3379 tree def, def_stmt;
3380 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3381 tree new_stmt = NULL_TREE;
3382 stmt_vec_info prev_stmt_info;
3383 int nunits_in;
3384 int nunits_out;
3385 tree vectype_out, vectype_in;
3386 int ncopies, j;
3387 tree expr;
3388 tree rhs_type, lhs_type;
3389 tree builtin_decl;
3390 enum { NARROW, NONE, WIDEN } modifier;
3391 int i;
3392 VEC(tree,heap) *vec_oprnds0 = NULL;
3393 tree vop0;
3395 /* Is STMT a vectorizable conversion? */
3397 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3398 return false;
3400 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3401 return false;
3403 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3404 return false;
3406 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3407 return false;
3409 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3410 code = TREE_CODE (operation);
3411 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3412 return false;
3414 /* Check types of lhs and rhs. */
3415 op0 = TREE_OPERAND (operation, 0);
3416 rhs_type = TREE_TYPE (op0);
3417 vectype_in = get_vectype_for_scalar_type (rhs_type);
3418 if (!vectype_in)
3419 return false;
3420 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3422 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3423 lhs_type = TREE_TYPE (scalar_dest);
3424 vectype_out = get_vectype_for_scalar_type (lhs_type);
3425 if (!vectype_out)
3426 return false;
3427 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3429 /* FORNOW */
3430 if (nunits_in == nunits_out / 2)
3431 modifier = NARROW;
3432 else if (nunits_out == nunits_in)
3433 modifier = NONE;
3434 else if (nunits_out == nunits_in / 2)
3435 modifier = WIDEN;
3436 else
3437 return false;
3439 if (modifier == NONE)
3440 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3442 /* Bail out if the types are both integral or non-integral. */
3443 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3444 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3445 return false;
3447 if (modifier == NARROW)
3448 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3449 else
3450 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3452 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3453 this, so we can safely override NCOPIES with 1 here. */
3454 if (slp_node)
3455 ncopies = 1;
3457 /* Sanity check: make sure that at least one copy of the vectorized stmt
3458 needs to be generated. */
3459 gcc_assert (ncopies >= 1);
3461 /* FORNOW. This restriction should be relaxed. */
3462 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3464 if (vect_print_dump_info (REPORT_DETAILS))
3465 fprintf (vect_dump, "multiple types in nested loop.");
3466 return false;
3469 /* Check the operands of the operation. */
3470 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3472 if (vect_print_dump_info (REPORT_DETAILS))
3473 fprintf (vect_dump, "use not simple.");
3474 return false;
3477 /* Supportable by target? */
3478 if ((modifier == NONE
3479 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3480 || (modifier == WIDEN
3481 && !supportable_widening_operation (code, stmt, vectype_in,
3482 &decl1, &decl2,
3483 &code1, &code2))
3484 || (modifier == NARROW
3485 && !supportable_narrowing_operation (code, stmt, vectype_in,
3486 &code1)))
3488 if (vect_print_dump_info (REPORT_DETAILS))
3489 fprintf (vect_dump, "op not supported by target.");
3490 return false;
3493 if (modifier != NONE)
3495 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3496 /* FORNOW: SLP not supported. */
3497 if (STMT_SLP_TYPE (stmt_info))
3498 return false;
3501 if (!vec_stmt) /* transformation not required. */
3503 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3504 return true;
3507 /** Transform. **/
3508 if (vect_print_dump_info (REPORT_DETAILS))
3509 fprintf (vect_dump, "transform conversion.");
3511 /* Handle def. */
3512 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3514 if (modifier == NONE && !slp_node)
3515 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3517 prev_stmt_info = NULL;
3518 switch (modifier)
3520 case NONE:
3521 for (j = 0; j < ncopies; j++)
3523 tree sym;
3524 ssa_op_iter iter;
3526 if (j == 0)
3527 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3528 else
3529 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3531 builtin_decl =
3532 targetm.vectorize.builtin_conversion (code, vectype_in);
3533 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3535 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3537 /* Arguments are ready. create the new vector stmt. */
3538 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3539 new_temp = make_ssa_name (vec_dest, new_stmt);
3540 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3541 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3542 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3543 SSA_OP_ALL_VIRTUALS)
3545 if (TREE_CODE (sym) == SSA_NAME)
3546 sym = SSA_NAME_VAR (sym);
3547 mark_sym_for_renaming (sym);
3549 if (slp_node)
3550 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3553 if (j == 0)
3554 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3555 else
3556 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3557 prev_stmt_info = vinfo_for_stmt (new_stmt);
3559 break;
3561 case WIDEN:
3562 /* In case the vectorization factor (VF) is bigger than the number
3563 of elements that we can fit in a vectype (nunits), we have to
3564 generate more than one vector stmt - i.e - we need to "unroll"
3565 the vector stmt by a factor VF/nunits. */
3566 for (j = 0; j < ncopies; j++)
3568 if (j == 0)
3569 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3570 else
3571 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3573 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3575 /* Generate first half of the widened result: */
3576 new_stmt
3577 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3578 vec_oprnd0, vec_oprnd1,
3579 unary_op, vec_dest, bsi, stmt);
3580 if (j == 0)
3581 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3582 else
3583 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3584 prev_stmt_info = vinfo_for_stmt (new_stmt);
3586 /* Generate second half of the widened result: */
3587 new_stmt
3588 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3589 vec_oprnd0, vec_oprnd1,
3590 unary_op, vec_dest, bsi, stmt);
3591 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3592 prev_stmt_info = vinfo_for_stmt (new_stmt);
3594 break;
3596 case NARROW:
3597 /* In case the vectorization factor (VF) is bigger than the number
3598 of elements that we can fit in a vectype (nunits), we have to
3599 generate more than one vector stmt - i.e - we need to "unroll"
3600 the vector stmt by a factor VF/nunits. */
3601 for (j = 0; j < ncopies; j++)
3603 /* Handle uses. */
3604 if (j == 0)
3606 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3607 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3609 else
3611 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3612 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3615 /* Arguments are ready. Create the new vector stmt. */
3616 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3617 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3618 new_temp = make_ssa_name (vec_dest, new_stmt);
3619 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3620 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3622 if (j == 0)
3623 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3624 else
3625 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3627 prev_stmt_info = vinfo_for_stmt (new_stmt);
3630 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3633 if (vec_oprnds0)
3634 VEC_free (tree, heap, vec_oprnds0);
3636 return true;
3640 /* Function vectorizable_assignment.
3642 Check if STMT performs an assignment (copy) that can be vectorized.
3643 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3644 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3645 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3647 bool
3648 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3649 slp_tree slp_node)
3651 tree vec_dest;
3652 tree scalar_dest;
3653 tree op;
3654 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3655 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3656 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3657 tree new_temp;
3658 tree def, def_stmt;
3659 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3660 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3661 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3662 int i;
3663 VEC(tree,heap) *vec_oprnds = NULL;
3664 tree vop;
3666 /* FORNOW: SLP with multiple types is not supported. The SLP analysis
3667 verifies this, so we can safely override NCOPIES with 1 here. */
3668 if (slp_node)
3669 ncopies = 1;
3671 gcc_assert (ncopies >= 1);
3672 if (ncopies > 1)
3673 return false; /* FORNOW */
3675 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3676 return false;
3678 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3679 return false;
3681 /* Is vectorizable assignment? */
3682 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3683 return false;
3685 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3686 if (TREE_CODE (scalar_dest) != SSA_NAME)
3687 return false;
3689 op = GIMPLE_STMT_OPERAND (stmt, 1);
3690 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3692 if (vect_print_dump_info (REPORT_DETAILS))
3693 fprintf (vect_dump, "use not simple.");
3694 return false;
3697 if (!vec_stmt) /* transformation not required. */
3699 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3700 if (vect_print_dump_info (REPORT_DETAILS))
3701 fprintf (vect_dump, "=== vectorizable_assignment ===");
3702 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3703 return true;
3706 /** Transform. **/
3707 if (vect_print_dump_info (REPORT_DETAILS))
3708 fprintf (vect_dump, "transform assignment.");
3710 /* Handle def. */
3711 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3713 /* Handle use. */
3714 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3716 /* Arguments are ready. create the new vector stmt. */
3717 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3719 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3720 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3721 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3722 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3723 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3725 if (slp_node)
3726 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3729 VEC_free (tree, heap, vec_oprnds);
3730 return true;
3734 /* Function vect_min_worthwhile_factor.
3736 For a loop where we could vectorize the operation indicated by CODE,
3737 return the minimum vectorization factor that makes it worthwhile
3738 to use generic vectors. */
3739 static int
3740 vect_min_worthwhile_factor (enum tree_code code)
3742 switch (code)
3744 case PLUS_EXPR:
3745 case MINUS_EXPR:
3746 case NEGATE_EXPR:
3747 return 4;
3749 case BIT_AND_EXPR:
3750 case BIT_IOR_EXPR:
3751 case BIT_XOR_EXPR:
3752 case BIT_NOT_EXPR:
3753 return 2;
3755 default:
3756 return INT_MAX;
3761 /* Function vectorizable_induction
3763 Check if PHI performs an induction computation that can be vectorized.
3764 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3765 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3766 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3768 bool
3769 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3770 tree *vec_stmt)
3772 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3773 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3774 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3775 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3776 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3777 tree vec_def;
3779 gcc_assert (ncopies >= 1);
3781 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3782 return false;
3784 /* FORNOW: SLP not supported. */
3785 if (STMT_SLP_TYPE (stmt_info))
3786 return false;
3788 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3790 if (TREE_CODE (phi) != PHI_NODE)
3791 return false;
3793 if (!vec_stmt) /* transformation not required. */
3795 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3796 if (vect_print_dump_info (REPORT_DETAILS))
3797 fprintf (vect_dump, "=== vectorizable_induction ===");
3798 vect_model_induction_cost (stmt_info, ncopies);
3799 return true;
3802 /** Transform. **/
3804 if (vect_print_dump_info (REPORT_DETAILS))
3805 fprintf (vect_dump, "transform induction phi.");
3807 vec_def = get_initial_def_for_induction (phi);
3808 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3809 return true;
3813 /* Function vectorizable_operation.
3815 Check if STMT performs a binary or unary operation that can be vectorized.
3816 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3817 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3818 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3820 bool
3821 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3822 slp_tree slp_node)
3824 tree vec_dest;
3825 tree scalar_dest;
3826 tree operation;
3827 tree op0, op1 = NULL;
3828 tree vec_oprnd1 = NULL_TREE;
3829 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3830 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3831 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3832 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3833 enum tree_code code;
3834 enum machine_mode vec_mode;
3835 tree new_temp;
3836 int op_type;
3837 optab optab;
3838 int icode;
3839 enum machine_mode optab_op2_mode;
3840 tree def, def_stmt;
3841 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3842 tree new_stmt = NULL_TREE;
3843 stmt_vec_info prev_stmt_info;
3844 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3845 int nunits_out;
3846 tree vectype_out;
3847 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3848 int j, i;
3849 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3850 tree vop0, vop1;
3851 unsigned int k;
3852 bool shift_p = false;
3853 bool scalar_shift_arg = false;
3855 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3856 this, so we can safely override NCOPIES with 1 here. */
3857 if (slp_node)
3858 ncopies = 1;
3859 gcc_assert (ncopies >= 1);
3860 /* FORNOW. This restriction should be relaxed. */
3861 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3863 if (vect_print_dump_info (REPORT_DETAILS))
3864 fprintf (vect_dump, "multiple types in nested loop.");
3865 return false;
3868 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3869 return false;
3871 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3872 return false;
3874 /* Is STMT a vectorizable binary/unary operation? */
3875 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3876 return false;
3878 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3879 return false;
3881 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3882 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3883 if (!vectype_out)
3884 return false;
3885 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3886 if (nunits_out != nunits_in)
3887 return false;
3889 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3890 code = TREE_CODE (operation);
3892 /* For pointer addition, we should use the normal plus for
3893 the vector addition. */
3894 if (code == POINTER_PLUS_EXPR)
3895 code = PLUS_EXPR;
3897 /* Support only unary or binary operations. */
3898 op_type = TREE_OPERAND_LENGTH (operation);
3899 if (op_type != unary_op && op_type != binary_op)
3901 if (vect_print_dump_info (REPORT_DETAILS))
3902 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3903 return false;
3906 op0 = TREE_OPERAND (operation, 0);
3907 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3909 if (vect_print_dump_info (REPORT_DETAILS))
3910 fprintf (vect_dump, "use not simple.");
3911 return false;
3914 if (op_type == binary_op)
3916 op1 = TREE_OPERAND (operation, 1);
3917 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3919 if (vect_print_dump_info (REPORT_DETAILS))
3920 fprintf (vect_dump, "use not simple.");
3921 return false;
3925 /* If this is a shift/rotate, determine whether the shift amount is a vector,
3926 or scalar. If the shift/rotate amount is a vector, use the vector/vector
3927 shift optabs. */
3928 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
3929 || code == RROTATE_EXPR)
3931 shift_p = true;
3933 /* vector shifted by vector */
3934 if (dt[1] == vect_loop_def)
3936 optab = optab_for_tree_code (code, vectype, optab_vector);
3937 if (vect_print_dump_info (REPORT_DETAILS))
3938 fprintf (vect_dump, "vector/vector shift/rotate found.");
3941 /* See if the machine has a vector shifted by scalar insn and if not
3942 then see if it has a vector shifted by vector insn */
3943 else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
3945 optab = optab_for_tree_code (code, vectype, optab_scalar);
3946 if (optab
3947 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
3948 != CODE_FOR_nothing))
3950 scalar_shift_arg = true;
3951 if (vect_print_dump_info (REPORT_DETAILS))
3952 fprintf (vect_dump, "vector/scalar shift/rotate found.");
3954 else
3956 optab = optab_for_tree_code (code, vectype, optab_vector);
3957 if (vect_print_dump_info (REPORT_DETAILS)
3958 && optab
3959 && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
3960 != CODE_FOR_nothing))
3961 fprintf (vect_dump, "vector/vector shift/rotate found.");
3965 else
3967 if (vect_print_dump_info (REPORT_DETAILS))
3968 fprintf (vect_dump, "operand mode requires invariant argument.");
3969 return false;
3972 else
3973 optab = optab_for_tree_code (code, vectype, optab_default);
3975 /* Supportable by target? */
3976 if (!optab)
3978 if (vect_print_dump_info (REPORT_DETAILS))
3979 fprintf (vect_dump, "no optab.");
3980 return false;
3982 vec_mode = TYPE_MODE (vectype);
3983 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3984 if (icode == CODE_FOR_nothing)
3986 if (vect_print_dump_info (REPORT_DETAILS))
3987 fprintf (vect_dump, "op not supported by target.");
3988 /* Check only during analysis. */
3989 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3990 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3991 < vect_min_worthwhile_factor (code)
3992 && !vec_stmt))
3993 return false;
3994 if (vect_print_dump_info (REPORT_DETAILS))
3995 fprintf (vect_dump, "proceeding using word mode.");
3998 /* Worthwhile without SIMD support? Check only during analysis. */
3999 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
4000 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4001 < vect_min_worthwhile_factor (code)
4002 && !vec_stmt)
4004 if (vect_print_dump_info (REPORT_DETAILS))
4005 fprintf (vect_dump, "not worthwhile without SIMD support.");
4006 return false;
4009 if (!vec_stmt) /* transformation not required. */
4011 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
4012 if (vect_print_dump_info (REPORT_DETAILS))
4013 fprintf (vect_dump, "=== vectorizable_operation ===");
4014 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4015 return true;
4018 /** Transform. **/
4020 if (vect_print_dump_info (REPORT_DETAILS))
4021 fprintf (vect_dump, "transform binary/unary operation.");
4023 /* Handle def. */
4024 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4026 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4027 created in the previous stages of the recursion, so no allocation is
4028 needed, except for the case of shift with scalar shift argument. In that
4029 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4030 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4031 In case of loop-based vectorization we allocate VECs of size 1. We
4032 allocate VEC_OPRNDS1 only in case of binary operation. */
4033 if (!slp_node)
4035 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4036 if (op_type == binary_op)
4037 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4039 else if (scalar_shift_arg)
4040 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4042 /* In case the vectorization factor (VF) is bigger than the number
4043 of elements that we can fit in a vectype (nunits), we have to generate
4044 more than one vector stmt - i.e - we need to "unroll" the
4045 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4046 from one copy of the vector stmt to the next, in the field
4047 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4048 stages to find the correct vector defs to be used when vectorizing
4049 stmts that use the defs of the current stmt. The example below illustrates
4050 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4051 4 vectorized stmts):
4053 before vectorization:
4054 RELATED_STMT VEC_STMT
4055 S1: x = memref - -
4056 S2: z = x + 1 - -
4058 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4059 there):
4060 RELATED_STMT VEC_STMT
4061 VS1_0: vx0 = memref0 VS1_1 -
4062 VS1_1: vx1 = memref1 VS1_2 -
4063 VS1_2: vx2 = memref2 VS1_3 -
4064 VS1_3: vx3 = memref3 - -
4065 S1: x = load - VS1_0
4066 S2: z = x + 1 - -
4068 step2: vectorize stmt S2 (done here):
4069 To vectorize stmt S2 we first need to find the relevant vector
4070 def for the first operand 'x'. This is, as usual, obtained from
4071 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4072 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4073 relevant vector def 'vx0'. Having found 'vx0' we can generate
4074 the vector stmt VS2_0, and as usual, record it in the
4075 STMT_VINFO_VEC_STMT of stmt S2.
4076 When creating the second copy (VS2_1), we obtain the relevant vector
4077 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4078 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4079 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4080 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4081 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4082 chain of stmts and pointers:
4083 RELATED_STMT VEC_STMT
4084 VS1_0: vx0 = memref0 VS1_1 -
4085 VS1_1: vx1 = memref1 VS1_2 -
4086 VS1_2: vx2 = memref2 VS1_3 -
4087 VS1_3: vx3 = memref3 - -
4088 S1: x = load - VS1_0
4089 VS2_0: vz0 = vx0 + v1 VS2_1 -
4090 VS2_1: vz1 = vx1 + v1 VS2_2 -
4091 VS2_2: vz2 = vx2 + v1 VS2_3 -
4092 VS2_3: vz3 = vx3 + v1 - -
4093 S2: z = x + 1 - VS2_0 */
4095 prev_stmt_info = NULL;
4096 for (j = 0; j < ncopies; j++)
4098 /* Handle uses. */
4099 if (j == 0)
4101 if (op_type == binary_op && scalar_shift_arg)
4103 /* Vector shl and shr insn patterns can be defined with scalar
4104 operand 2 (shift operand). In this case, use constant or loop
4105 invariant op1 directly, without extending it to vector mode
4106 first. */
4107 optab_op2_mode = insn_data[icode].operand[2].mode;
4108 if (!VECTOR_MODE_P (optab_op2_mode))
4110 if (vect_print_dump_info (REPORT_DETAILS))
4111 fprintf (vect_dump, "operand 1 using scalar mode.");
4112 vec_oprnd1 = op1;
4113 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4114 if (slp_node)
4116 /* Store vec_oprnd1 for every vector stmt to be created
4117 for SLP_NODE. We check during the analysis that all the
4118 shift arguments are the same.
4119 TODO: Allow different constants for different vector
4120 stmts generated for an SLP instance. */
4121 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4122 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4127 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4128 (a special case for certain kind of vector shifts); otherwise,
4129 operand 1 should be of a vector type (the usual case). */
4130 if (op_type == binary_op && !vec_oprnd1)
4131 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4132 slp_node);
4133 else
4134 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4135 slp_node);
4137 else
4138 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4140 /* Arguments are ready. Create the new vector stmt. */
4141 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4143 if (op_type == binary_op)
4145 vop1 = VEC_index (tree, vec_oprnds1, i);
4146 new_stmt = build_gimple_modify_stmt (vec_dest,
4147 build2 (code, vectype, vop0, vop1));
4149 else
4150 new_stmt = build_gimple_modify_stmt (vec_dest,
4151 build1 (code, vectype, vop0));
4153 new_temp = make_ssa_name (vec_dest, new_stmt);
4154 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4155 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4156 if (slp_node)
4157 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4160 if (j == 0)
4161 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4162 else
4163 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4164 prev_stmt_info = vinfo_for_stmt (new_stmt);
4167 VEC_free (tree, heap, vec_oprnds0);
4168 if (vec_oprnds1)
4169 VEC_free (tree, heap, vec_oprnds1);
4171 return true;
4175 /* Function vectorizable_type_demotion
4177 Check if STMT performs a binary or unary operation that involves
4178 type demotion, and if it can be vectorized.
4179 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4180 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4181 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4183 bool
4184 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4185 tree *vec_stmt)
4187 tree vec_dest;
4188 tree scalar_dest;
4189 tree operation;
4190 tree op0;
4191 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4192 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4193 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4194 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4195 enum tree_code code, code1 = ERROR_MARK;
4196 tree new_temp;
4197 tree def, def_stmt;
4198 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4199 tree new_stmt;
4200 stmt_vec_info prev_stmt_info;
4201 int nunits_in;
4202 int nunits_out;
4203 tree vectype_out;
4204 int ncopies;
4205 int j;
4206 tree expr;
4207 tree vectype_in;
4209 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4210 return false;
4212 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4213 return false;
4215 /* Is STMT a vectorizable type-demotion operation? */
4216 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4217 return false;
4219 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4220 return false;
4222 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4223 code = TREE_CODE (operation);
4224 if (code != NOP_EXPR && code != CONVERT_EXPR)
4225 return false;
4227 op0 = TREE_OPERAND (operation, 0);
4228 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4229 if (!vectype_in)
4230 return false;
4231 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4233 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4234 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4235 if (!vectype_out)
4236 return false;
4237 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4238 if (nunits_in != nunits_out / 2) /* FORNOW */
4239 return false;
4241 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4242 gcc_assert (ncopies >= 1);
4243 /* FORNOW. This restriction should be relaxed. */
4244 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4246 if (vect_print_dump_info (REPORT_DETAILS))
4247 fprintf (vect_dump, "multiple types in nested loop.");
4248 return false;
4251 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4252 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4253 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4254 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4255 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4256 return false;
4258 /* Check the operands of the operation. */
4259 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4261 if (vect_print_dump_info (REPORT_DETAILS))
4262 fprintf (vect_dump, "use not simple.");
4263 return false;
4266 /* Supportable by target? */
4267 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4268 return false;
4270 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4272 if (!vec_stmt) /* transformation not required. */
4274 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4275 if (vect_print_dump_info (REPORT_DETAILS))
4276 fprintf (vect_dump, "=== vectorizable_demotion ===");
4277 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4278 return true;
4281 /** Transform. **/
4282 if (vect_print_dump_info (REPORT_DETAILS))
4283 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4284 ncopies);
4286 /* Handle def. */
4287 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4289 /* In case the vectorization factor (VF) is bigger than the number
4290 of elements that we can fit in a vectype (nunits), we have to generate
4291 more than one vector stmt - i.e - we need to "unroll" the
4292 vector stmt by a factor VF/nunits. */
4293 prev_stmt_info = NULL;
4294 for (j = 0; j < ncopies; j++)
4296 /* Handle uses. */
4297 if (j == 0)
4299 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4300 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4302 else
4304 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4305 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4308 /* Arguments are ready. Create the new vector stmt. */
4309 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4310 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4311 new_temp = make_ssa_name (vec_dest, new_stmt);
4312 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4313 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4315 if (j == 0)
4316 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4317 else
4318 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4320 prev_stmt_info = vinfo_for_stmt (new_stmt);
4323 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4324 return true;
4328 /* Function vectorizable_type_promotion
4330 Check if STMT performs a binary or unary operation that involves
4331 type promotion, and if it can be vectorized.
4332 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4333 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4334 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4336 bool
4337 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4338 tree *vec_stmt)
4340 tree vec_dest;
4341 tree scalar_dest;
4342 tree operation;
4343 tree op0, op1 = NULL;
4344 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4345 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4346 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4347 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4348 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4349 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4350 int op_type;
4351 tree def, def_stmt;
4352 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4353 tree new_stmt;
4354 stmt_vec_info prev_stmt_info;
4355 int nunits_in;
4356 int nunits_out;
4357 tree vectype_out;
4358 int ncopies;
4359 int j;
4360 tree vectype_in;
4362 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4363 return false;
4365 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4366 return false;
4368 /* Is STMT a vectorizable type-promotion operation? */
4369 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4370 return false;
4372 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4373 return false;
4375 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4376 code = TREE_CODE (operation);
4377 if (code != NOP_EXPR && code != CONVERT_EXPR
4378 && code != WIDEN_MULT_EXPR)
4379 return false;
4381 op0 = TREE_OPERAND (operation, 0);
4382 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4383 if (!vectype_in)
4384 return false;
4385 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4387 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4388 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4389 if (!vectype_out)
4390 return false;
4391 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4392 if (nunits_out != nunits_in / 2) /* FORNOW */
4393 return false;
4395 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4396 gcc_assert (ncopies >= 1);
4397 /* FORNOW. This restriction should be relaxed. */
4398 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4400 if (vect_print_dump_info (REPORT_DETAILS))
4401 fprintf (vect_dump, "multiple types in nested loop.");
4402 return false;
4405 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4406 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4407 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4408 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4409 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4410 return false;
4412 /* Check the operands of the operation. */
4413 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4415 if (vect_print_dump_info (REPORT_DETAILS))
4416 fprintf (vect_dump, "use not simple.");
4417 return false;
4420 op_type = TREE_CODE_LENGTH (code);
4421 if (op_type == binary_op)
4423 op1 = TREE_OPERAND (operation, 1);
4424 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4426 if (vect_print_dump_info (REPORT_DETAILS))
4427 fprintf (vect_dump, "use not simple.");
4428 return false;
4432 /* Supportable by target? */
4433 if (!supportable_widening_operation (code, stmt, vectype_in,
4434 &decl1, &decl2, &code1, &code2))
4435 return false;
4437 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4439 if (!vec_stmt) /* transformation not required. */
4441 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4442 if (vect_print_dump_info (REPORT_DETAILS))
4443 fprintf (vect_dump, "=== vectorizable_promotion ===");
4444 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4445 return true;
4448 /** Transform. **/
4450 if (vect_print_dump_info (REPORT_DETAILS))
4451 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4452 ncopies);
4454 /* Handle def. */
4455 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4457 /* In case the vectorization factor (VF) is bigger than the number
4458 of elements that we can fit in a vectype (nunits), we have to generate
4459 more than one vector stmt - i.e - we need to "unroll" the
4460 vector stmt by a factor VF/nunits. */
4462 prev_stmt_info = NULL;
4463 for (j = 0; j < ncopies; j++)
4465 /* Handle uses. */
4466 if (j == 0)
4468 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4469 if (op_type == binary_op)
4470 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4472 else
4474 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4475 if (op_type == binary_op)
4476 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4479 /* Arguments are ready. Create the new vector stmt. We are creating
4480 two vector defs because the widened result does not fit in one vector.
4481 The vectorized stmt can be expressed as a call to a target builtin,
4482 or a using a tree-code. */
4483 /* Generate first half of the widened result: */
4484 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4485 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4486 if (j == 0)
4487 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4488 else
4489 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4490 prev_stmt_info = vinfo_for_stmt (new_stmt);
4492 /* Generate second half of the widened result: */
4493 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4494 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4495 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4496 prev_stmt_info = vinfo_for_stmt (new_stmt);
4500 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4501 return true;
4505 /* Function vect_strided_store_supported.
4507 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4508 and FALSE otherwise. */
4510 static bool
4511 vect_strided_store_supported (tree vectype)
4513 optab interleave_high_optab, interleave_low_optab;
4514 int mode;
4516 mode = (int) TYPE_MODE (vectype);
4518 /* Check that the operation is supported. */
4519 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4520 vectype, optab_default);
4521 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4522 vectype, optab_default);
4523 if (!interleave_high_optab || !interleave_low_optab)
4525 if (vect_print_dump_info (REPORT_DETAILS))
4526 fprintf (vect_dump, "no optab for interleave.");
4527 return false;
4530 if (optab_handler (interleave_high_optab, mode)->insn_code
4531 == CODE_FOR_nothing
4532 || optab_handler (interleave_low_optab, mode)->insn_code
4533 == CODE_FOR_nothing)
4535 if (vect_print_dump_info (REPORT_DETAILS))
4536 fprintf (vect_dump, "interleave op not supported by target.");
4537 return false;
4540 return true;
4544 /* Function vect_permute_store_chain.
4546 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4547 a power of 2, generate interleave_high/low stmts to reorder the data
4548 correctly for the stores. Return the final references for stores in
4549 RESULT_CHAIN.
4551 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4552 The input is 4 vectors each containing 8 elements. We assign a number to each
4553 element, the input sequence is:
4555 1st vec: 0 1 2 3 4 5 6 7
4556 2nd vec: 8 9 10 11 12 13 14 15
4557 3rd vec: 16 17 18 19 20 21 22 23
4558 4th vec: 24 25 26 27 28 29 30 31
4560 The output sequence should be:
4562 1st vec: 0 8 16 24 1 9 17 25
4563 2nd vec: 2 10 18 26 3 11 19 27
4564 3rd vec: 4 12 20 28 5 13 21 30
4565 4th vec: 6 14 22 30 7 15 23 31
4567 i.e., we interleave the contents of the four vectors in their order.
4569 We use interleave_high/low instructions to create such output. The input of
4570 each interleave_high/low operation is two vectors:
4571 1st vec 2nd vec
4572 0 1 2 3 4 5 6 7
4573 the even elements of the result vector are obtained left-to-right from the
4574 high/low elements of the first vector. The odd elements of the result are
4575 obtained left-to-right from the high/low elements of the second vector.
4576 The output of interleave_high will be: 0 4 1 5
4577 and of interleave_low: 2 6 3 7
4580 The permutation is done in log LENGTH stages. In each stage interleave_high
4581 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4582 where the first argument is taken from the first half of DR_CHAIN and the
4583 second argument from it's second half.
4584 In our example,
4586 I1: interleave_high (1st vec, 3rd vec)
4587 I2: interleave_low (1st vec, 3rd vec)
4588 I3: interleave_high (2nd vec, 4th vec)
4589 I4: interleave_low (2nd vec, 4th vec)
4591 The output for the first stage is:
4593 I1: 0 16 1 17 2 18 3 19
4594 I2: 4 20 5 21 6 22 7 23
4595 I3: 8 24 9 25 10 26 11 27
4596 I4: 12 28 13 29 14 30 15 31
4598 The output of the second stage, i.e. the final result is:
4600 I1: 0 8 16 24 1 9 17 25
4601 I2: 2 10 18 26 3 11 19 27
4602 I3: 4 12 20 28 5 13 21 30
4603 I4: 6 14 22 30 7 15 23 31. */
4605 static bool
4606 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4607 unsigned int length,
4608 tree stmt,
4609 block_stmt_iterator *bsi,
4610 VEC(tree,heap) **result_chain)
4612 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4613 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4614 tree scalar_dest, tmp;
4615 int i;
4616 unsigned int j;
4618 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4620 /* Check that the operation is supported. */
4621 if (!vect_strided_store_supported (vectype))
4622 return false;
4624 *result_chain = VEC_copy (tree, heap, dr_chain);
4626 for (i = 0; i < exact_log2 (length); i++)
4628 for (j = 0; j < length/2; j++)
4630 vect1 = VEC_index (tree, dr_chain, j);
4631 vect2 = VEC_index (tree, dr_chain, j+length/2);
4633 /* Create interleaving stmt:
4634 in the case of big endian:
4635 high = interleave_high (vect1, vect2)
4636 and in the case of little endian:
4637 high = interleave_low (vect1, vect2). */
4638 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4639 DECL_GIMPLE_REG_P (perm_dest) = 1;
4640 add_referenced_var (perm_dest);
4641 if (BYTES_BIG_ENDIAN)
4642 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4643 else
4644 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4645 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4646 high = make_ssa_name (perm_dest, perm_stmt);
4647 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4648 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4649 VEC_replace (tree, *result_chain, 2*j, high);
4651 /* Create interleaving stmt:
4652 in the case of big endian:
4653 low = interleave_low (vect1, vect2)
4654 and in the case of little endian:
4655 low = interleave_high (vect1, vect2). */
4656 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4657 DECL_GIMPLE_REG_P (perm_dest) = 1;
4658 add_referenced_var (perm_dest);
4659 if (BYTES_BIG_ENDIAN)
4660 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4661 else
4662 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4663 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4664 low = make_ssa_name (perm_dest, perm_stmt);
4665 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4666 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4667 VEC_replace (tree, *result_chain, 2*j+1, low);
4669 dr_chain = VEC_copy (tree, heap, *result_chain);
4671 return true;
4675 /* Function vectorizable_store.
4677 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4678 can be vectorized.
4679 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4680 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4681 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4683 bool
4684 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4685 slp_tree slp_node)
4687 tree scalar_dest;
4688 tree data_ref;
4689 tree op;
4690 tree vec_oprnd = NULL_TREE;
4691 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4692 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4693 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4694 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4695 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4696 enum machine_mode vec_mode;
4697 tree dummy;
4698 enum dr_alignment_support alignment_support_scheme;
4699 tree def, def_stmt;
4700 enum vect_def_type dt;
4701 stmt_vec_info prev_stmt_info = NULL;
4702 tree dataref_ptr = NULL_TREE;
4703 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4704 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4705 int j;
4706 tree next_stmt, first_stmt = NULL_TREE;
4707 bool strided_store = false;
4708 unsigned int group_size, i;
4709 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4710 bool inv_p;
4711 VEC(tree,heap) *vec_oprnds = NULL;
4712 bool slp = (slp_node != NULL);
4713 stmt_vec_info first_stmt_vinfo;
4714 unsigned int vec_num;
4716 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4717 this, so we can safely override NCOPIES with 1 here. */
4718 if (slp)
4719 ncopies = 1;
4721 gcc_assert (ncopies >= 1);
4723 /* FORNOW. This restriction should be relaxed. */
4724 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4726 if (vect_print_dump_info (REPORT_DETAILS))
4727 fprintf (vect_dump, "multiple types in nested loop.");
4728 return false;
4731 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4732 return false;
4734 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4735 return false;
4737 /* Is vectorizable store? */
4739 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4740 return false;
4742 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4743 if (TREE_CODE (scalar_dest) != ARRAY_REF
4744 && TREE_CODE (scalar_dest) != INDIRECT_REF
4745 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4746 return false;
4748 op = GIMPLE_STMT_OPERAND (stmt, 1);
4749 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4751 if (vect_print_dump_info (REPORT_DETAILS))
4752 fprintf (vect_dump, "use not simple.");
4753 return false;
4756 /* If accesses through a pointer to vectype do not alias the original
4757 memory reference we have a problem. */
4758 if (get_alias_set (vectype) != get_alias_set (TREE_TYPE (scalar_dest))
4759 && !alias_set_subset_of (get_alias_set (vectype),
4760 get_alias_set (TREE_TYPE (scalar_dest))))
4762 if (vect_print_dump_info (REPORT_DETAILS))
4763 fprintf (vect_dump, "vector type does not alias scalar type");
4764 return false;
4767 if (!useless_type_conversion_p (TREE_TYPE (op), TREE_TYPE (scalar_dest)))
4769 if (vect_print_dump_info (REPORT_DETAILS))
4770 fprintf (vect_dump, "operands of different types");
4771 return false;
4774 vec_mode = TYPE_MODE (vectype);
4775 /* FORNOW. In some cases can vectorize even if data-type not supported
4776 (e.g. - array initialization with 0). */
4777 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4778 return false;
4780 if (!STMT_VINFO_DATA_REF (stmt_info))
4781 return false;
4783 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4785 strided_store = true;
4786 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4787 if (!vect_strided_store_supported (vectype)
4788 && !PURE_SLP_STMT (stmt_info) && !slp)
4789 return false;
4791 if (first_stmt == stmt)
4793 /* STMT is the leader of the group. Check the operands of all the
4794 stmts of the group. */
4795 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4796 while (next_stmt)
4798 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4799 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4801 if (vect_print_dump_info (REPORT_DETAILS))
4802 fprintf (vect_dump, "use not simple.");
4803 return false;
4805 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4810 if (!vec_stmt) /* transformation not required. */
4812 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4813 if (!PURE_SLP_STMT (stmt_info))
4814 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4815 return true;
4818 /** Transform. **/
4820 if (strided_store)
4822 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4823 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4825 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4827 /* FORNOW */
4828 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4830 /* We vectorize all the stmts of the interleaving group when we
4831 reach the last stmt in the group. */
4832 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4833 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4834 && !slp)
4836 *vec_stmt = NULL_TREE;
4837 return true;
4840 if (slp)
4841 strided_store = false;
4843 /* VEC_NUM is the number of vect stmts to be created for this group. */
4844 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4845 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4846 else
4847 vec_num = group_size;
4849 else
4851 first_stmt = stmt;
4852 first_dr = dr;
4853 group_size = vec_num = 1;
4854 first_stmt_vinfo = stmt_info;
4857 if (vect_print_dump_info (REPORT_DETAILS))
4858 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4860 dr_chain = VEC_alloc (tree, heap, group_size);
4861 oprnds = VEC_alloc (tree, heap, group_size);
4863 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4864 gcc_assert (alignment_support_scheme);
4865 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4867 /* In case the vectorization factor (VF) is bigger than the number
4868 of elements that we can fit in a vectype (nunits), we have to generate
4869 more than one vector stmt - i.e - we need to "unroll" the
4870 vector stmt by a factor VF/nunits. For more details see documentation in
4871 vect_get_vec_def_for_copy_stmt. */
4873 /* In case of interleaving (non-unit strided access):
4875 S1: &base + 2 = x2
4876 S2: &base = x0
4877 S3: &base + 1 = x1
4878 S4: &base + 3 = x3
4880 We create vectorized stores starting from base address (the access of the
4881 first stmt in the chain (S2 in the above example), when the last store stmt
4882 of the chain (S4) is reached:
4884 VS1: &base = vx2
4885 VS2: &base + vec_size*1 = vx0
4886 VS3: &base + vec_size*2 = vx1
4887 VS4: &base + vec_size*3 = vx3
4889 Then permutation statements are generated:
4891 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4892 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4895 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4896 (the order of the data-refs in the output of vect_permute_store_chain
4897 corresponds to the order of scalar stmts in the interleaving chain - see
4898 the documentation of vect_permute_store_chain()).
4900 In case of both multiple types and interleaving, above vector stores and
4901 permutation stmts are created for every copy. The result vector stmts are
4902 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4903 STMT_VINFO_RELATED_STMT for the next copies.
4906 prev_stmt_info = NULL;
4907 for (j = 0; j < ncopies; j++)
4909 tree new_stmt;
4910 tree ptr_incr;
4912 if (j == 0)
4914 if (slp)
4916 /* Get vectorized arguments for SLP_NODE. */
4917 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4919 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4921 else
4923 /* For interleaved stores we collect vectorized defs for all the
4924 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4925 used as an input to vect_permute_store_chain(), and OPRNDS as
4926 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4928 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4929 OPRNDS are of size 1. */
4930 next_stmt = first_stmt;
4931 for (i = 0; i < group_size; i++)
4933 /* Since gaps are not supported for interleaved stores,
4934 GROUP_SIZE is the exact number of stmts in the chain.
4935 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4936 there is no interleaving, GROUP_SIZE is 1, and only one
4937 iteration of the loop will be executed. */
4938 gcc_assert (next_stmt);
4939 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4941 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4942 NULL);
4943 VEC_quick_push(tree, dr_chain, vec_oprnd);
4944 VEC_quick_push(tree, oprnds, vec_oprnd);
4945 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4949 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4950 &dummy, &ptr_incr, false,
4951 &inv_p);
4952 gcc_assert (!inv_p);
4954 else
4956 /* FORNOW SLP doesn't work for multiple types. */
4957 gcc_assert (!slp);
4959 /* For interleaved stores we created vectorized defs for all the
4960 defs stored in OPRNDS in the previous iteration (previous copy).
4961 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4962 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4963 next copy.
4964 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4965 OPRNDS are of size 1. */
4966 for (i = 0; i < group_size; i++)
4968 op = VEC_index (tree, oprnds, i);
4969 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4970 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4971 VEC_replace(tree, dr_chain, i, vec_oprnd);
4972 VEC_replace(tree, oprnds, i, vec_oprnd);
4974 dataref_ptr =
4975 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4978 if (strided_store)
4980 result_chain = VEC_alloc (tree, heap, group_size);
4981 /* Permute. */
4982 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4983 &result_chain))
4984 return false;
4987 next_stmt = first_stmt;
4988 for (i = 0; i < vec_num; i++)
4990 if (i > 0)
4991 /* Bump the vector pointer. */
4992 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4993 NULL_TREE);
4995 if (slp)
4996 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4997 else if (strided_store)
4998 /* For strided stores vectorized defs are interleaved in
4999 vect_permute_store_chain(). */
5000 vec_oprnd = VEC_index (tree, result_chain, i);
5002 data_ref = build_fold_indirect_ref (dataref_ptr);
5003 /* Arguments are ready. Create the new vector stmt. */
5004 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
5005 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5006 mark_symbols_for_renaming (new_stmt);
5008 if (j == 0)
5009 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5010 else
5011 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5013 prev_stmt_info = vinfo_for_stmt (new_stmt);
5014 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5015 if (!next_stmt)
5016 break;
5020 VEC_free (tree, heap, dr_chain);
5021 VEC_free (tree, heap, oprnds);
5022 if (result_chain)
5023 VEC_free (tree, heap, result_chain);
5025 return true;
5029 /* Function vect_setup_realignment
5031 This function is called when vectorizing an unaligned load using
5032 the dr_explicit_realign[_optimized] scheme.
5033 This function generates the following code at the loop prolog:
5035 p = initial_addr;
5036 x msq_init = *(floor(p)); # prolog load
5037 realignment_token = call target_builtin;
5038 loop:
5039 x msq = phi (msq_init, ---)
5041 The stmts marked with x are generated only for the case of
5042 dr_explicit_realign_optimized.
5044 The code above sets up a new (vector) pointer, pointing to the first
5045 location accessed by STMT, and a "floor-aligned" load using that pointer.
5046 It also generates code to compute the "realignment-token" (if the relevant
5047 target hook was defined), and creates a phi-node at the loop-header bb
5048 whose arguments are the result of the prolog-load (created by this
5049 function) and the result of a load that takes place in the loop (to be
5050 created by the caller to this function).
5052 For the case of dr_explicit_realign_optimized:
5053 The caller to this function uses the phi-result (msq) to create the
5054 realignment code inside the loop, and sets up the missing phi argument,
5055 as follows:
5056 loop:
5057 msq = phi (msq_init, lsq)
5058 lsq = *(floor(p')); # load in loop
5059 result = realign_load (msq, lsq, realignment_token);
5061 For the case of dr_explicit_realign:
5062 loop:
5063 msq = *(floor(p)); # load in loop
5064 p' = p + (VS-1);
5065 lsq = *(floor(p')); # load in loop
5066 result = realign_load (msq, lsq, realignment_token);
5068 Input:
5069 STMT - (scalar) load stmt to be vectorized. This load accesses
5070 a memory location that may be unaligned.
5071 BSI - place where new code is to be inserted.
5072 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5073 is used.
5075 Output:
5076 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5077 target hook, if defined.
5078 Return value - the result of the loop-header phi node. */
5080 static tree
5081 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
5082 tree *realignment_token,
5083 enum dr_alignment_support alignment_support_scheme,
5084 tree init_addr,
5085 struct loop **at_loop)
5087 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5088 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5090 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5091 edge pe;
5092 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5093 tree vec_dest;
5094 tree inc;
5095 tree ptr;
5096 tree data_ref;
5097 tree new_stmt;
5098 basic_block new_bb;
5099 tree msq_init = NULL_TREE;
5100 tree new_temp;
5101 tree phi_stmt;
5102 tree msq = NULL_TREE;
5103 tree stmts = NULL_TREE;
5104 bool inv_p;
5105 bool compute_in_loop = false;
5106 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5107 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5108 struct loop *loop_for_initial_load;
5110 gcc_assert (alignment_support_scheme == dr_explicit_realign
5111 || alignment_support_scheme == dr_explicit_realign_optimized);
5113 /* We need to generate three things:
5114 1. the misalignment computation
5115 2. the extra vector load (for the optimized realignment scheme).
5116 3. the phi node for the two vectors from which the realignment is
5117 done (for the optimized realignment scheme).
5120 /* 1. Determine where to generate the misalignment computation.
5122 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5123 calculation will be generated by this function, outside the loop (in the
5124 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5125 caller, inside the loop.
5127 Background: If the misalignment remains fixed throughout the iterations of
5128 the loop, then both realignment schemes are applicable, and also the
5129 misalignment computation can be done outside LOOP. This is because we are
5130 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5131 are a multiple of VS (the Vector Size), and therefore the misalignment in
5132 different vectorized LOOP iterations is always the same.
5133 The problem arises only if the memory access is in an inner-loop nested
5134 inside LOOP, which is now being vectorized using outer-loop vectorization.
5135 This is the only case when the misalignment of the memory access may not
5136 remain fixed throughout the iterations of the inner-loop (as explained in
5137 detail in vect_supportable_dr_alignment). In this case, not only is the
5138 optimized realignment scheme not applicable, but also the misalignment
5139 computation (and generation of the realignment token that is passed to
5140 REALIGN_LOAD) have to be done inside the loop.
5142 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5143 or not, which in turn determines if the misalignment is computed inside
5144 the inner-loop, or outside LOOP. */
5146 if (init_addr != NULL_TREE)
5148 compute_in_loop = true;
5149 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5153 /* 2. Determine where to generate the extra vector load.
5155 For the optimized realignment scheme, instead of generating two vector
5156 loads in each iteration, we generate a single extra vector load in the
5157 preheader of the loop, and in each iteration reuse the result of the
5158 vector load from the previous iteration. In case the memory access is in
5159 an inner-loop nested inside LOOP, which is now being vectorized using
5160 outer-loop vectorization, we need to determine whether this initial vector
5161 load should be generated at the preheader of the inner-loop, or can be
5162 generated at the preheader of LOOP. If the memory access has no evolution
5163 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5164 to be generated inside LOOP (in the preheader of the inner-loop). */
5166 if (nested_in_vect_loop)
5168 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5169 bool invariant_in_outerloop =
5170 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5171 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5173 else
5174 loop_for_initial_load = loop;
5175 if (at_loop)
5176 *at_loop = loop_for_initial_load;
5178 /* 3. For the case of the optimized realignment, create the first vector
5179 load at the loop preheader. */
5181 if (alignment_support_scheme == dr_explicit_realign_optimized)
5183 /* Create msq_init = *(floor(p1)) in the loop preheader */
5185 gcc_assert (!compute_in_loop);
5186 pe = loop_preheader_edge (loop_for_initial_load);
5187 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5188 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5189 &init_addr, &inc, true, &inv_p);
5190 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5191 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5192 new_temp = make_ssa_name (vec_dest, new_stmt);
5193 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5194 mark_symbols_for_renaming (new_stmt);
5195 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5196 gcc_assert (!new_bb);
5197 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5200 /* 4. Create realignment token using a target builtin, if available.
5201 It is done either inside the containing loop, or before LOOP (as
5202 determined above). */
5204 if (targetm.vectorize.builtin_mask_for_load)
5206 tree builtin_decl;
5208 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5209 if (compute_in_loop)
5210 gcc_assert (init_addr); /* already computed by the caller. */
5211 else
5213 /* Generate the INIT_ADDR computation outside LOOP. */
5214 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5215 NULL_TREE, loop);
5216 pe = loop_preheader_edge (loop);
5217 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5218 gcc_assert (!new_bb);
5221 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5222 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5223 vec_dest = vect_create_destination_var (scalar_dest,
5224 TREE_TYPE (new_stmt));
5225 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5226 new_temp = make_ssa_name (vec_dest, new_stmt);
5227 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5229 if (compute_in_loop)
5230 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5231 else
5233 /* Generate the misalignment computation outside LOOP. */
5234 pe = loop_preheader_edge (loop);
5235 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5236 gcc_assert (!new_bb);
5239 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5241 /* The result of the CALL_EXPR to this builtin is determined from
5242 the value of the parameter and no global variables are touched
5243 which makes the builtin a "const" function. Requiring the
5244 builtin to have the "const" attribute makes it unnecessary
5245 to call mark_call_clobbered. */
5246 gcc_assert (TREE_READONLY (builtin_decl));
5249 if (alignment_support_scheme == dr_explicit_realign)
5250 return msq;
5252 gcc_assert (!compute_in_loop);
5253 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5256 /* 5. Create msq = phi <msq_init, lsq> in loop */
5258 pe = loop_preheader_edge (containing_loop);
5259 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5260 msq = make_ssa_name (vec_dest, NULL_TREE);
5261 phi_stmt = create_phi_node (msq, containing_loop->header);
5262 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5263 add_phi_arg (phi_stmt, msq_init, pe);
5265 return msq;
5269 /* Function vect_strided_load_supported.
5271 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5272 and FALSE otherwise. */
5274 static bool
5275 vect_strided_load_supported (tree vectype)
5277 optab perm_even_optab, perm_odd_optab;
5278 int mode;
5280 mode = (int) TYPE_MODE (vectype);
5282 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
5283 optab_default);
5284 if (!perm_even_optab)
5286 if (vect_print_dump_info (REPORT_DETAILS))
5287 fprintf (vect_dump, "no optab for perm_even.");
5288 return false;
5291 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5293 if (vect_print_dump_info (REPORT_DETAILS))
5294 fprintf (vect_dump, "perm_even op not supported by target.");
5295 return false;
5298 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
5299 optab_default);
5300 if (!perm_odd_optab)
5302 if (vect_print_dump_info (REPORT_DETAILS))
5303 fprintf (vect_dump, "no optab for perm_odd.");
5304 return false;
5307 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5309 if (vect_print_dump_info (REPORT_DETAILS))
5310 fprintf (vect_dump, "perm_odd op not supported by target.");
5311 return false;
5313 return true;
5317 /* Function vect_permute_load_chain.
5319 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5320 a power of 2, generate extract_even/odd stmts to reorder the input data
5321 correctly. Return the final references for loads in RESULT_CHAIN.
5323 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5324 The input is 4 vectors each containing 8 elements. We assign a number to each
5325 element, the input sequence is:
5327 1st vec: 0 1 2 3 4 5 6 7
5328 2nd vec: 8 9 10 11 12 13 14 15
5329 3rd vec: 16 17 18 19 20 21 22 23
5330 4th vec: 24 25 26 27 28 29 30 31
5332 The output sequence should be:
5334 1st vec: 0 4 8 12 16 20 24 28
5335 2nd vec: 1 5 9 13 17 21 25 29
5336 3rd vec: 2 6 10 14 18 22 26 30
5337 4th vec: 3 7 11 15 19 23 27 31
5339 i.e., the first output vector should contain the first elements of each
5340 interleaving group, etc.
5342 We use extract_even/odd instructions to create such output. The input of each
5343 extract_even/odd operation is two vectors
5344 1st vec 2nd vec
5345 0 1 2 3 4 5 6 7
5347 and the output is the vector of extracted even/odd elements. The output of
5348 extract_even will be: 0 2 4 6
5349 and of extract_odd: 1 3 5 7
5352 The permutation is done in log LENGTH stages. In each stage extract_even and
5353 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5354 order. In our example,
5356 E1: extract_even (1st vec, 2nd vec)
5357 E2: extract_odd (1st vec, 2nd vec)
5358 E3: extract_even (3rd vec, 4th vec)
5359 E4: extract_odd (3rd vec, 4th vec)
5361 The output for the first stage will be:
5363 E1: 0 2 4 6 8 10 12 14
5364 E2: 1 3 5 7 9 11 13 15
5365 E3: 16 18 20 22 24 26 28 30
5366 E4: 17 19 21 23 25 27 29 31
5368 In order to proceed and create the correct sequence for the next stage (or
5369 for the correct output, if the second stage is the last one, as in our
5370 example), we first put the output of extract_even operation and then the
5371 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5372 The input for the second stage is:
5374 1st vec (E1): 0 2 4 6 8 10 12 14
5375 2nd vec (E3): 16 18 20 22 24 26 28 30
5376 3rd vec (E2): 1 3 5 7 9 11 13 15
5377 4th vec (E4): 17 19 21 23 25 27 29 31
5379 The output of the second stage:
5381 E1: 0 4 8 12 16 20 24 28
5382 E2: 2 6 10 14 18 22 26 30
5383 E3: 1 5 9 13 17 21 25 29
5384 E4: 3 7 11 15 19 23 27 31
5386 And RESULT_CHAIN after reordering:
5388 1st vec (E1): 0 4 8 12 16 20 24 28
5389 2nd vec (E3): 1 5 9 13 17 21 25 29
5390 3rd vec (E2): 2 6 10 14 18 22 26 30
5391 4th vec (E4): 3 7 11 15 19 23 27 31. */
5393 static bool
5394 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5395 unsigned int length,
5396 tree stmt,
5397 block_stmt_iterator *bsi,
5398 VEC(tree,heap) **result_chain)
5400 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5401 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5402 tree tmp;
5403 int i;
5404 unsigned int j;
5406 /* Check that the operation is supported. */
5407 if (!vect_strided_load_supported (vectype))
5408 return false;
5410 *result_chain = VEC_copy (tree, heap, dr_chain);
5411 for (i = 0; i < exact_log2 (length); i++)
5413 for (j = 0; j < length; j +=2)
5415 first_vect = VEC_index (tree, dr_chain, j);
5416 second_vect = VEC_index (tree, dr_chain, j+1);
5418 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5419 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5420 DECL_GIMPLE_REG_P (perm_dest) = 1;
5421 add_referenced_var (perm_dest);
5423 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5424 first_vect, second_vect);
5425 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5427 data_ref = make_ssa_name (perm_dest, perm_stmt);
5428 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5429 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5430 mark_symbols_for_renaming (perm_stmt);
5432 VEC_replace (tree, *result_chain, j/2, data_ref);
5434 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5435 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5436 DECL_GIMPLE_REG_P (perm_dest) = 1;
5437 add_referenced_var (perm_dest);
5439 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5440 first_vect, second_vect);
5441 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5442 data_ref = make_ssa_name (perm_dest, perm_stmt);
5443 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5444 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5445 mark_symbols_for_renaming (perm_stmt);
5447 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5449 dr_chain = VEC_copy (tree, heap, *result_chain);
5451 return true;
5455 /* Function vect_transform_strided_load.
5457 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5458 to perform their permutation and ascribe the result vectorized statements to
5459 the scalar statements.
5462 static bool
5463 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5464 block_stmt_iterator *bsi)
5466 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5467 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5468 tree next_stmt, new_stmt;
5469 VEC(tree,heap) *result_chain = NULL;
5470 unsigned int i, gap_count;
5471 tree tmp_data_ref;
5473 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5474 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5475 vectors, that are ready for vector computation. */
5476 result_chain = VEC_alloc (tree, heap, size);
5477 /* Permute. */
5478 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5479 return false;
5481 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5482 Since we scan the chain starting from it's first node, their order
5483 corresponds the order of data-refs in RESULT_CHAIN. */
5484 next_stmt = first_stmt;
5485 gap_count = 1;
5486 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5488 if (!next_stmt)
5489 break;
5491 /* Skip the gaps. Loads created for the gaps will be removed by dead
5492 code elimination pass later. No need to check for the first stmt in
5493 the group, since it always exists.
5494 DR_GROUP_GAP is the number of steps in elements from the previous
5495 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5496 correspond to the gaps.
5498 if (next_stmt != first_stmt
5499 && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5501 gap_count++;
5502 continue;
5505 while (next_stmt)
5507 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5508 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5509 copies, and we put the new vector statement in the first available
5510 RELATED_STMT. */
5511 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5512 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5513 else
5515 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5516 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5517 vinfo_for_stmt (prev_stmt));
5518 while (rel_stmt)
5520 prev_stmt = rel_stmt;
5521 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5523 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5525 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5526 gap_count = 1;
5527 /* If NEXT_STMT accesses the same DR as the previous statement,
5528 put the same TMP_DATA_REF as its vectorized statement; otherwise
5529 get the next data-ref from RESULT_CHAIN. */
5530 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5531 break;
5535 VEC_free (tree, heap, result_chain);
5536 return true;
5540 /* vectorizable_load.
5542 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5543 can be vectorized.
5544 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5545 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5546 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5548 bool
5549 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5550 slp_tree slp_node)
5552 tree scalar_dest;
5553 tree vec_dest = NULL;
5554 tree data_ref = NULL;
5555 tree op;
5556 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5557 stmt_vec_info prev_stmt_info;
5558 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5559 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5560 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5561 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5562 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5563 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5564 tree new_temp;
5565 int mode;
5566 tree new_stmt = NULL_TREE;
5567 tree dummy;
5568 enum dr_alignment_support alignment_support_scheme;
5569 tree dataref_ptr = NULL_TREE;
5570 tree ptr_incr;
5571 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5572 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5573 int i, j, group_size;
5574 tree msq = NULL_TREE, lsq;
5575 tree offset = NULL_TREE;
5576 tree realignment_token = NULL_TREE;
5577 tree phi = NULL_TREE;
5578 VEC(tree,heap) *dr_chain = NULL;
5579 bool strided_load = false;
5580 tree first_stmt;
5581 tree scalar_type;
5582 bool inv_p;
5583 bool compute_in_loop = false;
5584 struct loop *at_loop;
5585 int vec_num;
5586 bool slp = (slp_node != NULL);
5588 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5589 this, so we can safely override NCOPIES with 1 here. */
5590 if (slp)
5591 ncopies = 1;
5593 gcc_assert (ncopies >= 1);
5595 /* FORNOW. This restriction should be relaxed. */
5596 if (nested_in_vect_loop && ncopies > 1)
5598 if (vect_print_dump_info (REPORT_DETAILS))
5599 fprintf (vect_dump, "multiple types in nested loop.");
5600 return false;
5603 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5604 return false;
5606 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5607 return false;
5609 /* Is vectorizable load? */
5610 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5611 return false;
5613 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5614 if (TREE_CODE (scalar_dest) != SSA_NAME)
5615 return false;
5617 op = GIMPLE_STMT_OPERAND (stmt, 1);
5618 if (TREE_CODE (op) != ARRAY_REF
5619 && TREE_CODE (op) != INDIRECT_REF
5620 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5621 return false;
5623 if (!STMT_VINFO_DATA_REF (stmt_info))
5624 return false;
5626 scalar_type = TREE_TYPE (DR_REF (dr));
5627 mode = (int) TYPE_MODE (vectype);
5629 /* FORNOW. In some cases can vectorize even if data-type not supported
5630 (e.g. - data copies). */
5631 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5633 if (vect_print_dump_info (REPORT_DETAILS))
5634 fprintf (vect_dump, "Aligned load, but unsupported type.");
5635 return false;
5638 /* If accesses through a pointer to vectype do not alias the original
5639 memory reference we have a problem. */
5640 if (get_alias_set (vectype) != get_alias_set (scalar_type)
5641 && !alias_set_subset_of (get_alias_set (vectype),
5642 get_alias_set (scalar_type)))
5644 if (vect_print_dump_info (REPORT_DETAILS))
5645 fprintf (vect_dump, "vector type does not alias scalar type");
5646 return false;
5649 /* Check if the load is a part of an interleaving chain. */
5650 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5652 strided_load = true;
5653 /* FORNOW */
5654 gcc_assert (! nested_in_vect_loop);
5656 /* Check if interleaving is supported. */
5657 if (!vect_strided_load_supported (vectype)
5658 && !PURE_SLP_STMT (stmt_info) && !slp)
5659 return false;
5662 if (!vec_stmt) /* transformation not required. */
5664 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5665 vect_model_load_cost (stmt_info, ncopies, NULL);
5666 return true;
5669 if (vect_print_dump_info (REPORT_DETAILS))
5670 fprintf (vect_dump, "transform load.");
5672 /** Transform. **/
5674 if (strided_load)
5676 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5677 /* Check if the chain of loads is already vectorized. */
5678 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5680 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5681 return true;
5683 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5684 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5685 dr_chain = VEC_alloc (tree, heap, group_size);
5687 /* VEC_NUM is the number of vect stmts to be created for this group. */
5688 if (slp)
5690 strided_load = false;
5691 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5693 else
5694 vec_num = group_size;
5696 else
5698 first_stmt = stmt;
5699 first_dr = dr;
5700 group_size = vec_num = 1;
5703 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5704 gcc_assert (alignment_support_scheme);
5706 /* In case the vectorization factor (VF) is bigger than the number
5707 of elements that we can fit in a vectype (nunits), we have to generate
5708 more than one vector stmt - i.e - we need to "unroll" the
5709 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5710 from one copy of the vector stmt to the next, in the field
5711 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5712 stages to find the correct vector defs to be used when vectorizing
5713 stmts that use the defs of the current stmt. The example below illustrates
5714 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5715 4 vectorized stmts):
5717 before vectorization:
5718 RELATED_STMT VEC_STMT
5719 S1: x = memref - -
5720 S2: z = x + 1 - -
5722 step 1: vectorize stmt S1:
5723 We first create the vector stmt VS1_0, and, as usual, record a
5724 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5725 Next, we create the vector stmt VS1_1, and record a pointer to
5726 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5727 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5728 stmts and pointers:
5729 RELATED_STMT VEC_STMT
5730 VS1_0: vx0 = memref0 VS1_1 -
5731 VS1_1: vx1 = memref1 VS1_2 -
5732 VS1_2: vx2 = memref2 VS1_3 -
5733 VS1_3: vx3 = memref3 - -
5734 S1: x = load - VS1_0
5735 S2: z = x + 1 - -
5737 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5738 information we recorded in RELATED_STMT field is used to vectorize
5739 stmt S2. */
5741 /* In case of interleaving (non-unit strided access):
5743 S1: x2 = &base + 2
5744 S2: x0 = &base
5745 S3: x1 = &base + 1
5746 S4: x3 = &base + 3
5748 Vectorized loads are created in the order of memory accesses
5749 starting from the access of the first stmt of the chain:
5751 VS1: vx0 = &base
5752 VS2: vx1 = &base + vec_size*1
5753 VS3: vx3 = &base + vec_size*2
5754 VS4: vx4 = &base + vec_size*3
5756 Then permutation statements are generated:
5758 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5759 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5762 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5763 (the order of the data-refs in the output of vect_permute_load_chain
5764 corresponds to the order of scalar stmts in the interleaving chain - see
5765 the documentation of vect_permute_load_chain()).
5766 The generation of permutation stmts and recording them in
5767 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5769 In case of both multiple types and interleaving, the vector loads and
5770 permutation stmts above are created for every copy. The result vector stmts
5771 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5772 STMT_VINFO_RELATED_STMT for the next copies. */
5774 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5775 on a target that supports unaligned accesses (dr_unaligned_supported)
5776 we generate the following code:
5777 p = initial_addr;
5778 indx = 0;
5779 loop {
5780 p = p + indx * vectype_size;
5781 vec_dest = *(p);
5782 indx = indx + 1;
5785 Otherwise, the data reference is potentially unaligned on a target that
5786 does not support unaligned accesses (dr_explicit_realign_optimized) -
5787 then generate the following code, in which the data in each iteration is
5788 obtained by two vector loads, one from the previous iteration, and one
5789 from the current iteration:
5790 p1 = initial_addr;
5791 msq_init = *(floor(p1))
5792 p2 = initial_addr + VS - 1;
5793 realignment_token = call target_builtin;
5794 indx = 0;
5795 loop {
5796 p2 = p2 + indx * vectype_size
5797 lsq = *(floor(p2))
5798 vec_dest = realign_load (msq, lsq, realignment_token)
5799 indx = indx + 1;
5800 msq = lsq;
5801 } */
5803 /* If the misalignment remains the same throughout the execution of the
5804 loop, we can create the init_addr and permutation mask at the loop
5805 preheader. Otherwise, it needs to be created inside the loop.
5806 This can only occur when vectorizing memory accesses in the inner-loop
5807 nested within an outer-loop that is being vectorized. */
5809 if (nested_in_vect_loop_p (loop, stmt)
5810 && (TREE_INT_CST_LOW (DR_STEP (dr))
5811 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
5813 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5814 compute_in_loop = true;
5817 if ((alignment_support_scheme == dr_explicit_realign_optimized
5818 || alignment_support_scheme == dr_explicit_realign)
5819 && !compute_in_loop)
5821 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5822 alignment_support_scheme, NULL_TREE,
5823 &at_loop);
5824 if (alignment_support_scheme == dr_explicit_realign_optimized)
5826 phi = SSA_NAME_DEF_STMT (msq);
5827 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5830 else
5831 at_loop = loop;
5833 prev_stmt_info = NULL;
5834 for (j = 0; j < ncopies; j++)
5836 /* 1. Create the vector pointer update chain. */
5837 if (j == 0)
5838 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5839 at_loop, offset,
5840 &dummy, &ptr_incr, false,
5841 &inv_p);
5842 else
5843 dataref_ptr =
5844 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5846 for (i = 0; i < vec_num; i++)
5848 if (i > 0)
5849 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5850 NULL_TREE);
5852 /* 2. Create the vector-load in the loop. */
5853 switch (alignment_support_scheme)
5855 case dr_aligned:
5856 gcc_assert (aligned_access_p (first_dr));
5857 data_ref = build_fold_indirect_ref (dataref_ptr);
5858 break;
5859 case dr_unaligned_supported:
5861 int mis = DR_MISALIGNMENT (first_dr);
5862 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5864 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5865 data_ref =
5866 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5867 break;
5869 case dr_explicit_realign:
5871 tree ptr, bump;
5872 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5874 if (compute_in_loop)
5875 msq = vect_setup_realignment (first_stmt, bsi,
5876 &realignment_token,
5877 dr_explicit_realign,
5878 dataref_ptr, NULL);
5880 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5881 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5882 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5883 new_temp = make_ssa_name (vec_dest, new_stmt);
5884 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5885 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5886 copy_virtual_operands (new_stmt, stmt);
5887 mark_symbols_for_renaming (new_stmt);
5888 msq = new_temp;
5890 bump = size_binop (MULT_EXPR, vs_minus_1,
5891 TYPE_SIZE_UNIT (scalar_type));
5892 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5893 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5894 break;
5896 case dr_explicit_realign_optimized:
5897 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5898 break;
5899 default:
5900 gcc_unreachable ();
5902 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5903 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5904 new_temp = make_ssa_name (vec_dest, new_stmt);
5905 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5906 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5907 mark_symbols_for_renaming (new_stmt);
5909 /* 3. Handle explicit realignment if necessary/supported. Create in
5910 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5911 if (alignment_support_scheme == dr_explicit_realign_optimized
5912 || alignment_support_scheme == dr_explicit_realign)
5914 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5915 if (!realignment_token)
5916 realignment_token = dataref_ptr;
5917 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5918 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5919 realignment_token);
5920 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5921 new_temp = make_ssa_name (vec_dest, new_stmt);
5922 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5923 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5925 if (alignment_support_scheme == dr_explicit_realign_optimized)
5927 if (i == vec_num - 1 && j == ncopies - 1)
5928 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5929 msq = lsq;
5933 /* 4. Handle invariant-load. */
5934 if (inv_p)
5936 gcc_assert (!strided_load);
5937 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5938 if (j == 0)
5940 int k;
5941 tree t = NULL_TREE;
5942 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5944 /* CHECKME: bitpos depends on endianess? */
5945 bitpos = bitsize_zero_node;
5946 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5947 bitsize, bitpos);
5948 vec_dest =
5949 vect_create_destination_var (scalar_dest, NULL_TREE);
5950 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5951 new_temp = make_ssa_name (vec_dest, new_stmt);
5952 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5953 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5955 for (k = nunits - 1; k >= 0; --k)
5956 t = tree_cons (NULL_TREE, new_temp, t);
5957 /* FIXME: use build_constructor directly. */
5958 vec_inv = build_constructor_from_list (vectype, t);
5959 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5960 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5962 else
5963 gcc_unreachable (); /* FORNOW. */
5966 /* Collect vector loads and later create their permutation in
5967 vect_transform_strided_load (). */
5968 if (strided_load)
5969 VEC_quick_push (tree, dr_chain, new_temp);
5971 /* Store vector loads in the corresponding SLP_NODE. */
5972 if (slp)
5973 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5976 /* FORNOW: SLP with multiple types is unsupported. */
5977 if (slp)
5978 return true;
5980 if (strided_load)
5982 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5983 return false;
5984 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5985 VEC_free (tree, heap, dr_chain);
5986 dr_chain = VEC_alloc (tree, heap, group_size);
5988 else
5990 if (j == 0)
5991 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5992 else
5993 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5994 prev_stmt_info = vinfo_for_stmt (new_stmt);
5998 if (dr_chain)
5999 VEC_free (tree, heap, dr_chain);
6001 return true;
6005 /* Function vectorizable_live_operation.
6007 STMT computes a value that is used outside the loop. Check if
6008 it can be supported. */
6010 bool
6011 vectorizable_live_operation (tree stmt,
6012 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
6013 tree *vec_stmt ATTRIBUTE_UNUSED)
6015 tree operation;
6016 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6017 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6018 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6019 int i;
6020 int op_type;
6021 tree op;
6022 tree def, def_stmt;
6023 enum vect_def_type dt;
6025 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6027 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6028 return false;
6030 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6031 return false;
6033 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
6034 return false;
6036 /* FORNOW. CHECKME. */
6037 if (nested_in_vect_loop_p (loop, stmt))
6038 return false;
6040 operation = GIMPLE_STMT_OPERAND (stmt, 1);
6041 op_type = TREE_OPERAND_LENGTH (operation);
6043 /* FORNOW: support only if all uses are invariant. This means
6044 that the scalar operations can remain in place, unvectorized.
6045 The original last scalar value that they compute will be used. */
6047 for (i = 0; i < op_type; i++)
6049 op = TREE_OPERAND (operation, i);
6050 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
6052 if (vect_print_dump_info (REPORT_DETAILS))
6053 fprintf (vect_dump, "use not simple.");
6054 return false;
6057 if (dt != vect_invariant_def && dt != vect_constant_def)
6058 return false;
6061 /* No transformation is required for the cases we currently support. */
6062 return true;
6066 /* Function vect_is_simple_cond.
6068 Input:
6069 LOOP - the loop that is being vectorized.
6070 COND - Condition that is checked for simple use.
6072 Returns whether a COND can be vectorized. Checks whether
6073 condition operands are supportable using vec_is_simple_use. */
6075 static bool
6076 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6078 tree lhs, rhs;
6079 tree def;
6080 enum vect_def_type dt;
6082 if (!COMPARISON_CLASS_P (cond))
6083 return false;
6085 lhs = TREE_OPERAND (cond, 0);
6086 rhs = TREE_OPERAND (cond, 1);
6088 if (TREE_CODE (lhs) == SSA_NAME)
6090 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6091 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6092 return false;
6094 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6095 && TREE_CODE (lhs) != FIXED_CST)
6096 return false;
6098 if (TREE_CODE (rhs) == SSA_NAME)
6100 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6101 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6102 return false;
6104 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6105 && TREE_CODE (rhs) != FIXED_CST)
6106 return false;
6108 return true;
6111 /* vectorizable_condition.
6113 Check if STMT is conditional modify expression that can be vectorized.
6114 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6115 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6116 at BSI.
6118 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6120 bool
6121 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
6123 tree scalar_dest = NULL_TREE;
6124 tree vec_dest = NULL_TREE;
6125 tree op = NULL_TREE;
6126 tree cond_expr, then_clause, else_clause;
6127 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6128 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6129 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6130 tree vec_compare, vec_cond_expr;
6131 tree new_temp;
6132 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6133 enum machine_mode vec_mode;
6134 tree def;
6135 enum vect_def_type dt;
6136 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6137 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6139 gcc_assert (ncopies >= 1);
6140 if (ncopies > 1)
6141 return false; /* FORNOW */
6143 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6144 return false;
6146 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6147 return false;
6149 /* FORNOW: SLP not supported. */
6150 if (STMT_SLP_TYPE (stmt_info))
6151 return false;
6153 /* FORNOW: not yet supported. */
6154 if (STMT_VINFO_LIVE_P (stmt_info))
6156 if (vect_print_dump_info (REPORT_DETAILS))
6157 fprintf (vect_dump, "value used after loop.");
6158 return false;
6161 /* Is vectorizable conditional operation? */
6162 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6163 return false;
6165 op = GIMPLE_STMT_OPERAND (stmt, 1);
6167 if (TREE_CODE (op) != COND_EXPR)
6168 return false;
6170 cond_expr = TREE_OPERAND (op, 0);
6171 then_clause = TREE_OPERAND (op, 1);
6172 else_clause = TREE_OPERAND (op, 2);
6174 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6175 return false;
6177 /* We do not handle two different vector types for the condition
6178 and the values. */
6179 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6180 return false;
6182 if (TREE_CODE (then_clause) == SSA_NAME)
6184 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6185 if (!vect_is_simple_use (then_clause, loop_vinfo,
6186 &then_def_stmt, &def, &dt))
6187 return false;
6189 else if (TREE_CODE (then_clause) != INTEGER_CST
6190 && TREE_CODE (then_clause) != REAL_CST
6191 && TREE_CODE (then_clause) != FIXED_CST)
6192 return false;
6194 if (TREE_CODE (else_clause) == SSA_NAME)
6196 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6197 if (!vect_is_simple_use (else_clause, loop_vinfo,
6198 &else_def_stmt, &def, &dt))
6199 return false;
6201 else if (TREE_CODE (else_clause) != INTEGER_CST
6202 && TREE_CODE (else_clause) != REAL_CST
6203 && TREE_CODE (else_clause) != FIXED_CST)
6204 return false;
6207 vec_mode = TYPE_MODE (vectype);
6209 if (!vec_stmt)
6211 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6212 return expand_vec_cond_expr_p (op, vec_mode);
6215 /* Transform */
6217 /* Handle def. */
6218 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6219 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6221 /* Handle cond expr. */
6222 vec_cond_lhs =
6223 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6224 vec_cond_rhs =
6225 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6226 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6227 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6229 /* Arguments are ready. Create the new vector stmt. */
6230 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6231 vec_cond_lhs, vec_cond_rhs);
6232 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6233 vec_compare, vec_then_clause, vec_else_clause);
6235 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6236 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6237 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6238 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6240 return true;
6244 /* Function vect_transform_stmt.
6246 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6248 static bool
6249 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6250 slp_tree slp_node)
6252 bool is_store = false;
6253 tree vec_stmt = NULL_TREE;
6254 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6255 tree orig_stmt_in_pattern;
6256 bool done;
6258 switch (STMT_VINFO_TYPE (stmt_info))
6260 case type_demotion_vec_info_type:
6261 gcc_assert (!slp_node);
6262 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6263 gcc_assert (done);
6264 break;
6266 case type_promotion_vec_info_type:
6267 gcc_assert (!slp_node);
6268 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6269 gcc_assert (done);
6270 break;
6272 case type_conversion_vec_info_type:
6273 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6274 gcc_assert (done);
6275 break;
6277 case induc_vec_info_type:
6278 gcc_assert (!slp_node);
6279 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6280 gcc_assert (done);
6281 break;
6283 case op_vec_info_type:
6284 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6285 gcc_assert (done);
6286 break;
6288 case assignment_vec_info_type:
6289 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6290 gcc_assert (done);
6291 break;
6293 case load_vec_info_type:
6294 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6295 gcc_assert (done);
6296 break;
6298 case store_vec_info_type:
6299 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6300 gcc_assert (done);
6301 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6303 /* In case of interleaving, the whole chain is vectorized when the
6304 last store in the chain is reached. Store stmts before the last
6305 one are skipped, and there vec_stmt_info shouldn't be freed
6306 meanwhile. */
6307 *strided_store = true;
6308 if (STMT_VINFO_VEC_STMT (stmt_info))
6309 is_store = true;
6311 else
6312 is_store = true;
6313 break;
6315 case condition_vec_info_type:
6316 gcc_assert (!slp_node);
6317 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6318 gcc_assert (done);
6319 break;
6321 case call_vec_info_type:
6322 gcc_assert (!slp_node);
6323 done = vectorizable_call (stmt, bsi, &vec_stmt);
6324 break;
6326 case reduc_vec_info_type:
6327 gcc_assert (!slp_node);
6328 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6329 gcc_assert (done);
6330 break;
6332 default:
6333 if (!STMT_VINFO_LIVE_P (stmt_info))
6335 if (vect_print_dump_info (REPORT_DETAILS))
6336 fprintf (vect_dump, "stmt not supported.");
6337 gcc_unreachable ();
6341 if (STMT_VINFO_LIVE_P (stmt_info)
6342 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6344 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6345 gcc_assert (done);
6348 if (vec_stmt)
6350 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6351 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6352 if (orig_stmt_in_pattern)
6354 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6355 /* STMT was inserted by the vectorizer to replace a computation idiom.
6356 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6357 computed this idiom. We need to record a pointer to VEC_STMT in
6358 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6359 documentation of vect_pattern_recog. */
6360 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6362 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6363 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6368 return is_store;
6372 /* This function builds ni_name = number of iterations loop executes
6373 on the loop preheader. */
6375 static tree
6376 vect_build_loop_niters (loop_vec_info loop_vinfo)
6378 tree ni_name, stmt, var;
6379 edge pe;
6380 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6381 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6383 var = create_tmp_var (TREE_TYPE (ni), "niters");
6384 add_referenced_var (var);
6385 ni_name = force_gimple_operand (ni, &stmt, false, var);
6387 pe = loop_preheader_edge (loop);
6388 if (stmt)
6390 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6391 gcc_assert (!new_bb);
6394 return ni_name;
6398 /* This function generates the following statements:
6400 ni_name = number of iterations loop executes
6401 ratio = ni_name / vf
6402 ratio_mult_vf_name = ratio * vf
6404 and places them at the loop preheader edge. */
6406 static void
6407 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6408 tree *ni_name_ptr,
6409 tree *ratio_mult_vf_name_ptr,
6410 tree *ratio_name_ptr)
6413 edge pe;
6414 basic_block new_bb;
6415 tree stmt, ni_name;
6416 tree var;
6417 tree ratio_name;
6418 tree ratio_mult_vf_name;
6419 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6420 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6421 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6422 tree log_vf;
6424 pe = loop_preheader_edge (loop);
6426 /* Generate temporary variable that contains
6427 number of iterations loop executes. */
6429 ni_name = vect_build_loop_niters (loop_vinfo);
6430 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6432 /* Create: ratio = ni >> log2(vf) */
6434 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6435 if (!is_gimple_val (ratio_name))
6437 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6438 add_referenced_var (var);
6440 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6441 pe = loop_preheader_edge (loop);
6442 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6443 gcc_assert (!new_bb);
6446 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6448 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6449 ratio_name, log_vf);
6450 if (!is_gimple_val (ratio_mult_vf_name))
6452 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6453 add_referenced_var (var);
6455 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6456 true, var);
6457 pe = loop_preheader_edge (loop);
6458 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6459 gcc_assert (!new_bb);
6462 *ni_name_ptr = ni_name;
6463 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6464 *ratio_name_ptr = ratio_name;
6466 return;
6470 /* Function vect_update_ivs_after_vectorizer.
6472 "Advance" the induction variables of LOOP to the value they should take
6473 after the execution of LOOP. This is currently necessary because the
6474 vectorizer does not handle induction variables that are used after the
6475 loop. Such a situation occurs when the last iterations of LOOP are
6476 peeled, because:
6477 1. We introduced new uses after LOOP for IVs that were not originally used
6478 after LOOP: the IVs of LOOP are now used by an epilog loop.
6479 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6480 times, whereas the loop IVs should be bumped N times.
6482 Input:
6483 - LOOP - a loop that is going to be vectorized. The last few iterations
6484 of LOOP were peeled.
6485 - NITERS - the number of iterations that LOOP executes (before it is
6486 vectorized). i.e, the number of times the ivs should be bumped.
6487 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6488 coming out from LOOP on which there are uses of the LOOP ivs
6489 (this is the path from LOOP->exit to epilog_loop->preheader).
6491 The new definitions of the ivs are placed in LOOP->exit.
6492 The phi args associated with the edge UPDATE_E in the bb
6493 UPDATE_E->dest are updated accordingly.
6495 Assumption 1: Like the rest of the vectorizer, this function assumes
6496 a single loop exit that has a single predecessor.
6498 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6499 organized in the same order.
6501 Assumption 3: The access function of the ivs is simple enough (see
6502 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6504 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6505 coming out of LOOP on which the ivs of LOOP are used (this is the path
6506 that leads to the epilog loop; other paths skip the epilog loop). This
6507 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6508 needs to have its phis updated.
6511 static void
6512 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6513 edge update_e)
6515 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6516 basic_block exit_bb = single_exit (loop)->dest;
6517 tree phi, phi1;
6518 basic_block update_bb = update_e->dest;
6520 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6522 /* Make sure there exists a single-predecessor exit bb: */
6523 gcc_assert (single_pred_p (exit_bb));
6525 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6526 phi && phi1;
6527 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6529 tree access_fn = NULL;
6530 tree evolution_part;
6531 tree init_expr;
6532 tree step_expr;
6533 tree var, ni, ni_name;
6534 block_stmt_iterator last_bsi;
6536 if (vect_print_dump_info (REPORT_DETAILS))
6538 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6539 print_generic_expr (vect_dump, phi, TDF_SLIM);
6542 /* Skip virtual phi's. */
6543 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6545 if (vect_print_dump_info (REPORT_DETAILS))
6546 fprintf (vect_dump, "virtual phi. skip.");
6547 continue;
6550 /* Skip reduction phis. */
6551 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6553 if (vect_print_dump_info (REPORT_DETAILS))
6554 fprintf (vect_dump, "reduc phi. skip.");
6555 continue;
6558 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6559 gcc_assert (access_fn);
6560 evolution_part =
6561 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6562 gcc_assert (evolution_part != NULL_TREE);
6564 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6565 of degree >= 2 or exponential. */
6566 gcc_assert (!tree_is_chrec (evolution_part));
6568 step_expr = evolution_part;
6569 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6570 loop->num));
6572 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6573 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6574 init_expr,
6575 fold_convert (sizetype,
6576 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6577 niters, step_expr)));
6578 else
6579 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6580 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6581 fold_convert (TREE_TYPE (init_expr),
6582 niters),
6583 step_expr),
6584 init_expr);
6588 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6589 add_referenced_var (var);
6591 last_bsi = bsi_last (exit_bb);
6592 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6593 true, BSI_SAME_STMT);
6595 /* Fix phi expressions in the successor bb. */
6596 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6600 /* Return the more conservative threshold between the
6601 min_profitable_iters returned by the cost model and the user
6602 specified threshold, if provided. */
6604 static unsigned int
6605 conservative_cost_threshold (loop_vec_info loop_vinfo,
6606 int min_profitable_iters)
6608 unsigned int th;
6609 int min_scalar_loop_bound;
6611 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6612 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6614 /* Use the cost model only if it is more conservative than user specified
6615 threshold. */
6616 th = (unsigned) min_scalar_loop_bound;
6617 if (min_profitable_iters
6618 && (!min_scalar_loop_bound
6619 || min_profitable_iters > min_scalar_loop_bound))
6620 th = (unsigned) min_profitable_iters;
6622 if (th && vect_print_dump_info (REPORT_COST))
6623 fprintf (vect_dump, "Vectorization may not be profitable.");
6625 return th;
6628 /* Function vect_do_peeling_for_loop_bound
6630 Peel the last iterations of the loop represented by LOOP_VINFO.
6631 The peeled iterations form a new epilog loop. Given that the loop now
6632 iterates NITERS times, the new epilog loop iterates
6633 NITERS % VECTORIZATION_FACTOR times.
6635 The original loop will later be made to iterate
6636 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6638 static void
6639 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6641 tree ni_name, ratio_mult_vf_name;
6642 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6643 struct loop *new_loop;
6644 edge update_e;
6645 basic_block preheader;
6646 int loop_num;
6647 bool check_profitability = false;
6648 unsigned int th = 0;
6649 int min_profitable_iters;
6651 if (vect_print_dump_info (REPORT_DETAILS))
6652 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6654 initialize_original_copy_tables ();
6656 /* Generate the following variables on the preheader of original loop:
6658 ni_name = number of iteration the original loop executes
6659 ratio = ni_name / vf
6660 ratio_mult_vf_name = ratio * vf */
6661 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6662 &ratio_mult_vf_name, ratio);
6664 loop_num = loop->num;
6666 /* If cost model check not done during versioning and
6667 peeling for alignment. */
6668 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6669 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6670 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6672 check_profitability = true;
6674 /* Get profitability threshold for vectorized loop. */
6675 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6677 th = conservative_cost_threshold (loop_vinfo,
6678 min_profitable_iters);
6681 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6682 ratio_mult_vf_name, ni_name, false,
6683 th, check_profitability);
6684 gcc_assert (new_loop);
6685 gcc_assert (loop_num == loop->num);
6686 #ifdef ENABLE_CHECKING
6687 slpeel_verify_cfg_after_peeling (loop, new_loop);
6688 #endif
6690 /* A guard that controls whether the new_loop is to be executed or skipped
6691 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6692 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6693 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6694 is on the path where the LOOP IVs are used and need to be updated. */
6696 preheader = loop_preheader_edge (new_loop)->src;
6697 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6698 update_e = EDGE_PRED (preheader, 0);
6699 else
6700 update_e = EDGE_PRED (preheader, 1);
6702 /* Update IVs of original loop as if they were advanced
6703 by ratio_mult_vf_name steps. */
6704 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6706 /* After peeling we have to reset scalar evolution analyzer. */
6707 scev_reset ();
6709 free_original_copy_tables ();
6713 /* Function vect_gen_niters_for_prolog_loop
6715 Set the number of iterations for the loop represented by LOOP_VINFO
6716 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6717 and the misalignment of DR - the data reference recorded in
6718 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6719 this loop, the data reference DR will refer to an aligned location.
6721 The following computation is generated:
6723 If the misalignment of DR is known at compile time:
6724 addr_mis = int mis = DR_MISALIGNMENT (dr);
6725 Else, compute address misalignment in bytes:
6726 addr_mis = addr & (vectype_size - 1)
6728 prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
6730 (elem_size = element type size; an element is the scalar element whose type
6731 is the inner type of the vectype)
6733 When the step of the data-ref in the loop is not 1 (as in interleaved data
6734 and SLP), the number of iterations of the prolog must be divided by the step
6735 (which is equal to the size of interleaved group).
6737 The above formulas assume that VF == number of elements in the vector. This
6738 may not hold when there are multiple-types in the loop.
6739 In this case, for some data-references in the loop the VF does not represent
6740 the number of elements that fit in the vector. Therefore, instead of VF we
6741 use TYPE_VECTOR_SUBPARTS. */
6743 static tree
6744 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6746 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6747 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6748 tree var, stmt;
6749 tree iters, iters_name;
6750 edge pe;
6751 basic_block new_bb;
6752 tree dr_stmt = DR_STMT (dr);
6753 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6754 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6755 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6756 tree niters_type = TREE_TYPE (loop_niters);
6757 int step = 1;
6758 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6759 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6761 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6762 step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
6764 pe = loop_preheader_edge (loop);
6766 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6768 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6769 int elem_misalign = byte_misalign / element_size;
6771 if (vect_print_dump_info (REPORT_DETAILS))
6772 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6774 iters = build_int_cst (niters_type,
6775 (((nelements - elem_misalign) & (nelements - 1)) / step));
6777 else
6779 tree new_stmts = NULL_TREE;
6780 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6781 &new_stmts, NULL_TREE, loop);
6782 tree ptr_type = TREE_TYPE (start_addr);
6783 tree size = TYPE_SIZE (ptr_type);
6784 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6785 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6786 tree elem_size_log =
6787 build_int_cst (type, exact_log2 (vectype_align/nelements));
6788 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6789 tree nelements_tree = build_int_cst (type, nelements);
6790 tree byte_misalign;
6791 tree elem_misalign;
6793 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6794 gcc_assert (!new_bb);
6796 /* Create: byte_misalign = addr & (vectype_size - 1) */
6797 byte_misalign =
6798 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6800 /* Create: elem_misalign = byte_misalign / element_size */
6801 elem_misalign =
6802 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6804 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6805 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6806 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6807 iters = fold_convert (niters_type, iters);
6810 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6811 /* If the loop bound is known at compile time we already verified that it is
6812 greater than vf; since the misalignment ('iters') is at most vf, there's
6813 no need to generate the MIN_EXPR in this case. */
6814 if (TREE_CODE (loop_niters) != INTEGER_CST)
6815 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6817 if (vect_print_dump_info (REPORT_DETAILS))
6819 fprintf (vect_dump, "niters for prolog loop: ");
6820 print_generic_expr (vect_dump, iters, TDF_SLIM);
6823 var = create_tmp_var (niters_type, "prolog_loop_niters");
6824 add_referenced_var (var);
6825 iters_name = force_gimple_operand (iters, &stmt, false, var);
6827 /* Insert stmt on loop preheader edge. */
6828 if (stmt)
6830 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6831 gcc_assert (!new_bb);
6834 return iters_name;
6838 /* Function vect_update_init_of_dr
6840 NITERS iterations were peeled from LOOP. DR represents a data reference
6841 in LOOP. This function updates the information recorded in DR to
6842 account for the fact that the first NITERS iterations had already been
6843 executed. Specifically, it updates the OFFSET field of DR. */
6845 static void
6846 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6848 tree offset = DR_OFFSET (dr);
6850 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6851 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6852 DR_OFFSET (dr) = offset;
6856 /* Function vect_update_inits_of_drs
6858 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6859 This function updates the information recorded for the data references in
6860 the loop to account for the fact that the first NITERS iterations had
6861 already been executed. Specifically, it updates the initial_condition of
6862 the access_function of all the data_references in the loop. */
6864 static void
6865 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6867 unsigned int i;
6868 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6869 struct data_reference *dr;
6871 if (vect_print_dump_info (REPORT_DETAILS))
6872 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6874 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6875 vect_update_init_of_dr (dr, niters);
6879 /* Function vect_do_peeling_for_alignment
6881 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6882 'niters' is set to the misalignment of one of the data references in the
6883 loop, thereby forcing it to refer to an aligned location at the beginning
6884 of the execution of this loop. The data reference for which we are
6885 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6887 static void
6888 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6890 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6891 tree niters_of_prolog_loop, ni_name;
6892 tree n_iters;
6893 struct loop *new_loop;
6894 bool check_profitability = false;
6895 unsigned int th = 0;
6896 int min_profitable_iters;
6898 if (vect_print_dump_info (REPORT_DETAILS))
6899 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6901 initialize_original_copy_tables ();
6903 ni_name = vect_build_loop_niters (loop_vinfo);
6904 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6907 /* If cost model check not done during versioning. */
6908 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6909 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6911 check_profitability = true;
6913 /* Get profitability threshold for vectorized loop. */
6914 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6916 th = conservative_cost_threshold (loop_vinfo,
6917 min_profitable_iters);
6920 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6921 new_loop =
6922 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6923 niters_of_prolog_loop, ni_name, true,
6924 th, check_profitability);
6926 gcc_assert (new_loop);
6927 #ifdef ENABLE_CHECKING
6928 slpeel_verify_cfg_after_peeling (new_loop, loop);
6929 #endif
6931 /* Update number of times loop executes. */
6932 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6933 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6934 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6936 /* Update the init conditions of the access functions of all data refs. */
6937 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6939 /* After peeling we have to reset scalar evolution analyzer. */
6940 scev_reset ();
6942 free_original_copy_tables ();
6946 /* Function vect_create_cond_for_align_checks.
6948 Create a conditional expression that represents the alignment checks for
6949 all of data references (array element references) whose alignment must be
6950 checked at runtime.
6952 Input:
6953 COND_EXPR - input conditional expression. New conditions will be chained
6954 with logical AND operation.
6955 LOOP_VINFO - two fields of the loop information are used.
6956 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6957 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6959 Output:
6960 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6961 expression.
6962 The returned value is the conditional expression to be used in the if
6963 statement that controls which version of the loop gets executed at runtime.
6965 The algorithm makes two assumptions:
6966 1) The number of bytes "n" in a vector is a power of 2.
6967 2) An address "a" is aligned if a%n is zero and that this
6968 test can be done as a&(n-1) == 0. For example, for 16
6969 byte vectors the test is a&0xf == 0. */
6971 static void
6972 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6973 tree *cond_expr,
6974 tree *cond_expr_stmt_list)
6976 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6977 VEC(tree,heap) *may_misalign_stmts
6978 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6979 tree ref_stmt, tmp;
6980 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6981 tree mask_cst;
6982 unsigned int i;
6983 tree psize;
6984 tree int_ptrsize_type;
6985 char tmp_name[20];
6986 tree or_tmp_name = NULL_TREE;
6987 tree and_tmp, and_tmp_name, and_stmt;
6988 tree ptrsize_zero;
6989 tree part_cond_expr;
6991 /* Check that mask is one less than a power of 2, i.e., mask is
6992 all zeros followed by all ones. */
6993 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6995 /* CHECKME: what is the best integer or unsigned type to use to hold a
6996 cast from a pointer value? */
6997 psize = TYPE_SIZE (ptr_type_node);
6998 int_ptrsize_type
6999 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
7001 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
7002 of the first vector of the i'th data reference. */
7004 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
7006 tree new_stmt_list = NULL_TREE;
7007 tree addr_base;
7008 tree addr_tmp, addr_tmp_name, addr_stmt;
7009 tree or_tmp, new_or_tmp_name, or_stmt;
7011 /* create: addr_tmp = (int)(address_of_first_vector) */
7012 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
7013 &new_stmt_list, NULL_TREE, loop);
7015 if (new_stmt_list != NULL_TREE)
7016 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
7018 sprintf (tmp_name, "%s%d", "addr2int", i);
7019 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7020 add_referenced_var (addr_tmp);
7021 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
7022 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
7023 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
7024 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
7025 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
7027 /* The addresses are OR together. */
7029 if (or_tmp_name != NULL_TREE)
7031 /* create: or_tmp = or_tmp | addr_tmp */
7032 sprintf (tmp_name, "%s%d", "orptrs", i);
7033 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
7034 add_referenced_var (or_tmp);
7035 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
7036 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
7037 or_tmp_name, addr_tmp_name);
7038 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
7039 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
7040 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
7041 or_tmp_name = new_or_tmp_name;
7043 else
7044 or_tmp_name = addr_tmp_name;
7046 } /* end for i */
7048 mask_cst = build_int_cst (int_ptrsize_type, mask);
7050 /* create: and_tmp = or_tmp & mask */
7051 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
7052 add_referenced_var (and_tmp);
7053 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
7055 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
7056 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
7057 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
7058 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
7060 /* Make and_tmp the left operand of the conditional test against zero.
7061 if and_tmp has a nonzero bit then some address is unaligned. */
7062 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7063 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7064 and_tmp_name, ptrsize_zero);
7065 if (*cond_expr)
7066 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7067 *cond_expr, part_cond_expr);
7068 else
7069 *cond_expr = part_cond_expr;
7072 /* Function vect_vfa_segment_size.
7074 Create an expression that computes the size of segment
7075 that will be accessed for a data reference. The functions takes into
7076 account that realignment loads may access one more vector.
7078 Input:
7079 DR: The data reference.
7080 VECT_FACTOR: vectorization factor.
7082 Return an expression whose value is the size of segment which will be
7083 accessed by DR. */
7085 static tree
7086 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7088 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7089 DR_STEP (dr), vect_factor);
7091 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7093 tree vector_size = TYPE_SIZE_UNIT
7094 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7096 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7097 segment_length, vector_size);
7099 return fold_convert (sizetype, segment_length);
7102 /* Function vect_create_cond_for_alias_checks.
7104 Create a conditional expression that represents the run-time checks for
7105 overlapping of address ranges represented by a list of data references
7106 relations passed as input.
7108 Input:
7109 COND_EXPR - input conditional expression. New conditions will be chained
7110 with logical AND operation.
7111 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7112 to be checked.
7114 Output:
7115 COND_EXPR - conditional expression.
7116 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7117 expression.
7120 The returned value is the conditional expression to be used in the if
7121 statement that controls which version of the loop gets executed at runtime.
7124 static void
7125 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7126 tree * cond_expr,
7127 tree * cond_expr_stmt_list)
7129 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7130 VEC (ddr_p, heap) * may_alias_ddrs =
7131 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7132 tree vect_factor =
7133 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7135 ddr_p ddr;
7136 unsigned int i;
7137 tree part_cond_expr;
7139 /* Create expression
7140 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7141 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7145 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7146 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7148 if (VEC_empty (ddr_p, may_alias_ddrs))
7149 return;
7151 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7153 struct data_reference *dr_a, *dr_b;
7154 tree dr_group_first_a, dr_group_first_b;
7155 tree addr_base_a, addr_base_b;
7156 tree segment_length_a, segment_length_b;
7157 tree stmt_a, stmt_b;
7159 dr_a = DDR_A (ddr);
7160 stmt_a = DR_STMT (DDR_A (ddr));
7161 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7162 if (dr_group_first_a)
7164 stmt_a = dr_group_first_a;
7165 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7168 dr_b = DDR_B (ddr);
7169 stmt_b = DR_STMT (DDR_B (ddr));
7170 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7171 if (dr_group_first_b)
7173 stmt_b = dr_group_first_b;
7174 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7177 addr_base_a =
7178 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7179 NULL_TREE, loop);
7180 addr_base_b =
7181 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7182 NULL_TREE, loop);
7184 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7185 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7187 if (vect_print_dump_info (REPORT_DR_DETAILS))
7189 fprintf (vect_dump,
7190 "create runtime check for data references ");
7191 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7192 fprintf (vect_dump, " and ");
7193 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7197 part_cond_expr =
7198 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7199 fold_build2 (LT_EXPR, boolean_type_node,
7200 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7201 addr_base_a,
7202 segment_length_a),
7203 addr_base_b),
7204 fold_build2 (LT_EXPR, boolean_type_node,
7205 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7206 addr_base_b,
7207 segment_length_b),
7208 addr_base_a));
7210 if (*cond_expr)
7211 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7212 *cond_expr, part_cond_expr);
7213 else
7214 *cond_expr = part_cond_expr;
7216 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7217 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7218 VEC_length (ddr_p, may_alias_ddrs));
7222 /* Function vect_loop_versioning.
7224 If the loop has data references that may or may not be aligned or/and
7225 has data reference relations whose independence was not proven then
7226 two versions of the loop need to be generated, one which is vectorized
7227 and one which isn't. A test is then generated to control which of the
7228 loops is executed. The test checks for the alignment of all of the
7229 data references that may or may not be aligned. An additional
7230 sequence of runtime tests is generated for each pairs of DDRs whose
7231 independence was not proven. The vectorized version of loop is
7232 executed only if both alias and alignment tests are passed.
7234 The test generated to check which version of loop is executed
7235 is modified to also check for profitability as indicated by the
7236 cost model initially. */
7238 static void
7239 vect_loop_versioning (loop_vec_info loop_vinfo)
7241 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7242 struct loop *nloop;
7243 tree cond_expr = NULL_TREE;
7244 tree cond_expr_stmt_list = NULL_TREE;
7245 basic_block condition_bb;
7246 block_stmt_iterator cond_exp_bsi;
7247 basic_block merge_bb;
7248 basic_block new_exit_bb;
7249 edge new_exit_e, e;
7250 tree orig_phi, new_phi, arg;
7251 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7252 tree gimplify_stmt_list;
7253 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7254 int min_profitable_iters = 0;
7255 unsigned int th;
7257 /* Get profitability threshold for vectorized loop. */
7258 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7260 th = conservative_cost_threshold (loop_vinfo,
7261 min_profitable_iters);
7263 cond_expr =
7264 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7265 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7267 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7268 false, NULL_TREE);
7270 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7271 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7272 &cond_expr_stmt_list);
7274 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7275 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7276 &cond_expr_stmt_list);
7278 cond_expr =
7279 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7280 cond_expr =
7281 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7282 NULL_TREE);
7283 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7285 initialize_original_copy_tables ();
7286 nloop = loop_version (loop, cond_expr, &condition_bb,
7287 prob, prob, REG_BR_PROB_BASE - prob, true);
7288 free_original_copy_tables();
7290 /* Loop versioning violates an assumption we try to maintain during
7291 vectorization - that the loop exit block has a single predecessor.
7292 After versioning, the exit block of both loop versions is the same
7293 basic block (i.e. it has two predecessors). Just in order to simplify
7294 following transformations in the vectorizer, we fix this situation
7295 here by adding a new (empty) block on the exit-edge of the loop,
7296 with the proper loop-exit phis to maintain loop-closed-form. */
7298 merge_bb = single_exit (loop)->dest;
7299 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7300 new_exit_bb = split_edge (single_exit (loop));
7301 new_exit_e = single_exit (loop);
7302 e = EDGE_SUCC (new_exit_bb, 0);
7304 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7305 orig_phi = PHI_CHAIN (orig_phi))
7307 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7308 new_exit_bb);
7309 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7310 add_phi_arg (new_phi, arg, new_exit_e);
7311 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7314 /* End loop-exit-fixes after versioning. */
7316 update_ssa (TODO_update_ssa);
7317 if (cond_expr_stmt_list)
7319 cond_exp_bsi = bsi_last (condition_bb);
7320 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7324 /* Remove a group of stores (for SLP or interleaving), free their
7325 stmt_vec_info. */
7327 static void
7328 vect_remove_stores (tree first_stmt)
7330 tree next = first_stmt;
7331 tree tmp;
7332 block_stmt_iterator next_si;
7334 while (next)
7336 /* Free the attached stmt_vec_info and remove the stmt. */
7337 next_si = bsi_for_stmt (next);
7338 bsi_remove (&next_si, true);
7339 tmp = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
7340 free_stmt_vec_info (next);
7341 next = tmp;
7346 /* Vectorize SLP instance tree in postorder. */
7348 static bool
7349 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7351 tree stmt;
7352 bool strided_store, is_store;
7353 block_stmt_iterator si;
7354 stmt_vec_info stmt_info;
7356 if (!node)
7357 return false;
7359 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7360 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7362 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7363 stmt_info = vinfo_for_stmt (stmt);
7364 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7365 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7367 if (vect_print_dump_info (REPORT_DETAILS))
7369 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7370 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7373 si = bsi_for_stmt (stmt);
7374 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7375 if (is_store)
7377 if (DR_GROUP_FIRST_DR (stmt_info))
7378 /* If IS_STORE is TRUE, the vectorization of the
7379 interleaving chain was completed - free all the stores in
7380 the chain. */
7381 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7382 else
7383 /* FORNOW: SLP originates only from strided stores. */
7384 gcc_unreachable ();
7386 return true;
7389 /* FORNOW: SLP originates only from strided stores. */
7390 return false;
7394 static bool
7395 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7397 VEC (slp_instance, heap) *slp_instances =
7398 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7399 slp_instance instance;
7400 unsigned int vec_stmts_size;
7401 unsigned int group_size, i;
7402 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7403 bool is_store = false;
7405 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7407 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7408 /* For each SLP instance calculate number of vector stmts to be created
7409 for the scalar stmts in each node of the SLP tree. Number of vector
7410 elements in one vector iteration is the number of scalar elements in
7411 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7412 size. */
7413 vec_stmts_size = vectorization_factor * group_size / nunits;
7415 /* Schedule the tree of INSTANCE. */
7416 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7417 vec_stmts_size);
7419 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7420 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7421 fprintf (vect_dump, "vectorizing stmts using SLP.");
7424 return is_store;
7427 /* Function vect_transform_loop.
7429 The analysis phase has determined that the loop is vectorizable.
7430 Vectorize the loop - created vectorized stmts to replace the scalar
7431 stmts in the loop, and update the loop exit condition. */
7433 void
7434 vect_transform_loop (loop_vec_info loop_vinfo)
7436 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7437 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7438 int nbbs = loop->num_nodes;
7439 block_stmt_iterator si;
7440 int i;
7441 tree ratio = NULL;
7442 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7443 bool strided_store;
7444 bool slp_scheduled = false;
7445 unsigned int nunits;
7447 if (vect_print_dump_info (REPORT_DETAILS))
7448 fprintf (vect_dump, "=== vec_transform_loop ===");
7450 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7451 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7452 vect_loop_versioning (loop_vinfo);
7454 /* CHECKME: we wouldn't need this if we called update_ssa once
7455 for all loops. */
7456 bitmap_zero (vect_memsyms_to_rename);
7458 /* Peel the loop if there are data refs with unknown alignment.
7459 Only one data ref with unknown store is allowed. */
7461 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7462 vect_do_peeling_for_alignment (loop_vinfo);
7464 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7465 compile time constant), or it is a constant that doesn't divide by the
7466 vectorization factor, then an epilog loop needs to be created.
7467 We therefore duplicate the loop: the original loop will be vectorized,
7468 and will compute the first (n/VF) iterations. The second copy of the loop
7469 will remain scalar and will compute the remaining (n%VF) iterations.
7470 (VF is the vectorization factor). */
7472 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7473 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7474 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7475 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7476 else
7477 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7478 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7480 /* 1) Make sure the loop header has exactly two entries
7481 2) Make sure we have a preheader basic block. */
7483 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7485 split_edge (loop_preheader_edge (loop));
7487 /* FORNOW: the vectorizer supports only loops which body consist
7488 of one basic block (header + empty latch). When the vectorizer will
7489 support more involved loop forms, the order by which the BBs are
7490 traversed need to be reconsidered. */
7492 for (i = 0; i < nbbs; i++)
7494 basic_block bb = bbs[i];
7495 stmt_vec_info stmt_info;
7496 tree phi;
7498 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7500 if (vect_print_dump_info (REPORT_DETAILS))
7502 fprintf (vect_dump, "------>vectorizing phi: ");
7503 print_generic_expr (vect_dump, phi, TDF_SLIM);
7505 stmt_info = vinfo_for_stmt (phi);
7506 if (!stmt_info)
7507 continue;
7509 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7510 && !STMT_VINFO_LIVE_P (stmt_info))
7511 continue;
7513 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7514 != (unsigned HOST_WIDE_INT) vectorization_factor)
7515 && vect_print_dump_info (REPORT_DETAILS))
7516 fprintf (vect_dump, "multiple-types.");
7518 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7520 if (vect_print_dump_info (REPORT_DETAILS))
7521 fprintf (vect_dump, "transform phi.");
7522 vect_transform_stmt (phi, NULL, NULL, NULL);
7526 for (si = bsi_start (bb); !bsi_end_p (si);)
7528 tree stmt = bsi_stmt (si);
7529 bool is_store;
7531 if (vect_print_dump_info (REPORT_DETAILS))
7533 fprintf (vect_dump, "------>vectorizing statement: ");
7534 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7537 stmt_info = vinfo_for_stmt (stmt);
7539 /* vector stmts created in the outer-loop during vectorization of
7540 stmts in an inner-loop may not have a stmt_info, and do not
7541 need to be vectorized. */
7542 if (!stmt_info)
7544 bsi_next (&si);
7545 continue;
7548 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7549 && !STMT_VINFO_LIVE_P (stmt_info))
7551 bsi_next (&si);
7552 continue;
7555 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7556 nunits =
7557 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7558 if (!STMT_SLP_TYPE (stmt_info)
7559 && nunits != (unsigned int) vectorization_factor
7560 && vect_print_dump_info (REPORT_DETAILS))
7561 /* For SLP VF is set according to unrolling factor, and not to
7562 vector size, hence for SLP this print is not valid. */
7563 fprintf (vect_dump, "multiple-types.");
7565 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7566 reached. */
7567 if (STMT_SLP_TYPE (stmt_info))
7569 if (!slp_scheduled)
7571 slp_scheduled = true;
7573 if (vect_print_dump_info (REPORT_DETAILS))
7574 fprintf (vect_dump, "=== scheduling SLP instances ===");
7576 is_store = vect_schedule_slp (loop_vinfo, nunits);
7578 /* IS_STORE is true if STMT is a store. Stores cannot be of
7579 hybrid SLP type. They are removed in
7580 vect_schedule_slp_instance and their vinfo is destroyed. */
7581 if (is_store)
7583 bsi_next (&si);
7584 continue;
7588 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7589 if (PURE_SLP_STMT (stmt_info))
7591 bsi_next (&si);
7592 continue;
7596 /* -------- vectorize statement ------------ */
7597 if (vect_print_dump_info (REPORT_DETAILS))
7598 fprintf (vect_dump, "transform statement.");
7600 strided_store = false;
7601 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7602 if (is_store)
7604 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7606 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7607 interleaving chain was completed - free all the stores in
7608 the chain. */
7609 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7610 bsi_remove (&si, true);
7611 continue;
7613 else
7615 /* Free the attached stmt_vec_info and remove the stmt. */
7616 free_stmt_vec_info (stmt);
7617 bsi_remove (&si, true);
7618 continue;
7621 bsi_next (&si);
7622 } /* stmts in BB */
7623 } /* BBs in loop */
7625 slpeel_make_loop_iterate_ntimes (loop, ratio);
7627 mark_set_for_renaming (vect_memsyms_to_rename);
7629 /* The memory tags and pointers in vectorized statements need to
7630 have their SSA forms updated. FIXME, why can't this be delayed
7631 until all the loops have been transformed? */
7632 update_ssa (TODO_update_ssa);
7634 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7635 fprintf (vect_dump, "LOOP VECTORIZED.");
7636 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7637 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");