PR c++/30897
[official-gcc.git] / gcc / tree-vect-transform.c
blob4b88bdf7f4389aece69ac311790e509e2d04d816
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
124 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
125 int nbbs = loop->num_nodes;
126 int byte_misalign;
127 int peel_guard_costs = 0;
128 int innerloop_iters = 0, factor;
129 VEC (slp_instance, heap) *slp_instances;
130 slp_instance instance;
132 /* Cost model disabled. */
133 if (!flag_vect_cost_model)
135 if (vect_print_dump_info (REPORT_DETAILS))
136 fprintf (vect_dump, "cost model disabled.");
137 return 0;
140 /* Requires loop versioning tests to handle misalignment. */
142 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
144 /* FIXME: Make cost depend on complexity of individual check. */
145 vec_outside_cost +=
146 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
147 if (vect_print_dump_info (REPORT_DETAILS))
148 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
149 "versioning to treat misalignment.\n");
152 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
154 /* FIXME: Make cost depend on complexity of individual check. */
155 vec_outside_cost +=
156 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
157 if (vect_print_dump_info (REPORT_DETAILS))
158 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
159 "versioning aliasing.\n");
162 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
163 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
168 /* Count statements in scalar loop. Using this as scalar cost for a single
169 iteration for now.
171 TODO: Add outer loop support.
173 TODO: Consider assigning different costs to different scalar
174 statements. */
176 /* FORNOW. */
177 if (loop->inner)
178 innerloop_iters = 50; /* FIXME */
180 for (i = 0; i < nbbs; i++)
182 block_stmt_iterator si;
183 basic_block bb = bbs[i];
185 if (bb->loop_father == loop->inner)
186 factor = innerloop_iters;
187 else
188 factor = 1;
190 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
192 tree stmt = bsi_stmt (si);
193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
194 if (!STMT_VINFO_RELEVANT_P (stmt_info)
195 && !STMT_VINFO_LIVE_P (stmt_info))
196 continue;
197 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
198 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
199 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
200 some of the "outside" costs are generated inside the outer-loop. */
201 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
205 /* Add additional cost for the peeled instructions in prologue and epilogue
206 loop.
208 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
209 at compile-time - we assume it's vf/2 (the worst would be vf-1).
211 TODO: Build an expression that represents peel_iters for prologue and
212 epilogue to be used in a run-time test. */
214 byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
216 if (byte_misalign < 0)
218 peel_iters_prologue = vf/2;
219 if (vect_print_dump_info (REPORT_DETAILS))
220 fprintf (vect_dump, "cost model: "
221 "prologue peel iters set to vf/2.");
223 /* If peeling for alignment is unknown, loop bound of main loop becomes
224 unknown. */
225 peel_iters_epilogue = vf/2;
226 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "cost model: "
228 "epilogue peel iters set to vf/2 because "
229 "peeling for alignment is unknown .");
231 /* If peeled iterations are unknown, count a taken branch and a not taken
232 branch per peeled loop. Even if scalar loop iterations are known,
233 vector iterations are not known since peeled prologue iterations are
234 not known. Hence guards remain the same. */
235 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
236 + TARG_COND_NOT_TAKEN_BRANCH_COST);
239 else
241 if (byte_misalign)
243 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
244 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
245 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
246 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
248 peel_iters_prologue = nelements - (byte_misalign / element_size);
250 else
251 peel_iters_prologue = 0;
253 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
255 peel_iters_epilogue = vf/2;
256 if (vect_print_dump_info (REPORT_DETAILS))
257 fprintf (vect_dump, "cost model: "
258 "epilogue peel iters set to vf/2 because "
259 "loop iterations are unknown .");
261 /* If peeled iterations are known but number of scalar loop
262 iterations are unknown, count a taken branch per peeled loop. */
263 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
266 else
268 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
269 peel_iters_prologue = niters < peel_iters_prologue ?
270 niters : peel_iters_prologue;
271 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
275 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
276 + (peel_iters_epilogue * scalar_single_iter_cost)
277 + peel_guard_costs;
279 /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
280 information we provide for the target is whether testing against the
281 threshold involves a runtime test. */
282 if (targetm.vectorize.builtin_vectorization_cost)
284 bool runtime_test = false;
286 /* If the number of iterations is unknown, or the
287 peeling-for-misalignment amount is unknown, we eill have to generate
288 a runtime test to test the loop count against the threshold. */
289 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
290 || (byte_misalign < 0))
291 runtime_test = true;
292 vec_outside_cost +=
293 targetm.vectorize.builtin_vectorization_cost (runtime_test);
294 if (vect_print_dump_info (REPORT_DETAILS))
295 fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
296 targetm.vectorize.builtin_vectorization_cost (runtime_test));
299 /* Add SLP costs. */
300 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
301 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
303 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
304 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
307 /* Calculate number of iterations required to make the vector version
308 profitable, relative to the loop bodies only. The following condition
309 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
310 SIC = scalar iteration cost, VIC = vector iteration cost,
311 VOC = vector outside cost and VF = vectorization factor. */
313 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
315 if (vec_outside_cost <= 0)
316 min_profitable_iters = 1;
317 else
319 min_profitable_iters = (vec_outside_cost * vf
320 - vec_inside_cost * peel_iters_prologue
321 - vec_inside_cost * peel_iters_epilogue)
322 / ((scalar_single_iter_cost * vf)
323 - vec_inside_cost);
325 if ((scalar_single_iter_cost * vf * min_profitable_iters)
326 <= ((vec_inside_cost * min_profitable_iters)
327 + (vec_outside_cost * vf)))
328 min_profitable_iters++;
331 /* vector version will never be profitable. */
332 else
334 if (vect_print_dump_info (REPORT_DETAILS))
335 fprintf (vect_dump, "cost model: vector iteration cost = %d "
336 "is divisible by scalar iteration cost = %d by a factor "
337 "greater than or equal to the vectorization factor = %d .",
338 vec_inside_cost, scalar_single_iter_cost, vf);
339 return -1;
342 if (vect_print_dump_info (REPORT_DETAILS))
344 fprintf (vect_dump, "Cost model analysis: \n");
345 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
346 vec_inside_cost);
347 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
348 vec_outside_cost);
349 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
350 fprintf (vect_dump, " prologue iterations: %d\n",
351 peel_iters_prologue);
352 fprintf (vect_dump, " epilogue iterations: %d\n",
353 peel_iters_epilogue);
354 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
355 min_profitable_iters);
358 min_profitable_iters =
359 min_profitable_iters < vf ? vf : min_profitable_iters;
361 /* Because the condition we create is:
362 if (niters <= min_profitable_iters)
363 then skip the vectorized loop. */
364 min_profitable_iters--;
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, " Profitability threshold = %d\n",
368 min_profitable_iters);
370 return min_profitable_iters;
374 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
375 functions. Design better to avoid maintenance issues. */
377 /* Function vect_model_reduction_cost.
379 Models cost for a reduction operation, including the vector ops
380 generated within the strip-mine loop, the initial definition before
381 the loop, and the epilogue code that must be generated. */
383 static bool
384 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
385 int ncopies)
387 int outer_cost = 0;
388 enum tree_code code;
389 optab optab;
390 tree vectype;
391 tree orig_stmt;
392 tree reduction_op;
393 enum machine_mode mode;
394 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
395 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
397 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
399 /* Cost of reduction op inside loop. */
400 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
402 reduction_op = TREE_OPERAND (operation, op_type-1);
403 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
404 if (!vectype)
406 if (vect_print_dump_info (REPORT_DETAILS))
408 fprintf (vect_dump, "unsupported data-type ");
409 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
411 return false;
414 mode = TYPE_MODE (vectype);
415 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
417 if (!orig_stmt)
418 orig_stmt = STMT_VINFO_STMT (stmt_info);
420 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
422 /* Add in cost for initial definition. */
423 outer_cost += TARG_SCALAR_TO_VEC_COST;
425 /* Determine cost of epilogue code.
427 We have a reduction operator that will reduce the vector in one statement.
428 Also requires scalar extract. */
430 if (!nested_in_vect_loop_p (loop, orig_stmt))
432 if (reduc_code < NUM_TREE_CODES)
433 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
434 else
436 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
437 tree bitsize =
438 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
439 int element_bitsize = tree_low_cst (bitsize, 1);
440 int nelements = vec_size_in_bits / element_bitsize;
442 optab = optab_for_tree_code (code, vectype);
444 /* We have a whole vector shift available. */
445 if (VECTOR_MODE_P (mode)
446 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
447 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
448 /* Final reduction via vector shifts and the reduction operator. Also
449 requires scalar extract. */
450 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
451 + TARG_VEC_TO_SCALAR_COST);
452 else
453 /* Use extracts and reduction op for final reduction. For N elements,
454 we have N extracts and N-1 reduction ops. */
455 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
459 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
461 if (vect_print_dump_info (REPORT_DETAILS))
462 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
463 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
464 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
466 return true;
470 /* Function vect_model_induction_cost.
472 Models cost for induction operations. */
474 static void
475 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
477 /* loop cost for vec_loop. */
478 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
479 /* prologue cost for vec_init and vec_step. */
480 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
482 if (vect_print_dump_info (REPORT_DETAILS))
483 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
484 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
485 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
489 /* Function vect_model_simple_cost.
491 Models cost for simple operations, i.e. those that only emit ncopies of a
492 single op. Right now, this does not account for multiple insns that could
493 be generated for the single vector op. We will handle that shortly. */
495 void
496 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
497 enum vect_def_type *dt, slp_tree slp_node)
499 int i;
500 int inside_cost = 0, outside_cost = 0;
502 inside_cost = ncopies * TARG_VEC_STMT_COST;
504 /* FORNOW: Assuming maximum 2 args per stmts. */
505 for (i = 0; i < 2; i++)
507 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
508 outside_cost += TARG_SCALAR_TO_VEC_COST;
511 if (vect_print_dump_info (REPORT_DETAILS))
512 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
513 "outside_cost = %d .", inside_cost, outside_cost);
515 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
516 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
517 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
521 /* Function vect_cost_strided_group_size
523 For strided load or store, return the group_size only if it is the first
524 load or store of a group, else return 1. This ensures that group size is
525 only returned once per group. */
527 static int
528 vect_cost_strided_group_size (stmt_vec_info stmt_info)
530 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
532 if (first_stmt == STMT_VINFO_STMT (stmt_info))
533 return DR_GROUP_SIZE (stmt_info);
535 return 1;
539 /* Function vect_model_store_cost
541 Models cost for stores. In the case of strided accesses, one access
542 has the overhead of the strided access attributed to it. */
544 void
545 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
546 enum vect_def_type dt, slp_tree slp_node)
548 int group_size;
549 int inside_cost = 0, outside_cost = 0;
551 if (dt == vect_constant_def || dt == vect_invariant_def)
552 outside_cost = TARG_SCALAR_TO_VEC_COST;
554 /* Strided access? */
555 if (DR_GROUP_FIRST_DR (stmt_info))
556 group_size = vect_cost_strided_group_size (stmt_info);
557 /* Not a strided access. */
558 else
559 group_size = 1;
561 /* Is this an access in a group of stores, which provide strided access?
562 If so, add in the cost of the permutes. */
563 if (group_size > 1)
565 /* Uses a high and low interleave operation for each needed permute. */
566 inside_cost = ncopies * exact_log2(group_size) * group_size
567 * TARG_VEC_STMT_COST;
569 if (vect_print_dump_info (REPORT_DETAILS))
570 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
571 group_size);
575 /* Costs of the stores. */
576 inside_cost += ncopies * TARG_VEC_STORE_COST;
578 if (vect_print_dump_info (REPORT_DETAILS))
579 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
580 "outside_cost = %d .", inside_cost, outside_cost);
582 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
583 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
584 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
588 /* Function vect_model_load_cost
590 Models cost for loads. In the case of strided accesses, the last access
591 has the overhead of the strided access attributed to it. Since unaligned
592 accesses are supported for loads, we also account for the costs of the
593 access scheme chosen. */
595 void
596 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
599 int group_size;
600 int alignment_support_cheme;
601 tree first_stmt;
602 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
603 int inside_cost = 0, outside_cost = 0;
605 /* Strided accesses? */
606 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
607 if (first_stmt && !slp_node)
609 group_size = vect_cost_strided_group_size (stmt_info);
610 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
612 /* Not a strided access. */
613 else
615 group_size = 1;
616 first_dr = dr;
619 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
621 /* Is this an access in a group of loads providing strided access?
622 If so, add in the cost of the permutes. */
623 if (group_size > 1)
625 /* Uses an even and odd extract operations for each needed permute. */
626 inside_cost = ncopies * exact_log2(group_size) * group_size
627 * TARG_VEC_STMT_COST;
629 if (vect_print_dump_info (REPORT_DETAILS))
630 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
631 group_size);
635 /* The loads themselves. */
636 switch (alignment_support_cheme)
638 case dr_aligned:
640 inside_cost += ncopies * TARG_VEC_LOAD_COST;
642 if (vect_print_dump_info (REPORT_DETAILS))
643 fprintf (vect_dump, "vect_model_load_cost: aligned.");
645 break;
647 case dr_unaligned_supported:
649 /* Here, we assign an additional cost for the unaligned load. */
650 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
652 if (vect_print_dump_info (REPORT_DETAILS))
653 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
654 "hardware.");
656 break;
658 case dr_explicit_realign:
660 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
662 /* FIXME: If the misalignment remains fixed across the iterations of
663 the containing loop, the following cost should be added to the
664 outside costs. */
665 if (targetm.vectorize.builtin_mask_for_load)
666 inside_cost += TARG_VEC_STMT_COST;
668 break;
670 case dr_explicit_realign_optimized:
672 if (vect_print_dump_info (REPORT_DETAILS))
673 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
674 "pipelined.");
676 /* Unaligned software pipeline has a load of an address, an initial
677 load, and possibly a mask operation to "prime" the loop. However,
678 if this is an access in a group of loads, which provide strided
679 access, then the above cost should only be considered for one
680 access in the group. Inside the loop, there is a load op
681 and a realignment op. */
683 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
685 outside_cost = 2*TARG_VEC_STMT_COST;
686 if (targetm.vectorize.builtin_mask_for_load)
687 outside_cost += TARG_VEC_STMT_COST;
690 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
692 break;
695 default:
696 gcc_unreachable ();
699 if (vect_print_dump_info (REPORT_DETAILS))
700 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
701 "outside_cost = %d .", inside_cost, outside_cost);
703 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
704 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
705 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
709 /* Function vect_get_new_vect_var.
711 Returns a name for a new variable. The current naming scheme appends the
712 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
713 the name of vectorizer generated variables, and appends that to NAME if
714 provided. */
716 static tree
717 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
719 const char *prefix;
720 tree new_vect_var;
722 switch (var_kind)
724 case vect_simple_var:
725 prefix = "vect_";
726 break;
727 case vect_scalar_var:
728 prefix = "stmp_";
729 break;
730 case vect_pointer_var:
731 prefix = "vect_p";
732 break;
733 default:
734 gcc_unreachable ();
737 if (name)
739 char* tmp = concat (prefix, name, NULL);
740 new_vect_var = create_tmp_var (type, tmp);
741 free (tmp);
743 else
744 new_vect_var = create_tmp_var (type, prefix);
746 /* Mark vector typed variable as a gimple register variable. */
747 if (TREE_CODE (type) == VECTOR_TYPE)
748 DECL_GIMPLE_REG_P (new_vect_var) = true;
750 return new_vect_var;
754 /* Function vect_create_addr_base_for_vector_ref.
756 Create an expression that computes the address of the first memory location
757 that will be accessed for a data reference.
759 Input:
760 STMT: The statement containing the data reference.
761 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
762 OFFSET: Optional. If supplied, it is be added to the initial address.
763 LOOP: Specify relative to which loop-nest should the address be computed.
764 For example, when the dataref is in an inner-loop nested in an
765 outer-loop that is now being vectorized, LOOP can be either the
766 outer-loop, or the inner-loop. The first memory location accessed
767 by the following dataref ('in' points to short):
769 for (i=0; i<N; i++)
770 for (j=0; j<M; j++)
771 s += in[i+j]
773 is as follows:
774 if LOOP=i_loop: &in (relative to i_loop)
775 if LOOP=j_loop: &in+i*2B (relative to j_loop)
777 Output:
778 1. Return an SSA_NAME whose value is the address of the memory location of
779 the first vector of the data reference.
780 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
781 these statement(s) which define the returned SSA_NAME.
783 FORNOW: We are only handling array accesses with step 1. */
785 static tree
786 vect_create_addr_base_for_vector_ref (tree stmt,
787 tree *new_stmt_list,
788 tree offset,
789 struct loop *loop)
791 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
792 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
793 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
794 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
795 tree base_name;
796 tree data_ref_base_var;
797 tree new_base_stmt;
798 tree vec_stmt;
799 tree addr_base, addr_expr;
800 tree dest, new_stmt;
801 tree base_offset = unshare_expr (DR_OFFSET (dr));
802 tree init = unshare_expr (DR_INIT (dr));
803 tree vect_ptr_type, addr_expr2;
804 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
806 gcc_assert (loop);
807 if (loop != containing_loop)
809 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
810 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
812 gcc_assert (nested_in_vect_loop_p (loop, stmt));
814 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
815 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
816 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
819 /* Create data_ref_base */
820 base_name = build_fold_indirect_ref (data_ref_base);
821 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
822 add_referenced_var (data_ref_base_var);
823 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
824 true, data_ref_base_var);
825 append_to_statement_list_force(new_base_stmt, new_stmt_list);
827 /* Create base_offset */
828 base_offset = size_binop (PLUS_EXPR, base_offset, init);
829 base_offset = fold_convert (sizetype, base_offset);
830 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
831 add_referenced_var (dest);
832 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
833 append_to_statement_list_force (new_stmt, new_stmt_list);
835 if (offset)
837 tree tmp = create_tmp_var (sizetype, "offset");
839 add_referenced_var (tmp);
840 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
841 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
842 base_offset, offset);
843 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
844 append_to_statement_list_force (new_stmt, new_stmt_list);
847 /* base + base_offset */
848 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
849 data_ref_base, base_offset);
851 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
853 /* addr_expr = addr_base */
854 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
855 get_name (base_name));
856 add_referenced_var (addr_expr);
857 vec_stmt = fold_convert (vect_ptr_type, addr_base);
858 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
859 get_name (base_name));
860 add_referenced_var (addr_expr2);
861 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
862 append_to_statement_list_force (new_stmt, new_stmt_list);
864 if (vect_print_dump_info (REPORT_DETAILS))
866 fprintf (vect_dump, "created ");
867 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
869 return vec_stmt;
873 /* Function vect_create_data_ref_ptr.
875 Create a new pointer to vector type (vp), that points to the first location
876 accessed in the loop by STMT, along with the def-use update chain to
877 appropriately advance the pointer through the loop iterations. Also set
878 aliasing information for the pointer. This vector pointer is used by the
879 callers to this function to create a memory reference expression for vector
880 load/store access.
882 Input:
883 1. STMT: a stmt that references memory. Expected to be of the form
884 GIMPLE_MODIFY_STMT <name, data-ref> or
885 GIMPLE_MODIFY_STMT <data-ref, name>.
886 2. AT_LOOP: the loop where the vector memref is to be created.
887 3. OFFSET (optional): an offset to be added to the initial address accessed
888 by the data-ref in STMT.
889 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
890 pointing to the initial address.
891 5. TYPE: if not NULL indicates the required type of the data-ref
893 Output:
894 1. Declare a new ptr to vector_type, and have it point to the base of the
895 data reference (initial addressed accessed by the data reference).
896 For example, for vector of type V8HI, the following code is generated:
898 v8hi *vp;
899 vp = (v8hi *)initial_address;
901 if OFFSET is not supplied:
902 initial_address = &a[init];
903 if OFFSET is supplied:
904 initial_address = &a[init + OFFSET];
906 Return the initial_address in INITIAL_ADDRESS.
908 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
909 update the pointer in each iteration of the loop.
911 Return the increment stmt that updates the pointer in PTR_INCR.
913 3. Set INV_P to true if the access pattern of the data reference in the
914 vectorized loop is invariant. Set it to false otherwise.
916 4. Return the pointer. */
918 static tree
919 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
920 tree offset, tree *initial_address, tree *ptr_incr,
921 bool only_init, tree type, bool *inv_p)
923 tree base_name;
924 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
925 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
926 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
927 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
928 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
929 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
930 tree vect_ptr_type;
931 tree vect_ptr;
932 tree tag;
933 tree new_temp;
934 tree vec_stmt;
935 tree new_stmt_list = NULL_TREE;
936 edge pe;
937 basic_block new_bb;
938 tree vect_ptr_init;
939 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
940 tree vptr;
941 block_stmt_iterator incr_bsi;
942 bool insert_after;
943 tree indx_before_incr, indx_after_incr;
944 tree incr;
945 tree step;
947 /* Check the step (evolution) of the load in LOOP, and record
948 whether it's invariant. */
949 if (nested_in_vect_loop)
950 step = STMT_VINFO_DR_STEP (stmt_info);
951 else
952 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
954 if (tree_int_cst_compare (step, size_zero_node) == 0)
955 *inv_p = true;
956 else
957 *inv_p = false;
959 /* Create an expression for the first address accessed by this load
960 in LOOP. */
961 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
963 if (vect_print_dump_info (REPORT_DETAILS))
965 tree data_ref_base = base_name;
966 fprintf (vect_dump, "create vector-pointer variable to type: ");
967 print_generic_expr (vect_dump, vectype, TDF_SLIM);
968 if (TREE_CODE (data_ref_base) == VAR_DECL)
969 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
970 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
971 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
972 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
973 fprintf (vect_dump, " vectorizing a record based array ref: ");
974 else if (TREE_CODE (data_ref_base) == SSA_NAME)
975 fprintf (vect_dump, " vectorizing a pointer ref: ");
976 print_generic_expr (vect_dump, base_name, TDF_SLIM);
979 /** (1) Create the new vector-pointer variable: **/
980 if (type)
981 vect_ptr_type = build_pointer_type (type);
982 else
983 vect_ptr_type = build_pointer_type (vectype);
984 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
985 get_name (base_name));
986 add_referenced_var (vect_ptr);
988 /** (2) Add aliasing information to the new vector-pointer:
989 (The points-to info (DR_PTR_INFO) may be defined later.) **/
991 tag = DR_SYMBOL_TAG (dr);
992 gcc_assert (tag);
994 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
995 tag must be created with tag added to its may alias list. */
996 if (!MTAG_P (tag))
997 new_type_alias (vect_ptr, tag, DR_REF (dr));
998 else
999 set_symbol_mem_tag (vect_ptr, tag);
1001 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1003 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1004 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1005 def-use update cycles for the pointer: One relative to the outer-loop
1006 (LOOP), which is what steps (3) and (4) below do. The other is relative
1007 to the inner-loop (which is the inner-most loop containing the dataref),
1008 and this is done be step (5) below.
1010 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1011 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1012 redundant. Steps (3),(4) create the following:
1014 vp0 = &base_addr;
1015 LOOP: vp1 = phi(vp0,vp2)
1016 ...
1018 vp2 = vp1 + step
1019 goto LOOP
1021 If there is an inner-loop nested in loop, then step (5) will also be
1022 applied, and an additional update in the inner-loop will be created:
1024 vp0 = &base_addr;
1025 LOOP: vp1 = phi(vp0,vp2)
1027 inner: vp3 = phi(vp1,vp4)
1028 vp4 = vp3 + inner_step
1029 if () goto inner
1031 vp2 = vp1 + step
1032 if () goto LOOP */
1034 /** (3) Calculate the initial address the vector-pointer, and set
1035 the vector-pointer to point to it before the loop: **/
1037 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1039 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1040 offset, loop);
1041 pe = loop_preheader_edge (loop);
1042 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1043 gcc_assert (!new_bb);
1044 *initial_address = new_temp;
1046 /* Create: p = (vectype *) initial_base */
1047 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1048 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1049 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1050 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1051 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1052 gcc_assert (!new_bb);
1055 /** (4) Handle the updating of the vector-pointer inside the loop.
1056 This is needed when ONLY_INIT is false, and also when AT_LOOP
1057 is the inner-loop nested in LOOP (during outer-loop vectorization).
1060 if (only_init && at_loop == loop) /* No update in loop is required. */
1062 /* Copy the points-to information if it exists. */
1063 if (DR_PTR_INFO (dr))
1064 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1065 vptr = vect_ptr_init;
1067 else
1069 /* The step of the vector pointer is the Vector Size. */
1070 tree step = TYPE_SIZE_UNIT (vectype);
1071 /* One exception to the above is when the scalar step of the load in
1072 LOOP is zero. In this case the step here is also zero. */
1073 if (*inv_p)
1074 step = size_zero_node;
1076 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1078 create_iv (vect_ptr_init,
1079 fold_convert (vect_ptr_type, step),
1080 NULL_TREE, loop, &incr_bsi, insert_after,
1081 &indx_before_incr, &indx_after_incr);
1082 incr = bsi_stmt (incr_bsi);
1083 set_stmt_info (stmt_ann (incr),
1084 new_stmt_vec_info (incr, loop_vinfo));
1086 /* Copy the points-to information if it exists. */
1087 if (DR_PTR_INFO (dr))
1089 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1090 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1092 merge_alias_info (vect_ptr_init, indx_before_incr);
1093 merge_alias_info (vect_ptr_init, indx_after_incr);
1094 if (ptr_incr)
1095 *ptr_incr = incr;
1097 vptr = indx_before_incr;
1100 if (!nested_in_vect_loop || only_init)
1101 return vptr;
1104 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1105 nested in LOOP, if exists: **/
1107 gcc_assert (nested_in_vect_loop);
1108 if (!only_init)
1110 standard_iv_increment_position (containing_loop, &incr_bsi,
1111 &insert_after);
1112 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1113 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1114 &indx_after_incr);
1115 incr = bsi_stmt (incr_bsi);
1116 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1118 /* Copy the points-to information if it exists. */
1119 if (DR_PTR_INFO (dr))
1121 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1122 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1124 merge_alias_info (vect_ptr_init, indx_before_incr);
1125 merge_alias_info (vect_ptr_init, indx_after_incr);
1126 if (ptr_incr)
1127 *ptr_incr = incr;
1129 return indx_before_incr;
1131 else
1132 gcc_unreachable ();
1136 /* Function bump_vector_ptr
1138 Increment a pointer (to a vector type) by vector-size. If requested,
1139 i.e. if PTR-INCR is given, then also connect the new increment stmt
1140 to the existing def-use update-chain of the pointer, by modifying
1141 the PTR_INCR as illustrated below:
1143 The pointer def-use update-chain before this function:
1144 DATAREF_PTR = phi (p_0, p_2)
1145 ....
1146 PTR_INCR: p_2 = DATAREF_PTR + step
1148 The pointer def-use update-chain after this function:
1149 DATAREF_PTR = phi (p_0, p_2)
1150 ....
1151 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1152 ....
1153 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1155 Input:
1156 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1157 in the loop.
1158 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1159 the loop. The increment amount across iterations is expected
1160 to be vector_size.
1161 BSI - location where the new update stmt is to be placed.
1162 STMT - the original scalar memory-access stmt that is being vectorized.
1163 BUMP - optional. The offset by which to bump the pointer. If not given,
1164 the offset is assumed to be vector_size.
1166 Output: Return NEW_DATAREF_PTR as illustrated above.
1170 static tree
1171 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1172 tree stmt, tree bump)
1174 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1175 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1176 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1177 tree vptr_type = TREE_TYPE (dataref_ptr);
1178 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1179 tree update = TYPE_SIZE_UNIT (vectype);
1180 tree incr_stmt;
1181 ssa_op_iter iter;
1182 use_operand_p use_p;
1183 tree new_dataref_ptr;
1185 if (bump)
1186 update = bump;
1188 incr_stmt = build_gimple_modify_stmt (ptr_var,
1189 build2 (POINTER_PLUS_EXPR, vptr_type,
1190 dataref_ptr, update));
1191 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1192 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1193 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1195 /* Copy the points-to information if it exists. */
1196 if (DR_PTR_INFO (dr))
1197 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1198 merge_alias_info (new_dataref_ptr, dataref_ptr);
1200 if (!ptr_incr)
1201 return new_dataref_ptr;
1203 /* Update the vector-pointer's cross-iteration increment. */
1204 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1206 tree use = USE_FROM_PTR (use_p);
1208 if (use == dataref_ptr)
1209 SET_USE (use_p, new_dataref_ptr);
1210 else
1211 gcc_assert (tree_int_cst_compare (use, update) == 0);
1214 return new_dataref_ptr;
1218 /* Function vect_create_destination_var.
1220 Create a new temporary of type VECTYPE. */
1222 static tree
1223 vect_create_destination_var (tree scalar_dest, tree vectype)
1225 tree vec_dest;
1226 const char *new_name;
1227 tree type;
1228 enum vect_var_kind kind;
1230 kind = vectype ? vect_simple_var : vect_scalar_var;
1231 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1233 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1235 new_name = get_name (scalar_dest);
1236 if (!new_name)
1237 new_name = "var_";
1238 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1239 add_referenced_var (vec_dest);
1241 return vec_dest;
1245 /* Function vect_init_vector.
1247 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1248 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1249 is not NULL. Otherwise, place the initialization at the loop preheader.
1250 Return the DEF of INIT_STMT.
1251 It will be used in the vectorization of STMT. */
1253 static tree
1254 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1255 block_stmt_iterator *bsi)
1257 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1258 tree new_var;
1259 tree init_stmt;
1260 tree vec_oprnd;
1261 edge pe;
1262 tree new_temp;
1263 basic_block new_bb;
1265 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1266 add_referenced_var (new_var);
1267 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1268 new_temp = make_ssa_name (new_var, init_stmt);
1269 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1271 if (bsi)
1272 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1273 else
1275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1276 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1278 if (nested_in_vect_loop_p (loop, stmt))
1279 loop = loop->inner;
1280 pe = loop_preheader_edge (loop);
1281 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1282 gcc_assert (!new_bb);
1285 if (vect_print_dump_info (REPORT_DETAILS))
1287 fprintf (vect_dump, "created new init_stmt: ");
1288 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1291 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1292 return vec_oprnd;
1296 /* For constant and loop invariant defs of SLP_NODE this function returns
1297 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1298 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1299 stmts. */
1301 static void
1302 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1303 unsigned int op_num)
1305 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1306 tree stmt = VEC_index (tree, stmts, 0);
1307 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1308 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1309 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1310 tree vec_cst;
1311 tree t = NULL_TREE;
1312 int j, number_of_places_left_in_vector;
1313 tree vector_type;
1314 tree op, vop, operation;
1315 int group_size = VEC_length (tree, stmts);
1316 unsigned int vec_num, i;
1317 int number_of_copies = 1;
1318 bool is_store = false;
1319 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1320 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1322 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1323 is_store = true;
1325 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1326 created vectors. It is greater than 1 if unrolling is performed.
1328 For example, we have two scalar operands, s1 and s2 (e.g., group of
1329 strided accesses of size two), while NUINTS is four (i.e., four scalars
1330 of this type can be packed in a vector). The output vector will contain
1331 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1332 will be 2).
1334 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1335 containing the operands.
1337 For example, NUINTS is four as before, and the group size is 8
1338 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1339 {s5, s6, s7, s8}. */
1341 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1343 number_of_places_left_in_vector = nunits;
1344 for (j = 0; j < number_of_copies; j++)
1346 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1348 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1349 if (is_store)
1350 op = operation;
1351 else
1352 op = TREE_OPERAND (operation, op_num);
1354 /* Create 'vect_ = {op0,op1,...,opn}'. */
1355 t = tree_cons (NULL_TREE, op, t);
1357 number_of_places_left_in_vector--;
1359 if (number_of_places_left_in_vector == 0)
1361 number_of_places_left_in_vector = nunits;
1363 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1364 gcc_assert (vector_type);
1365 vec_cst = build_constructor_from_list (vector_type, t);
1366 VEC_quick_push (tree, voprnds,
1367 vect_init_vector (stmt, vec_cst, vector_type,
1368 NULL));
1369 t = NULL_TREE;
1374 /* Since the vectors are created in the reverse order, we should invert
1375 them. */
1376 vec_num = VEC_length (tree, voprnds);
1377 for (j = vec_num - 1; j >= 0; j--)
1379 vop = VEC_index (tree, voprnds, j);
1380 VEC_quick_push (tree, *vec_oprnds, vop);
1383 VEC_free (tree, heap, voprnds);
1385 /* In case that VF is greater than the unrolling factor needed for the SLP
1386 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1387 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1388 to replicate the vectors. */
1389 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1391 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1392 VEC_quick_push (tree, *vec_oprnds, vop);
1397 /* Get vectorized definitions from SLP_NODE that contains corresponding
1398 vectorized def-stmts. */
1400 static void
1401 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1403 tree vec_oprnd;
1404 tree vec_def_stmt;
1405 unsigned int i;
1407 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1409 for (i = 0;
1410 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1411 i++)
1413 gcc_assert (vec_def_stmt);
1414 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1415 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1420 /* Get vectorized definitions for SLP_NODE.
1421 If the scalar definitions are loop invariants or constants, collect them and
1422 call vect_get_constant_vectors() to create vector stmts.
1423 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1424 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1425 vect_get_slp_vect_defs() to retrieve them.
1426 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1427 the right node. This is used when the second operand must remain scalar. */
1429 static void
1430 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1431 VEC (tree,heap) **vec_oprnds1)
1433 tree operation, first_stmt;
1435 /* Allocate memory for vectorized defs. */
1436 *vec_oprnds0 = VEC_alloc (tree, heap,
1437 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1439 /* SLP_NODE corresponds either to a group of stores or to a group of
1440 unary/binary operations. We don't call this function for loads. */
1441 if (SLP_TREE_LEFT (slp_node))
1442 /* The defs are already vectorized. */
1443 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1444 else
1445 /* Build vectors from scalar defs. */
1446 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1448 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1449 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1450 /* Since we don't call this function with loads, this is a group of
1451 stores. */
1452 return;
1454 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1455 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1456 return;
1458 *vec_oprnds1 = VEC_alloc (tree, heap,
1459 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1461 if (SLP_TREE_RIGHT (slp_node))
1462 /* The defs are already vectorized. */
1463 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1464 else
1465 /* Build vectors from scalar defs. */
1466 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1470 /* Function get_initial_def_for_induction
1472 Input:
1473 STMT - a stmt that performs an induction operation in the loop.
1474 IV_PHI - the initial value of the induction variable
1476 Output:
1477 Return a vector variable, initialized with the first VF values of
1478 the induction variable. E.g., for an iv with IV_PHI='X' and
1479 evolution S, for a vector of 4 units, we want to return:
1480 [X, X + S, X + 2*S, X + 3*S]. */
1482 static tree
1483 get_initial_def_for_induction (tree iv_phi)
1485 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1486 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1487 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1488 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1489 tree vectype;
1490 int nunits;
1491 edge pe = loop_preheader_edge (loop);
1492 struct loop *iv_loop;
1493 basic_block new_bb;
1494 tree vec, vec_init, vec_step, t;
1495 tree access_fn;
1496 tree new_var;
1497 tree new_name;
1498 tree init_stmt;
1499 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1500 tree init_expr, step_expr;
1501 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1502 int i;
1503 bool ok;
1504 int ncopies;
1505 tree expr;
1506 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1507 bool nested_in_vect_loop = false;
1508 tree stmts;
1509 imm_use_iterator imm_iter;
1510 use_operand_p use_p;
1511 tree exit_phi;
1512 edge latch_e;
1513 tree loop_arg;
1514 block_stmt_iterator si;
1515 basic_block bb = bb_for_stmt (iv_phi);
1517 vectype = get_vectype_for_scalar_type (scalar_type);
1518 gcc_assert (vectype);
1519 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1520 ncopies = vf / nunits;
1522 gcc_assert (phi_info);
1523 gcc_assert (ncopies >= 1);
1525 /* Find the first insertion point in the BB. */
1526 si = bsi_after_labels (bb);
1528 if (INTEGRAL_TYPE_P (scalar_type))
1529 step_expr = build_int_cst (scalar_type, 0);
1530 else
1531 step_expr = build_real (scalar_type, dconst0);
1533 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1534 if (nested_in_vect_loop_p (loop, iv_phi))
1536 nested_in_vect_loop = true;
1537 iv_loop = loop->inner;
1539 else
1540 iv_loop = loop;
1541 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1543 latch_e = loop_latch_edge (iv_loop);
1544 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1546 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1547 gcc_assert (access_fn);
1548 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1549 &init_expr, &step_expr);
1550 gcc_assert (ok);
1551 pe = loop_preheader_edge (iv_loop);
1553 /* Create the vector that holds the initial_value of the induction. */
1554 if (nested_in_vect_loop)
1556 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1557 been created during vectorization of previous stmts; We obtain it from
1558 the STMT_VINFO_VEC_STMT of the defining stmt. */
1559 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1560 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1562 else
1564 /* iv_loop is the loop to be vectorized. Create:
1565 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1566 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1567 add_referenced_var (new_var);
1569 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1570 if (stmts)
1572 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1573 gcc_assert (!new_bb);
1576 t = NULL_TREE;
1577 t = tree_cons (NULL_TREE, init_expr, t);
1578 for (i = 1; i < nunits; i++)
1580 tree tmp;
1582 /* Create: new_name_i = new_name + step_expr */
1583 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1584 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1585 new_name = make_ssa_name (new_var, init_stmt);
1586 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1588 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1589 gcc_assert (!new_bb);
1591 if (vect_print_dump_info (REPORT_DETAILS))
1593 fprintf (vect_dump, "created new init_stmt: ");
1594 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1596 t = tree_cons (NULL_TREE, new_name, t);
1598 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1599 vec = build_constructor_from_list (vectype, nreverse (t));
1600 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1604 /* Create the vector that holds the step of the induction. */
1605 if (nested_in_vect_loop)
1606 /* iv_loop is nested in the loop to be vectorized. Generate:
1607 vec_step = [S, S, S, S] */
1608 new_name = step_expr;
1609 else
1611 /* iv_loop is the loop to be vectorized. Generate:
1612 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1613 expr = build_int_cst (scalar_type, vf);
1614 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1617 t = NULL_TREE;
1618 for (i = 0; i < nunits; i++)
1619 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1620 vec = build_constructor_from_list (vectype, t);
1621 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1624 /* Create the following def-use cycle:
1625 loop prolog:
1626 vec_init = ...
1627 vec_step = ...
1628 loop:
1629 vec_iv = PHI <vec_init, vec_loop>
1631 STMT
1633 vec_loop = vec_iv + vec_step; */
1635 /* Create the induction-phi that defines the induction-operand. */
1636 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1637 add_referenced_var (vec_dest);
1638 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1639 set_stmt_info (get_stmt_ann (induction_phi),
1640 new_stmt_vec_info (induction_phi, loop_vinfo));
1641 induc_def = PHI_RESULT (induction_phi);
1643 /* Create the iv update inside the loop */
1644 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1645 build2 (PLUS_EXPR, vectype,
1646 induc_def, vec_step));
1647 vec_def = make_ssa_name (vec_dest, new_stmt);
1648 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1649 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1650 set_stmt_info (get_stmt_ann (new_stmt),
1651 new_stmt_vec_info (new_stmt, loop_vinfo));
1653 /* Set the arguments of the phi node: */
1654 add_phi_arg (induction_phi, vec_init, pe);
1655 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1658 /* In case that vectorization factor (VF) is bigger than the number
1659 of elements that we can fit in a vectype (nunits), we have to generate
1660 more than one vector stmt - i.e - we need to "unroll" the
1661 vector stmt by a factor VF/nunits. For more details see documentation
1662 in vectorizable_operation. */
1664 if (ncopies > 1)
1666 stmt_vec_info prev_stmt_vinfo;
1667 /* FORNOW. This restriction should be relaxed. */
1668 gcc_assert (!nested_in_vect_loop);
1670 /* Create the vector that holds the step of the induction. */
1671 expr = build_int_cst (scalar_type, nunits);
1672 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1673 t = NULL_TREE;
1674 for (i = 0; i < nunits; i++)
1675 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1676 vec = build_constructor_from_list (vectype, t);
1677 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1679 vec_def = induc_def;
1680 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1681 for (i = 1; i < ncopies; i++)
1683 tree tmp;
1685 /* vec_i = vec_prev + vec_step */
1686 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1687 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1688 vec_def = make_ssa_name (vec_dest, new_stmt);
1689 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1690 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1691 set_stmt_info (get_stmt_ann (new_stmt),
1692 new_stmt_vec_info (new_stmt, loop_vinfo));
1693 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1694 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1698 if (nested_in_vect_loop)
1700 /* Find the loop-closed exit-phi of the induction, and record
1701 the final vector of induction results: */
1702 exit_phi = NULL;
1703 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1705 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1707 exit_phi = USE_STMT (use_p);
1708 break;
1711 if (exit_phi)
1713 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1714 /* FORNOW. Currently not supporting the case that an inner-loop induction
1715 is not used in the outer-loop (i.e. only outside the outer-loop). */
1716 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1717 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1719 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1720 if (vect_print_dump_info (REPORT_DETAILS))
1722 fprintf (vect_dump, "vector of inductions after inner-loop:");
1723 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1729 if (vect_print_dump_info (REPORT_DETAILS))
1731 fprintf (vect_dump, "transform induction: created def-use cycle:");
1732 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1733 fprintf (vect_dump, "\n");
1734 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1737 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1738 return induc_def;
1742 /* Function vect_get_vec_def_for_operand.
1744 OP is an operand in STMT. This function returns a (vector) def that will be
1745 used in the vectorized stmt for STMT.
1747 In the case that OP is an SSA_NAME which is defined in the loop, then
1748 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1750 In case OP is an invariant or constant, a new stmt that creates a vector def
1751 needs to be introduced. */
1753 static tree
1754 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1756 tree vec_oprnd;
1757 tree vec_stmt;
1758 tree def_stmt;
1759 stmt_vec_info def_stmt_info = NULL;
1760 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1761 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1762 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1763 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1764 tree vec_inv;
1765 tree vec_cst;
1766 tree t = NULL_TREE;
1767 tree def;
1768 int i;
1769 enum vect_def_type dt;
1770 bool is_simple_use;
1771 tree vector_type;
1773 if (vect_print_dump_info (REPORT_DETAILS))
1775 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1776 print_generic_expr (vect_dump, op, TDF_SLIM);
1779 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1780 gcc_assert (is_simple_use);
1781 if (vect_print_dump_info (REPORT_DETAILS))
1783 if (def)
1785 fprintf (vect_dump, "def = ");
1786 print_generic_expr (vect_dump, def, TDF_SLIM);
1788 if (def_stmt)
1790 fprintf (vect_dump, " def_stmt = ");
1791 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1795 switch (dt)
1797 /* Case 1: operand is a constant. */
1798 case vect_constant_def:
1800 if (scalar_def)
1801 *scalar_def = op;
1803 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1804 if (vect_print_dump_info (REPORT_DETAILS))
1805 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1807 for (i = nunits - 1; i >= 0; --i)
1809 t = tree_cons (NULL_TREE, op, t);
1811 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1812 gcc_assert (vector_type);
1813 vec_cst = build_vector (vector_type, t);
1815 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1818 /* Case 2: operand is defined outside the loop - loop invariant. */
1819 case vect_invariant_def:
1821 if (scalar_def)
1822 *scalar_def = def;
1824 /* Create 'vec_inv = {inv,inv,..,inv}' */
1825 if (vect_print_dump_info (REPORT_DETAILS))
1826 fprintf (vect_dump, "Create vector_inv.");
1828 for (i = nunits - 1; i >= 0; --i)
1830 t = tree_cons (NULL_TREE, def, t);
1833 /* FIXME: use build_constructor directly. */
1834 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1835 gcc_assert (vector_type);
1836 vec_inv = build_constructor_from_list (vector_type, t);
1837 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1840 /* Case 3: operand is defined inside the loop. */
1841 case vect_loop_def:
1843 if (scalar_def)
1844 *scalar_def = def_stmt;
1846 /* Get the def from the vectorized stmt. */
1847 def_stmt_info = vinfo_for_stmt (def_stmt);
1848 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1849 gcc_assert (vec_stmt);
1850 if (TREE_CODE (vec_stmt) == PHI_NODE)
1851 vec_oprnd = PHI_RESULT (vec_stmt);
1852 else
1853 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1854 return vec_oprnd;
1857 /* Case 4: operand is defined by a loop header phi - reduction */
1858 case vect_reduction_def:
1860 struct loop *loop;
1862 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1863 loop = (bb_for_stmt (def_stmt))->loop_father;
1865 /* Get the def before the loop */
1866 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1867 return get_initial_def_for_reduction (stmt, op, scalar_def);
1870 /* Case 5: operand is defined by loop-header phi - induction. */
1871 case vect_induction_def:
1873 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1875 /* Get the def from the vectorized stmt. */
1876 def_stmt_info = vinfo_for_stmt (def_stmt);
1877 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1878 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1879 vec_oprnd = PHI_RESULT (vec_stmt);
1880 return vec_oprnd;
1883 default:
1884 gcc_unreachable ();
1889 /* Function vect_get_vec_def_for_stmt_copy
1891 Return a vector-def for an operand. This function is used when the
1892 vectorized stmt to be created (by the caller to this function) is a "copy"
1893 created in case the vectorized result cannot fit in one vector, and several
1894 copies of the vector-stmt are required. In this case the vector-def is
1895 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1896 of the stmt that defines VEC_OPRND.
1897 DT is the type of the vector def VEC_OPRND.
1899 Context:
1900 In case the vectorization factor (VF) is bigger than the number
1901 of elements that can fit in a vectype (nunits), we have to generate
1902 more than one vector stmt to vectorize the scalar stmt. This situation
1903 arises when there are multiple data-types operated upon in the loop; the
1904 smallest data-type determines the VF, and as a result, when vectorizing
1905 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1906 vector stmt (each computing a vector of 'nunits' results, and together
1907 computing 'VF' results in each iteration). This function is called when
1908 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1909 which VF=16 and nunits=4, so the number of copies required is 4):
1911 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1913 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1914 VS1.1: vx.1 = memref1 VS1.2
1915 VS1.2: vx.2 = memref2 VS1.3
1916 VS1.3: vx.3 = memref3
1918 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1919 VSnew.1: vz1 = vx.1 + ... VSnew.2
1920 VSnew.2: vz2 = vx.2 + ... VSnew.3
1921 VSnew.3: vz3 = vx.3 + ...
1923 The vectorization of S1 is explained in vectorizable_load.
1924 The vectorization of S2:
1925 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1926 the function 'vect_get_vec_def_for_operand' is called to
1927 get the relevant vector-def for each operand of S2. For operand x it
1928 returns the vector-def 'vx.0'.
1930 To create the remaining copies of the vector-stmt (VSnew.j), this
1931 function is called to get the relevant vector-def for each operand. It is
1932 obtained from the respective VS1.j stmt, which is recorded in the
1933 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1935 For example, to obtain the vector-def 'vx.1' in order to create the
1936 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1937 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1938 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1939 and return its def ('vx.1').
1940 Overall, to create the above sequence this function will be called 3 times:
1941 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1942 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1943 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1945 static tree
1946 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1948 tree vec_stmt_for_operand;
1949 stmt_vec_info def_stmt_info;
1951 /* Do nothing; can reuse same def. */
1952 if (dt == vect_invariant_def || dt == vect_constant_def )
1953 return vec_oprnd;
1955 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1956 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1957 gcc_assert (def_stmt_info);
1958 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1959 gcc_assert (vec_stmt_for_operand);
1960 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1961 return vec_oprnd;
1965 /* Get vectorized definitions for the operands to create a copy of an original
1966 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
1968 static void
1969 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
1970 VEC(tree,heap) **vec_oprnds0,
1971 VEC(tree,heap) **vec_oprnds1)
1973 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
1975 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
1976 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
1978 if (vec_oprnds1 && *vec_oprnds1)
1980 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
1981 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
1982 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
1987 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
1989 static void
1990 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
1991 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
1993 if (slp_node)
1994 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
1995 else
1997 tree vec_oprnd;
1999 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2000 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2001 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2003 if (op1)
2005 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2006 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2007 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2013 /* Function vect_finish_stmt_generation.
2015 Insert a new stmt. */
2017 static void
2018 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2019 block_stmt_iterator *bsi)
2021 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2024 gcc_assert (stmt == bsi_stmt (*bsi));
2025 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2027 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2029 set_stmt_info (get_stmt_ann (vec_stmt),
2030 new_stmt_vec_info (vec_stmt, loop_vinfo));
2032 if (vect_print_dump_info (REPORT_DETAILS))
2034 fprintf (vect_dump, "add new stmt: ");
2035 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2038 /* Make sure bsi points to the stmt that is being vectorized. */
2039 gcc_assert (stmt == bsi_stmt (*bsi));
2041 #ifdef USE_MAPPED_LOCATION
2042 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2043 #else
2044 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
2045 #endif
2049 /* Function get_initial_def_for_reduction
2051 Input:
2052 STMT - a stmt that performs a reduction operation in the loop.
2053 INIT_VAL - the initial value of the reduction variable
2055 Output:
2056 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2057 of the reduction (used for adjusting the epilog - see below).
2058 Return a vector variable, initialized according to the operation that STMT
2059 performs. This vector will be used as the initial value of the
2060 vector of partial results.
2062 Option1 (adjust in epilog): Initialize the vector as follows:
2063 add: [0,0,...,0,0]
2064 mult: [1,1,...,1,1]
2065 min/max: [init_val,init_val,..,init_val,init_val]
2066 bit and/or: [init_val,init_val,..,init_val,init_val]
2067 and when necessary (e.g. add/mult case) let the caller know
2068 that it needs to adjust the result by init_val.
2070 Option2: Initialize the vector as follows:
2071 add: [0,0,...,0,init_val]
2072 mult: [1,1,...,1,init_val]
2073 min/max: [init_val,init_val,...,init_val]
2074 bit and/or: [init_val,init_val,...,init_val]
2075 and no adjustments are needed.
2077 For example, for the following code:
2079 s = init_val;
2080 for (i=0;i<n;i++)
2081 s = s + a[i];
2083 STMT is 's = s + a[i]', and the reduction variable is 's'.
2084 For a vector of 4 units, we want to return either [0,0,0,init_val],
2085 or [0,0,0,0] and let the caller know that it needs to adjust
2086 the result at the end by 'init_val'.
2088 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2089 initialization vector is simpler (same element in all entries).
2090 A cost model should help decide between these two schemes. */
2092 static tree
2093 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2095 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2096 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2097 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2098 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2099 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2100 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2101 tree type = TREE_TYPE (init_val);
2102 tree vecdef;
2103 tree def_for_init;
2104 tree init_def;
2105 tree t = NULL_TREE;
2106 int i;
2107 tree vector_type;
2108 bool nested_in_vect_loop = false;
2110 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2111 if (nested_in_vect_loop_p (loop, stmt))
2112 nested_in_vect_loop = true;
2113 else
2114 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2116 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2118 switch (code)
2120 case WIDEN_SUM_EXPR:
2121 case DOT_PROD_EXPR:
2122 case PLUS_EXPR:
2123 if (nested_in_vect_loop)
2124 *adjustment_def = vecdef;
2125 else
2126 *adjustment_def = init_val;
2127 /* Create a vector of zeros for init_def. */
2128 if (INTEGRAL_TYPE_P (type))
2129 def_for_init = build_int_cst (type, 0);
2130 else
2131 def_for_init = build_real (type, dconst0);
2132 for (i = nunits - 1; i >= 0; --i)
2133 t = tree_cons (NULL_TREE, def_for_init, t);
2134 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2135 gcc_assert (vector_type);
2136 init_def = build_vector (vector_type, t);
2137 break;
2139 case MIN_EXPR:
2140 case MAX_EXPR:
2141 *adjustment_def = NULL_TREE;
2142 init_def = vecdef;
2143 break;
2145 default:
2146 gcc_unreachable ();
2149 return init_def;
2153 /* Function vect_create_epilog_for_reduction
2155 Create code at the loop-epilog to finalize the result of a reduction
2156 computation.
2158 VECT_DEF is a vector of partial results.
2159 REDUC_CODE is the tree-code for the epilog reduction.
2160 STMT is the scalar reduction stmt that is being vectorized.
2161 REDUCTION_PHI is the phi-node that carries the reduction computation.
2163 This function:
2164 1. Creates the reduction def-use cycle: sets the arguments for
2165 REDUCTION_PHI:
2166 The loop-entry argument is the vectorized initial-value of the reduction.
2167 The loop-latch argument is VECT_DEF - the vector of partial sums.
2168 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2169 by applying the operation specified by REDUC_CODE if available, or by
2170 other means (whole-vector shifts or a scalar loop).
2171 The function also creates a new phi node at the loop exit to preserve
2172 loop-closed form, as illustrated below.
2174 The flow at the entry to this function:
2176 loop:
2177 vec_def = phi <null, null> # REDUCTION_PHI
2178 VECT_DEF = vector_stmt # vectorized form of STMT
2179 s_loop = scalar_stmt # (scalar) STMT
2180 loop_exit:
2181 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2182 use <s_out0>
2183 use <s_out0>
2185 The above is transformed by this function into:
2187 loop:
2188 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2189 VECT_DEF = vector_stmt # vectorized form of STMT
2190 s_loop = scalar_stmt # (scalar) STMT
2191 loop_exit:
2192 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2193 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2194 v_out2 = reduce <v_out1>
2195 s_out3 = extract_field <v_out2, 0>
2196 s_out4 = adjust_result <s_out3>
2197 use <s_out4>
2198 use <s_out4>
2201 static void
2202 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2203 enum tree_code reduc_code, tree reduction_phi)
2205 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2206 tree vectype;
2207 enum machine_mode mode;
2208 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2210 basic_block exit_bb;
2211 tree scalar_dest;
2212 tree scalar_type;
2213 tree new_phi;
2214 block_stmt_iterator exit_bsi;
2215 tree vec_dest;
2216 tree new_temp = NULL_TREE;
2217 tree new_name;
2218 tree epilog_stmt = NULL_TREE;
2219 tree new_scalar_dest, exit_phi, new_dest;
2220 tree bitsize, bitpos, bytesize;
2221 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2222 tree adjustment_def;
2223 tree vec_initial_def;
2224 tree orig_name;
2225 imm_use_iterator imm_iter;
2226 use_operand_p use_p;
2227 bool extract_scalar_result = false;
2228 tree reduction_op, expr;
2229 tree orig_stmt;
2230 tree use_stmt;
2231 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2232 bool nested_in_vect_loop = false;
2233 int op_type;
2234 VEC(tree,heap) *phis = NULL;
2235 int i;
2237 if (nested_in_vect_loop_p (loop, stmt))
2239 loop = loop->inner;
2240 nested_in_vect_loop = true;
2243 op_type = TREE_OPERAND_LENGTH (operation);
2244 reduction_op = TREE_OPERAND (operation, op_type-1);
2245 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2246 gcc_assert (vectype);
2247 mode = TYPE_MODE (vectype);
2249 /*** 1. Create the reduction def-use cycle ***/
2251 /* 1.1 set the loop-entry arg of the reduction-phi: */
2252 /* For the case of reduction, vect_get_vec_def_for_operand returns
2253 the scalar def before the loop, that defines the initial value
2254 of the reduction variable. */
2255 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2256 &adjustment_def);
2257 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2259 /* 1.2 set the loop-latch arg for the reduction-phi: */
2260 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2262 if (vect_print_dump_info (REPORT_DETAILS))
2264 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2265 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2266 fprintf (vect_dump, "\n");
2267 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2271 /*** 2. Create epilog code
2272 The reduction epilog code operates across the elements of the vector
2273 of partial results computed by the vectorized loop.
2274 The reduction epilog code consists of:
2275 step 1: compute the scalar result in a vector (v_out2)
2276 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2277 step 3: adjust the scalar result (s_out3) if needed.
2279 Step 1 can be accomplished using one the following three schemes:
2280 (scheme 1) using reduc_code, if available.
2281 (scheme 2) using whole-vector shifts, if available.
2282 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2283 combined.
2285 The overall epilog code looks like this:
2287 s_out0 = phi <s_loop> # original EXIT_PHI
2288 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2289 v_out2 = reduce <v_out1> # step 1
2290 s_out3 = extract_field <v_out2, 0> # step 2
2291 s_out4 = adjust_result <s_out3> # step 3
2293 (step 3 is optional, and step2 1 and 2 may be combined).
2294 Lastly, the uses of s_out0 are replaced by s_out4.
2296 ***/
2298 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2299 v_out1 = phi <v_loop> */
2301 exit_bb = single_exit (loop)->dest;
2302 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2303 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2304 exit_bsi = bsi_after_labels (exit_bb);
2306 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2307 (i.e. when reduc_code is not available) and in the final adjustment
2308 code (if needed). Also get the original scalar reduction variable as
2309 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2310 represents a reduction pattern), the tree-code and scalar-def are
2311 taken from the original stmt that the pattern-stmt (STMT) replaces.
2312 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2313 are taken from STMT. */
2315 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2316 if (!orig_stmt)
2318 /* Regular reduction */
2319 orig_stmt = stmt;
2321 else
2323 /* Reduction pattern */
2324 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2325 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2326 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2328 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2329 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2330 scalar_type = TREE_TYPE (scalar_dest);
2331 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2332 bitsize = TYPE_SIZE (scalar_type);
2333 bytesize = TYPE_SIZE_UNIT (scalar_type);
2336 /* In case this is a reduction in an inner-loop while vectorizing an outer
2337 loop - we don't need to extract a single scalar result at the end of the
2338 inner-loop. The final vector of partial results will be used in the
2339 vectorized outer-loop, or reduced to a scalar result at the end of the
2340 outer-loop. */
2341 if (nested_in_vect_loop)
2342 goto vect_finalize_reduction;
2344 /* 2.3 Create the reduction code, using one of the three schemes described
2345 above. */
2347 if (reduc_code < NUM_TREE_CODES)
2349 tree tmp;
2351 /*** Case 1: Create:
2352 v_out2 = reduc_expr <v_out1> */
2354 if (vect_print_dump_info (REPORT_DETAILS))
2355 fprintf (vect_dump, "Reduce using direct vector reduction.");
2357 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2358 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2359 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2360 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2361 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2362 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2364 extract_scalar_result = true;
2366 else
2368 enum tree_code shift_code = 0;
2369 bool have_whole_vector_shift = true;
2370 int bit_offset;
2371 int element_bitsize = tree_low_cst (bitsize, 1);
2372 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2373 tree vec_temp;
2375 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2376 shift_code = VEC_RSHIFT_EXPR;
2377 else
2378 have_whole_vector_shift = false;
2380 /* Regardless of whether we have a whole vector shift, if we're
2381 emulating the operation via tree-vect-generic, we don't want
2382 to use it. Only the first round of the reduction is likely
2383 to still be profitable via emulation. */
2384 /* ??? It might be better to emit a reduction tree code here, so that
2385 tree-vect-generic can expand the first round via bit tricks. */
2386 if (!VECTOR_MODE_P (mode))
2387 have_whole_vector_shift = false;
2388 else
2390 optab optab = optab_for_tree_code (code, vectype);
2391 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2392 have_whole_vector_shift = false;
2395 if (have_whole_vector_shift)
2397 /*** Case 2: Create:
2398 for (offset = VS/2; offset >= element_size; offset/=2)
2400 Create: va' = vec_shift <va, offset>
2401 Create: va = vop <va, va'>
2402 } */
2404 if (vect_print_dump_info (REPORT_DETAILS))
2405 fprintf (vect_dump, "Reduce using vector shifts");
2407 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2408 new_temp = PHI_RESULT (new_phi);
2410 for (bit_offset = vec_size_in_bits/2;
2411 bit_offset >= element_bitsize;
2412 bit_offset /= 2)
2414 tree bitpos = size_int (bit_offset);
2415 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2416 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2417 new_name = make_ssa_name (vec_dest, epilog_stmt);
2418 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2419 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2421 tmp = build2 (code, vectype, new_name, new_temp);
2422 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2423 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2424 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2425 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2428 extract_scalar_result = true;
2430 else
2432 tree rhs;
2434 /*** Case 3: Create:
2435 s = extract_field <v_out2, 0>
2436 for (offset = element_size;
2437 offset < vector_size;
2438 offset += element_size;)
2440 Create: s' = extract_field <v_out2, offset>
2441 Create: s = op <s, s'>
2442 } */
2444 if (vect_print_dump_info (REPORT_DETAILS))
2445 fprintf (vect_dump, "Reduce using scalar code. ");
2447 vec_temp = PHI_RESULT (new_phi);
2448 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2449 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2450 bitsize_zero_node);
2451 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2452 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2453 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2454 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2455 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2457 for (bit_offset = element_bitsize;
2458 bit_offset < vec_size_in_bits;
2459 bit_offset += element_bitsize)
2461 tree tmp;
2462 tree bitpos = bitsize_int (bit_offset);
2463 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2464 bitpos);
2466 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2467 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2468 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2469 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2470 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2472 tmp = build2 (code, scalar_type, new_name, new_temp);
2473 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2474 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2475 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2476 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2479 extract_scalar_result = false;
2483 /* 2.4 Extract the final scalar result. Create:
2484 s_out3 = extract_field <v_out2, bitpos> */
2486 if (extract_scalar_result)
2488 tree rhs;
2490 gcc_assert (!nested_in_vect_loop);
2491 if (vect_print_dump_info (REPORT_DETAILS))
2492 fprintf (vect_dump, "extract scalar result");
2494 if (BYTES_BIG_ENDIAN)
2495 bitpos = size_binop (MULT_EXPR,
2496 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2497 TYPE_SIZE (scalar_type));
2498 else
2499 bitpos = bitsize_zero_node;
2501 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2502 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2503 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2504 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2505 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2506 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2509 vect_finalize_reduction:
2511 /* 2.5 Adjust the final result by the initial value of the reduction
2512 variable. (When such adjustment is not needed, then
2513 'adjustment_def' is zero). For example, if code is PLUS we create:
2514 new_temp = loop_exit_def + adjustment_def */
2516 if (adjustment_def)
2518 if (nested_in_vect_loop)
2520 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2521 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2522 new_dest = vect_create_destination_var (scalar_dest, vectype);
2524 else
2526 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2527 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2528 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2530 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2531 new_temp = make_ssa_name (new_dest, epilog_stmt);
2532 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2533 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2537 /* 2.6 Handle the loop-exit phi */
2539 /* Replace uses of s_out0 with uses of s_out3:
2540 Find the loop-closed-use at the loop exit of the original scalar result.
2541 (The reduction result is expected to have two immediate uses - one at the
2542 latch block, and one at the loop exit). */
2543 phis = VEC_alloc (tree, heap, 10);
2544 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2546 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2548 exit_phi = USE_STMT (use_p);
2549 VEC_quick_push (tree, phis, exit_phi);
2552 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2553 gcc_assert (!VEC_empty (tree, phis));
2555 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2557 if (nested_in_vect_loop)
2559 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2561 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2562 is not used in the outer-loop (but only outside the outer-loop). */
2563 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2564 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2566 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2567 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2568 set_stmt_info (get_stmt_ann (epilog_stmt),
2569 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2570 continue;
2573 /* Replace the uses: */
2574 orig_name = PHI_RESULT (exit_phi);
2575 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2576 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2577 SET_USE (use_p, new_temp);
2579 VEC_free (tree, heap, phis);
2583 /* Function vectorizable_reduction.
2585 Check if STMT performs a reduction operation that can be vectorized.
2586 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2587 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2588 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2590 This function also handles reduction idioms (patterns) that have been
2591 recognized in advance during vect_pattern_recog. In this case, STMT may be
2592 of this form:
2593 X = pattern_expr (arg0, arg1, ..., X)
2594 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2595 sequence that had been detected and replaced by the pattern-stmt (STMT).
2597 In some cases of reduction patterns, the type of the reduction variable X is
2598 different than the type of the other arguments of STMT.
2599 In such cases, the vectype that is used when transforming STMT into a vector
2600 stmt is different than the vectype that is used to determine the
2601 vectorization factor, because it consists of a different number of elements
2602 than the actual number of elements that are being operated upon in parallel.
2604 For example, consider an accumulation of shorts into an int accumulator.
2605 On some targets it's possible to vectorize this pattern operating on 8
2606 shorts at a time (hence, the vectype for purposes of determining the
2607 vectorization factor should be V8HI); on the other hand, the vectype that
2608 is used to create the vector form is actually V4SI (the type of the result).
2610 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2611 indicates what is the actual level of parallelism (V8HI in the example), so
2612 that the right vectorization factor would be derived. This vectype
2613 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2614 be used to create the vectorized stmt. The right vectype for the vectorized
2615 stmt is obtained from the type of the result X:
2616 get_vectype_for_scalar_type (TREE_TYPE (X))
2618 This means that, contrary to "regular" reductions (or "regular" stmts in
2619 general), the following equation:
2620 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2621 does *NOT* necessarily hold for reduction patterns. */
2623 bool
2624 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2626 tree vec_dest;
2627 tree scalar_dest;
2628 tree op;
2629 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2630 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2631 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2632 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2633 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2634 tree operation;
2635 enum tree_code code, orig_code, epilog_reduc_code = 0;
2636 enum machine_mode vec_mode;
2637 int op_type;
2638 optab optab, reduc_optab;
2639 tree new_temp = NULL_TREE;
2640 tree def, def_stmt;
2641 enum vect_def_type dt;
2642 tree new_phi;
2643 tree scalar_type;
2644 bool is_simple_use;
2645 tree orig_stmt;
2646 stmt_vec_info orig_stmt_info;
2647 tree expr = NULL_TREE;
2648 int i;
2649 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2650 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2651 stmt_vec_info prev_stmt_info;
2652 tree reduc_def;
2653 tree new_stmt = NULL_TREE;
2654 int j;
2656 if (nested_in_vect_loop_p (loop, stmt))
2658 loop = loop->inner;
2659 /* FORNOW. This restriction should be relaxed. */
2660 if (ncopies > 1)
2662 if (vect_print_dump_info (REPORT_DETAILS))
2663 fprintf (vect_dump, "multiple types in nested loop.");
2664 return false;
2668 gcc_assert (ncopies >= 1);
2670 /* FORNOW: SLP not supported. */
2671 if (STMT_SLP_TYPE (stmt_info))
2672 return false;
2674 /* 1. Is vectorizable reduction? */
2676 /* Not supportable if the reduction variable is used in the loop. */
2677 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2678 return false;
2680 /* Reductions that are not used even in an enclosing outer-loop,
2681 are expected to be "live" (used out of the loop). */
2682 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2683 && !STMT_VINFO_LIVE_P (stmt_info))
2684 return false;
2686 /* Make sure it was already recognized as a reduction computation. */
2687 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2688 return false;
2690 /* 2. Has this been recognized as a reduction pattern?
2692 Check if STMT represents a pattern that has been recognized
2693 in earlier analysis stages. For stmts that represent a pattern,
2694 the STMT_VINFO_RELATED_STMT field records the last stmt in
2695 the original sequence that constitutes the pattern. */
2697 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2698 if (orig_stmt)
2700 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2701 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2702 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2703 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2706 /* 3. Check the operands of the operation. The first operands are defined
2707 inside the loop body. The last operand is the reduction variable,
2708 which is defined by the loop-header-phi. */
2710 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2712 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2713 code = TREE_CODE (operation);
2714 op_type = TREE_OPERAND_LENGTH (operation);
2715 if (op_type != binary_op && op_type != ternary_op)
2716 return false;
2717 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2718 scalar_type = TREE_TYPE (scalar_dest);
2720 /* All uses but the last are expected to be defined in the loop.
2721 The last use is the reduction variable. */
2722 for (i = 0; i < op_type-1; i++)
2724 op = TREE_OPERAND (operation, i);
2725 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2726 gcc_assert (is_simple_use);
2727 if (dt != vect_loop_def
2728 && dt != vect_invariant_def
2729 && dt != vect_constant_def
2730 && dt != vect_induction_def)
2731 return false;
2734 op = TREE_OPERAND (operation, i);
2735 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2736 gcc_assert (is_simple_use);
2737 gcc_assert (dt == vect_reduction_def);
2738 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2739 if (orig_stmt)
2740 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2741 else
2742 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2744 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2745 return false;
2747 /* 4. Supportable by target? */
2749 /* 4.1. check support for the operation in the loop */
2750 optab = optab_for_tree_code (code, vectype);
2751 if (!optab)
2753 if (vect_print_dump_info (REPORT_DETAILS))
2754 fprintf (vect_dump, "no optab.");
2755 return false;
2757 vec_mode = TYPE_MODE (vectype);
2758 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2760 if (vect_print_dump_info (REPORT_DETAILS))
2761 fprintf (vect_dump, "op not supported by target.");
2762 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2763 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2764 < vect_min_worthwhile_factor (code))
2765 return false;
2766 if (vect_print_dump_info (REPORT_DETAILS))
2767 fprintf (vect_dump, "proceeding using word mode.");
2770 /* Worthwhile without SIMD support? */
2771 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2772 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2773 < vect_min_worthwhile_factor (code))
2775 if (vect_print_dump_info (REPORT_DETAILS))
2776 fprintf (vect_dump, "not worthwhile without SIMD support.");
2777 return false;
2780 /* 4.2. Check support for the epilog operation.
2782 If STMT represents a reduction pattern, then the type of the
2783 reduction variable may be different than the type of the rest
2784 of the arguments. For example, consider the case of accumulation
2785 of shorts into an int accumulator; The original code:
2786 S1: int_a = (int) short_a;
2787 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2789 was replaced with:
2790 STMT: int_acc = widen_sum <short_a, int_acc>
2792 This means that:
2793 1. The tree-code that is used to create the vector operation in the
2794 epilog code (that reduces the partial results) is not the
2795 tree-code of STMT, but is rather the tree-code of the original
2796 stmt from the pattern that STMT is replacing. I.e, in the example
2797 above we want to use 'widen_sum' in the loop, but 'plus' in the
2798 epilog.
2799 2. The type (mode) we use to check available target support
2800 for the vector operation to be created in the *epilog*, is
2801 determined by the type of the reduction variable (in the example
2802 above we'd check this: plus_optab[vect_int_mode]).
2803 However the type (mode) we use to check available target support
2804 for the vector operation to be created *inside the loop*, is
2805 determined by the type of the other arguments to STMT (in the
2806 example we'd check this: widen_sum_optab[vect_short_mode]).
2808 This is contrary to "regular" reductions, in which the types of all
2809 the arguments are the same as the type of the reduction variable.
2810 For "regular" reductions we can therefore use the same vector type
2811 (and also the same tree-code) when generating the epilog code and
2812 when generating the code inside the loop. */
2814 if (orig_stmt)
2816 /* This is a reduction pattern: get the vectype from the type of the
2817 reduction variable, and get the tree-code from orig_stmt. */
2818 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2819 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2820 if (!vectype)
2822 if (vect_print_dump_info (REPORT_DETAILS))
2824 fprintf (vect_dump, "unsupported data-type ");
2825 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2827 return false;
2830 vec_mode = TYPE_MODE (vectype);
2832 else
2834 /* Regular reduction: use the same vectype and tree-code as used for
2835 the vector code inside the loop can be used for the epilog code. */
2836 orig_code = code;
2839 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2840 return false;
2841 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2842 if (!reduc_optab)
2844 if (vect_print_dump_info (REPORT_DETAILS))
2845 fprintf (vect_dump, "no optab for reduction.");
2846 epilog_reduc_code = NUM_TREE_CODES;
2848 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2850 if (vect_print_dump_info (REPORT_DETAILS))
2851 fprintf (vect_dump, "reduc op not supported by target.");
2852 epilog_reduc_code = NUM_TREE_CODES;
2855 if (!vec_stmt) /* transformation not required. */
2857 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2858 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2859 return false;
2860 return true;
2863 /** Transform. **/
2865 if (vect_print_dump_info (REPORT_DETAILS))
2866 fprintf (vect_dump, "transform reduction.");
2868 /* Create the destination vector */
2869 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2871 /* Create the reduction-phi that defines the reduction-operand. */
2872 new_phi = create_phi_node (vec_dest, loop->header);
2874 /* In case the vectorization factor (VF) is bigger than the number
2875 of elements that we can fit in a vectype (nunits), we have to generate
2876 more than one vector stmt - i.e - we need to "unroll" the
2877 vector stmt by a factor VF/nunits. For more details see documentation
2878 in vectorizable_operation. */
2880 prev_stmt_info = NULL;
2881 for (j = 0; j < ncopies; j++)
2883 /* Handle uses. */
2884 if (j == 0)
2886 op = TREE_OPERAND (operation, 0);
2887 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2888 if (op_type == ternary_op)
2890 op = TREE_OPERAND (operation, 1);
2891 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2894 /* Get the vector def for the reduction variable from the phi node */
2895 reduc_def = PHI_RESULT (new_phi);
2897 else
2899 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2900 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2901 if (op_type == ternary_op)
2902 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2904 /* Get the vector def for the reduction variable from the vectorized
2905 reduction operation generated in the previous iteration (j-1) */
2906 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2909 /* Arguments are ready. create the new vector stmt. */
2910 if (op_type == binary_op)
2911 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2912 else
2913 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2914 reduc_def);
2915 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2916 new_temp = make_ssa_name (vec_dest, new_stmt);
2917 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2918 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2920 if (j == 0)
2921 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2922 else
2923 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2924 prev_stmt_info = vinfo_for_stmt (new_stmt);
2927 /* Finalize the reduction-phi (set it's arguments) and create the
2928 epilog reduction code. */
2929 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2930 return true;
2933 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2934 a function declaration if the target has a vectorized version
2935 of the function, or NULL_TREE if the function cannot be vectorized. */
2937 tree
2938 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2940 tree fndecl = get_callee_fndecl (call);
2941 enum built_in_function code;
2943 /* We only handle functions that do not read or clobber memory -- i.e.
2944 const or novops ones. */
2945 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2946 return NULL_TREE;
2948 if (!fndecl
2949 || TREE_CODE (fndecl) != FUNCTION_DECL
2950 || !DECL_BUILT_IN (fndecl))
2951 return NULL_TREE;
2953 code = DECL_FUNCTION_CODE (fndecl);
2954 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2955 vectype_in);
2958 /* Function vectorizable_call.
2960 Check if STMT performs a function call that can be vectorized.
2961 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2962 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2963 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2965 bool
2966 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2968 tree vec_dest;
2969 tree scalar_dest;
2970 tree operation;
2971 tree op, type;
2972 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2974 tree vectype_out, vectype_in;
2975 int nunits_in;
2976 int nunits_out;
2977 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2978 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2979 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2980 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
2981 tree new_stmt;
2982 int ncopies, j, nargs;
2983 call_expr_arg_iterator iter;
2984 tree vargs;
2985 enum { NARROW, NONE, WIDEN } modifier;
2987 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2988 return false;
2990 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2991 return false;
2993 /* FORNOW: SLP not supported. */
2994 if (STMT_SLP_TYPE (stmt_info))
2995 return false;
2997 /* Is STMT a vectorizable call? */
2998 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2999 return false;
3001 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3002 return false;
3004 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3005 if (TREE_CODE (operation) != CALL_EXPR)
3006 return false;
3008 /* Process function arguments. */
3009 rhs_type = NULL_TREE;
3010 nargs = 0;
3011 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3013 /* Bail out if the function has more than two arguments, we
3014 do not have interesting builtin functions to vectorize with
3015 more than two arguments. */
3016 if (nargs >= 2)
3017 return false;
3019 /* We can only handle calls with arguments of the same type. */
3020 if (rhs_type
3021 && rhs_type != TREE_TYPE (op))
3023 if (vect_print_dump_info (REPORT_DETAILS))
3024 fprintf (vect_dump, "argument types differ.");
3025 return false;
3027 rhs_type = TREE_TYPE (op);
3029 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3031 if (vect_print_dump_info (REPORT_DETAILS))
3032 fprintf (vect_dump, "use not simple.");
3033 return false;
3036 ++nargs;
3039 /* No arguments is also not good. */
3040 if (nargs == 0)
3041 return false;
3043 vectype_in = get_vectype_for_scalar_type (rhs_type);
3044 if (!vectype_in)
3045 return false;
3046 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3048 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3049 vectype_out = get_vectype_for_scalar_type (lhs_type);
3050 if (!vectype_out)
3051 return false;
3052 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3054 /* FORNOW */
3055 if (nunits_in == nunits_out / 2)
3056 modifier = NARROW;
3057 else if (nunits_out == nunits_in)
3058 modifier = NONE;
3059 else if (nunits_out == nunits_in / 2)
3060 modifier = WIDEN;
3061 else
3062 return false;
3064 /* For now, we only vectorize functions if a target specific builtin
3065 is available. TODO -- in some cases, it might be profitable to
3066 insert the calls for pieces of the vector, in order to be able
3067 to vectorize other operations in the loop. */
3068 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3069 if (fndecl == NULL_TREE)
3071 if (vect_print_dump_info (REPORT_DETAILS))
3072 fprintf (vect_dump, "function is not vectorizable.");
3074 return false;
3077 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3079 if (modifier == NARROW)
3080 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3081 else
3082 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3084 /* Sanity check: make sure that at least one copy of the vectorized stmt
3085 needs to be generated. */
3086 gcc_assert (ncopies >= 1);
3088 /* FORNOW. This restriction should be relaxed. */
3089 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3091 if (vect_print_dump_info (REPORT_DETAILS))
3092 fprintf (vect_dump, "multiple types in nested loop.");
3093 return false;
3096 if (!vec_stmt) /* transformation not required. */
3098 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3099 if (vect_print_dump_info (REPORT_DETAILS))
3100 fprintf (vect_dump, "=== vectorizable_call ===");
3101 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3102 return true;
3105 /** Transform. **/
3107 if (vect_print_dump_info (REPORT_DETAILS))
3108 fprintf (vect_dump, "transform operation.");
3110 /* FORNOW. This restriction should be relaxed. */
3111 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3113 if (vect_print_dump_info (REPORT_DETAILS))
3114 fprintf (vect_dump, "multiple types in nested loop.");
3115 return false;
3118 /* Handle def. */
3119 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3120 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3122 prev_stmt_info = NULL;
3123 switch (modifier)
3125 case NONE:
3126 for (j = 0; j < ncopies; ++j)
3128 /* Build argument list for the vectorized call. */
3129 /* FIXME: Rewrite this so that it doesn't
3130 construct a temporary list. */
3131 vargs = NULL_TREE;
3132 nargs = 0;
3133 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3135 if (j == 0)
3136 vec_oprnd0
3137 = vect_get_vec_def_for_operand (op, stmt, NULL);
3138 else
3139 vec_oprnd0
3140 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3142 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3144 ++nargs;
3146 vargs = nreverse (vargs);
3148 rhs = build_function_call_expr (fndecl, vargs);
3149 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3150 new_temp = make_ssa_name (vec_dest, new_stmt);
3151 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3153 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3155 if (j == 0)
3156 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3157 else
3158 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3160 prev_stmt_info = vinfo_for_stmt (new_stmt);
3163 break;
3165 case NARROW:
3166 for (j = 0; j < ncopies; ++j)
3168 /* Build argument list for the vectorized call. */
3169 /* FIXME: Rewrite this so that it doesn't
3170 construct a temporary list. */
3171 vargs = NULL_TREE;
3172 nargs = 0;
3173 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3175 if (j == 0)
3177 vec_oprnd0
3178 = vect_get_vec_def_for_operand (op, stmt, NULL);
3179 vec_oprnd1
3180 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3182 else
3184 vec_oprnd0
3185 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3186 vec_oprnd1
3187 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3190 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3191 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3193 ++nargs;
3195 vargs = nreverse (vargs);
3197 rhs = build_function_call_expr (fndecl, vargs);
3198 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3199 new_temp = make_ssa_name (vec_dest, new_stmt);
3200 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3202 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3204 if (j == 0)
3205 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3206 else
3207 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3209 prev_stmt_info = vinfo_for_stmt (new_stmt);
3212 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3214 break;
3216 case WIDEN:
3217 /* No current target implements this case. */
3218 return false;
3221 /* The call in STMT might prevent it from being removed in dce.
3222 We however cannot remove it here, due to the way the ssa name
3223 it defines is mapped to the new definition. So just replace
3224 rhs of the statement with something harmless. */
3225 type = TREE_TYPE (scalar_dest);
3226 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3227 update_stmt (stmt);
3229 return true;
3233 /* Function vect_gen_widened_results_half
3235 Create a vector stmt whose code, type, number of arguments, and result
3236 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3237 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3238 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3239 needs to be created (DECL is a function-decl of a target-builtin).
3240 STMT is the original scalar stmt that we are vectorizing. */
3242 static tree
3243 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3244 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3245 tree vec_dest, block_stmt_iterator *bsi,
3246 tree stmt)
3248 tree expr;
3249 tree new_stmt;
3250 tree new_temp;
3251 tree sym;
3252 ssa_op_iter iter;
3254 /* Generate half of the widened result: */
3255 if (code == CALL_EXPR)
3257 /* Target specific support */
3258 if (op_type == binary_op)
3259 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3260 else
3261 expr = build_call_expr (decl, 1, vec_oprnd0);
3263 else
3265 /* Generic support */
3266 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3267 if (op_type == binary_op)
3268 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3269 else
3270 expr = build1 (code, vectype, vec_oprnd0);
3272 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3273 new_temp = make_ssa_name (vec_dest, new_stmt);
3274 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3275 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3277 if (code == CALL_EXPR)
3279 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3281 if (TREE_CODE (sym) == SSA_NAME)
3282 sym = SSA_NAME_VAR (sym);
3283 mark_sym_for_renaming (sym);
3287 return new_stmt;
3291 /* Check if STMT performs a conversion operation, that can be vectorized.
3292 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3293 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3294 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3296 bool
3297 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3298 tree *vec_stmt, slp_tree slp_node)
3300 tree vec_dest;
3301 tree scalar_dest;
3302 tree operation;
3303 tree op0;
3304 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3305 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3306 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3307 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3308 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3309 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3310 tree new_temp;
3311 tree def, def_stmt;
3312 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3313 tree new_stmt = NULL_TREE;
3314 stmt_vec_info prev_stmt_info;
3315 int nunits_in;
3316 int nunits_out;
3317 tree vectype_out, vectype_in;
3318 int ncopies, j;
3319 tree expr;
3320 tree rhs_type, lhs_type;
3321 tree builtin_decl;
3322 enum { NARROW, NONE, WIDEN } modifier;
3323 int i;
3324 VEC(tree,heap) *vec_oprnds0 = NULL;
3325 tree vop0;
3327 /* Is STMT a vectorizable conversion? */
3329 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3330 return false;
3332 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3333 return false;
3335 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3336 return false;
3338 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3339 return false;
3341 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3342 code = TREE_CODE (operation);
3343 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3344 return false;
3346 /* Check types of lhs and rhs. */
3347 op0 = TREE_OPERAND (operation, 0);
3348 rhs_type = TREE_TYPE (op0);
3349 vectype_in = get_vectype_for_scalar_type (rhs_type);
3350 if (!vectype_in)
3351 return false;
3352 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3354 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3355 lhs_type = TREE_TYPE (scalar_dest);
3356 vectype_out = get_vectype_for_scalar_type (lhs_type);
3357 if (!vectype_out)
3358 return false;
3359 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3361 /* FORNOW */
3362 if (nunits_in == nunits_out / 2)
3363 modifier = NARROW;
3364 else if (nunits_out == nunits_in)
3365 modifier = NONE;
3366 else if (nunits_out == nunits_in / 2)
3367 modifier = WIDEN;
3368 else
3369 return false;
3371 if (modifier == NONE)
3372 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3374 /* Bail out if the types are both integral or non-integral. */
3375 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3376 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3377 return false;
3379 if (modifier == NARROW)
3380 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3381 else
3382 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3384 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3385 this, so we can safely override NCOPIES with 1 here. */
3386 if (slp_node)
3387 ncopies = 1;
3389 /* Sanity check: make sure that at least one copy of the vectorized stmt
3390 needs to be generated. */
3391 gcc_assert (ncopies >= 1);
3393 /* FORNOW. This restriction should be relaxed. */
3394 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3396 if (vect_print_dump_info (REPORT_DETAILS))
3397 fprintf (vect_dump, "multiple types in nested loop.");
3398 return false;
3401 /* Check the operands of the operation. */
3402 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3404 if (vect_print_dump_info (REPORT_DETAILS))
3405 fprintf (vect_dump, "use not simple.");
3406 return false;
3409 /* Supportable by target? */
3410 if ((modifier == NONE
3411 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3412 || (modifier == WIDEN
3413 && !supportable_widening_operation (code, stmt, vectype_in,
3414 &decl1, &decl2,
3415 &code1, &code2))
3416 || (modifier == NARROW
3417 && !supportable_narrowing_operation (code, stmt, vectype_in,
3418 &code1)))
3420 if (vect_print_dump_info (REPORT_DETAILS))
3421 fprintf (vect_dump, "op not supported by target.");
3422 return false;
3425 if (modifier != NONE)
3427 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3428 /* FORNOW: SLP not supported. */
3429 if (STMT_SLP_TYPE (stmt_info))
3430 return false;
3433 if (!vec_stmt) /* transformation not required. */
3435 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3436 return true;
3439 /** Transform. **/
3440 if (vect_print_dump_info (REPORT_DETAILS))
3441 fprintf (vect_dump, "transform conversion.");
3443 /* Handle def. */
3444 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3446 if (modifier == NONE && !slp_node)
3447 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3449 prev_stmt_info = NULL;
3450 switch (modifier)
3452 case NONE:
3453 for (j = 0; j < ncopies; j++)
3455 tree sym;
3456 ssa_op_iter iter;
3458 if (j == 0)
3459 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3460 else
3461 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3463 builtin_decl =
3464 targetm.vectorize.builtin_conversion (code, vectype_in);
3465 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3467 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3469 /* Arguments are ready. create the new vector stmt. */
3470 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3471 new_temp = make_ssa_name (vec_dest, new_stmt);
3472 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3473 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3474 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3475 SSA_OP_ALL_VIRTUALS)
3477 if (TREE_CODE (sym) == SSA_NAME)
3478 sym = SSA_NAME_VAR (sym);
3479 mark_sym_for_renaming (sym);
3481 if (slp_node)
3482 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3485 if (j == 0)
3486 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3487 else
3488 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3489 prev_stmt_info = vinfo_for_stmt (new_stmt);
3491 break;
3493 case WIDEN:
3494 /* In case the vectorization factor (VF) is bigger than the number
3495 of elements that we can fit in a vectype (nunits), we have to
3496 generate more than one vector stmt - i.e - we need to "unroll"
3497 the vector stmt by a factor VF/nunits. */
3498 for (j = 0; j < ncopies; j++)
3500 if (j == 0)
3501 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3502 else
3503 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3505 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3507 /* Generate first half of the widened result: */
3508 new_stmt
3509 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3510 vec_oprnd0, vec_oprnd1,
3511 unary_op, vec_dest, bsi, stmt);
3512 if (j == 0)
3513 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3514 else
3515 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3516 prev_stmt_info = vinfo_for_stmt (new_stmt);
3518 /* Generate second half of the widened result: */
3519 new_stmt
3520 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3521 vec_oprnd0, vec_oprnd1,
3522 unary_op, vec_dest, bsi, stmt);
3523 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3524 prev_stmt_info = vinfo_for_stmt (new_stmt);
3526 break;
3528 case NARROW:
3529 /* In case the vectorization factor (VF) is bigger than the number
3530 of elements that we can fit in a vectype (nunits), we have to
3531 generate more than one vector stmt - i.e - we need to "unroll"
3532 the vector stmt by a factor VF/nunits. */
3533 for (j = 0; j < ncopies; j++)
3535 /* Handle uses. */
3536 if (j == 0)
3538 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3539 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3541 else
3543 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3544 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3547 /* Arguments are ready. Create the new vector stmt. */
3548 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3549 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3550 new_temp = make_ssa_name (vec_dest, new_stmt);
3551 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3552 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3554 if (j == 0)
3555 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3556 else
3557 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3559 prev_stmt_info = vinfo_for_stmt (new_stmt);
3562 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3565 return true;
3569 /* Function vectorizable_assignment.
3571 Check if STMT performs an assignment (copy) that can be vectorized.
3572 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3573 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3574 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3576 bool
3577 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3578 slp_tree slp_node)
3580 tree vec_dest;
3581 tree scalar_dest;
3582 tree op;
3583 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3584 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3585 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3586 tree new_temp;
3587 tree def, def_stmt;
3588 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3589 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3590 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3591 int i;
3592 VEC(tree,heap) *vec_oprnds = NULL;
3593 tree vop;
3595 gcc_assert (ncopies >= 1);
3596 if (ncopies > 1)
3597 return false; /* FORNOW */
3599 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3600 return false;
3602 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3603 return false;
3605 /* Is vectorizable assignment? */
3606 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3607 return false;
3609 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3610 if (TREE_CODE (scalar_dest) != SSA_NAME)
3611 return false;
3613 op = GIMPLE_STMT_OPERAND (stmt, 1);
3614 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3616 if (vect_print_dump_info (REPORT_DETAILS))
3617 fprintf (vect_dump, "use not simple.");
3618 return false;
3621 if (!vec_stmt) /* transformation not required. */
3623 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3624 if (vect_print_dump_info (REPORT_DETAILS))
3625 fprintf (vect_dump, "=== vectorizable_assignment ===");
3626 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3627 return true;
3630 /** Transform. **/
3631 if (vect_print_dump_info (REPORT_DETAILS))
3632 fprintf (vect_dump, "transform assignment.");
3634 /* Handle def. */
3635 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3637 /* Handle use. */
3638 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3640 /* Arguments are ready. create the new vector stmt. */
3641 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3643 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3644 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3645 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3646 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3647 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3649 if (slp_node)
3650 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3653 VEC_free (tree, heap, vec_oprnds);
3654 return true;
3658 /* Function vect_min_worthwhile_factor.
3660 For a loop where we could vectorize the operation indicated by CODE,
3661 return the minimum vectorization factor that makes it worthwhile
3662 to use generic vectors. */
3663 static int
3664 vect_min_worthwhile_factor (enum tree_code code)
3666 switch (code)
3668 case PLUS_EXPR:
3669 case MINUS_EXPR:
3670 case NEGATE_EXPR:
3671 return 4;
3673 case BIT_AND_EXPR:
3674 case BIT_IOR_EXPR:
3675 case BIT_XOR_EXPR:
3676 case BIT_NOT_EXPR:
3677 return 2;
3679 default:
3680 return INT_MAX;
3685 /* Function vectorizable_induction
3687 Check if PHI performs an induction computation that can be vectorized.
3688 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3689 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3690 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3692 bool
3693 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3694 tree *vec_stmt)
3696 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3697 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3698 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3699 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3700 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3701 tree vec_def;
3703 gcc_assert (ncopies >= 1);
3705 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3706 return false;
3708 /* FORNOW: SLP not supported. */
3709 if (STMT_SLP_TYPE (stmt_info))
3710 return false;
3712 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3714 if (TREE_CODE (phi) != PHI_NODE)
3715 return false;
3717 if (!vec_stmt) /* transformation not required. */
3719 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3720 if (vect_print_dump_info (REPORT_DETAILS))
3721 fprintf (vect_dump, "=== vectorizable_induction ===");
3722 vect_model_induction_cost (stmt_info, ncopies);
3723 return true;
3726 /** Transform. **/
3728 if (vect_print_dump_info (REPORT_DETAILS))
3729 fprintf (vect_dump, "transform induction phi.");
3731 vec_def = get_initial_def_for_induction (phi);
3732 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3733 return true;
3737 /* Function vectorizable_operation.
3739 Check if STMT performs a binary or unary operation that can be vectorized.
3740 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3741 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3742 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3744 bool
3745 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3746 slp_tree slp_node)
3748 tree vec_dest;
3749 tree scalar_dest;
3750 tree operation;
3751 tree op0, op1 = NULL;
3752 tree vec_oprnd1 = NULL_TREE;
3753 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3754 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3755 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3756 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3757 enum tree_code code;
3758 enum machine_mode vec_mode;
3759 tree new_temp;
3760 int op_type;
3761 optab optab;
3762 int icode;
3763 enum machine_mode optab_op2_mode;
3764 tree def, def_stmt;
3765 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3766 tree new_stmt = NULL_TREE;
3767 stmt_vec_info prev_stmt_info;
3768 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3769 int nunits_out;
3770 tree vectype_out;
3771 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3772 int j, i;
3773 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3774 tree vop0, vop1;
3776 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3777 this, so we can safely override NCOPIES with 1 here. */
3778 if (slp_node)
3779 ncopies = 1;
3780 gcc_assert (ncopies >= 1);
3781 /* FORNOW. This restriction should be relaxed. */
3782 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3784 if (vect_print_dump_info (REPORT_DETAILS))
3785 fprintf (vect_dump, "multiple types in nested loop.");
3786 return false;
3789 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3790 return false;
3792 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3793 return false;
3795 /* Is STMT a vectorizable binary/unary operation? */
3796 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3797 return false;
3799 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3800 return false;
3802 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3803 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3804 if (!vectype_out)
3805 return false;
3806 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3807 if (nunits_out != nunits_in)
3808 return false;
3810 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3811 code = TREE_CODE (operation);
3813 /* For pointer addition, we should use the normal plus for
3814 the vector addition. */
3815 if (code == POINTER_PLUS_EXPR)
3816 code = PLUS_EXPR;
3818 optab = optab_for_tree_code (code, vectype);
3820 /* Support only unary or binary operations. */
3821 op_type = TREE_OPERAND_LENGTH (operation);
3822 if (op_type != unary_op && op_type != binary_op)
3824 if (vect_print_dump_info (REPORT_DETAILS))
3825 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3826 return false;
3829 op0 = TREE_OPERAND (operation, 0);
3830 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3832 if (vect_print_dump_info (REPORT_DETAILS))
3833 fprintf (vect_dump, "use not simple.");
3834 return false;
3837 if (op_type == binary_op)
3839 op1 = TREE_OPERAND (operation, 1);
3840 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3842 if (vect_print_dump_info (REPORT_DETAILS))
3843 fprintf (vect_dump, "use not simple.");
3844 return false;
3848 /* Supportable by target? */
3849 if (!optab)
3851 if (vect_print_dump_info (REPORT_DETAILS))
3852 fprintf (vect_dump, "no optab.");
3853 return false;
3855 vec_mode = TYPE_MODE (vectype);
3856 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3857 if (icode == CODE_FOR_nothing)
3859 if (vect_print_dump_info (REPORT_DETAILS))
3860 fprintf (vect_dump, "op not supported by target.");
3861 /* Check only during analysis. */
3862 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3863 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3864 < vect_min_worthwhile_factor (code)
3865 && !vec_stmt))
3866 return false;
3867 if (vect_print_dump_info (REPORT_DETAILS))
3868 fprintf (vect_dump, "proceeding using word mode.");
3871 /* Worthwhile without SIMD support? Check only during analysis. */
3872 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3873 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3874 < vect_min_worthwhile_factor (code)
3875 && !vec_stmt)
3877 if (vect_print_dump_info (REPORT_DETAILS))
3878 fprintf (vect_dump, "not worthwhile without SIMD support.");
3879 return false;
3882 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3884 /* FORNOW: not yet supported. */
3885 if (!VECTOR_MODE_P (vec_mode))
3886 return false;
3888 /* Invariant argument is needed for a vector shift
3889 by a scalar shift operand. */
3890 optab_op2_mode = insn_data[icode].operand[2].mode;
3891 if (! (VECTOR_MODE_P (optab_op2_mode)
3892 || dt[1] == vect_constant_def
3893 || dt[1] == vect_invariant_def))
3895 if (vect_print_dump_info (REPORT_DETAILS))
3896 fprintf (vect_dump, "operand mode requires invariant argument.");
3897 return false;
3901 if (!vec_stmt) /* transformation not required. */
3903 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3904 if (vect_print_dump_info (REPORT_DETAILS))
3905 fprintf (vect_dump, "=== vectorizable_operation ===");
3906 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3907 return true;
3910 /** Transform. **/
3912 if (vect_print_dump_info (REPORT_DETAILS))
3913 fprintf (vect_dump, "transform binary/unary operation.");
3915 /* Handle def. */
3916 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3918 if (!slp_node)
3919 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3920 if (op_type == binary_op)
3921 vec_oprnds1 = VEC_alloc (tree, heap, 1);
3923 /* In case the vectorization factor (VF) is bigger than the number
3924 of elements that we can fit in a vectype (nunits), we have to generate
3925 more than one vector stmt - i.e - we need to "unroll" the
3926 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3927 from one copy of the vector stmt to the next, in the field
3928 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3929 stages to find the correct vector defs to be used when vectorizing
3930 stmts that use the defs of the current stmt. The example below illustrates
3931 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3932 4 vectorized stmts):
3934 before vectorization:
3935 RELATED_STMT VEC_STMT
3936 S1: x = memref - -
3937 S2: z = x + 1 - -
3939 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3940 there):
3941 RELATED_STMT VEC_STMT
3942 VS1_0: vx0 = memref0 VS1_1 -
3943 VS1_1: vx1 = memref1 VS1_2 -
3944 VS1_2: vx2 = memref2 VS1_3 -
3945 VS1_3: vx3 = memref3 - -
3946 S1: x = load - VS1_0
3947 S2: z = x + 1 - -
3949 step2: vectorize stmt S2 (done here):
3950 To vectorize stmt S2 we first need to find the relevant vector
3951 def for the first operand 'x'. This is, as usual, obtained from
3952 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3953 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3954 relevant vector def 'vx0'. Having found 'vx0' we can generate
3955 the vector stmt VS2_0, and as usual, record it in the
3956 STMT_VINFO_VEC_STMT of stmt S2.
3957 When creating the second copy (VS2_1), we obtain the relevant vector
3958 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3959 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3960 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3961 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3962 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3963 chain of stmts and pointers:
3964 RELATED_STMT VEC_STMT
3965 VS1_0: vx0 = memref0 VS1_1 -
3966 VS1_1: vx1 = memref1 VS1_2 -
3967 VS1_2: vx2 = memref2 VS1_3 -
3968 VS1_3: vx3 = memref3 - -
3969 S1: x = load - VS1_0
3970 VS2_0: vz0 = vx0 + v1 VS2_1 -
3971 VS2_1: vz1 = vx1 + v1 VS2_2 -
3972 VS2_2: vz2 = vx2 + v1 VS2_3 -
3973 VS2_3: vz3 = vx3 + v1 - -
3974 S2: z = x + 1 - VS2_0 */
3976 prev_stmt_info = NULL;
3977 for (j = 0; j < ncopies; j++)
3979 /* Handle uses. */
3980 if (j == 0)
3982 if (op_type == binary_op
3983 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
3985 /* Vector shl and shr insn patterns can be defined with scalar
3986 operand 2 (shift operand). In this case, use constant or loop
3987 invariant op1 directly, without extending it to vector mode
3988 first. */
3989 optab_op2_mode = insn_data[icode].operand[2].mode;
3990 if (!VECTOR_MODE_P (optab_op2_mode))
3992 if (vect_print_dump_info (REPORT_DETAILS))
3993 fprintf (vect_dump, "operand 1 using scalar mode.");
3994 vec_oprnd1 = op1;
3995 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
3999 /* vec_oprnd is available if operand 1 should be of a scalar-type
4000 (a special case for certain kind of vector shifts); otherwise,
4001 operand 1 should be of a vector type (the usual case). */
4002 if (op_type == binary_op && !vec_oprnd1)
4003 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4004 slp_node);
4005 else
4006 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4007 slp_node);
4009 else
4010 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4012 /* Arguments are ready. Create the new vector stmt. */
4013 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4015 if (op_type == binary_op)
4017 vop1 = VEC_index (tree, vec_oprnds1, i);
4018 new_stmt = build_gimple_modify_stmt (vec_dest,
4019 build2 (code, vectype, vop0, vop1));
4021 else
4022 new_stmt = build_gimple_modify_stmt (vec_dest,
4023 build1 (code, vectype, vop0));
4025 new_temp = make_ssa_name (vec_dest, new_stmt);
4026 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4027 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4028 if (slp_node)
4029 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4032 if (j == 0)
4033 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4034 else
4035 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4036 prev_stmt_info = vinfo_for_stmt (new_stmt);
4039 VEC_free (tree, heap, vec_oprnds0);
4040 if (vec_oprnds1)
4041 VEC_free (tree, heap, vec_oprnds1);
4043 return true;
4047 /* Function vectorizable_type_demotion
4049 Check if STMT performs a binary or unary operation that involves
4050 type demotion, and if it can be vectorized.
4051 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4052 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4053 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4055 bool
4056 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4057 tree *vec_stmt)
4059 tree vec_dest;
4060 tree scalar_dest;
4061 tree operation;
4062 tree op0;
4063 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4064 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4065 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4066 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4067 enum tree_code code, code1 = ERROR_MARK;
4068 tree new_temp;
4069 tree def, def_stmt;
4070 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4071 tree new_stmt;
4072 stmt_vec_info prev_stmt_info;
4073 int nunits_in;
4074 int nunits_out;
4075 tree vectype_out;
4076 int ncopies;
4077 int j;
4078 tree expr;
4079 tree vectype_in;
4081 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4082 return false;
4084 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4085 return false;
4087 /* Is STMT a vectorizable type-demotion operation? */
4088 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4089 return false;
4091 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4092 return false;
4094 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4095 code = TREE_CODE (operation);
4096 if (code != NOP_EXPR && code != CONVERT_EXPR)
4097 return false;
4099 op0 = TREE_OPERAND (operation, 0);
4100 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4101 if (!vectype_in)
4102 return false;
4103 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4105 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4106 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4107 if (!vectype_out)
4108 return false;
4109 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4110 if (nunits_in != nunits_out / 2) /* FORNOW */
4111 return false;
4113 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4114 gcc_assert (ncopies >= 1);
4115 /* FORNOW. This restriction should be relaxed. */
4116 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4118 if (vect_print_dump_info (REPORT_DETAILS))
4119 fprintf (vect_dump, "multiple types in nested loop.");
4120 return false;
4123 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4124 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4125 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4126 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4127 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4128 return false;
4130 /* Check the operands of the operation. */
4131 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4133 if (vect_print_dump_info (REPORT_DETAILS))
4134 fprintf (vect_dump, "use not simple.");
4135 return false;
4138 /* Supportable by target? */
4139 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4140 return false;
4142 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4144 if (!vec_stmt) /* transformation not required. */
4146 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4147 if (vect_print_dump_info (REPORT_DETAILS))
4148 fprintf (vect_dump, "=== vectorizable_demotion ===");
4149 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4150 return true;
4153 /** Transform. **/
4154 if (vect_print_dump_info (REPORT_DETAILS))
4155 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4156 ncopies);
4158 /* Handle def. */
4159 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4161 /* In case the vectorization factor (VF) is bigger than the number
4162 of elements that we can fit in a vectype (nunits), we have to generate
4163 more than one vector stmt - i.e - we need to "unroll" the
4164 vector stmt by a factor VF/nunits. */
4165 prev_stmt_info = NULL;
4166 for (j = 0; j < ncopies; j++)
4168 /* Handle uses. */
4169 if (j == 0)
4171 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4172 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4174 else
4176 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4177 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4180 /* Arguments are ready. Create the new vector stmt. */
4181 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4182 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4183 new_temp = make_ssa_name (vec_dest, new_stmt);
4184 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4185 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4187 if (j == 0)
4188 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4189 else
4190 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4192 prev_stmt_info = vinfo_for_stmt (new_stmt);
4195 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4196 return true;
4200 /* Function vectorizable_type_promotion
4202 Check if STMT performs a binary or unary operation that involves
4203 type promotion, and if it can be vectorized.
4204 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4205 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4206 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4208 bool
4209 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4210 tree *vec_stmt)
4212 tree vec_dest;
4213 tree scalar_dest;
4214 tree operation;
4215 tree op0, op1 = NULL;
4216 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4217 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4219 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4220 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4221 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4222 int op_type;
4223 tree def, def_stmt;
4224 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4225 tree new_stmt;
4226 stmt_vec_info prev_stmt_info;
4227 int nunits_in;
4228 int nunits_out;
4229 tree vectype_out;
4230 int ncopies;
4231 int j;
4232 tree vectype_in;
4234 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4235 return false;
4237 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4238 return false;
4240 /* Is STMT a vectorizable type-promotion operation? */
4241 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4242 return false;
4244 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4245 return false;
4247 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4248 code = TREE_CODE (operation);
4249 if (code != NOP_EXPR && code != CONVERT_EXPR
4250 && code != WIDEN_MULT_EXPR)
4251 return false;
4253 op0 = TREE_OPERAND (operation, 0);
4254 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4255 if (!vectype_in)
4256 return false;
4257 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4259 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4260 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4261 if (!vectype_out)
4262 return false;
4263 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4264 if (nunits_out != nunits_in / 2) /* FORNOW */
4265 return false;
4267 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4268 gcc_assert (ncopies >= 1);
4269 /* FORNOW. This restriction should be relaxed. */
4270 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4272 if (vect_print_dump_info (REPORT_DETAILS))
4273 fprintf (vect_dump, "multiple types in nested loop.");
4274 return false;
4277 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4278 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4279 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4280 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4281 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4282 return false;
4284 /* Check the operands of the operation. */
4285 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4287 if (vect_print_dump_info (REPORT_DETAILS))
4288 fprintf (vect_dump, "use not simple.");
4289 return false;
4292 op_type = TREE_CODE_LENGTH (code);
4293 if (op_type == binary_op)
4295 op1 = TREE_OPERAND (operation, 1);
4296 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4298 if (vect_print_dump_info (REPORT_DETAILS))
4299 fprintf (vect_dump, "use not simple.");
4300 return false;
4304 /* Supportable by target? */
4305 if (!supportable_widening_operation (code, stmt, vectype_in,
4306 &decl1, &decl2, &code1, &code2))
4307 return false;
4309 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4311 if (!vec_stmt) /* transformation not required. */
4313 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4314 if (vect_print_dump_info (REPORT_DETAILS))
4315 fprintf (vect_dump, "=== vectorizable_promotion ===");
4316 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4317 return true;
4320 /** Transform. **/
4322 if (vect_print_dump_info (REPORT_DETAILS))
4323 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4324 ncopies);
4326 /* Handle def. */
4327 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4329 /* In case the vectorization factor (VF) is bigger than the number
4330 of elements that we can fit in a vectype (nunits), we have to generate
4331 more than one vector stmt - i.e - we need to "unroll" the
4332 vector stmt by a factor VF/nunits. */
4334 prev_stmt_info = NULL;
4335 for (j = 0; j < ncopies; j++)
4337 /* Handle uses. */
4338 if (j == 0)
4340 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4341 if (op_type == binary_op)
4342 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4344 else
4346 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4347 if (op_type == binary_op)
4348 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4351 /* Arguments are ready. Create the new vector stmt. We are creating
4352 two vector defs because the widened result does not fit in one vector.
4353 The vectorized stmt can be expressed as a call to a taregt builtin,
4354 or a using a tree-code. */
4355 /* Generate first half of the widened result: */
4356 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4357 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4358 if (j == 0)
4359 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4360 else
4361 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4362 prev_stmt_info = vinfo_for_stmt (new_stmt);
4364 /* Generate second half of the widened result: */
4365 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4366 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4367 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4368 prev_stmt_info = vinfo_for_stmt (new_stmt);
4372 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4373 return true;
4377 /* Function vect_strided_store_supported.
4379 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4380 and FALSE otherwise. */
4382 static bool
4383 vect_strided_store_supported (tree vectype)
4385 optab interleave_high_optab, interleave_low_optab;
4386 int mode;
4388 mode = (int) TYPE_MODE (vectype);
4390 /* Check that the operation is supported. */
4391 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4392 vectype);
4393 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4394 vectype);
4395 if (!interleave_high_optab || !interleave_low_optab)
4397 if (vect_print_dump_info (REPORT_DETAILS))
4398 fprintf (vect_dump, "no optab for interleave.");
4399 return false;
4402 if (optab_handler (interleave_high_optab, mode)->insn_code
4403 == CODE_FOR_nothing
4404 || optab_handler (interleave_low_optab, mode)->insn_code
4405 == CODE_FOR_nothing)
4407 if (vect_print_dump_info (REPORT_DETAILS))
4408 fprintf (vect_dump, "interleave op not supported by target.");
4409 return false;
4412 return true;
4416 /* Function vect_permute_store_chain.
4418 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4419 a power of 2, generate interleave_high/low stmts to reorder the data
4420 correctly for the stores. Return the final references for stores in
4421 RESULT_CHAIN.
4423 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4424 The input is 4 vectors each containing 8 elements. We assign a number to each
4425 element, the input sequence is:
4427 1st vec: 0 1 2 3 4 5 6 7
4428 2nd vec: 8 9 10 11 12 13 14 15
4429 3rd vec: 16 17 18 19 20 21 22 23
4430 4th vec: 24 25 26 27 28 29 30 31
4432 The output sequence should be:
4434 1st vec: 0 8 16 24 1 9 17 25
4435 2nd vec: 2 10 18 26 3 11 19 27
4436 3rd vec: 4 12 20 28 5 13 21 30
4437 4th vec: 6 14 22 30 7 15 23 31
4439 i.e., we interleave the contents of the four vectors in their order.
4441 We use interleave_high/low instructions to create such output. The input of
4442 each interleave_high/low operation is two vectors:
4443 1st vec 2nd vec
4444 0 1 2 3 4 5 6 7
4445 the even elements of the result vector are obtained left-to-right from the
4446 high/low elements of the first vector. The odd elements of the result are
4447 obtained left-to-right from the high/low elements of the second vector.
4448 The output of interleave_high will be: 0 4 1 5
4449 and of interleave_low: 2 6 3 7
4452 The permutation is done in log LENGTH stages. In each stage interleave_high
4453 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4454 where the first argument is taken from the first half of DR_CHAIN and the
4455 second argument from it's second half.
4456 In our example,
4458 I1: interleave_high (1st vec, 3rd vec)
4459 I2: interleave_low (1st vec, 3rd vec)
4460 I3: interleave_high (2nd vec, 4th vec)
4461 I4: interleave_low (2nd vec, 4th vec)
4463 The output for the first stage is:
4465 I1: 0 16 1 17 2 18 3 19
4466 I2: 4 20 5 21 6 22 7 23
4467 I3: 8 24 9 25 10 26 11 27
4468 I4: 12 28 13 29 14 30 15 31
4470 The output of the second stage, i.e. the final result is:
4472 I1: 0 8 16 24 1 9 17 25
4473 I2: 2 10 18 26 3 11 19 27
4474 I3: 4 12 20 28 5 13 21 30
4475 I4: 6 14 22 30 7 15 23 31. */
4477 static bool
4478 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4479 unsigned int length,
4480 tree stmt,
4481 block_stmt_iterator *bsi,
4482 VEC(tree,heap) **result_chain)
4484 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4485 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4486 tree scalar_dest, tmp;
4487 int i;
4488 unsigned int j;
4489 VEC(tree,heap) *first, *second;
4491 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4492 first = VEC_alloc (tree, heap, length/2);
4493 second = VEC_alloc (tree, heap, length/2);
4495 /* Check that the operation is supported. */
4496 if (!vect_strided_store_supported (vectype))
4497 return false;
4499 *result_chain = VEC_copy (tree, heap, dr_chain);
4501 for (i = 0; i < exact_log2 (length); i++)
4503 for (j = 0; j < length/2; j++)
4505 vect1 = VEC_index (tree, dr_chain, j);
4506 vect2 = VEC_index (tree, dr_chain, j+length/2);
4508 /* Create interleaving stmt:
4509 in the case of big endian:
4510 high = interleave_high (vect1, vect2)
4511 and in the case of little endian:
4512 high = interleave_low (vect1, vect2). */
4513 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4514 DECL_GIMPLE_REG_P (perm_dest) = 1;
4515 add_referenced_var (perm_dest);
4516 if (BYTES_BIG_ENDIAN)
4517 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4518 else
4519 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4520 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4521 high = make_ssa_name (perm_dest, perm_stmt);
4522 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4523 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4524 VEC_replace (tree, *result_chain, 2*j, high);
4526 /* Create interleaving stmt:
4527 in the case of big endian:
4528 low = interleave_low (vect1, vect2)
4529 and in the case of little endian:
4530 low = interleave_high (vect1, vect2). */
4531 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4532 DECL_GIMPLE_REG_P (perm_dest) = 1;
4533 add_referenced_var (perm_dest);
4534 if (BYTES_BIG_ENDIAN)
4535 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4536 else
4537 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4538 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4539 low = make_ssa_name (perm_dest, perm_stmt);
4540 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4541 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4542 VEC_replace (tree, *result_chain, 2*j+1, low);
4544 dr_chain = VEC_copy (tree, heap, *result_chain);
4546 return true;
4550 /* Function vectorizable_store.
4552 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4553 can be vectorized.
4554 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4555 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4556 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4558 bool
4559 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4560 slp_tree slp_node)
4562 tree scalar_dest;
4563 tree data_ref;
4564 tree op;
4565 tree vec_oprnd = NULL_TREE;
4566 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4567 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4568 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4569 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4570 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4571 enum machine_mode vec_mode;
4572 tree dummy;
4573 enum dr_alignment_support alignment_support_scheme;
4574 tree def, def_stmt;
4575 enum vect_def_type dt;
4576 stmt_vec_info prev_stmt_info = NULL;
4577 tree dataref_ptr = NULL_TREE;
4578 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4579 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4580 int j;
4581 tree next_stmt, first_stmt = NULL_TREE;
4582 bool strided_store = false;
4583 unsigned int group_size, i;
4584 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4585 bool inv_p;
4586 VEC(tree,heap) *vec_oprnds = NULL;
4587 bool slp = (slp_node != NULL);
4588 stmt_vec_info first_stmt_vinfo;
4589 unsigned int vec_num;
4591 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4592 this, so we can safely override NCOPIES with 1 here. */
4593 if (slp)
4594 ncopies = 1;
4596 gcc_assert (ncopies >= 1);
4598 /* FORNOW. This restriction should be relaxed. */
4599 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4601 if (vect_print_dump_info (REPORT_DETAILS))
4602 fprintf (vect_dump, "multiple types in nested loop.");
4603 return false;
4606 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4607 return false;
4609 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4610 return false;
4612 /* Is vectorizable store? */
4614 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4615 return false;
4617 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4618 if (TREE_CODE (scalar_dest) != ARRAY_REF
4619 && TREE_CODE (scalar_dest) != INDIRECT_REF
4620 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4621 return false;
4623 op = GIMPLE_STMT_OPERAND (stmt, 1);
4624 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4626 if (vect_print_dump_info (REPORT_DETAILS))
4627 fprintf (vect_dump, "use not simple.");
4628 return false;
4631 vec_mode = TYPE_MODE (vectype);
4632 /* FORNOW. In some cases can vectorize even if data-type not supported
4633 (e.g. - array initialization with 0). */
4634 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4635 return false;
4637 if (!STMT_VINFO_DATA_REF (stmt_info))
4638 return false;
4640 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4642 strided_store = true;
4643 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4644 if (!vect_strided_store_supported (vectype)
4645 && !PURE_SLP_STMT (stmt_info) && !slp)
4646 return false;
4648 if (first_stmt == stmt)
4650 /* STMT is the leader of the group. Check the operands of all the
4651 stmts of the group. */
4652 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4653 while (next_stmt)
4655 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4656 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4658 if (vect_print_dump_info (REPORT_DETAILS))
4659 fprintf (vect_dump, "use not simple.");
4660 return false;
4662 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4667 if (!vec_stmt) /* transformation not required. */
4669 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4670 if (!PURE_SLP_STMT (stmt_info))
4671 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4672 return true;
4675 /** Transform. **/
4677 if (strided_store)
4679 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4680 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4682 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4684 /* FORNOW */
4685 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4687 /* We vectorize all the stmts of the interleaving group when we
4688 reach the last stmt in the group. */
4689 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4690 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4691 && !slp)
4693 *vec_stmt = NULL_TREE;
4694 return true;
4697 if (slp)
4698 strided_store = false;
4700 /* VEC_NUM is the number of vect stmts to be created for this group. */
4701 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4702 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4703 else
4704 vec_num = group_size;
4706 else
4708 first_stmt = stmt;
4709 first_dr = dr;
4710 group_size = vec_num = 1;
4711 first_stmt_vinfo = stmt_info;
4714 if (vect_print_dump_info (REPORT_DETAILS))
4715 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4717 dr_chain = VEC_alloc (tree, heap, group_size);
4718 oprnds = VEC_alloc (tree, heap, group_size);
4720 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4721 gcc_assert (alignment_support_scheme);
4722 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4724 /* In case the vectorization factor (VF) is bigger than the number
4725 of elements that we can fit in a vectype (nunits), we have to generate
4726 more than one vector stmt - i.e - we need to "unroll" the
4727 vector stmt by a factor VF/nunits. For more details see documentation in
4728 vect_get_vec_def_for_copy_stmt. */
4730 /* In case of interleaving (non-unit strided access):
4732 S1: &base + 2 = x2
4733 S2: &base = x0
4734 S3: &base + 1 = x1
4735 S4: &base + 3 = x3
4737 We create vectorized stores starting from base address (the access of the
4738 first stmt in the chain (S2 in the above example), when the last store stmt
4739 of the chain (S4) is reached:
4741 VS1: &base = vx2
4742 VS2: &base + vec_size*1 = vx0
4743 VS3: &base + vec_size*2 = vx1
4744 VS4: &base + vec_size*3 = vx3
4746 Then permutation statements are generated:
4748 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4749 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4752 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4753 (the order of the data-refs in the output of vect_permute_store_chain
4754 corresponds to the order of scalar stmts in the interleaving chain - see
4755 the documentation of vect_permute_store_chain()).
4757 In case of both multiple types and interleaving, above vector stores and
4758 permutation stmts are created for every copy. The result vector stmts are
4759 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4760 STMT_VINFO_RELATED_STMT for the next copies.
4763 prev_stmt_info = NULL;
4764 for (j = 0; j < ncopies; j++)
4766 tree new_stmt;
4767 tree ptr_incr;
4769 if (j == 0)
4771 if (slp)
4773 /* Get vectorized arguments for SLP_NODE. */
4774 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4776 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4778 else
4780 /* For interleaved stores we collect vectorized defs for all the
4781 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4782 used as an input to vect_permute_store_chain(), and OPRNDS as
4783 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4785 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4786 OPRNDS are of size 1. */
4787 next_stmt = first_stmt;
4788 for (i = 0; i < group_size; i++)
4790 /* Since gaps are not supported for interleaved stores,
4791 GROUP_SIZE is the exact number of stmts in the chain.
4792 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4793 there is no interleaving, GROUP_SIZE is 1, and only one
4794 iteration of the loop will be executed. */
4795 gcc_assert (next_stmt);
4796 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4798 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4799 NULL);
4800 VEC_quick_push(tree, dr_chain, vec_oprnd);
4801 VEC_quick_push(tree, oprnds, vec_oprnd);
4802 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4805 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4806 &dummy, &ptr_incr, false,
4807 TREE_TYPE (vec_oprnd), &inv_p);
4808 gcc_assert (!inv_p);
4810 else
4812 /* FORNOW SLP doesn't work for multiple types. */
4813 gcc_assert (!slp);
4815 /* For interleaved stores we created vectorized defs for all the
4816 defs stored in OPRNDS in the previous iteration (previous copy).
4817 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4818 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4819 next copy.
4820 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4821 OPRNDS are of size 1. */
4822 for (i = 0; i < group_size; i++)
4824 op = VEC_index (tree, oprnds, i);
4825 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4826 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4827 VEC_replace(tree, dr_chain, i, vec_oprnd);
4828 VEC_replace(tree, oprnds, i, vec_oprnd);
4830 dataref_ptr =
4831 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4834 if (strided_store)
4836 result_chain = VEC_alloc (tree, heap, group_size);
4837 /* Permute. */
4838 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4839 &result_chain))
4840 return false;
4843 next_stmt = first_stmt;
4844 for (i = 0; i < vec_num; i++)
4846 if (i > 0)
4847 /* Bump the vector pointer. */
4848 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4849 NULL_TREE);
4851 if (slp)
4852 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4853 else if (strided_store)
4854 /* For strided stores vectorized defs are interleaved in
4855 vect_permute_store_chain(). */
4856 vec_oprnd = VEC_index (tree, result_chain, i);
4858 data_ref = build_fold_indirect_ref (dataref_ptr);
4859 /* Arguments are ready. Create the new vector stmt. */
4860 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4861 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4862 mark_symbols_for_renaming (new_stmt);
4864 if (j == 0)
4865 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4866 else
4867 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4869 prev_stmt_info = vinfo_for_stmt (new_stmt);
4870 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4871 if (!next_stmt)
4872 break;
4876 return true;
4880 /* Function vect_setup_realignment
4882 This function is called when vectorizing an unaligned load using
4883 the dr_explicit_realign[_optimized] scheme.
4884 This function generates the following code at the loop prolog:
4886 p = initial_addr;
4887 x msq_init = *(floor(p)); # prolog load
4888 realignment_token = call target_builtin;
4889 loop:
4890 x msq = phi (msq_init, ---)
4892 The stmts marked with x are generated only for the case of
4893 dr_explicit_realign_optimized.
4895 The code above sets up a new (vector) pointer, pointing to the first
4896 location accessed by STMT, and a "floor-aligned" load using that pointer.
4897 It also generates code to compute the "realignment-token" (if the relevant
4898 target hook was defined), and creates a phi-node at the loop-header bb
4899 whose arguments are the result of the prolog-load (created by this
4900 function) and the result of a load that takes place in the loop (to be
4901 created by the caller to this function).
4903 For the case of dr_explicit_realign_optimized:
4904 The caller to this function uses the phi-result (msq) to create the
4905 realignment code inside the loop, and sets up the missing phi argument,
4906 as follows:
4907 loop:
4908 msq = phi (msq_init, lsq)
4909 lsq = *(floor(p')); # load in loop
4910 result = realign_load (msq, lsq, realignment_token);
4912 For the case of dr_explicit_realign:
4913 loop:
4914 msq = *(floor(p)); # load in loop
4915 p' = p + (VS-1);
4916 lsq = *(floor(p')); # load in loop
4917 result = realign_load (msq, lsq, realignment_token);
4919 Input:
4920 STMT - (scalar) load stmt to be vectorized. This load accesses
4921 a memory location that may be unaligned.
4922 BSI - place where new code is to be inserted.
4923 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4924 is used.
4926 Output:
4927 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4928 target hook, if defined.
4929 Return value - the result of the loop-header phi node. */
4931 static tree
4932 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
4933 tree *realignment_token,
4934 enum dr_alignment_support alignment_support_scheme,
4935 tree init_addr,
4936 struct loop **at_loop)
4938 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4939 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4940 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4942 edge pe;
4943 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4944 tree vec_dest;
4945 tree inc;
4946 tree ptr;
4947 tree data_ref;
4948 tree new_stmt;
4949 basic_block new_bb;
4950 tree msq_init = NULL_TREE;
4951 tree new_temp;
4952 tree phi_stmt;
4953 tree msq = NULL_TREE;
4954 tree stmts = NULL_TREE;
4955 bool inv_p;
4956 bool compute_in_loop = false;
4957 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4958 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
4959 struct loop *loop_for_initial_load;
4961 gcc_assert (alignment_support_scheme == dr_explicit_realign
4962 || alignment_support_scheme == dr_explicit_realign_optimized);
4964 /* We need to generate three things:
4965 1. the misalignment computation
4966 2. the extra vector load (for the optimized realignment scheme).
4967 3. the phi node for the two vectors from which the realignment is
4968 done (for the optimized realignment scheme).
4971 /* 1. Determine where to generate the misalignment computation.
4973 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4974 calculation will be generated by this function, outside the loop (in the
4975 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4976 caller, inside the loop.
4978 Background: If the misalignment remains fixed throughout the iterations of
4979 the loop, then both realignment schemes are applicable, and also the
4980 misalignment computation can be done outside LOOP. This is because we are
4981 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4982 are a multiple of VS (the Vector Size), and therefore the misalignment in
4983 different vectorized LOOP iterations is always the same.
4984 The problem arises only if the memory access is in an inner-loop nested
4985 inside LOOP, which is now being vectorized using outer-loop vectorization.
4986 This is the only case when the misalignment of the memory access may not
4987 remain fixed throughout the iterations of the inner-loop (as explained in
4988 detail in vect_supportable_dr_alignment). In this case, not only is the
4989 optimized realignment scheme not applicable, but also the misalignment
4990 computation (and generation of the realignment token that is passed to
4991 REALIGN_LOAD) have to be done inside the loop.
4993 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4994 or not, which in turn determines if the misalignment is computed inside
4995 the inner-loop, or outside LOOP. */
4997 if (init_addr != NULL_TREE)
4999 compute_in_loop = true;
5000 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5004 /* 2. Determine where to generate the extra vector load.
5006 For the optimized realignment scheme, instead of generating two vector
5007 loads in each iteration, we generate a single extra vector load in the
5008 preheader of the loop, and in each iteration reuse the result of the
5009 vector load from the previous iteration. In case the memory access is in
5010 an inner-loop nested inside LOOP, which is now being vectorized using
5011 outer-loop vectorization, we need to determine whether this initial vector
5012 load should be generated at the preheader of the inner-loop, or can be
5013 generated at the preheader of LOOP. If the memory access has no evolution
5014 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5015 to be generated inside LOOP (in the preheader of the inner-loop). */
5017 if (nested_in_vect_loop)
5019 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5020 bool invariant_in_outerloop =
5021 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5022 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5024 else
5025 loop_for_initial_load = loop;
5026 if (at_loop)
5027 *at_loop = loop_for_initial_load;
5029 /* 3. For the case of the optimized realignment, create the first vector
5030 load at the loop preheader. */
5032 if (alignment_support_scheme == dr_explicit_realign_optimized)
5034 /* Create msq_init = *(floor(p1)) in the loop preheader */
5036 gcc_assert (!compute_in_loop);
5037 pe = loop_preheader_edge (loop_for_initial_load);
5038 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5039 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5040 &init_addr, &inc, true, NULL_TREE, &inv_p);
5041 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5042 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5043 new_temp = make_ssa_name (vec_dest, new_stmt);
5044 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5045 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5046 gcc_assert (!new_bb);
5047 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5050 /* 4. Create realignment token using a target builtin, if available.
5051 It is done either inside the containing loop, or before LOOP (as
5052 determined above). */
5054 if (targetm.vectorize.builtin_mask_for_load)
5056 tree builtin_decl;
5058 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5059 if (compute_in_loop)
5060 gcc_assert (init_addr); /* already computed by the caller. */
5061 else
5063 /* Generate the INIT_ADDR computation outside LOOP. */
5064 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5065 NULL_TREE, loop);
5066 pe = loop_preheader_edge (loop);
5067 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5068 gcc_assert (!new_bb);
5071 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5072 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5073 vec_dest = vect_create_destination_var (scalar_dest,
5074 TREE_TYPE (new_stmt));
5075 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5076 new_temp = make_ssa_name (vec_dest, new_stmt);
5077 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5079 if (compute_in_loop)
5080 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5081 else
5083 /* Generate the misalignment computation outside LOOP. */
5084 pe = loop_preheader_edge (loop);
5085 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5086 gcc_assert (!new_bb);
5089 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5091 /* The result of the CALL_EXPR to this builtin is determined from
5092 the value of the parameter and no global variables are touched
5093 which makes the builtin a "const" function. Requiring the
5094 builtin to have the "const" attribute makes it unnecessary
5095 to call mark_call_clobbered. */
5096 gcc_assert (TREE_READONLY (builtin_decl));
5099 if (alignment_support_scheme == dr_explicit_realign)
5100 return msq;
5102 gcc_assert (!compute_in_loop);
5103 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5106 /* 5. Create msq = phi <msq_init, lsq> in loop */
5108 pe = loop_preheader_edge (containing_loop);
5109 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5110 msq = make_ssa_name (vec_dest, NULL_TREE);
5111 phi_stmt = create_phi_node (msq, containing_loop->header);
5112 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5113 add_phi_arg (phi_stmt, msq_init, pe);
5115 return msq;
5119 /* Function vect_strided_load_supported.
5121 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5122 and FALSE otherwise. */
5124 static bool
5125 vect_strided_load_supported (tree vectype)
5127 optab perm_even_optab, perm_odd_optab;
5128 int mode;
5130 mode = (int) TYPE_MODE (vectype);
5132 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5133 if (!perm_even_optab)
5135 if (vect_print_dump_info (REPORT_DETAILS))
5136 fprintf (vect_dump, "no optab for perm_even.");
5137 return false;
5140 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5142 if (vect_print_dump_info (REPORT_DETAILS))
5143 fprintf (vect_dump, "perm_even op not supported by target.");
5144 return false;
5147 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5148 if (!perm_odd_optab)
5150 if (vect_print_dump_info (REPORT_DETAILS))
5151 fprintf (vect_dump, "no optab for perm_odd.");
5152 return false;
5155 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5157 if (vect_print_dump_info (REPORT_DETAILS))
5158 fprintf (vect_dump, "perm_odd op not supported by target.");
5159 return false;
5161 return true;
5165 /* Function vect_permute_load_chain.
5167 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5168 a power of 2, generate extract_even/odd stmts to reorder the input data
5169 correctly. Return the final references for loads in RESULT_CHAIN.
5171 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5172 The input is 4 vectors each containing 8 elements. We assign a number to each
5173 element, the input sequence is:
5175 1st vec: 0 1 2 3 4 5 6 7
5176 2nd vec: 8 9 10 11 12 13 14 15
5177 3rd vec: 16 17 18 19 20 21 22 23
5178 4th vec: 24 25 26 27 28 29 30 31
5180 The output sequence should be:
5182 1st vec: 0 4 8 12 16 20 24 28
5183 2nd vec: 1 5 9 13 17 21 25 29
5184 3rd vec: 2 6 10 14 18 22 26 30
5185 4th vec: 3 7 11 15 19 23 27 31
5187 i.e., the first output vector should contain the first elements of each
5188 interleaving group, etc.
5190 We use extract_even/odd instructions to create such output. The input of each
5191 extract_even/odd operation is two vectors
5192 1st vec 2nd vec
5193 0 1 2 3 4 5 6 7
5195 and the output is the vector of extracted even/odd elements. The output of
5196 extract_even will be: 0 2 4 6
5197 and of extract_odd: 1 3 5 7
5200 The permutation is done in log LENGTH stages. In each stage extract_even and
5201 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5202 order. In our example,
5204 E1: extract_even (1st vec, 2nd vec)
5205 E2: extract_odd (1st vec, 2nd vec)
5206 E3: extract_even (3rd vec, 4th vec)
5207 E4: extract_odd (3rd vec, 4th vec)
5209 The output for the first stage will be:
5211 E1: 0 2 4 6 8 10 12 14
5212 E2: 1 3 5 7 9 11 13 15
5213 E3: 16 18 20 22 24 26 28 30
5214 E4: 17 19 21 23 25 27 29 31
5216 In order to proceed and create the correct sequence for the next stage (or
5217 for the correct output, if the second stage is the last one, as in our
5218 example), we first put the output of extract_even operation and then the
5219 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5220 The input for the second stage is:
5222 1st vec (E1): 0 2 4 6 8 10 12 14
5223 2nd vec (E3): 16 18 20 22 24 26 28 30
5224 3rd vec (E2): 1 3 5 7 9 11 13 15
5225 4th vec (E4): 17 19 21 23 25 27 29 31
5227 The output of the second stage:
5229 E1: 0 4 8 12 16 20 24 28
5230 E2: 2 6 10 14 18 22 26 30
5231 E3: 1 5 9 13 17 21 25 29
5232 E4: 3 7 11 15 19 23 27 31
5234 And RESULT_CHAIN after reordering:
5236 1st vec (E1): 0 4 8 12 16 20 24 28
5237 2nd vec (E3): 1 5 9 13 17 21 25 29
5238 3rd vec (E2): 2 6 10 14 18 22 26 30
5239 4th vec (E4): 3 7 11 15 19 23 27 31. */
5241 static bool
5242 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5243 unsigned int length,
5244 tree stmt,
5245 block_stmt_iterator *bsi,
5246 VEC(tree,heap) **result_chain)
5248 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5249 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5250 tree tmp;
5251 int i;
5252 unsigned int j;
5254 /* Check that the operation is supported. */
5255 if (!vect_strided_load_supported (vectype))
5256 return false;
5258 *result_chain = VEC_copy (tree, heap, dr_chain);
5259 for (i = 0; i < exact_log2 (length); i++)
5261 for (j = 0; j < length; j +=2)
5263 first_vect = VEC_index (tree, dr_chain, j);
5264 second_vect = VEC_index (tree, dr_chain, j+1);
5266 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5267 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5268 DECL_GIMPLE_REG_P (perm_dest) = 1;
5269 add_referenced_var (perm_dest);
5271 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5272 first_vect, second_vect);
5273 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5275 data_ref = make_ssa_name (perm_dest, perm_stmt);
5276 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5277 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5278 mark_symbols_for_renaming (perm_stmt);
5280 VEC_replace (tree, *result_chain, j/2, data_ref);
5282 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5283 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5284 DECL_GIMPLE_REG_P (perm_dest) = 1;
5285 add_referenced_var (perm_dest);
5287 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5288 first_vect, second_vect);
5289 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5290 data_ref = make_ssa_name (perm_dest, perm_stmt);
5291 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5292 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5293 mark_symbols_for_renaming (perm_stmt);
5295 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5297 dr_chain = VEC_copy (tree, heap, *result_chain);
5299 return true;
5303 /* Function vect_transform_strided_load.
5305 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5306 to perform their permutation and ascribe the result vectorized statements to
5307 the scalar statements.
5310 static bool
5311 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5312 block_stmt_iterator *bsi)
5314 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5315 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5316 tree next_stmt, new_stmt;
5317 VEC(tree,heap) *result_chain = NULL;
5318 unsigned int i, gap_count;
5319 tree tmp_data_ref;
5321 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5322 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5323 vectors, that are ready for vector computation. */
5324 result_chain = VEC_alloc (tree, heap, size);
5325 /* Permute. */
5326 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5327 return false;
5329 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5330 Since we scan the chain starting from it's first node, their order
5331 corresponds the order of data-refs in RESULT_CHAIN. */
5332 next_stmt = first_stmt;
5333 gap_count = 1;
5334 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5336 if (!next_stmt)
5337 break;
5339 /* Skip the gaps. Loads created for the gaps will be removed by dead
5340 code elimination pass later.
5341 DR_GROUP_GAP is the number of steps in elements from the previous
5342 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5343 correspond to the gaps.
5345 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5347 gap_count++;
5348 continue;
5351 while (next_stmt)
5353 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5354 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5355 copies, and we put the new vector statement in the first available
5356 RELATED_STMT. */
5357 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5358 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5359 else
5361 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5362 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5363 vinfo_for_stmt (prev_stmt));
5364 while (rel_stmt)
5366 prev_stmt = rel_stmt;
5367 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5369 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5371 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5372 gap_count = 1;
5373 /* If NEXT_STMT accesses the same DR as the previous statement,
5374 put the same TMP_DATA_REF as its vectorized statement; otherwise
5375 get the next data-ref from RESULT_CHAIN. */
5376 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5377 break;
5380 return true;
5384 /* vectorizable_load.
5386 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5387 can be vectorized.
5388 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5389 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5390 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5392 bool
5393 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5394 slp_tree slp_node)
5396 tree scalar_dest;
5397 tree vec_dest = NULL;
5398 tree data_ref = NULL;
5399 tree op;
5400 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5401 stmt_vec_info prev_stmt_info;
5402 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5403 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5404 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5405 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5406 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5407 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5408 tree new_temp;
5409 int mode;
5410 tree new_stmt = NULL_TREE;
5411 tree dummy;
5412 enum dr_alignment_support alignment_support_scheme;
5413 tree dataref_ptr = NULL_TREE;
5414 tree ptr_incr;
5415 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5416 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5417 int i, j, group_size;
5418 tree msq = NULL_TREE, lsq;
5419 tree offset = NULL_TREE;
5420 tree realignment_token = NULL_TREE;
5421 tree phi = NULL_TREE;
5422 VEC(tree,heap) *dr_chain = NULL;
5423 bool strided_load = false;
5424 tree first_stmt;
5425 tree scalar_type;
5426 bool inv_p;
5427 bool compute_in_loop = false;
5428 struct loop *at_loop;
5429 int vec_num;
5430 bool slp = (slp_node != NULL);
5432 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5433 this, so we can safely override NCOPIES with 1 here. */
5434 if (slp)
5435 ncopies = 1;
5437 gcc_assert (ncopies >= 1);
5439 /* FORNOW. This restriction should be relaxed. */
5440 if (nested_in_vect_loop && ncopies > 1)
5442 if (vect_print_dump_info (REPORT_DETAILS))
5443 fprintf (vect_dump, "multiple types in nested loop.");
5444 return false;
5447 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5448 return false;
5450 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5451 return false;
5453 /* Is vectorizable load? */
5454 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5455 return false;
5457 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5458 if (TREE_CODE (scalar_dest) != SSA_NAME)
5459 return false;
5461 op = GIMPLE_STMT_OPERAND (stmt, 1);
5462 if (TREE_CODE (op) != ARRAY_REF
5463 && TREE_CODE (op) != INDIRECT_REF
5464 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5465 return false;
5467 if (!STMT_VINFO_DATA_REF (stmt_info))
5468 return false;
5470 scalar_type = TREE_TYPE (DR_REF (dr));
5471 mode = (int) TYPE_MODE (vectype);
5473 /* FORNOW. In some cases can vectorize even if data-type not supported
5474 (e.g. - data copies). */
5475 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5477 if (vect_print_dump_info (REPORT_DETAILS))
5478 fprintf (vect_dump, "Aligned load, but unsupported type.");
5479 return false;
5482 /* Check if the load is a part of an interleaving chain. */
5483 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5485 strided_load = true;
5486 /* FORNOW */
5487 gcc_assert (! nested_in_vect_loop);
5489 /* Check if interleaving is supported. */
5490 if (!vect_strided_load_supported (vectype)
5491 && !PURE_SLP_STMT (stmt_info) && !slp)
5492 return false;
5495 if (!vec_stmt) /* transformation not required. */
5497 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5498 vect_model_load_cost (stmt_info, ncopies, NULL);
5499 return true;
5502 if (vect_print_dump_info (REPORT_DETAILS))
5503 fprintf (vect_dump, "transform load.");
5505 /** Transform. **/
5507 if (strided_load)
5509 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5510 /* Check if the chain of loads is already vectorized. */
5511 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5513 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5514 return true;
5516 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5517 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5518 dr_chain = VEC_alloc (tree, heap, group_size);
5520 /* VEC_NUM is the number of vect stmts to be created for this group. */
5521 if (slp)
5523 strided_load = false;
5524 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5526 else
5527 vec_num = group_size;
5529 else
5531 first_stmt = stmt;
5532 first_dr = dr;
5533 group_size = vec_num = 1;
5536 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5537 gcc_assert (alignment_support_scheme);
5539 /* In case the vectorization factor (VF) is bigger than the number
5540 of elements that we can fit in a vectype (nunits), we have to generate
5541 more than one vector stmt - i.e - we need to "unroll" the
5542 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5543 from one copy of the vector stmt to the next, in the field
5544 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5545 stages to find the correct vector defs to be used when vectorizing
5546 stmts that use the defs of the current stmt. The example below illustrates
5547 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5548 4 vectorized stmts):
5550 before vectorization:
5551 RELATED_STMT VEC_STMT
5552 S1: x = memref - -
5553 S2: z = x + 1 - -
5555 step 1: vectorize stmt S1:
5556 We first create the vector stmt VS1_0, and, as usual, record a
5557 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5558 Next, we create the vector stmt VS1_1, and record a pointer to
5559 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5560 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5561 stmts and pointers:
5562 RELATED_STMT VEC_STMT
5563 VS1_0: vx0 = memref0 VS1_1 -
5564 VS1_1: vx1 = memref1 VS1_2 -
5565 VS1_2: vx2 = memref2 VS1_3 -
5566 VS1_3: vx3 = memref3 - -
5567 S1: x = load - VS1_0
5568 S2: z = x + 1 - -
5570 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5571 information we recorded in RELATED_STMT field is used to vectorize
5572 stmt S2. */
5574 /* In case of interleaving (non-unit strided access):
5576 S1: x2 = &base + 2
5577 S2: x0 = &base
5578 S3: x1 = &base + 1
5579 S4: x3 = &base + 3
5581 Vectorized loads are created in the order of memory accesses
5582 starting from the access of the first stmt of the chain:
5584 VS1: vx0 = &base
5585 VS2: vx1 = &base + vec_size*1
5586 VS3: vx3 = &base + vec_size*2
5587 VS4: vx4 = &base + vec_size*3
5589 Then permutation statements are generated:
5591 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5592 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5595 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5596 (the order of the data-refs in the output of vect_permute_load_chain
5597 corresponds to the order of scalar stmts in the interleaving chain - see
5598 the documentation of vect_permute_load_chain()).
5599 The generation of permutation stmts and recording them in
5600 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5602 In case of both multiple types and interleaving, the vector loads and
5603 permutation stmts above are created for every copy. The result vector stmts
5604 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5605 STMT_VINFO_RELATED_STMT for the next copies. */
5607 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5608 on a target that supports unaligned accesses (dr_unaligned_supported)
5609 we generate the following code:
5610 p = initial_addr;
5611 indx = 0;
5612 loop {
5613 p = p + indx * vectype_size;
5614 vec_dest = *(p);
5615 indx = indx + 1;
5618 Otherwise, the data reference is potentially unaligned on a target that
5619 does not support unaligned accesses (dr_explicit_realign_optimized) -
5620 then generate the following code, in which the data in each iteration is
5621 obtained by two vector loads, one from the previous iteration, and one
5622 from the current iteration:
5623 p1 = initial_addr;
5624 msq_init = *(floor(p1))
5625 p2 = initial_addr + VS - 1;
5626 realignment_token = call target_builtin;
5627 indx = 0;
5628 loop {
5629 p2 = p2 + indx * vectype_size
5630 lsq = *(floor(p2))
5631 vec_dest = realign_load (msq, lsq, realignment_token)
5632 indx = indx + 1;
5633 msq = lsq;
5634 } */
5636 /* If the misalignment remains the same throughout the execution of the
5637 loop, we can create the init_addr and permutation mask at the loop
5638 preheader. Otherwise, it needs to be created inside the loop.
5639 This can only occur when vectorizing memory accesses in the inner-loop
5640 nested within an outer-loop that is being vectorized. */
5642 if (nested_in_vect_loop_p (loop, stmt)
5643 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5645 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5646 compute_in_loop = true;
5649 if ((alignment_support_scheme == dr_explicit_realign_optimized
5650 || alignment_support_scheme == dr_explicit_realign)
5651 && !compute_in_loop)
5653 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5654 alignment_support_scheme, NULL_TREE,
5655 &at_loop);
5656 if (alignment_support_scheme == dr_explicit_realign_optimized)
5658 phi = SSA_NAME_DEF_STMT (msq);
5659 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5662 else
5663 at_loop = loop;
5665 prev_stmt_info = NULL;
5666 for (j = 0; j < ncopies; j++)
5668 /* 1. Create the vector pointer update chain. */
5669 if (j == 0)
5670 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5671 at_loop, offset,
5672 &dummy, &ptr_incr, false,
5673 NULL_TREE, &inv_p);
5674 else
5675 dataref_ptr =
5676 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5678 for (i = 0; i < vec_num; i++)
5680 if (i > 0)
5681 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5682 NULL_TREE);
5684 /* 2. Create the vector-load in the loop. */
5685 switch (alignment_support_scheme)
5687 case dr_aligned:
5688 gcc_assert (aligned_access_p (first_dr));
5689 data_ref = build_fold_indirect_ref (dataref_ptr);
5690 break;
5691 case dr_unaligned_supported:
5693 int mis = DR_MISALIGNMENT (first_dr);
5694 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5696 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5697 data_ref =
5698 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5699 break;
5701 case dr_explicit_realign:
5703 tree ptr, bump;
5704 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5706 if (compute_in_loop)
5707 msq = vect_setup_realignment (first_stmt, bsi,
5708 &realignment_token,
5709 dr_explicit_realign,
5710 dataref_ptr, NULL);
5712 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5713 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5714 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5715 new_temp = make_ssa_name (vec_dest, new_stmt);
5716 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5717 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5718 copy_virtual_operands (new_stmt, stmt);
5719 mark_symbols_for_renaming (new_stmt);
5720 msq = new_temp;
5722 bump = size_binop (MULT_EXPR, vs_minus_1,
5723 TYPE_SIZE_UNIT (scalar_type));
5724 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5725 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5726 break;
5728 case dr_explicit_realign_optimized:
5729 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5730 break;
5731 default:
5732 gcc_unreachable ();
5734 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5735 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5736 new_temp = make_ssa_name (vec_dest, new_stmt);
5737 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5738 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5739 mark_symbols_for_renaming (new_stmt);
5741 /* 3. Handle explicit realignment if necessary/supported. Create in
5742 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5743 if (alignment_support_scheme == dr_explicit_realign_optimized
5744 || alignment_support_scheme == dr_explicit_realign)
5746 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5747 if (!realignment_token)
5748 realignment_token = dataref_ptr;
5749 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5750 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5751 realignment_token);
5752 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5753 new_temp = make_ssa_name (vec_dest, new_stmt);
5754 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5755 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5757 if (alignment_support_scheme == dr_explicit_realign_optimized)
5759 if (i == vec_num - 1 && j == ncopies - 1)
5760 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5761 msq = lsq;
5765 /* 4. Handle invariant-load. */
5766 if (inv_p)
5768 gcc_assert (!strided_load);
5769 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5770 if (j == 0)
5772 int k;
5773 tree t = NULL_TREE;
5774 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5776 /* CHECKME: bitpos depends on endianess? */
5777 bitpos = bitsize_zero_node;
5778 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5779 bitsize, bitpos);
5780 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5781 TYPE_UNSIGNED (scalar_type);
5782 vec_dest =
5783 vect_create_destination_var (scalar_dest, NULL_TREE);
5784 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5785 new_temp = make_ssa_name (vec_dest, new_stmt);
5786 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5787 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5789 for (k = nunits - 1; k >= 0; --k)
5790 t = tree_cons (NULL_TREE, new_temp, t);
5791 /* FIXME: use build_constructor directly. */
5792 vec_inv = build_constructor_from_list (vectype, t);
5793 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5794 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5796 else
5797 gcc_unreachable (); /* FORNOW. */
5800 /* Collect vector loads and later create their permutation in
5801 vect_transform_strided_load (). */
5802 if (strided_load)
5803 VEC_quick_push (tree, dr_chain, new_temp);
5805 /* Store vector loads in the corresponding SLP_NODE. */
5806 if (slp)
5807 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5810 /* FORNOW: SLP with multiple types is unsupported. */
5811 if (slp)
5812 return true;
5814 if (strided_load)
5816 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5817 return false;
5818 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5819 dr_chain = VEC_alloc (tree, heap, group_size);
5821 else
5823 if (j == 0)
5824 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5825 else
5826 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5827 prev_stmt_info = vinfo_for_stmt (new_stmt);
5831 return true;
5835 /* Function vectorizable_live_operation.
5837 STMT computes a value that is used outside the loop. Check if
5838 it can be supported. */
5840 bool
5841 vectorizable_live_operation (tree stmt,
5842 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5843 tree *vec_stmt ATTRIBUTE_UNUSED)
5845 tree operation;
5846 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5847 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5848 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5849 int i;
5850 int op_type;
5851 tree op;
5852 tree def, def_stmt;
5853 enum vect_def_type dt;
5855 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5857 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5858 return false;
5860 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5861 return false;
5863 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5864 return false;
5866 /* FORNOW. CHECKME. */
5867 if (nested_in_vect_loop_p (loop, stmt))
5868 return false;
5870 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5871 op_type = TREE_OPERAND_LENGTH (operation);
5873 /* FORNOW: support only if all uses are invariant. This means
5874 that the scalar operations can remain in place, unvectorized.
5875 The original last scalar value that they compute will be used. */
5877 for (i = 0; i < op_type; i++)
5879 op = TREE_OPERAND (operation, i);
5880 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5882 if (vect_print_dump_info (REPORT_DETAILS))
5883 fprintf (vect_dump, "use not simple.");
5884 return false;
5887 if (dt != vect_invariant_def && dt != vect_constant_def)
5888 return false;
5891 /* No transformation is required for the cases we currently support. */
5892 return true;
5896 /* Function vect_is_simple_cond.
5898 Input:
5899 LOOP - the loop that is being vectorized.
5900 COND - Condition that is checked for simple use.
5902 Returns whether a COND can be vectorized. Checks whether
5903 condition operands are supportable using vec_is_simple_use. */
5905 static bool
5906 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
5908 tree lhs, rhs;
5909 tree def;
5910 enum vect_def_type dt;
5912 if (!COMPARISON_CLASS_P (cond))
5913 return false;
5915 lhs = TREE_OPERAND (cond, 0);
5916 rhs = TREE_OPERAND (cond, 1);
5918 if (TREE_CODE (lhs) == SSA_NAME)
5920 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
5921 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
5922 return false;
5924 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
5925 && TREE_CODE (lhs) != FIXED_CST)
5926 return false;
5928 if (TREE_CODE (rhs) == SSA_NAME)
5930 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
5931 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
5932 return false;
5934 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
5935 && TREE_CODE (rhs) != FIXED_CST)
5936 return false;
5938 return true;
5941 /* vectorizable_condition.
5943 Check if STMT is conditional modify expression that can be vectorized.
5944 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5945 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
5946 at BSI.
5948 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5950 bool
5951 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5953 tree scalar_dest = NULL_TREE;
5954 tree vec_dest = NULL_TREE;
5955 tree op = NULL_TREE;
5956 tree cond_expr, then_clause, else_clause;
5957 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5958 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5959 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
5960 tree vec_compare, vec_cond_expr;
5961 tree new_temp;
5962 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5963 enum machine_mode vec_mode;
5964 tree def;
5965 enum vect_def_type dt;
5966 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5967 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5969 gcc_assert (ncopies >= 1);
5970 if (ncopies > 1)
5971 return false; /* FORNOW */
5973 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5974 return false;
5976 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5977 return false;
5979 /* FORNOW: SLP not supported. */
5980 if (STMT_SLP_TYPE (stmt_info))
5981 return false;
5983 /* FORNOW: not yet supported. */
5984 if (STMT_VINFO_LIVE_P (stmt_info))
5986 if (vect_print_dump_info (REPORT_DETAILS))
5987 fprintf (vect_dump, "value used after loop.");
5988 return false;
5991 /* Is vectorizable conditional operation? */
5992 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5993 return false;
5995 op = GIMPLE_STMT_OPERAND (stmt, 1);
5997 if (TREE_CODE (op) != COND_EXPR)
5998 return false;
6000 cond_expr = TREE_OPERAND (op, 0);
6001 then_clause = TREE_OPERAND (op, 1);
6002 else_clause = TREE_OPERAND (op, 2);
6004 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6005 return false;
6007 /* We do not handle two different vector types for the condition
6008 and the values. */
6009 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6010 return false;
6012 if (TREE_CODE (then_clause) == SSA_NAME)
6014 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6015 if (!vect_is_simple_use (then_clause, loop_vinfo,
6016 &then_def_stmt, &def, &dt))
6017 return false;
6019 else if (TREE_CODE (then_clause) != INTEGER_CST
6020 && TREE_CODE (then_clause) != REAL_CST
6021 && TREE_CODE (then_clause) != FIXED_CST)
6022 return false;
6024 if (TREE_CODE (else_clause) == SSA_NAME)
6026 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6027 if (!vect_is_simple_use (else_clause, loop_vinfo,
6028 &else_def_stmt, &def, &dt))
6029 return false;
6031 else if (TREE_CODE (else_clause) != INTEGER_CST
6032 && TREE_CODE (else_clause) != REAL_CST
6033 && TREE_CODE (else_clause) != FIXED_CST)
6034 return false;
6037 vec_mode = TYPE_MODE (vectype);
6039 if (!vec_stmt)
6041 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6042 return expand_vec_cond_expr_p (op, vec_mode);
6045 /* Transform */
6047 /* Handle def. */
6048 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6049 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6051 /* Handle cond expr. */
6052 vec_cond_lhs =
6053 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6054 vec_cond_rhs =
6055 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6056 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6057 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6059 /* Arguments are ready. create the new vector stmt. */
6060 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6061 vec_cond_lhs, vec_cond_rhs);
6062 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6063 vec_compare, vec_then_clause, vec_else_clause);
6065 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6066 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6067 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6068 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6070 return true;
6074 /* Function vect_transform_stmt.
6076 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6078 static bool
6079 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6080 slp_tree slp_node)
6082 bool is_store = false;
6083 tree vec_stmt = NULL_TREE;
6084 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6085 tree orig_stmt_in_pattern;
6086 bool done;
6088 switch (STMT_VINFO_TYPE (stmt_info))
6090 case type_demotion_vec_info_type:
6091 gcc_assert (!slp_node);
6092 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6093 gcc_assert (done);
6094 break;
6096 case type_promotion_vec_info_type:
6097 gcc_assert (!slp_node);
6098 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6099 gcc_assert (done);
6100 break;
6102 case type_conversion_vec_info_type:
6103 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6104 gcc_assert (done);
6105 break;
6107 case induc_vec_info_type:
6108 gcc_assert (!slp_node);
6109 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6110 gcc_assert (done);
6111 break;
6113 case op_vec_info_type:
6114 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6115 gcc_assert (done);
6116 break;
6118 case assignment_vec_info_type:
6119 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6120 gcc_assert (done);
6121 break;
6123 case load_vec_info_type:
6124 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6125 gcc_assert (done);
6126 break;
6128 case store_vec_info_type:
6129 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6130 gcc_assert (done);
6131 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6133 /* In case of interleaving, the whole chain is vectorized when the
6134 last store in the chain is reached. Store stmts before the last
6135 one are skipped, and there vec_stmt_info shouldn't be freed
6136 meanwhile. */
6137 *strided_store = true;
6138 if (STMT_VINFO_VEC_STMT (stmt_info))
6139 is_store = true;
6141 else
6142 is_store = true;
6143 break;
6145 case condition_vec_info_type:
6146 gcc_assert (!slp_node);
6147 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6148 gcc_assert (done);
6149 break;
6151 case call_vec_info_type:
6152 gcc_assert (!slp_node);
6153 done = vectorizable_call (stmt, bsi, &vec_stmt);
6154 break;
6156 case reduc_vec_info_type:
6157 gcc_assert (!slp_node);
6158 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6159 gcc_assert (done);
6160 break;
6162 default:
6163 if (!STMT_VINFO_LIVE_P (stmt_info))
6165 if (vect_print_dump_info (REPORT_DETAILS))
6166 fprintf (vect_dump, "stmt not supported.");
6167 gcc_unreachable ();
6171 if (STMT_VINFO_LIVE_P (stmt_info)
6172 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6174 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6175 gcc_assert (done);
6178 if (vec_stmt)
6180 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6181 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6182 if (orig_stmt_in_pattern)
6184 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6185 /* STMT was inserted by the vectorizer to replace a computation idiom.
6186 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6187 computed this idiom. We need to record a pointer to VEC_STMT in
6188 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6189 documentation of vect_pattern_recog. */
6190 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6192 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6193 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6198 return is_store;
6202 /* This function builds ni_name = number of iterations loop executes
6203 on the loop preheader. */
6205 static tree
6206 vect_build_loop_niters (loop_vec_info loop_vinfo)
6208 tree ni_name, stmt, var;
6209 edge pe;
6210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6211 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6213 var = create_tmp_var (TREE_TYPE (ni), "niters");
6214 add_referenced_var (var);
6215 ni_name = force_gimple_operand (ni, &stmt, false, var);
6217 pe = loop_preheader_edge (loop);
6218 if (stmt)
6220 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6221 gcc_assert (!new_bb);
6224 return ni_name;
6228 /* This function generates the following statements:
6230 ni_name = number of iterations loop executes
6231 ratio = ni_name / vf
6232 ratio_mult_vf_name = ratio * vf
6234 and places them at the loop preheader edge. */
6236 static void
6237 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6238 tree *ni_name_ptr,
6239 tree *ratio_mult_vf_name_ptr,
6240 tree *ratio_name_ptr)
6243 edge pe;
6244 basic_block new_bb;
6245 tree stmt, ni_name;
6246 tree var;
6247 tree ratio_name;
6248 tree ratio_mult_vf_name;
6249 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6250 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6251 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6252 tree log_vf;
6254 pe = loop_preheader_edge (loop);
6256 /* Generate temporary variable that contains
6257 number of iterations loop executes. */
6259 ni_name = vect_build_loop_niters (loop_vinfo);
6260 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6262 /* Create: ratio = ni >> log2(vf) */
6264 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6265 if (!is_gimple_val (ratio_name))
6267 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6268 add_referenced_var (var);
6270 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6271 pe = loop_preheader_edge (loop);
6272 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6273 gcc_assert (!new_bb);
6276 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6278 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6279 ratio_name, log_vf);
6280 if (!is_gimple_val (ratio_mult_vf_name))
6282 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6283 add_referenced_var (var);
6285 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6286 true, var);
6287 pe = loop_preheader_edge (loop);
6288 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6289 gcc_assert (!new_bb);
6292 *ni_name_ptr = ni_name;
6293 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6294 *ratio_name_ptr = ratio_name;
6296 return;
6300 /* Function vect_update_ivs_after_vectorizer.
6302 "Advance" the induction variables of LOOP to the value they should take
6303 after the execution of LOOP. This is currently necessary because the
6304 vectorizer does not handle induction variables that are used after the
6305 loop. Such a situation occurs when the last iterations of LOOP are
6306 peeled, because:
6307 1. We introduced new uses after LOOP for IVs that were not originally used
6308 after LOOP: the IVs of LOOP are now used by an epilog loop.
6309 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6310 times, whereas the loop IVs should be bumped N times.
6312 Input:
6313 - LOOP - a loop that is going to be vectorized. The last few iterations
6314 of LOOP were peeled.
6315 - NITERS - the number of iterations that LOOP executes (before it is
6316 vectorized). i.e, the number of times the ivs should be bumped.
6317 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6318 coming out from LOOP on which there are uses of the LOOP ivs
6319 (this is the path from LOOP->exit to epilog_loop->preheader).
6321 The new definitions of the ivs are placed in LOOP->exit.
6322 The phi args associated with the edge UPDATE_E in the bb
6323 UPDATE_E->dest are updated accordingly.
6325 Assumption 1: Like the rest of the vectorizer, this function assumes
6326 a single loop exit that has a single predecessor.
6328 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6329 organized in the same order.
6331 Assumption 3: The access function of the ivs is simple enough (see
6332 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6334 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6335 coming out of LOOP on which the ivs of LOOP are used (this is the path
6336 that leads to the epilog loop; other paths skip the epilog loop). This
6337 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6338 needs to have its phis updated.
6341 static void
6342 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6343 edge update_e)
6345 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6346 basic_block exit_bb = single_exit (loop)->dest;
6347 tree phi, phi1;
6348 basic_block update_bb = update_e->dest;
6350 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6352 /* Make sure there exists a single-predecessor exit bb: */
6353 gcc_assert (single_pred_p (exit_bb));
6355 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6356 phi && phi1;
6357 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6359 tree access_fn = NULL;
6360 tree evolution_part;
6361 tree init_expr;
6362 tree step_expr;
6363 tree var, ni, ni_name;
6364 block_stmt_iterator last_bsi;
6366 if (vect_print_dump_info (REPORT_DETAILS))
6368 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6369 print_generic_expr (vect_dump, phi, TDF_SLIM);
6372 /* Skip virtual phi's. */
6373 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6375 if (vect_print_dump_info (REPORT_DETAILS))
6376 fprintf (vect_dump, "virtual phi. skip.");
6377 continue;
6380 /* Skip reduction phis. */
6381 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6383 if (vect_print_dump_info (REPORT_DETAILS))
6384 fprintf (vect_dump, "reduc phi. skip.");
6385 continue;
6388 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6389 gcc_assert (access_fn);
6390 evolution_part =
6391 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6392 gcc_assert (evolution_part != NULL_TREE);
6394 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6395 of degree >= 2 or exponential. */
6396 gcc_assert (!tree_is_chrec (evolution_part));
6398 step_expr = evolution_part;
6399 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6400 loop->num));
6402 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6403 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6404 init_expr,
6405 fold_convert (sizetype,
6406 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6407 niters, step_expr)));
6408 else
6409 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6410 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6411 fold_convert (TREE_TYPE (init_expr),
6412 niters),
6413 step_expr),
6414 init_expr);
6418 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6419 add_referenced_var (var);
6421 last_bsi = bsi_last (exit_bb);
6422 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6423 true, BSI_SAME_STMT);
6425 /* Fix phi expressions in the successor bb. */
6426 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6431 /* Function vect_do_peeling_for_loop_bound
6433 Peel the last iterations of the loop represented by LOOP_VINFO.
6434 The peeled iterations form a new epilog loop. Given that the loop now
6435 iterates NITERS times, the new epilog loop iterates
6436 NITERS % VECTORIZATION_FACTOR times.
6438 The original loop will later be made to iterate
6439 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6441 static void
6442 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6444 tree ni_name, ratio_mult_vf_name;
6445 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6446 struct loop *new_loop;
6447 edge update_e;
6448 basic_block preheader;
6449 int loop_num;
6450 unsigned int th;
6451 int min_scalar_loop_bound;
6452 int min_profitable_iters;
6454 if (vect_print_dump_info (REPORT_DETAILS))
6455 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6457 initialize_original_copy_tables ();
6459 /* Generate the following variables on the preheader of original loop:
6461 ni_name = number of iteration the original loop executes
6462 ratio = ni_name / vf
6463 ratio_mult_vf_name = ratio * vf */
6464 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6465 &ratio_mult_vf_name, ratio);
6467 loop_num = loop->num;
6469 /* Analyze cost to set threshhold for vectorized loop. */
6470 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6471 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6472 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6474 /* Use the cost model only if it is more conservative than user specified
6475 threshold. */
6477 th = (unsigned) min_scalar_loop_bound;
6478 if (min_profitable_iters
6479 && (!min_scalar_loop_bound
6480 || min_profitable_iters > min_scalar_loop_bound))
6481 th = (unsigned) min_profitable_iters;
6483 if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
6484 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6485 && vect_print_dump_info (REPORT_DETAILS))
6486 fprintf (vect_dump, "vectorization may not be profitable.");
6488 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6489 ratio_mult_vf_name, ni_name, false,
6490 th);
6491 gcc_assert (new_loop);
6492 gcc_assert (loop_num == loop->num);
6493 #ifdef ENABLE_CHECKING
6494 slpeel_verify_cfg_after_peeling (loop, new_loop);
6495 #endif
6497 /* A guard that controls whether the new_loop is to be executed or skipped
6498 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6499 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6500 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6501 is on the path where the LOOP IVs are used and need to be updated. */
6503 preheader = loop_preheader_edge (new_loop)->src;
6504 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6505 update_e = EDGE_PRED (preheader, 0);
6506 else
6507 update_e = EDGE_PRED (preheader, 1);
6509 /* Update IVs of original loop as if they were advanced
6510 by ratio_mult_vf_name steps. */
6511 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6513 /* After peeling we have to reset scalar evolution analyzer. */
6514 scev_reset ();
6516 free_original_copy_tables ();
6520 /* Function vect_gen_niters_for_prolog_loop
6522 Set the number of iterations for the loop represented by LOOP_VINFO
6523 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6524 and the misalignment of DR - the data reference recorded in
6525 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6526 this loop, the data reference DR will refer to an aligned location.
6528 The following computation is generated:
6530 If the misalignment of DR is known at compile time:
6531 addr_mis = int mis = DR_MISALIGNMENT (dr);
6532 Else, compute address misalignment in bytes:
6533 addr_mis = addr & (vectype_size - 1)
6535 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6537 (elem_size = element type size; an element is the scalar element
6538 whose type is the inner type of the vectype)
6540 For interleaving,
6542 prolog_niters = min ( LOOP_NITERS ,
6543 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6544 where group_size is the size of the interleaved group.
6546 The above formulas assume that VF == number of elements in the vector. This
6547 may not hold when there are multiple-types in the loop.
6548 In this case, for some data-references in the loop the VF does not represent
6549 the number of elements that fit in the vector. Therefore, instead of VF we
6550 use TYPE_VECTOR_SUBPARTS. */
6552 static tree
6553 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6555 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6556 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6557 tree var, stmt;
6558 tree iters, iters_name;
6559 edge pe;
6560 basic_block new_bb;
6561 tree dr_stmt = DR_STMT (dr);
6562 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6563 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6564 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6565 tree niters_type = TREE_TYPE (loop_niters);
6566 int group_size = 1;
6567 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6568 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6570 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6572 /* For interleaved access element size must be multiplied by the size of
6573 the interleaved group. */
6574 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6575 DR_GROUP_FIRST_DR (stmt_info)));
6576 element_size *= group_size;
6579 pe = loop_preheader_edge (loop);
6581 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6583 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6584 int elem_misalign = byte_misalign / element_size;
6586 if (vect_print_dump_info (REPORT_DETAILS))
6587 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6588 iters = build_int_cst (niters_type,
6589 (nelements - elem_misalign)&(nelements/group_size-1));
6591 else
6593 tree new_stmts = NULL_TREE;
6594 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6595 &new_stmts, NULL_TREE, loop);
6596 tree ptr_type = TREE_TYPE (start_addr);
6597 tree size = TYPE_SIZE (ptr_type);
6598 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6599 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6600 tree elem_size_log =
6601 build_int_cst (type, exact_log2 (vectype_align/nelements));
6602 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6603 tree nelements_tree = build_int_cst (type, nelements);
6604 tree byte_misalign;
6605 tree elem_misalign;
6607 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6608 gcc_assert (!new_bb);
6610 /* Create: byte_misalign = addr & (vectype_size - 1) */
6611 byte_misalign =
6612 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6614 /* Create: elem_misalign = byte_misalign / element_size */
6615 elem_misalign =
6616 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6618 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6619 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6620 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6621 iters = fold_convert (niters_type, iters);
6624 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6625 /* If the loop bound is known at compile time we already verified that it is
6626 greater than vf; since the misalignment ('iters') is at most vf, there's
6627 no need to generate the MIN_EXPR in this case. */
6628 if (TREE_CODE (loop_niters) != INTEGER_CST)
6629 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6631 if (vect_print_dump_info (REPORT_DETAILS))
6633 fprintf (vect_dump, "niters for prolog loop: ");
6634 print_generic_expr (vect_dump, iters, TDF_SLIM);
6637 var = create_tmp_var (niters_type, "prolog_loop_niters");
6638 add_referenced_var (var);
6639 iters_name = force_gimple_operand (iters, &stmt, false, var);
6641 /* Insert stmt on loop preheader edge. */
6642 if (stmt)
6644 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6645 gcc_assert (!new_bb);
6648 return iters_name;
6652 /* Function vect_update_init_of_dr
6654 NITERS iterations were peeled from LOOP. DR represents a data reference
6655 in LOOP. This function updates the information recorded in DR to
6656 account for the fact that the first NITERS iterations had already been
6657 executed. Specifically, it updates the OFFSET field of DR. */
6659 static void
6660 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6662 tree offset = DR_OFFSET (dr);
6664 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6665 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6666 DR_OFFSET (dr) = offset;
6670 /* Function vect_update_inits_of_drs
6672 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6673 This function updates the information recorded for the data references in
6674 the loop to account for the fact that the first NITERS iterations had
6675 already been executed. Specifically, it updates the initial_condition of
6676 the access_function of all the data_references in the loop. */
6678 static void
6679 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6681 unsigned int i;
6682 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6683 struct data_reference *dr;
6685 if (vect_print_dump_info (REPORT_DETAILS))
6686 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6688 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6689 vect_update_init_of_dr (dr, niters);
6693 /* Function vect_do_peeling_for_alignment
6695 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6696 'niters' is set to the misalignment of one of the data references in the
6697 loop, thereby forcing it to refer to an aligned location at the beginning
6698 of the execution of this loop. The data reference for which we are
6699 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6701 static void
6702 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6704 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6705 tree niters_of_prolog_loop, ni_name;
6706 tree n_iters;
6707 struct loop *new_loop;
6709 if (vect_print_dump_info (REPORT_DETAILS))
6710 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6712 initialize_original_copy_tables ();
6714 ni_name = vect_build_loop_niters (loop_vinfo);
6715 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6717 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6718 new_loop =
6719 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6720 niters_of_prolog_loop, ni_name, true, 0);
6721 gcc_assert (new_loop);
6722 #ifdef ENABLE_CHECKING
6723 slpeel_verify_cfg_after_peeling (new_loop, loop);
6724 #endif
6726 /* Update number of times loop executes. */
6727 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6728 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6729 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6731 /* Update the init conditions of the access functions of all data refs. */
6732 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6734 /* After peeling we have to reset scalar evolution analyzer. */
6735 scev_reset ();
6737 free_original_copy_tables ();
6741 /* Function vect_create_cond_for_align_checks.
6743 Create a conditional expression that represents the alignment checks for
6744 all of data references (array element references) whose alignment must be
6745 checked at runtime.
6747 Input:
6748 LOOP_VINFO - two fields of the loop information are used.
6749 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6750 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6752 Output:
6753 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6754 expression.
6755 The returned value is the conditional expression to be used in the if
6756 statement that controls which version of the loop gets executed at runtime.
6758 The algorithm makes two assumptions:
6759 1) The number of bytes "n" in a vector is a power of 2.
6760 2) An address "a" is aligned if a%n is zero and that this
6761 test can be done as a&(n-1) == 0. For example, for 16
6762 byte vectors the test is a&0xf == 0. */
6764 static tree
6765 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6766 tree *cond_expr_stmt_list)
6768 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6769 VEC(tree,heap) *may_misalign_stmts
6770 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6771 tree ref_stmt, tmp;
6772 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6773 tree mask_cst;
6774 unsigned int i;
6775 tree psize;
6776 tree int_ptrsize_type;
6777 char tmp_name[20];
6778 tree or_tmp_name = NULL_TREE;
6779 tree and_tmp, and_tmp_name, and_stmt;
6780 tree ptrsize_zero;
6782 /* Check that mask is one less than a power of 2, i.e., mask is
6783 all zeros followed by all ones. */
6784 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6786 /* CHECKME: what is the best integer or unsigned type to use to hold a
6787 cast from a pointer value? */
6788 psize = TYPE_SIZE (ptr_type_node);
6789 int_ptrsize_type
6790 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6792 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6793 of the first vector of the i'th data reference. */
6795 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6797 tree new_stmt_list = NULL_TREE;
6798 tree addr_base;
6799 tree addr_tmp, addr_tmp_name, addr_stmt;
6800 tree or_tmp, new_or_tmp_name, or_stmt;
6802 /* create: addr_tmp = (int)(address_of_first_vector) */
6803 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6804 &new_stmt_list, NULL_TREE, loop);
6806 if (new_stmt_list != NULL_TREE)
6807 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6809 sprintf (tmp_name, "%s%d", "addr2int", i);
6810 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6811 add_referenced_var (addr_tmp);
6812 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6813 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6814 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6815 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6816 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6818 /* The addresses are OR together. */
6820 if (or_tmp_name != NULL_TREE)
6822 /* create: or_tmp = or_tmp | addr_tmp */
6823 sprintf (tmp_name, "%s%d", "orptrs", i);
6824 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6825 add_referenced_var (or_tmp);
6826 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6827 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6828 or_tmp_name, addr_tmp_name);
6829 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6830 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6831 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6832 or_tmp_name = new_or_tmp_name;
6834 else
6835 or_tmp_name = addr_tmp_name;
6837 } /* end for i */
6839 mask_cst = build_int_cst (int_ptrsize_type, mask);
6841 /* create: and_tmp = or_tmp & mask */
6842 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6843 add_referenced_var (and_tmp);
6844 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6846 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6847 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6848 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6849 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6851 /* Make and_tmp the left operand of the conditional test against zero.
6852 if and_tmp has a nonzero bit then some address is unaligned. */
6853 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6854 return build2 (EQ_EXPR, boolean_type_node,
6855 and_tmp_name, ptrsize_zero);
6858 /* Function vect_vfa_segment_size.
6860 Create an expression that computes the size of segment
6861 that will be accessed for a data reference. The functions takes into
6862 account that realignment loads may access one more vector.
6864 Input:
6865 DR: The data reference.
6866 VECT_FACTOR: vectorization factor.
6868 Return an expression whose value is the size of segment which will be
6869 accessed by DR. */
6871 static tree
6872 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
6874 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
6875 DR_STEP (dr), vect_factor);
6877 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
6879 tree vector_size = TYPE_SIZE_UNIT
6880 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
6882 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
6883 segment_length, vector_size);
6885 return fold_convert (sizetype, segment_length);
6888 /* Function vect_create_cond_for_alias_checks.
6890 Create a conditional expression that represents the run-time checks for
6891 overlapping of address ranges represented by a list of data references
6892 relations passed as input.
6894 Input:
6895 COND_EXPR - input conditional expression. New conditions will be chained
6896 with logical and operation.
6897 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
6898 to be checked.
6900 Output:
6901 COND_EXPR - conditional expression.
6902 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6903 expression.
6906 The returned value is the conditional expression to be used in the if
6907 statement that controls which version of the loop gets executed at runtime.
6910 static void
6911 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
6912 tree * cond_expr,
6913 tree * cond_expr_stmt_list)
6915 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6916 VEC (ddr_p, heap) * may_alias_ddrs =
6917 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
6918 tree vect_factor =
6919 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
6921 ddr_p ddr;
6922 unsigned int i;
6923 tree part_cond_expr;
6925 /* Create expression
6926 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
6927 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
6931 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
6932 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
6934 if (VEC_empty (ddr_p, may_alias_ddrs))
6935 return;
6937 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
6939 struct data_reference *dr_a, *dr_b;
6940 tree dr_group_first_a, dr_group_first_b;
6941 tree addr_base_a, addr_base_b;
6942 tree segment_length_a, segment_length_b;
6943 tree stmt_a, stmt_b;
6945 dr_a = DDR_A (ddr);
6946 stmt_a = DR_STMT (DDR_A (ddr));
6947 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
6948 if (dr_group_first_a)
6950 stmt_a = dr_group_first_a;
6951 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
6954 dr_b = DDR_B (ddr);
6955 stmt_b = DR_STMT (DDR_B (ddr));
6956 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
6957 if (dr_group_first_b)
6959 stmt_b = dr_group_first_b;
6960 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
6963 addr_base_a =
6964 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
6965 NULL_TREE, loop);
6966 addr_base_b =
6967 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
6968 NULL_TREE, loop);
6970 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
6971 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
6973 if (vect_print_dump_info (REPORT_DR_DETAILS))
6975 fprintf (vect_dump,
6976 "create runtime check for data references ");
6977 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
6978 fprintf (vect_dump, " and ");
6979 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
6983 part_cond_expr =
6984 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
6985 fold_build2 (LT_EXPR, boolean_type_node,
6986 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
6987 addr_base_a,
6988 segment_length_a),
6989 addr_base_b),
6990 fold_build2 (LT_EXPR, boolean_type_node,
6991 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
6992 addr_base_b,
6993 segment_length_b),
6994 addr_base_a));
6996 if (*cond_expr)
6997 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
6998 *cond_expr, part_cond_expr);
6999 else
7000 *cond_expr = part_cond_expr;
7002 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7003 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7004 VEC_length (ddr_p, may_alias_ddrs));
7008 /* Function vect_loop_versioning.
7010 If the loop has data references that may or may not be aligned or/and
7011 has data reference relations whose independence was not proven then
7012 two versions of the loop need to be generated, one which is vectorized
7013 and one which isn't. A test is then generated to control which of the
7014 loops is executed. The test checks for the alignment of all of the
7015 data references that may or may not be aligned. An additional
7016 sequence of runtime tests is generated for each pairs of DDRs whose
7017 independence was not proven. The vectorized version of loop is
7018 executed only if both alias and alignment tests are passed. */
7020 static void
7021 vect_loop_versioning (loop_vec_info loop_vinfo)
7023 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7024 struct loop *nloop;
7025 tree cond_expr = NULL_TREE;
7026 tree cond_expr_stmt_list = NULL_TREE;
7027 basic_block condition_bb;
7028 block_stmt_iterator cond_exp_bsi;
7029 basic_block merge_bb;
7030 basic_block new_exit_bb;
7031 edge new_exit_e, e;
7032 tree orig_phi, new_phi, arg;
7033 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7034 tree gimplify_stmt_list;
7036 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7037 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7038 return;
7040 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7041 cond_expr =
7042 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
7044 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7045 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, &cond_expr_stmt_list);
7047 cond_expr =
7048 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7049 cond_expr =
7050 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7051 NULL_TREE);
7052 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7054 initialize_original_copy_tables ();
7055 nloop = loop_version (loop, cond_expr, &condition_bb,
7056 prob, prob, REG_BR_PROB_BASE - prob, true);
7057 free_original_copy_tables();
7059 /* Loop versioning violates an assumption we try to maintain during
7060 vectorization - that the loop exit block has a single predecessor.
7061 After versioning, the exit block of both loop versions is the same
7062 basic block (i.e. it has two predecessors). Just in order to simplify
7063 following transformations in the vectorizer, we fix this situation
7064 here by adding a new (empty) block on the exit-edge of the loop,
7065 with the proper loop-exit phis to maintain loop-closed-form. */
7067 merge_bb = single_exit (loop)->dest;
7068 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7069 new_exit_bb = split_edge (single_exit (loop));
7070 new_exit_e = single_exit (loop);
7071 e = EDGE_SUCC (new_exit_bb, 0);
7073 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7074 orig_phi = PHI_CHAIN (orig_phi))
7076 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7077 new_exit_bb);
7078 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7079 add_phi_arg (new_phi, arg, new_exit_e);
7080 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7083 /* End loop-exit-fixes after versioning. */
7085 update_ssa (TODO_update_ssa);
7086 if (cond_expr_stmt_list)
7088 cond_exp_bsi = bsi_last (condition_bb);
7089 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7093 /* Remove a group of stores (for SLP or interleaving), free their
7094 stmt_vec_info. */
7096 static void
7097 vect_remove_stores (tree first_stmt)
7099 stmt_ann_t ann;
7100 tree next = first_stmt;
7101 tree tmp;
7102 stmt_vec_info next_stmt_info;
7103 block_stmt_iterator next_si;
7105 while (next)
7107 /* Free the attached stmt_vec_info and remove the stmt. */
7108 next_si = bsi_for_stmt (next);
7109 bsi_remove (&next_si, true);
7110 next_stmt_info = vinfo_for_stmt (next);
7111 ann = stmt_ann (next);
7112 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7113 free (next_stmt_info);
7114 set_stmt_info (ann, NULL);
7115 next = tmp;
7120 /* Vectorize SLP instance tree in postorder. */
7122 static bool
7123 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7125 tree stmt;
7126 bool strided_store, is_store;
7127 block_stmt_iterator si;
7128 stmt_vec_info stmt_info;
7130 if (!node)
7131 return false;
7133 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7134 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7136 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7137 stmt_info = vinfo_for_stmt (stmt);
7138 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7139 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7141 if (vect_print_dump_info (REPORT_DETAILS))
7143 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7144 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7147 si = bsi_for_stmt (stmt);
7148 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7149 if (is_store)
7151 if (DR_GROUP_FIRST_DR (stmt_info))
7152 /* If IS_STORE is TRUE, the vectorization of the
7153 interleaving chain was completed - free all the stores in
7154 the chain. */
7155 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7156 else
7157 /* FORNOW: SLP originates only from strided stores. */
7158 gcc_unreachable ();
7160 return true;
7163 /* FORNOW: SLP originates only from strided stores. */
7164 return false;
7168 static bool
7169 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7171 VEC (slp_instance, heap) *slp_instances =
7172 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7173 slp_instance instance;
7174 unsigned int vec_stmts_size;
7175 unsigned int group_size, i;
7176 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7177 bool is_store = false;
7179 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7181 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7182 /* For each SLP instance calculate number of vector stmts to be created
7183 for the scalar stmts in each node of the SLP tree. Number of vector
7184 elements in one vector iteration is the number of scalar elements in
7185 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7186 size. */
7187 vec_stmts_size = vectorization_factor * group_size / nunits;
7189 /* Schedule the tree of INSTANCE. */
7190 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7191 vec_stmts_size);
7193 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7194 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7195 fprintf (vect_dump, "vectorizing stmts using SLP.");
7198 return is_store;
7201 /* Function vect_transform_loop.
7203 The analysis phase has determined that the loop is vectorizable.
7204 Vectorize the loop - created vectorized stmts to replace the scalar
7205 stmts in the loop, and update the loop exit condition. */
7207 void
7208 vect_transform_loop (loop_vec_info loop_vinfo)
7210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7211 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7212 int nbbs = loop->num_nodes;
7213 block_stmt_iterator si, next_si;
7214 int i;
7215 tree ratio = NULL;
7216 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7217 bool strided_store;
7218 bool slp_scheduled = false;
7219 unsigned int nunits;
7221 if (vect_print_dump_info (REPORT_DETAILS))
7222 fprintf (vect_dump, "=== vec_transform_loop ===");
7223 vect_loop_versioning (loop_vinfo);
7225 /* CHECKME: we wouldn't need this if we called update_ssa once
7226 for all loops. */
7227 bitmap_zero (vect_memsyms_to_rename);
7229 /* Peel the loop if there are data refs with unknown alignment.
7230 Only one data ref with unknown store is allowed. */
7232 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7233 vect_do_peeling_for_alignment (loop_vinfo);
7235 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7236 compile time constant), or it is a constant that doesn't divide by the
7237 vectorization factor, then an epilog loop needs to be created.
7238 We therefore duplicate the loop: the original loop will be vectorized,
7239 and will compute the first (n/VF) iterations. The second copy of the loop
7240 will remain scalar and will compute the remaining (n%VF) iterations.
7241 (VF is the vectorization factor). */
7243 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7244 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7245 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7246 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7247 else
7248 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7249 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7251 /* 1) Make sure the loop header has exactly two entries
7252 2) Make sure we have a preheader basic block. */
7254 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7256 split_edge (loop_preheader_edge (loop));
7258 /* FORNOW: the vectorizer supports only loops which body consist
7259 of one basic block (header + empty latch). When the vectorizer will
7260 support more involved loop forms, the order by which the BBs are
7261 traversed need to be reconsidered. */
7263 for (i = 0; i < nbbs; i++)
7265 basic_block bb = bbs[i];
7266 stmt_vec_info stmt_info;
7267 tree phi;
7269 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7271 if (vect_print_dump_info (REPORT_DETAILS))
7273 fprintf (vect_dump, "------>vectorizing phi: ");
7274 print_generic_expr (vect_dump, phi, TDF_SLIM);
7276 stmt_info = vinfo_for_stmt (phi);
7277 if (!stmt_info)
7278 continue;
7280 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7281 && !STMT_VINFO_LIVE_P (stmt_info))
7282 continue;
7284 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7285 != (unsigned HOST_WIDE_INT) vectorization_factor)
7286 && vect_print_dump_info (REPORT_DETAILS))
7287 fprintf (vect_dump, "multiple-types.");
7289 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7291 if (vect_print_dump_info (REPORT_DETAILS))
7292 fprintf (vect_dump, "transform phi.");
7293 vect_transform_stmt (phi, NULL, NULL, NULL);
7297 for (si = bsi_start (bb); !bsi_end_p (si);)
7299 tree stmt = bsi_stmt (si);
7300 bool is_store;
7302 if (vect_print_dump_info (REPORT_DETAILS))
7304 fprintf (vect_dump, "------>vectorizing statement: ");
7305 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7308 stmt_info = vinfo_for_stmt (stmt);
7310 /* vector stmts created in the outer-loop during vectorization of
7311 stmts in an inner-loop may not have a stmt_info, and do not
7312 need to be vectorized. */
7313 if (!stmt_info)
7315 bsi_next (&si);
7316 continue;
7319 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7320 && !STMT_VINFO_LIVE_P (stmt_info))
7322 bsi_next (&si);
7323 continue;
7326 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7327 nunits =
7328 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7329 if (!STMT_SLP_TYPE (stmt_info)
7330 && nunits != (unsigned int) vectorization_factor
7331 && vect_print_dump_info (REPORT_DETAILS))
7332 /* For SLP VF is set according to unrolling factor, and not to
7333 vector size, hence for SLP this print is not valid. */
7334 fprintf (vect_dump, "multiple-types.");
7336 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7337 reached. */
7338 if (STMT_SLP_TYPE (stmt_info))
7340 if (!slp_scheduled)
7342 slp_scheduled = true;
7344 if (vect_print_dump_info (REPORT_DETAILS))
7345 fprintf (vect_dump, "=== scheduling SLP instances ===");
7347 is_store = vect_schedule_slp (loop_vinfo, nunits);
7349 /* IS_STORE is true if STMT is a store. Stores cannot be of
7350 hybrid SLP type. They are removed in
7351 vect_schedule_slp_instance and their vinfo is destroyed. */
7352 if (is_store)
7354 bsi_next (&si);
7355 continue;
7359 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7360 if (PURE_SLP_STMT (stmt_info))
7362 bsi_next (&si);
7363 continue;
7367 /* -------- vectorize statement ------------ */
7368 if (vect_print_dump_info (REPORT_DETAILS))
7369 fprintf (vect_dump, "transform statement.");
7371 strided_store = false;
7372 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7373 if (is_store)
7375 stmt_ann_t ann;
7376 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7378 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7379 interleaving chain was completed - free all the stores in
7380 the chain. */
7381 tree next = DR_GROUP_FIRST_DR (stmt_info);
7382 tree tmp;
7383 stmt_vec_info next_stmt_info;
7385 while (next)
7387 next_si = bsi_for_stmt (next);
7388 next_stmt_info = vinfo_for_stmt (next);
7389 /* Free the attached stmt_vec_info and remove the stmt. */
7390 ann = stmt_ann (next);
7391 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7392 free (next_stmt_info);
7393 set_stmt_info (ann, NULL);
7394 bsi_remove (&next_si, true);
7395 next = tmp;
7397 bsi_remove (&si, true);
7398 continue;
7400 else
7402 /* Free the attached stmt_vec_info and remove the stmt. */
7403 ann = stmt_ann (stmt);
7404 free (stmt_info);
7405 set_stmt_info (ann, NULL);
7406 bsi_remove (&si, true);
7407 continue;
7410 bsi_next (&si);
7411 } /* stmts in BB */
7412 } /* BBs in loop */
7414 slpeel_make_loop_iterate_ntimes (loop, ratio);
7416 mark_set_for_renaming (vect_memsyms_to_rename);
7418 /* The memory tags and pointers in vectorized statements need to
7419 have their SSA forms updated. FIXME, why can't this be delayed
7420 until all the loops have been transformed? */
7421 update_ssa (TODO_update_ssa);
7423 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7424 fprintf (vect_dump, "LOOP VECTORIZED.");
7425 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7426 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");