target-supports.exp (check_effective_target_mips_soft_float): Return true for MIPS16...
[official-gcc.git] / gcc / tree-vect-transform.c
blobfaf3b3ade221fc6f5539846aac105007c792ad61
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
124 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
125 int nbbs = loop->num_nodes;
126 int byte_misalign;
127 int peel_guard_costs = 0;
128 int innerloop_iters = 0, factor;
129 VEC (slp_instance, heap) *slp_instances;
130 slp_instance instance;
132 /* Cost model disabled. */
133 if (!flag_vect_cost_model)
135 if (vect_print_dump_info (REPORT_DETAILS))
136 fprintf (vect_dump, "cost model disabled.");
137 return 0;
140 /* Requires loop versioning tests to handle misalignment. */
142 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
144 /* FIXME: Make cost depend on complexity of individual check. */
145 vec_outside_cost +=
146 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
147 if (vect_print_dump_info (REPORT_DETAILS))
148 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
149 "versioning to treat misalignment.\n");
152 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
154 /* FIXME: Make cost depend on complexity of individual check. */
155 vec_outside_cost +=
156 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
157 if (vect_print_dump_info (REPORT_DETAILS))
158 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
159 "versioning aliasing.\n");
162 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
163 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
168 /* Count statements in scalar loop. Using this as scalar cost for a single
169 iteration for now.
171 TODO: Add outer loop support.
173 TODO: Consider assigning different costs to different scalar
174 statements. */
176 /* FORNOW. */
177 if (loop->inner)
178 innerloop_iters = 50; /* FIXME */
180 for (i = 0; i < nbbs; i++)
182 block_stmt_iterator si;
183 basic_block bb = bbs[i];
185 if (bb->loop_father == loop->inner)
186 factor = innerloop_iters;
187 else
188 factor = 1;
190 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
192 tree stmt = bsi_stmt (si);
193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
194 if (!STMT_VINFO_RELEVANT_P (stmt_info)
195 && !STMT_VINFO_LIVE_P (stmt_info))
196 continue;
197 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
198 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
199 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
200 some of the "outside" costs are generated inside the outer-loop. */
201 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
205 /* Add additional cost for the peeled instructions in prologue and epilogue
206 loop.
208 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
209 at compile-time - we assume it's vf/2 (the worst would be vf-1).
211 TODO: Build an expression that represents peel_iters for prologue and
212 epilogue to be used in a run-time test. */
214 byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
216 if (byte_misalign < 0)
218 peel_iters_prologue = vf/2;
219 if (vect_print_dump_info (REPORT_DETAILS))
220 fprintf (vect_dump, "cost model: "
221 "prologue peel iters set to vf/2.");
223 /* If peeling for alignment is unknown, loop bound of main loop becomes
224 unknown. */
225 peel_iters_epilogue = vf/2;
226 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "cost model: "
228 "epilogue peel iters set to vf/2 because "
229 "peeling for alignment is unknown .");
231 /* If peeled iterations are unknown, count a taken branch and a not taken
232 branch per peeled loop. Even if scalar loop iterations are known,
233 vector iterations are not known since peeled prologue iterations are
234 not known. Hence guards remain the same. */
235 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
236 + TARG_COND_NOT_TAKEN_BRANCH_COST);
239 else
241 if (byte_misalign)
243 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
244 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
245 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
246 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
248 peel_iters_prologue = nelements - (byte_misalign / element_size);
250 else
251 peel_iters_prologue = 0;
253 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
255 peel_iters_epilogue = vf/2;
256 if (vect_print_dump_info (REPORT_DETAILS))
257 fprintf (vect_dump, "cost model: "
258 "epilogue peel iters set to vf/2 because "
259 "loop iterations are unknown .");
261 /* If peeled iterations are known but number of scalar loop
262 iterations are unknown, count a taken branch per peeled loop. */
263 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
266 else
268 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
269 peel_iters_prologue = niters < peel_iters_prologue ?
270 niters : peel_iters_prologue;
271 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
275 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
276 + (peel_iters_epilogue * scalar_single_iter_cost)
277 + peel_guard_costs;
279 /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
280 information we provide for the target is whether testing against the
281 threshold involves a runtime test. */
282 if (targetm.vectorize.builtin_vectorization_cost)
284 bool runtime_test = false;
286 /* If the number of iterations is unknown, or the
287 peeling-for-misalignment amount is unknown, we eill have to generate
288 a runtime test to test the loop count against the threshold. */
289 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
290 || (byte_misalign < 0))
291 runtime_test = true;
292 vec_outside_cost +=
293 targetm.vectorize.builtin_vectorization_cost (runtime_test);
294 if (vect_print_dump_info (REPORT_DETAILS))
295 fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
296 targetm.vectorize.builtin_vectorization_cost (runtime_test));
299 /* Add SLP costs. */
300 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
301 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
303 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
304 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
307 /* Calculate number of iterations required to make the vector version
308 profitable, relative to the loop bodies only. The following condition
309 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
310 SIC = scalar iteration cost, VIC = vector iteration cost,
311 VOC = vector outside cost and VF = vectorization factor. */
313 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
315 if (vec_outside_cost <= 0)
316 min_profitable_iters = 1;
317 else
319 min_profitable_iters = (vec_outside_cost * vf
320 - vec_inside_cost * peel_iters_prologue
321 - vec_inside_cost * peel_iters_epilogue)
322 / ((scalar_single_iter_cost * vf)
323 - vec_inside_cost);
325 if ((scalar_single_iter_cost * vf * min_profitable_iters)
326 <= ((vec_inside_cost * min_profitable_iters)
327 + (vec_outside_cost * vf)))
328 min_profitable_iters++;
331 /* vector version will never be profitable. */
332 else
334 if (vect_print_dump_info (REPORT_DETAILS))
335 fprintf (vect_dump, "cost model: vector iteration cost = %d "
336 "is divisible by scalar iteration cost = %d by a factor "
337 "greater than or equal to the vectorization factor = %d .",
338 vec_inside_cost, scalar_single_iter_cost, vf);
339 return -1;
342 if (vect_print_dump_info (REPORT_DETAILS))
344 fprintf (vect_dump, "Cost model analysis: \n");
345 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
346 vec_inside_cost);
347 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
348 vec_outside_cost);
349 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
350 fprintf (vect_dump, " prologue iterations: %d\n",
351 peel_iters_prologue);
352 fprintf (vect_dump, " epilogue iterations: %d\n",
353 peel_iters_epilogue);
354 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
355 min_profitable_iters);
358 min_profitable_iters =
359 min_profitable_iters < vf ? vf : min_profitable_iters;
361 /* Because the condition we create is:
362 if (niters <= min_profitable_iters)
363 then skip the vectorized loop. */
364 min_profitable_iters--;
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, " Profitability threshold = %d\n",
368 min_profitable_iters);
370 return min_profitable_iters;
374 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
375 functions. Design better to avoid maintenance issues. */
377 /* Function vect_model_reduction_cost.
379 Models cost for a reduction operation, including the vector ops
380 generated within the strip-mine loop, the initial definition before
381 the loop, and the epilogue code that must be generated. */
383 static bool
384 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
385 int ncopies)
387 int outer_cost = 0;
388 enum tree_code code;
389 optab optab;
390 tree vectype;
391 tree orig_stmt;
392 tree reduction_op;
393 enum machine_mode mode;
394 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
395 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
397 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
399 /* Cost of reduction op inside loop. */
400 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
402 reduction_op = TREE_OPERAND (operation, op_type-1);
403 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
404 if (!vectype)
406 if (vect_print_dump_info (REPORT_DETAILS))
408 fprintf (vect_dump, "unsupported data-type ");
409 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
411 return false;
414 mode = TYPE_MODE (vectype);
415 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
417 if (!orig_stmt)
418 orig_stmt = STMT_VINFO_STMT (stmt_info);
420 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
422 /* Add in cost for initial definition. */
423 outer_cost += TARG_SCALAR_TO_VEC_COST;
425 /* Determine cost of epilogue code.
427 We have a reduction operator that will reduce the vector in one statement.
428 Also requires scalar extract. */
430 if (!nested_in_vect_loop_p (loop, orig_stmt))
432 if (reduc_code < NUM_TREE_CODES)
433 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
434 else
436 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
437 tree bitsize =
438 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
439 int element_bitsize = tree_low_cst (bitsize, 1);
440 int nelements = vec_size_in_bits / element_bitsize;
442 optab = optab_for_tree_code (code, vectype);
444 /* We have a whole vector shift available. */
445 if (VECTOR_MODE_P (mode)
446 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
447 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
448 /* Final reduction via vector shifts and the reduction operator. Also
449 requires scalar extract. */
450 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
451 + TARG_VEC_TO_SCALAR_COST);
452 else
453 /* Use extracts and reduction op for final reduction. For N elements,
454 we have N extracts and N-1 reduction ops. */
455 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
459 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
461 if (vect_print_dump_info (REPORT_DETAILS))
462 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
463 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
464 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
466 return true;
470 /* Function vect_model_induction_cost.
472 Models cost for induction operations. */
474 static void
475 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
477 /* loop cost for vec_loop. */
478 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
479 /* prologue cost for vec_init and vec_step. */
480 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
482 if (vect_print_dump_info (REPORT_DETAILS))
483 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
484 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
485 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
489 /* Function vect_model_simple_cost.
491 Models cost for simple operations, i.e. those that only emit ncopies of a
492 single op. Right now, this does not account for multiple insns that could
493 be generated for the single vector op. We will handle that shortly. */
495 void
496 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
497 enum vect_def_type *dt, slp_tree slp_node)
499 int i;
500 int inside_cost = 0, outside_cost = 0;
502 inside_cost = ncopies * TARG_VEC_STMT_COST;
504 /* FORNOW: Assuming maximum 2 args per stmts. */
505 for (i = 0; i < 2; i++)
507 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
508 outside_cost += TARG_SCALAR_TO_VEC_COST;
511 if (vect_print_dump_info (REPORT_DETAILS))
512 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
513 "outside_cost = %d .", inside_cost, outside_cost);
515 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
516 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
517 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
521 /* Function vect_cost_strided_group_size
523 For strided load or store, return the group_size only if it is the first
524 load or store of a group, else return 1. This ensures that group size is
525 only returned once per group. */
527 static int
528 vect_cost_strided_group_size (stmt_vec_info stmt_info)
530 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
532 if (first_stmt == STMT_VINFO_STMT (stmt_info))
533 return DR_GROUP_SIZE (stmt_info);
535 return 1;
539 /* Function vect_model_store_cost
541 Models cost for stores. In the case of strided accesses, one access
542 has the overhead of the strided access attributed to it. */
544 void
545 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
546 enum vect_def_type dt, slp_tree slp_node)
548 int group_size;
549 int inside_cost = 0, outside_cost = 0;
551 if (dt == vect_constant_def || dt == vect_invariant_def)
552 outside_cost = TARG_SCALAR_TO_VEC_COST;
554 /* Strided access? */
555 if (DR_GROUP_FIRST_DR (stmt_info))
556 group_size = vect_cost_strided_group_size (stmt_info);
557 /* Not a strided access. */
558 else
559 group_size = 1;
561 /* Is this an access in a group of stores, which provide strided access?
562 If so, add in the cost of the permutes. */
563 if (group_size > 1)
565 /* Uses a high and low interleave operation for each needed permute. */
566 inside_cost = ncopies * exact_log2(group_size) * group_size
567 * TARG_VEC_STMT_COST;
569 if (vect_print_dump_info (REPORT_DETAILS))
570 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
571 group_size);
575 /* Costs of the stores. */
576 inside_cost += ncopies * TARG_VEC_STORE_COST;
578 if (vect_print_dump_info (REPORT_DETAILS))
579 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
580 "outside_cost = %d .", inside_cost, outside_cost);
582 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
583 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
584 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
588 /* Function vect_model_load_cost
590 Models cost for loads. In the case of strided accesses, the last access
591 has the overhead of the strided access attributed to it. Since unaligned
592 accesses are supported for loads, we also account for the costs of the
593 access scheme chosen. */
595 void
596 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
599 int group_size;
600 int alignment_support_cheme;
601 tree first_stmt;
602 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
603 int inside_cost = 0, outside_cost = 0;
605 /* Strided accesses? */
606 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
607 if (first_stmt && !slp_node)
609 group_size = vect_cost_strided_group_size (stmt_info);
610 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
612 /* Not a strided access. */
613 else
615 group_size = 1;
616 first_dr = dr;
619 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
621 /* Is this an access in a group of loads providing strided access?
622 If so, add in the cost of the permutes. */
623 if (group_size > 1)
625 /* Uses an even and odd extract operations for each needed permute. */
626 inside_cost = ncopies * exact_log2(group_size) * group_size
627 * TARG_VEC_STMT_COST;
629 if (vect_print_dump_info (REPORT_DETAILS))
630 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
631 group_size);
635 /* The loads themselves. */
636 switch (alignment_support_cheme)
638 case dr_aligned:
640 inside_cost += ncopies * TARG_VEC_LOAD_COST;
642 if (vect_print_dump_info (REPORT_DETAILS))
643 fprintf (vect_dump, "vect_model_load_cost: aligned.");
645 break;
647 case dr_unaligned_supported:
649 /* Here, we assign an additional cost for the unaligned load. */
650 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
652 if (vect_print_dump_info (REPORT_DETAILS))
653 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
654 "hardware.");
656 break;
658 case dr_explicit_realign:
660 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
662 /* FIXME: If the misalignment remains fixed across the iterations of
663 the containing loop, the following cost should be added to the
664 outside costs. */
665 if (targetm.vectorize.builtin_mask_for_load)
666 inside_cost += TARG_VEC_STMT_COST;
668 break;
670 case dr_explicit_realign_optimized:
672 if (vect_print_dump_info (REPORT_DETAILS))
673 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
674 "pipelined.");
676 /* Unaligned software pipeline has a load of an address, an initial
677 load, and possibly a mask operation to "prime" the loop. However,
678 if this is an access in a group of loads, which provide strided
679 access, then the above cost should only be considered for one
680 access in the group. Inside the loop, there is a load op
681 and a realignment op. */
683 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
685 outside_cost = 2*TARG_VEC_STMT_COST;
686 if (targetm.vectorize.builtin_mask_for_load)
687 outside_cost += TARG_VEC_STMT_COST;
690 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
692 break;
695 default:
696 gcc_unreachable ();
699 if (vect_print_dump_info (REPORT_DETAILS))
700 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
701 "outside_cost = %d .", inside_cost, outside_cost);
703 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
704 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
705 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
709 /* Function vect_get_new_vect_var.
711 Returns a name for a new variable. The current naming scheme appends the
712 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
713 the name of vectorizer generated variables, and appends that to NAME if
714 provided. */
716 static tree
717 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
719 const char *prefix;
720 tree new_vect_var;
722 switch (var_kind)
724 case vect_simple_var:
725 prefix = "vect_";
726 break;
727 case vect_scalar_var:
728 prefix = "stmp_";
729 break;
730 case vect_pointer_var:
731 prefix = "vect_p";
732 break;
733 default:
734 gcc_unreachable ();
737 if (name)
739 char* tmp = concat (prefix, name, NULL);
740 new_vect_var = create_tmp_var (type, tmp);
741 free (tmp);
743 else
744 new_vect_var = create_tmp_var (type, prefix);
746 /* Mark vector typed variable as a gimple register variable. */
747 if (TREE_CODE (type) == VECTOR_TYPE)
748 DECL_GIMPLE_REG_P (new_vect_var) = true;
750 return new_vect_var;
754 /* Function vect_create_addr_base_for_vector_ref.
756 Create an expression that computes the address of the first memory location
757 that will be accessed for a data reference.
759 Input:
760 STMT: The statement containing the data reference.
761 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
762 OFFSET: Optional. If supplied, it is be added to the initial address.
763 LOOP: Specify relative to which loop-nest should the address be computed.
764 For example, when the dataref is in an inner-loop nested in an
765 outer-loop that is now being vectorized, LOOP can be either the
766 outer-loop, or the inner-loop. The first memory location accessed
767 by the following dataref ('in' points to short):
769 for (i=0; i<N; i++)
770 for (j=0; j<M; j++)
771 s += in[i+j]
773 is as follows:
774 if LOOP=i_loop: &in (relative to i_loop)
775 if LOOP=j_loop: &in+i*2B (relative to j_loop)
777 Output:
778 1. Return an SSA_NAME whose value is the address of the memory location of
779 the first vector of the data reference.
780 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
781 these statement(s) which define the returned SSA_NAME.
783 FORNOW: We are only handling array accesses with step 1. */
785 static tree
786 vect_create_addr_base_for_vector_ref (tree stmt,
787 tree *new_stmt_list,
788 tree offset,
789 struct loop *loop)
791 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
792 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
793 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
794 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
795 tree base_name;
796 tree data_ref_base_var;
797 tree new_base_stmt;
798 tree vec_stmt;
799 tree addr_base, addr_expr;
800 tree dest, new_stmt;
801 tree base_offset = unshare_expr (DR_OFFSET (dr));
802 tree init = unshare_expr (DR_INIT (dr));
803 tree vect_ptr_type, addr_expr2;
804 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
806 gcc_assert (loop);
807 if (loop != containing_loop)
809 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
810 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
812 gcc_assert (nested_in_vect_loop_p (loop, stmt));
814 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
815 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
816 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
819 /* Create data_ref_base */
820 base_name = build_fold_indirect_ref (data_ref_base);
821 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
822 add_referenced_var (data_ref_base_var);
823 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
824 true, data_ref_base_var);
825 append_to_statement_list_force(new_base_stmt, new_stmt_list);
827 /* Create base_offset */
828 base_offset = size_binop (PLUS_EXPR, base_offset, init);
829 base_offset = fold_convert (sizetype, base_offset);
830 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
831 add_referenced_var (dest);
832 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
833 append_to_statement_list_force (new_stmt, new_stmt_list);
835 if (offset)
837 tree tmp = create_tmp_var (sizetype, "offset");
839 add_referenced_var (tmp);
840 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
841 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
842 base_offset, offset);
843 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
844 append_to_statement_list_force (new_stmt, new_stmt_list);
847 /* base + base_offset */
848 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
849 data_ref_base, base_offset);
851 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
853 /* addr_expr = addr_base */
854 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
855 get_name (base_name));
856 add_referenced_var (addr_expr);
857 vec_stmt = fold_convert (vect_ptr_type, addr_base);
858 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
859 get_name (base_name));
860 add_referenced_var (addr_expr2);
861 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
862 append_to_statement_list_force (new_stmt, new_stmt_list);
864 if (vect_print_dump_info (REPORT_DETAILS))
866 fprintf (vect_dump, "created ");
867 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
869 return vec_stmt;
873 /* Function vect_create_data_ref_ptr.
875 Create a new pointer to vector type (vp), that points to the first location
876 accessed in the loop by STMT, along with the def-use update chain to
877 appropriately advance the pointer through the loop iterations. Also set
878 aliasing information for the pointer. This vector pointer is used by the
879 callers to this function to create a memory reference expression for vector
880 load/store access.
882 Input:
883 1. STMT: a stmt that references memory. Expected to be of the form
884 GIMPLE_MODIFY_STMT <name, data-ref> or
885 GIMPLE_MODIFY_STMT <data-ref, name>.
886 2. AT_LOOP: the loop where the vector memref is to be created.
887 3. OFFSET (optional): an offset to be added to the initial address accessed
888 by the data-ref in STMT.
889 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
890 pointing to the initial address.
891 5. TYPE: if not NULL indicates the required type of the data-ref
893 Output:
894 1. Declare a new ptr to vector_type, and have it point to the base of the
895 data reference (initial addressed accessed by the data reference).
896 For example, for vector of type V8HI, the following code is generated:
898 v8hi *vp;
899 vp = (v8hi *)initial_address;
901 if OFFSET is not supplied:
902 initial_address = &a[init];
903 if OFFSET is supplied:
904 initial_address = &a[init + OFFSET];
906 Return the initial_address in INITIAL_ADDRESS.
908 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
909 update the pointer in each iteration of the loop.
911 Return the increment stmt that updates the pointer in PTR_INCR.
913 3. Set INV_P to true if the access pattern of the data reference in the
914 vectorized loop is invariant. Set it to false otherwise.
916 4. Return the pointer. */
918 static tree
919 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
920 tree offset, tree *initial_address, tree *ptr_incr,
921 bool only_init, tree type, bool *inv_p)
923 tree base_name;
924 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
925 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
926 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
927 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
928 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
929 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
930 tree vect_ptr_type;
931 tree vect_ptr;
932 tree tag;
933 tree new_temp;
934 tree vec_stmt;
935 tree new_stmt_list = NULL_TREE;
936 edge pe;
937 basic_block new_bb;
938 tree vect_ptr_init;
939 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
940 tree vptr;
941 block_stmt_iterator incr_bsi;
942 bool insert_after;
943 tree indx_before_incr, indx_after_incr;
944 tree incr;
945 tree step;
947 /* Check the step (evolution) of the load in LOOP, and record
948 whether it's invariant. */
949 if (nested_in_vect_loop)
950 step = STMT_VINFO_DR_STEP (stmt_info);
951 else
952 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
954 if (tree_int_cst_compare (step, size_zero_node) == 0)
955 *inv_p = true;
956 else
957 *inv_p = false;
959 /* Create an expression for the first address accessed by this load
960 in LOOP. */
961 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
963 if (vect_print_dump_info (REPORT_DETAILS))
965 tree data_ref_base = base_name;
966 fprintf (vect_dump, "create vector-pointer variable to type: ");
967 print_generic_expr (vect_dump, vectype, TDF_SLIM);
968 if (TREE_CODE (data_ref_base) == VAR_DECL)
969 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
970 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
971 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
972 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
973 fprintf (vect_dump, " vectorizing a record based array ref: ");
974 else if (TREE_CODE (data_ref_base) == SSA_NAME)
975 fprintf (vect_dump, " vectorizing a pointer ref: ");
976 print_generic_expr (vect_dump, base_name, TDF_SLIM);
979 /** (1) Create the new vector-pointer variable: **/
980 if (type)
981 vect_ptr_type = build_pointer_type (type);
982 else
983 vect_ptr_type = build_pointer_type (vectype);
984 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
985 get_name (base_name));
986 add_referenced_var (vect_ptr);
988 /** (2) Add aliasing information to the new vector-pointer:
989 (The points-to info (DR_PTR_INFO) may be defined later.) **/
991 tag = DR_SYMBOL_TAG (dr);
992 gcc_assert (tag);
994 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
995 tag must be created with tag added to its may alias list. */
996 if (!MTAG_P (tag))
997 new_type_alias (vect_ptr, tag, DR_REF (dr));
998 else
999 set_symbol_mem_tag (vect_ptr, tag);
1001 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1003 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1004 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1005 def-use update cycles for the pointer: One relative to the outer-loop
1006 (LOOP), which is what steps (3) and (4) below do. The other is relative
1007 to the inner-loop (which is the inner-most loop containing the dataref),
1008 and this is done be step (5) below.
1010 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1011 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1012 redundant. Steps (3),(4) create the following:
1014 vp0 = &base_addr;
1015 LOOP: vp1 = phi(vp0,vp2)
1016 ...
1018 vp2 = vp1 + step
1019 goto LOOP
1021 If there is an inner-loop nested in loop, then step (5) will also be
1022 applied, and an additional update in the inner-loop will be created:
1024 vp0 = &base_addr;
1025 LOOP: vp1 = phi(vp0,vp2)
1027 inner: vp3 = phi(vp1,vp4)
1028 vp4 = vp3 + inner_step
1029 if () goto inner
1031 vp2 = vp1 + step
1032 if () goto LOOP */
1034 /** (3) Calculate the initial address the vector-pointer, and set
1035 the vector-pointer to point to it before the loop: **/
1037 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1039 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1040 offset, loop);
1041 pe = loop_preheader_edge (loop);
1042 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1043 gcc_assert (!new_bb);
1044 *initial_address = new_temp;
1046 /* Create: p = (vectype *) initial_base */
1047 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1048 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1049 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1050 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1051 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1052 gcc_assert (!new_bb);
1055 /** (4) Handle the updating of the vector-pointer inside the loop.
1056 This is needed when ONLY_INIT is false, and also when AT_LOOP
1057 is the inner-loop nested in LOOP (during outer-loop vectorization).
1060 if (only_init && at_loop == loop) /* No update in loop is required. */
1062 /* Copy the points-to information if it exists. */
1063 if (DR_PTR_INFO (dr))
1064 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1065 vptr = vect_ptr_init;
1067 else
1069 /* The step of the vector pointer is the Vector Size. */
1070 tree step = TYPE_SIZE_UNIT (vectype);
1071 /* One exception to the above is when the scalar step of the load in
1072 LOOP is zero. In this case the step here is also zero. */
1073 if (*inv_p)
1074 step = size_zero_node;
1076 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1078 create_iv (vect_ptr_init,
1079 fold_convert (vect_ptr_type, step),
1080 NULL_TREE, loop, &incr_bsi, insert_after,
1081 &indx_before_incr, &indx_after_incr);
1082 incr = bsi_stmt (incr_bsi);
1083 set_stmt_info (stmt_ann (incr),
1084 new_stmt_vec_info (incr, loop_vinfo));
1086 /* Copy the points-to information if it exists. */
1087 if (DR_PTR_INFO (dr))
1089 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1090 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1092 merge_alias_info (vect_ptr_init, indx_before_incr);
1093 merge_alias_info (vect_ptr_init, indx_after_incr);
1094 if (ptr_incr)
1095 *ptr_incr = incr;
1097 vptr = indx_before_incr;
1100 if (!nested_in_vect_loop || only_init)
1101 return vptr;
1104 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1105 nested in LOOP, if exists: **/
1107 gcc_assert (nested_in_vect_loop);
1108 if (!only_init)
1110 standard_iv_increment_position (containing_loop, &incr_bsi,
1111 &insert_after);
1112 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1113 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1114 &indx_after_incr);
1115 incr = bsi_stmt (incr_bsi);
1116 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1118 /* Copy the points-to information if it exists. */
1119 if (DR_PTR_INFO (dr))
1121 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1122 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1124 merge_alias_info (vect_ptr_init, indx_before_incr);
1125 merge_alias_info (vect_ptr_init, indx_after_incr);
1126 if (ptr_incr)
1127 *ptr_incr = incr;
1129 return indx_before_incr;
1131 else
1132 gcc_unreachable ();
1136 /* Function bump_vector_ptr
1138 Increment a pointer (to a vector type) by vector-size. If requested,
1139 i.e. if PTR-INCR is given, then also connect the new increment stmt
1140 to the existing def-use update-chain of the pointer, by modifying
1141 the PTR_INCR as illustrated below:
1143 The pointer def-use update-chain before this function:
1144 DATAREF_PTR = phi (p_0, p_2)
1145 ....
1146 PTR_INCR: p_2 = DATAREF_PTR + step
1148 The pointer def-use update-chain after this function:
1149 DATAREF_PTR = phi (p_0, p_2)
1150 ....
1151 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1152 ....
1153 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1155 Input:
1156 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1157 in the loop.
1158 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1159 the loop. The increment amount across iterations is expected
1160 to be vector_size.
1161 BSI - location where the new update stmt is to be placed.
1162 STMT - the original scalar memory-access stmt that is being vectorized.
1163 BUMP - optional. The offset by which to bump the pointer. If not given,
1164 the offset is assumed to be vector_size.
1166 Output: Return NEW_DATAREF_PTR as illustrated above.
1170 static tree
1171 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1172 tree stmt, tree bump)
1174 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1175 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1176 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1177 tree vptr_type = TREE_TYPE (dataref_ptr);
1178 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1179 tree update = TYPE_SIZE_UNIT (vectype);
1180 tree incr_stmt;
1181 ssa_op_iter iter;
1182 use_operand_p use_p;
1183 tree new_dataref_ptr;
1185 if (bump)
1186 update = bump;
1188 incr_stmt = build_gimple_modify_stmt (ptr_var,
1189 build2 (POINTER_PLUS_EXPR, vptr_type,
1190 dataref_ptr, update));
1191 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1192 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1193 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1195 /* Copy the points-to information if it exists. */
1196 if (DR_PTR_INFO (dr))
1197 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1198 merge_alias_info (new_dataref_ptr, dataref_ptr);
1200 if (!ptr_incr)
1201 return new_dataref_ptr;
1203 /* Update the vector-pointer's cross-iteration increment. */
1204 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1206 tree use = USE_FROM_PTR (use_p);
1208 if (use == dataref_ptr)
1209 SET_USE (use_p, new_dataref_ptr);
1210 else
1211 gcc_assert (tree_int_cst_compare (use, update) == 0);
1214 return new_dataref_ptr;
1218 /* Function vect_create_destination_var.
1220 Create a new temporary of type VECTYPE. */
1222 static tree
1223 vect_create_destination_var (tree scalar_dest, tree vectype)
1225 tree vec_dest;
1226 const char *new_name;
1227 tree type;
1228 enum vect_var_kind kind;
1230 kind = vectype ? vect_simple_var : vect_scalar_var;
1231 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1233 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1235 new_name = get_name (scalar_dest);
1236 if (!new_name)
1237 new_name = "var_";
1238 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1239 add_referenced_var (vec_dest);
1241 return vec_dest;
1245 /* Function vect_init_vector.
1247 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1248 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1249 is not NULL. Otherwise, place the initialization at the loop preheader.
1250 Return the DEF of INIT_STMT.
1251 It will be used in the vectorization of STMT. */
1253 static tree
1254 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1255 block_stmt_iterator *bsi)
1257 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1258 tree new_var;
1259 tree init_stmt;
1260 tree vec_oprnd;
1261 edge pe;
1262 tree new_temp;
1263 basic_block new_bb;
1265 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1266 add_referenced_var (new_var);
1267 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1268 new_temp = make_ssa_name (new_var, init_stmt);
1269 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1271 if (bsi)
1272 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1273 else
1275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1276 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1278 if (nested_in_vect_loop_p (loop, stmt))
1279 loop = loop->inner;
1280 pe = loop_preheader_edge (loop);
1281 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1282 gcc_assert (!new_bb);
1285 if (vect_print_dump_info (REPORT_DETAILS))
1287 fprintf (vect_dump, "created new init_stmt: ");
1288 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1291 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1292 return vec_oprnd;
1296 /* For constant and loop invariant defs of SLP_NODE this function returns
1297 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1298 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1299 stmts. */
1301 static void
1302 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1303 unsigned int op_num)
1305 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1306 tree stmt = VEC_index (tree, stmts, 0);
1307 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1308 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1309 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1310 tree vec_cst;
1311 tree t = NULL_TREE;
1312 int j, number_of_places_left_in_vector;
1313 tree vector_type;
1314 tree op, vop, operation;
1315 int group_size = VEC_length (tree, stmts);
1316 unsigned int vec_num, i;
1317 int number_of_copies = 1;
1318 bool is_store = false;
1319 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1320 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1321 bool constant_p;
1323 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1324 is_store = true;
1326 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1327 created vectors. It is greater than 1 if unrolling is performed.
1329 For example, we have two scalar operands, s1 and s2 (e.g., group of
1330 strided accesses of size two), while NUINTS is four (i.e., four scalars
1331 of this type can be packed in a vector). The output vector will contain
1332 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1333 will be 2).
1335 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1336 containing the operands.
1338 For example, NUINTS is four as before, and the group size is 8
1339 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1340 {s5, s6, s7, s8}. */
1342 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1344 number_of_places_left_in_vector = nunits;
1345 constant_p = true;
1346 for (j = 0; j < number_of_copies; j++)
1348 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1350 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1351 if (is_store)
1352 op = operation;
1353 else
1354 op = TREE_OPERAND (operation, op_num);
1355 if (!CONSTANT_CLASS_P (op))
1356 constant_p = false;
1358 /* Create 'vect_ = {op0,op1,...,opn}'. */
1359 t = tree_cons (NULL_TREE, op, t);
1361 number_of_places_left_in_vector--;
1363 if (number_of_places_left_in_vector == 0)
1365 number_of_places_left_in_vector = nunits;
1367 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1368 gcc_assert (vector_type);
1369 if (constant_p)
1370 vec_cst = build_vector (vector_type, t);
1371 else
1372 vec_cst = build_constructor_from_list (vector_type, t);
1373 constant_p = true;
1374 VEC_quick_push (tree, voprnds,
1375 vect_init_vector (stmt, vec_cst, vector_type,
1376 NULL));
1377 t = NULL_TREE;
1382 /* Since the vectors are created in the reverse order, we should invert
1383 them. */
1384 vec_num = VEC_length (tree, voprnds);
1385 for (j = vec_num - 1; j >= 0; j--)
1387 vop = VEC_index (tree, voprnds, j);
1388 VEC_quick_push (tree, *vec_oprnds, vop);
1391 VEC_free (tree, heap, voprnds);
1393 /* In case that VF is greater than the unrolling factor needed for the SLP
1394 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1395 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1396 to replicate the vectors. */
1397 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1399 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1400 VEC_quick_push (tree, *vec_oprnds, vop);
1405 /* Get vectorized definitions from SLP_NODE that contains corresponding
1406 vectorized def-stmts. */
1408 static void
1409 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1411 tree vec_oprnd;
1412 tree vec_def_stmt;
1413 unsigned int i;
1415 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1417 for (i = 0;
1418 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1419 i++)
1421 gcc_assert (vec_def_stmt);
1422 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1423 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1428 /* Get vectorized definitions for SLP_NODE.
1429 If the scalar definitions are loop invariants or constants, collect them and
1430 call vect_get_constant_vectors() to create vector stmts.
1431 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1432 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1433 vect_get_slp_vect_defs() to retrieve them.
1434 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1435 the right node. This is used when the second operand must remain scalar. */
1437 static void
1438 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1439 VEC (tree,heap) **vec_oprnds1)
1441 tree operation, first_stmt;
1443 /* Allocate memory for vectorized defs. */
1444 *vec_oprnds0 = VEC_alloc (tree, heap,
1445 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1447 /* SLP_NODE corresponds either to a group of stores or to a group of
1448 unary/binary operations. We don't call this function for loads. */
1449 if (SLP_TREE_LEFT (slp_node))
1450 /* The defs are already vectorized. */
1451 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1452 else
1453 /* Build vectors from scalar defs. */
1454 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1456 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1457 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1458 /* Since we don't call this function with loads, this is a group of
1459 stores. */
1460 return;
1462 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1463 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1464 return;
1466 *vec_oprnds1 = VEC_alloc (tree, heap,
1467 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1469 if (SLP_TREE_RIGHT (slp_node))
1470 /* The defs are already vectorized. */
1471 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1472 else
1473 /* Build vectors from scalar defs. */
1474 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1478 /* Function get_initial_def_for_induction
1480 Input:
1481 STMT - a stmt that performs an induction operation in the loop.
1482 IV_PHI - the initial value of the induction variable
1484 Output:
1485 Return a vector variable, initialized with the first VF values of
1486 the induction variable. E.g., for an iv with IV_PHI='X' and
1487 evolution S, for a vector of 4 units, we want to return:
1488 [X, X + S, X + 2*S, X + 3*S]. */
1490 static tree
1491 get_initial_def_for_induction (tree iv_phi)
1493 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1495 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1496 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1497 tree vectype;
1498 int nunits;
1499 edge pe = loop_preheader_edge (loop);
1500 struct loop *iv_loop;
1501 basic_block new_bb;
1502 tree vec, vec_init, vec_step, t;
1503 tree access_fn;
1504 tree new_var;
1505 tree new_name;
1506 tree init_stmt;
1507 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1508 tree init_expr, step_expr;
1509 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1510 int i;
1511 bool ok;
1512 int ncopies;
1513 tree expr;
1514 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1515 bool nested_in_vect_loop = false;
1516 tree stmts;
1517 imm_use_iterator imm_iter;
1518 use_operand_p use_p;
1519 tree exit_phi;
1520 edge latch_e;
1521 tree loop_arg;
1522 block_stmt_iterator si;
1523 basic_block bb = bb_for_stmt (iv_phi);
1525 vectype = get_vectype_for_scalar_type (scalar_type);
1526 gcc_assert (vectype);
1527 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1528 ncopies = vf / nunits;
1530 gcc_assert (phi_info);
1531 gcc_assert (ncopies >= 1);
1533 /* Find the first insertion point in the BB. */
1534 si = bsi_after_labels (bb);
1536 if (INTEGRAL_TYPE_P (scalar_type))
1537 step_expr = build_int_cst (scalar_type, 0);
1538 else
1539 step_expr = build_real (scalar_type, dconst0);
1541 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1542 if (nested_in_vect_loop_p (loop, iv_phi))
1544 nested_in_vect_loop = true;
1545 iv_loop = loop->inner;
1547 else
1548 iv_loop = loop;
1549 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1551 latch_e = loop_latch_edge (iv_loop);
1552 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1554 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1555 gcc_assert (access_fn);
1556 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1557 &init_expr, &step_expr);
1558 gcc_assert (ok);
1559 pe = loop_preheader_edge (iv_loop);
1561 /* Create the vector that holds the initial_value of the induction. */
1562 if (nested_in_vect_loop)
1564 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1565 been created during vectorization of previous stmts; We obtain it from
1566 the STMT_VINFO_VEC_STMT of the defining stmt. */
1567 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1568 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1570 else
1572 /* iv_loop is the loop to be vectorized. Create:
1573 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1574 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1575 add_referenced_var (new_var);
1577 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1578 if (stmts)
1580 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1581 gcc_assert (!new_bb);
1584 t = NULL_TREE;
1585 t = tree_cons (NULL_TREE, init_expr, t);
1586 for (i = 1; i < nunits; i++)
1588 tree tmp;
1590 /* Create: new_name_i = new_name + step_expr */
1591 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1592 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1593 new_name = make_ssa_name (new_var, init_stmt);
1594 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1596 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1597 gcc_assert (!new_bb);
1599 if (vect_print_dump_info (REPORT_DETAILS))
1601 fprintf (vect_dump, "created new init_stmt: ");
1602 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1604 t = tree_cons (NULL_TREE, new_name, t);
1606 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1607 vec = build_constructor_from_list (vectype, nreverse (t));
1608 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1612 /* Create the vector that holds the step of the induction. */
1613 if (nested_in_vect_loop)
1614 /* iv_loop is nested in the loop to be vectorized. Generate:
1615 vec_step = [S, S, S, S] */
1616 new_name = step_expr;
1617 else
1619 /* iv_loop is the loop to be vectorized. Generate:
1620 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1621 expr = build_int_cst (scalar_type, vf);
1622 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1625 t = NULL_TREE;
1626 for (i = 0; i < nunits; i++)
1627 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1628 gcc_assert (CONSTANT_CLASS_P (new_name));
1629 vec = build_vector (vectype, t);
1630 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1633 /* Create the following def-use cycle:
1634 loop prolog:
1635 vec_init = ...
1636 vec_step = ...
1637 loop:
1638 vec_iv = PHI <vec_init, vec_loop>
1640 STMT
1642 vec_loop = vec_iv + vec_step; */
1644 /* Create the induction-phi that defines the induction-operand. */
1645 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1646 add_referenced_var (vec_dest);
1647 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1648 set_stmt_info (get_stmt_ann (induction_phi),
1649 new_stmt_vec_info (induction_phi, loop_vinfo));
1650 induc_def = PHI_RESULT (induction_phi);
1652 /* Create the iv update inside the loop */
1653 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1654 build2 (PLUS_EXPR, vectype,
1655 induc_def, vec_step));
1656 vec_def = make_ssa_name (vec_dest, new_stmt);
1657 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1658 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1659 set_stmt_info (get_stmt_ann (new_stmt),
1660 new_stmt_vec_info (new_stmt, loop_vinfo));
1662 /* Set the arguments of the phi node: */
1663 add_phi_arg (induction_phi, vec_init, pe);
1664 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1667 /* In case that vectorization factor (VF) is bigger than the number
1668 of elements that we can fit in a vectype (nunits), we have to generate
1669 more than one vector stmt - i.e - we need to "unroll" the
1670 vector stmt by a factor VF/nunits. For more details see documentation
1671 in vectorizable_operation. */
1673 if (ncopies > 1)
1675 stmt_vec_info prev_stmt_vinfo;
1676 /* FORNOW. This restriction should be relaxed. */
1677 gcc_assert (!nested_in_vect_loop);
1679 /* Create the vector that holds the step of the induction. */
1680 expr = build_int_cst (scalar_type, nunits);
1681 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1682 t = NULL_TREE;
1683 for (i = 0; i < nunits; i++)
1684 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1685 gcc_assert (CONSTANT_CLASS_P (new_name));
1686 vec = build_vector (vectype, t);
1687 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1689 vec_def = induc_def;
1690 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1691 for (i = 1; i < ncopies; i++)
1693 tree tmp;
1695 /* vec_i = vec_prev + vec_step */
1696 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1697 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1698 vec_def = make_ssa_name (vec_dest, new_stmt);
1699 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1700 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1701 set_stmt_info (get_stmt_ann (new_stmt),
1702 new_stmt_vec_info (new_stmt, loop_vinfo));
1703 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1704 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1708 if (nested_in_vect_loop)
1710 /* Find the loop-closed exit-phi of the induction, and record
1711 the final vector of induction results: */
1712 exit_phi = NULL;
1713 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1715 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1717 exit_phi = USE_STMT (use_p);
1718 break;
1721 if (exit_phi)
1723 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1724 /* FORNOW. Currently not supporting the case that an inner-loop induction
1725 is not used in the outer-loop (i.e. only outside the outer-loop). */
1726 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1727 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1729 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1730 if (vect_print_dump_info (REPORT_DETAILS))
1732 fprintf (vect_dump, "vector of inductions after inner-loop:");
1733 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1739 if (vect_print_dump_info (REPORT_DETAILS))
1741 fprintf (vect_dump, "transform induction: created def-use cycle:");
1742 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1743 fprintf (vect_dump, "\n");
1744 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1747 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1748 return induc_def;
1752 /* Function vect_get_vec_def_for_operand.
1754 OP is an operand in STMT. This function returns a (vector) def that will be
1755 used in the vectorized stmt for STMT.
1757 In the case that OP is an SSA_NAME which is defined in the loop, then
1758 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1760 In case OP is an invariant or constant, a new stmt that creates a vector def
1761 needs to be introduced. */
1763 static tree
1764 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1766 tree vec_oprnd;
1767 tree vec_stmt;
1768 tree def_stmt;
1769 stmt_vec_info def_stmt_info = NULL;
1770 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1771 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1772 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1773 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1774 tree vec_inv;
1775 tree vec_cst;
1776 tree t = NULL_TREE;
1777 tree def;
1778 int i;
1779 enum vect_def_type dt;
1780 bool is_simple_use;
1781 tree vector_type;
1783 if (vect_print_dump_info (REPORT_DETAILS))
1785 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1786 print_generic_expr (vect_dump, op, TDF_SLIM);
1789 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1790 gcc_assert (is_simple_use);
1791 if (vect_print_dump_info (REPORT_DETAILS))
1793 if (def)
1795 fprintf (vect_dump, "def = ");
1796 print_generic_expr (vect_dump, def, TDF_SLIM);
1798 if (def_stmt)
1800 fprintf (vect_dump, " def_stmt = ");
1801 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1805 switch (dt)
1807 /* Case 1: operand is a constant. */
1808 case vect_constant_def:
1810 if (scalar_def)
1811 *scalar_def = op;
1813 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1814 if (vect_print_dump_info (REPORT_DETAILS))
1815 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1817 for (i = nunits - 1; i >= 0; --i)
1819 t = tree_cons (NULL_TREE, op, t);
1821 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1822 gcc_assert (vector_type);
1823 vec_cst = build_vector (vector_type, t);
1825 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1828 /* Case 2: operand is defined outside the loop - loop invariant. */
1829 case vect_invariant_def:
1831 if (scalar_def)
1832 *scalar_def = def;
1834 /* Create 'vec_inv = {inv,inv,..,inv}' */
1835 if (vect_print_dump_info (REPORT_DETAILS))
1836 fprintf (vect_dump, "Create vector_inv.");
1838 for (i = nunits - 1; i >= 0; --i)
1840 t = tree_cons (NULL_TREE, def, t);
1843 /* FIXME: use build_constructor directly. */
1844 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1845 gcc_assert (vector_type);
1846 vec_inv = build_constructor_from_list (vector_type, t);
1847 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1850 /* Case 3: operand is defined inside the loop. */
1851 case vect_loop_def:
1853 if (scalar_def)
1854 *scalar_def = def_stmt;
1856 /* Get the def from the vectorized stmt. */
1857 def_stmt_info = vinfo_for_stmt (def_stmt);
1858 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1859 gcc_assert (vec_stmt);
1860 if (TREE_CODE (vec_stmt) == PHI_NODE)
1861 vec_oprnd = PHI_RESULT (vec_stmt);
1862 else
1863 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1864 return vec_oprnd;
1867 /* Case 4: operand is defined by a loop header phi - reduction */
1868 case vect_reduction_def:
1870 struct loop *loop;
1872 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1873 loop = (bb_for_stmt (def_stmt))->loop_father;
1875 /* Get the def before the loop */
1876 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1877 return get_initial_def_for_reduction (stmt, op, scalar_def);
1880 /* Case 5: operand is defined by loop-header phi - induction. */
1881 case vect_induction_def:
1883 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1885 /* Get the def from the vectorized stmt. */
1886 def_stmt_info = vinfo_for_stmt (def_stmt);
1887 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1888 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1889 vec_oprnd = PHI_RESULT (vec_stmt);
1890 return vec_oprnd;
1893 default:
1894 gcc_unreachable ();
1899 /* Function vect_get_vec_def_for_stmt_copy
1901 Return a vector-def for an operand. This function is used when the
1902 vectorized stmt to be created (by the caller to this function) is a "copy"
1903 created in case the vectorized result cannot fit in one vector, and several
1904 copies of the vector-stmt are required. In this case the vector-def is
1905 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1906 of the stmt that defines VEC_OPRND.
1907 DT is the type of the vector def VEC_OPRND.
1909 Context:
1910 In case the vectorization factor (VF) is bigger than the number
1911 of elements that can fit in a vectype (nunits), we have to generate
1912 more than one vector stmt to vectorize the scalar stmt. This situation
1913 arises when there are multiple data-types operated upon in the loop; the
1914 smallest data-type determines the VF, and as a result, when vectorizing
1915 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1916 vector stmt (each computing a vector of 'nunits' results, and together
1917 computing 'VF' results in each iteration). This function is called when
1918 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1919 which VF=16 and nunits=4, so the number of copies required is 4):
1921 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1923 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1924 VS1.1: vx.1 = memref1 VS1.2
1925 VS1.2: vx.2 = memref2 VS1.3
1926 VS1.3: vx.3 = memref3
1928 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1929 VSnew.1: vz1 = vx.1 + ... VSnew.2
1930 VSnew.2: vz2 = vx.2 + ... VSnew.3
1931 VSnew.3: vz3 = vx.3 + ...
1933 The vectorization of S1 is explained in vectorizable_load.
1934 The vectorization of S2:
1935 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1936 the function 'vect_get_vec_def_for_operand' is called to
1937 get the relevant vector-def for each operand of S2. For operand x it
1938 returns the vector-def 'vx.0'.
1940 To create the remaining copies of the vector-stmt (VSnew.j), this
1941 function is called to get the relevant vector-def for each operand. It is
1942 obtained from the respective VS1.j stmt, which is recorded in the
1943 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1945 For example, to obtain the vector-def 'vx.1' in order to create the
1946 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1947 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1948 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1949 and return its def ('vx.1').
1950 Overall, to create the above sequence this function will be called 3 times:
1951 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1952 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1953 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1955 static tree
1956 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1958 tree vec_stmt_for_operand;
1959 stmt_vec_info def_stmt_info;
1961 /* Do nothing; can reuse same def. */
1962 if (dt == vect_invariant_def || dt == vect_constant_def )
1963 return vec_oprnd;
1965 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1966 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1967 gcc_assert (def_stmt_info);
1968 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1969 gcc_assert (vec_stmt_for_operand);
1970 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1971 return vec_oprnd;
1975 /* Get vectorized definitions for the operands to create a copy of an original
1976 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
1978 static void
1979 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
1980 VEC(tree,heap) **vec_oprnds0,
1981 VEC(tree,heap) **vec_oprnds1)
1983 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
1985 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
1986 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
1988 if (vec_oprnds1 && *vec_oprnds1)
1990 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
1991 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
1992 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
1997 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
1999 static void
2000 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2001 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2003 if (slp_node)
2004 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2005 else
2007 tree vec_oprnd;
2009 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2010 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2011 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2013 if (op1)
2015 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2016 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2017 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2023 /* Function vect_finish_stmt_generation.
2025 Insert a new stmt. */
2027 static void
2028 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2029 block_stmt_iterator *bsi)
2031 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2032 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2034 gcc_assert (stmt == bsi_stmt (*bsi));
2035 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2037 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2039 set_stmt_info (get_stmt_ann (vec_stmt),
2040 new_stmt_vec_info (vec_stmt, loop_vinfo));
2042 if (vect_print_dump_info (REPORT_DETAILS))
2044 fprintf (vect_dump, "add new stmt: ");
2045 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2048 /* Make sure bsi points to the stmt that is being vectorized. */
2049 gcc_assert (stmt == bsi_stmt (*bsi));
2051 #ifdef USE_MAPPED_LOCATION
2052 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2053 #else
2054 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
2055 #endif
2059 /* Function get_initial_def_for_reduction
2061 Input:
2062 STMT - a stmt that performs a reduction operation in the loop.
2063 INIT_VAL - the initial value of the reduction variable
2065 Output:
2066 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2067 of the reduction (used for adjusting the epilog - see below).
2068 Return a vector variable, initialized according to the operation that STMT
2069 performs. This vector will be used as the initial value of the
2070 vector of partial results.
2072 Option1 (adjust in epilog): Initialize the vector as follows:
2073 add: [0,0,...,0,0]
2074 mult: [1,1,...,1,1]
2075 min/max: [init_val,init_val,..,init_val,init_val]
2076 bit and/or: [init_val,init_val,..,init_val,init_val]
2077 and when necessary (e.g. add/mult case) let the caller know
2078 that it needs to adjust the result by init_val.
2080 Option2: Initialize the vector as follows:
2081 add: [0,0,...,0,init_val]
2082 mult: [1,1,...,1,init_val]
2083 min/max: [init_val,init_val,...,init_val]
2084 bit and/or: [init_val,init_val,...,init_val]
2085 and no adjustments are needed.
2087 For example, for the following code:
2089 s = init_val;
2090 for (i=0;i<n;i++)
2091 s = s + a[i];
2093 STMT is 's = s + a[i]', and the reduction variable is 's'.
2094 For a vector of 4 units, we want to return either [0,0,0,init_val],
2095 or [0,0,0,0] and let the caller know that it needs to adjust
2096 the result at the end by 'init_val'.
2098 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2099 initialization vector is simpler (same element in all entries).
2100 A cost model should help decide between these two schemes. */
2102 static tree
2103 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2105 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2106 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2107 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2108 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2109 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2110 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2111 tree type = TREE_TYPE (init_val);
2112 tree vecdef;
2113 tree def_for_init;
2114 tree init_def;
2115 tree t = NULL_TREE;
2116 int i;
2117 tree vector_type;
2118 bool nested_in_vect_loop = false;
2120 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2121 if (nested_in_vect_loop_p (loop, stmt))
2122 nested_in_vect_loop = true;
2123 else
2124 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2126 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2128 switch (code)
2130 case WIDEN_SUM_EXPR:
2131 case DOT_PROD_EXPR:
2132 case PLUS_EXPR:
2133 if (nested_in_vect_loop)
2134 *adjustment_def = vecdef;
2135 else
2136 *adjustment_def = init_val;
2137 /* Create a vector of zeros for init_def. */
2138 if (SCALAR_FLOAT_TYPE_P (type))
2139 def_for_init = build_real (type, dconst0);
2140 else
2141 def_for_init = build_int_cst (type, 0);
2142 for (i = nunits - 1; i >= 0; --i)
2143 t = tree_cons (NULL_TREE, def_for_init, t);
2144 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2145 gcc_assert (vector_type);
2146 init_def = build_vector (vector_type, t);
2147 break;
2149 case MIN_EXPR:
2150 case MAX_EXPR:
2151 *adjustment_def = NULL_TREE;
2152 init_def = vecdef;
2153 break;
2155 default:
2156 gcc_unreachable ();
2159 return init_def;
2163 /* Function vect_create_epilog_for_reduction
2165 Create code at the loop-epilog to finalize the result of a reduction
2166 computation.
2168 VECT_DEF is a vector of partial results.
2169 REDUC_CODE is the tree-code for the epilog reduction.
2170 STMT is the scalar reduction stmt that is being vectorized.
2171 REDUCTION_PHI is the phi-node that carries the reduction computation.
2173 This function:
2174 1. Creates the reduction def-use cycle: sets the arguments for
2175 REDUCTION_PHI:
2176 The loop-entry argument is the vectorized initial-value of the reduction.
2177 The loop-latch argument is VECT_DEF - the vector of partial sums.
2178 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2179 by applying the operation specified by REDUC_CODE if available, or by
2180 other means (whole-vector shifts or a scalar loop).
2181 The function also creates a new phi node at the loop exit to preserve
2182 loop-closed form, as illustrated below.
2184 The flow at the entry to this function:
2186 loop:
2187 vec_def = phi <null, null> # REDUCTION_PHI
2188 VECT_DEF = vector_stmt # vectorized form of STMT
2189 s_loop = scalar_stmt # (scalar) STMT
2190 loop_exit:
2191 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2192 use <s_out0>
2193 use <s_out0>
2195 The above is transformed by this function into:
2197 loop:
2198 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2199 VECT_DEF = vector_stmt # vectorized form of STMT
2200 s_loop = scalar_stmt # (scalar) STMT
2201 loop_exit:
2202 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2203 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2204 v_out2 = reduce <v_out1>
2205 s_out3 = extract_field <v_out2, 0>
2206 s_out4 = adjust_result <s_out3>
2207 use <s_out4>
2208 use <s_out4>
2211 static void
2212 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2213 enum tree_code reduc_code, tree reduction_phi)
2215 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2216 tree vectype;
2217 enum machine_mode mode;
2218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2219 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2220 basic_block exit_bb;
2221 tree scalar_dest;
2222 tree scalar_type;
2223 tree new_phi;
2224 block_stmt_iterator exit_bsi;
2225 tree vec_dest;
2226 tree new_temp = NULL_TREE;
2227 tree new_name;
2228 tree epilog_stmt = NULL_TREE;
2229 tree new_scalar_dest, exit_phi, new_dest;
2230 tree bitsize, bitpos, bytesize;
2231 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2232 tree adjustment_def;
2233 tree vec_initial_def;
2234 tree orig_name;
2235 imm_use_iterator imm_iter;
2236 use_operand_p use_p;
2237 bool extract_scalar_result = false;
2238 tree reduction_op, expr;
2239 tree orig_stmt;
2240 tree use_stmt;
2241 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2242 bool nested_in_vect_loop = false;
2243 int op_type;
2244 VEC(tree,heap) *phis = NULL;
2245 int i;
2247 if (nested_in_vect_loop_p (loop, stmt))
2249 loop = loop->inner;
2250 nested_in_vect_loop = true;
2253 op_type = TREE_OPERAND_LENGTH (operation);
2254 reduction_op = TREE_OPERAND (operation, op_type-1);
2255 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2256 gcc_assert (vectype);
2257 mode = TYPE_MODE (vectype);
2259 /*** 1. Create the reduction def-use cycle ***/
2261 /* 1.1 set the loop-entry arg of the reduction-phi: */
2262 /* For the case of reduction, vect_get_vec_def_for_operand returns
2263 the scalar def before the loop, that defines the initial value
2264 of the reduction variable. */
2265 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2266 &adjustment_def);
2267 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2269 /* 1.2 set the loop-latch arg for the reduction-phi: */
2270 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2272 if (vect_print_dump_info (REPORT_DETAILS))
2274 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2275 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2276 fprintf (vect_dump, "\n");
2277 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2281 /*** 2. Create epilog code
2282 The reduction epilog code operates across the elements of the vector
2283 of partial results computed by the vectorized loop.
2284 The reduction epilog code consists of:
2285 step 1: compute the scalar result in a vector (v_out2)
2286 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2287 step 3: adjust the scalar result (s_out3) if needed.
2289 Step 1 can be accomplished using one the following three schemes:
2290 (scheme 1) using reduc_code, if available.
2291 (scheme 2) using whole-vector shifts, if available.
2292 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2293 combined.
2295 The overall epilog code looks like this:
2297 s_out0 = phi <s_loop> # original EXIT_PHI
2298 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2299 v_out2 = reduce <v_out1> # step 1
2300 s_out3 = extract_field <v_out2, 0> # step 2
2301 s_out4 = adjust_result <s_out3> # step 3
2303 (step 3 is optional, and step2 1 and 2 may be combined).
2304 Lastly, the uses of s_out0 are replaced by s_out4.
2306 ***/
2308 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2309 v_out1 = phi <v_loop> */
2311 exit_bb = single_exit (loop)->dest;
2312 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2313 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2314 exit_bsi = bsi_after_labels (exit_bb);
2316 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2317 (i.e. when reduc_code is not available) and in the final adjustment
2318 code (if needed). Also get the original scalar reduction variable as
2319 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2320 represents a reduction pattern), the tree-code and scalar-def are
2321 taken from the original stmt that the pattern-stmt (STMT) replaces.
2322 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2323 are taken from STMT. */
2325 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2326 if (!orig_stmt)
2328 /* Regular reduction */
2329 orig_stmt = stmt;
2331 else
2333 /* Reduction pattern */
2334 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2335 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2336 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2338 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2339 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2340 scalar_type = TREE_TYPE (scalar_dest);
2341 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2342 bitsize = TYPE_SIZE (scalar_type);
2343 bytesize = TYPE_SIZE_UNIT (scalar_type);
2346 /* In case this is a reduction in an inner-loop while vectorizing an outer
2347 loop - we don't need to extract a single scalar result at the end of the
2348 inner-loop. The final vector of partial results will be used in the
2349 vectorized outer-loop, or reduced to a scalar result at the end of the
2350 outer-loop. */
2351 if (nested_in_vect_loop)
2352 goto vect_finalize_reduction;
2354 /* 2.3 Create the reduction code, using one of the three schemes described
2355 above. */
2357 if (reduc_code < NUM_TREE_CODES)
2359 tree tmp;
2361 /*** Case 1: Create:
2362 v_out2 = reduc_expr <v_out1> */
2364 if (vect_print_dump_info (REPORT_DETAILS))
2365 fprintf (vect_dump, "Reduce using direct vector reduction.");
2367 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2368 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2369 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2370 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2371 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2372 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2374 extract_scalar_result = true;
2376 else
2378 enum tree_code shift_code = 0;
2379 bool have_whole_vector_shift = true;
2380 int bit_offset;
2381 int element_bitsize = tree_low_cst (bitsize, 1);
2382 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2383 tree vec_temp;
2385 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2386 shift_code = VEC_RSHIFT_EXPR;
2387 else
2388 have_whole_vector_shift = false;
2390 /* Regardless of whether we have a whole vector shift, if we're
2391 emulating the operation via tree-vect-generic, we don't want
2392 to use it. Only the first round of the reduction is likely
2393 to still be profitable via emulation. */
2394 /* ??? It might be better to emit a reduction tree code here, so that
2395 tree-vect-generic can expand the first round via bit tricks. */
2396 if (!VECTOR_MODE_P (mode))
2397 have_whole_vector_shift = false;
2398 else
2400 optab optab = optab_for_tree_code (code, vectype);
2401 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2402 have_whole_vector_shift = false;
2405 if (have_whole_vector_shift)
2407 /*** Case 2: Create:
2408 for (offset = VS/2; offset >= element_size; offset/=2)
2410 Create: va' = vec_shift <va, offset>
2411 Create: va = vop <va, va'>
2412 } */
2414 if (vect_print_dump_info (REPORT_DETAILS))
2415 fprintf (vect_dump, "Reduce using vector shifts");
2417 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2418 new_temp = PHI_RESULT (new_phi);
2420 for (bit_offset = vec_size_in_bits/2;
2421 bit_offset >= element_bitsize;
2422 bit_offset /= 2)
2424 tree bitpos = size_int (bit_offset);
2425 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2426 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2427 new_name = make_ssa_name (vec_dest, epilog_stmt);
2428 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2429 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2431 tmp = build2 (code, vectype, new_name, new_temp);
2432 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2433 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2434 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2435 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2438 extract_scalar_result = true;
2440 else
2442 tree rhs;
2444 /*** Case 3: Create:
2445 s = extract_field <v_out2, 0>
2446 for (offset = element_size;
2447 offset < vector_size;
2448 offset += element_size;)
2450 Create: s' = extract_field <v_out2, offset>
2451 Create: s = op <s, s'>
2452 } */
2454 if (vect_print_dump_info (REPORT_DETAILS))
2455 fprintf (vect_dump, "Reduce using scalar code. ");
2457 vec_temp = PHI_RESULT (new_phi);
2458 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2459 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2460 bitsize_zero_node);
2461 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2462 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2463 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2464 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2465 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2467 for (bit_offset = element_bitsize;
2468 bit_offset < vec_size_in_bits;
2469 bit_offset += element_bitsize)
2471 tree tmp;
2472 tree bitpos = bitsize_int (bit_offset);
2473 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2474 bitpos);
2476 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2477 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2478 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2479 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2480 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2482 tmp = build2 (code, scalar_type, new_name, new_temp);
2483 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2484 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2485 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2486 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2489 extract_scalar_result = false;
2493 /* 2.4 Extract the final scalar result. Create:
2494 s_out3 = extract_field <v_out2, bitpos> */
2496 if (extract_scalar_result)
2498 tree rhs;
2500 gcc_assert (!nested_in_vect_loop);
2501 if (vect_print_dump_info (REPORT_DETAILS))
2502 fprintf (vect_dump, "extract scalar result");
2504 if (BYTES_BIG_ENDIAN)
2505 bitpos = size_binop (MULT_EXPR,
2506 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2507 TYPE_SIZE (scalar_type));
2508 else
2509 bitpos = bitsize_zero_node;
2511 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2512 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2513 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2514 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2515 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2516 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2519 vect_finalize_reduction:
2521 /* 2.5 Adjust the final result by the initial value of the reduction
2522 variable. (When such adjustment is not needed, then
2523 'adjustment_def' is zero). For example, if code is PLUS we create:
2524 new_temp = loop_exit_def + adjustment_def */
2526 if (adjustment_def)
2528 if (nested_in_vect_loop)
2530 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2531 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2532 new_dest = vect_create_destination_var (scalar_dest, vectype);
2534 else
2536 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2537 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2538 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2540 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2541 new_temp = make_ssa_name (new_dest, epilog_stmt);
2542 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2543 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2547 /* 2.6 Handle the loop-exit phi */
2549 /* Replace uses of s_out0 with uses of s_out3:
2550 Find the loop-closed-use at the loop exit of the original scalar result.
2551 (The reduction result is expected to have two immediate uses - one at the
2552 latch block, and one at the loop exit). */
2553 phis = VEC_alloc (tree, heap, 10);
2554 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2556 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2558 exit_phi = USE_STMT (use_p);
2559 VEC_quick_push (tree, phis, exit_phi);
2562 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2563 gcc_assert (!VEC_empty (tree, phis));
2565 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2567 if (nested_in_vect_loop)
2569 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2571 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2572 is not used in the outer-loop (but only outside the outer-loop). */
2573 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2574 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2576 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2577 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2578 set_stmt_info (get_stmt_ann (epilog_stmt),
2579 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2580 continue;
2583 /* Replace the uses: */
2584 orig_name = PHI_RESULT (exit_phi);
2585 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2586 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2587 SET_USE (use_p, new_temp);
2589 VEC_free (tree, heap, phis);
2593 /* Function vectorizable_reduction.
2595 Check if STMT performs a reduction operation that can be vectorized.
2596 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2597 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2598 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2600 This function also handles reduction idioms (patterns) that have been
2601 recognized in advance during vect_pattern_recog. In this case, STMT may be
2602 of this form:
2603 X = pattern_expr (arg0, arg1, ..., X)
2604 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2605 sequence that had been detected and replaced by the pattern-stmt (STMT).
2607 In some cases of reduction patterns, the type of the reduction variable X is
2608 different than the type of the other arguments of STMT.
2609 In such cases, the vectype that is used when transforming STMT into a vector
2610 stmt is different than the vectype that is used to determine the
2611 vectorization factor, because it consists of a different number of elements
2612 than the actual number of elements that are being operated upon in parallel.
2614 For example, consider an accumulation of shorts into an int accumulator.
2615 On some targets it's possible to vectorize this pattern operating on 8
2616 shorts at a time (hence, the vectype for purposes of determining the
2617 vectorization factor should be V8HI); on the other hand, the vectype that
2618 is used to create the vector form is actually V4SI (the type of the result).
2620 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2621 indicates what is the actual level of parallelism (V8HI in the example), so
2622 that the right vectorization factor would be derived. This vectype
2623 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2624 be used to create the vectorized stmt. The right vectype for the vectorized
2625 stmt is obtained from the type of the result X:
2626 get_vectype_for_scalar_type (TREE_TYPE (X))
2628 This means that, contrary to "regular" reductions (or "regular" stmts in
2629 general), the following equation:
2630 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2631 does *NOT* necessarily hold for reduction patterns. */
2633 bool
2634 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2636 tree vec_dest;
2637 tree scalar_dest;
2638 tree op;
2639 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2640 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2641 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2642 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2643 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2644 tree operation;
2645 enum tree_code code, orig_code, epilog_reduc_code = 0;
2646 enum machine_mode vec_mode;
2647 int op_type;
2648 optab optab, reduc_optab;
2649 tree new_temp = NULL_TREE;
2650 tree def, def_stmt;
2651 enum vect_def_type dt;
2652 tree new_phi;
2653 tree scalar_type;
2654 bool is_simple_use;
2655 tree orig_stmt;
2656 stmt_vec_info orig_stmt_info;
2657 tree expr = NULL_TREE;
2658 int i;
2659 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2660 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2661 stmt_vec_info prev_stmt_info;
2662 tree reduc_def;
2663 tree new_stmt = NULL_TREE;
2664 int j;
2666 if (nested_in_vect_loop_p (loop, stmt))
2668 loop = loop->inner;
2669 /* FORNOW. This restriction should be relaxed. */
2670 if (ncopies > 1)
2672 if (vect_print_dump_info (REPORT_DETAILS))
2673 fprintf (vect_dump, "multiple types in nested loop.");
2674 return false;
2678 gcc_assert (ncopies >= 1);
2680 /* FORNOW: SLP not supported. */
2681 if (STMT_SLP_TYPE (stmt_info))
2682 return false;
2684 /* 1. Is vectorizable reduction? */
2686 /* Not supportable if the reduction variable is used in the loop. */
2687 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2688 return false;
2690 /* Reductions that are not used even in an enclosing outer-loop,
2691 are expected to be "live" (used out of the loop). */
2692 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2693 && !STMT_VINFO_LIVE_P (stmt_info))
2694 return false;
2696 /* Make sure it was already recognized as a reduction computation. */
2697 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2698 return false;
2700 /* 2. Has this been recognized as a reduction pattern?
2702 Check if STMT represents a pattern that has been recognized
2703 in earlier analysis stages. For stmts that represent a pattern,
2704 the STMT_VINFO_RELATED_STMT field records the last stmt in
2705 the original sequence that constitutes the pattern. */
2707 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2708 if (orig_stmt)
2710 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2711 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2712 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2713 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2716 /* 3. Check the operands of the operation. The first operands are defined
2717 inside the loop body. The last operand is the reduction variable,
2718 which is defined by the loop-header-phi. */
2720 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2722 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2723 code = TREE_CODE (operation);
2724 op_type = TREE_OPERAND_LENGTH (operation);
2725 if (op_type != binary_op && op_type != ternary_op)
2726 return false;
2727 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2728 scalar_type = TREE_TYPE (scalar_dest);
2729 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2730 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2731 return false;
2733 /* All uses but the last are expected to be defined in the loop.
2734 The last use is the reduction variable. */
2735 for (i = 0; i < op_type-1; i++)
2737 op = TREE_OPERAND (operation, i);
2738 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2739 gcc_assert (is_simple_use);
2740 if (dt != vect_loop_def
2741 && dt != vect_invariant_def
2742 && dt != vect_constant_def
2743 && dt != vect_induction_def)
2744 return false;
2747 op = TREE_OPERAND (operation, i);
2748 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2749 gcc_assert (is_simple_use);
2750 gcc_assert (dt == vect_reduction_def);
2751 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2752 if (orig_stmt)
2753 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2754 else
2755 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2757 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2758 return false;
2760 /* 4. Supportable by target? */
2762 /* 4.1. check support for the operation in the loop */
2763 optab = optab_for_tree_code (code, vectype);
2764 if (!optab)
2766 if (vect_print_dump_info (REPORT_DETAILS))
2767 fprintf (vect_dump, "no optab.");
2768 return false;
2770 vec_mode = TYPE_MODE (vectype);
2771 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2773 if (vect_print_dump_info (REPORT_DETAILS))
2774 fprintf (vect_dump, "op not supported by target.");
2775 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2776 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2777 < vect_min_worthwhile_factor (code))
2778 return false;
2779 if (vect_print_dump_info (REPORT_DETAILS))
2780 fprintf (vect_dump, "proceeding using word mode.");
2783 /* Worthwhile without SIMD support? */
2784 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2785 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2786 < vect_min_worthwhile_factor (code))
2788 if (vect_print_dump_info (REPORT_DETAILS))
2789 fprintf (vect_dump, "not worthwhile without SIMD support.");
2790 return false;
2793 /* 4.2. Check support for the epilog operation.
2795 If STMT represents a reduction pattern, then the type of the
2796 reduction variable may be different than the type of the rest
2797 of the arguments. For example, consider the case of accumulation
2798 of shorts into an int accumulator; The original code:
2799 S1: int_a = (int) short_a;
2800 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2802 was replaced with:
2803 STMT: int_acc = widen_sum <short_a, int_acc>
2805 This means that:
2806 1. The tree-code that is used to create the vector operation in the
2807 epilog code (that reduces the partial results) is not the
2808 tree-code of STMT, but is rather the tree-code of the original
2809 stmt from the pattern that STMT is replacing. I.e, in the example
2810 above we want to use 'widen_sum' in the loop, but 'plus' in the
2811 epilog.
2812 2. The type (mode) we use to check available target support
2813 for the vector operation to be created in the *epilog*, is
2814 determined by the type of the reduction variable (in the example
2815 above we'd check this: plus_optab[vect_int_mode]).
2816 However the type (mode) we use to check available target support
2817 for the vector operation to be created *inside the loop*, is
2818 determined by the type of the other arguments to STMT (in the
2819 example we'd check this: widen_sum_optab[vect_short_mode]).
2821 This is contrary to "regular" reductions, in which the types of all
2822 the arguments are the same as the type of the reduction variable.
2823 For "regular" reductions we can therefore use the same vector type
2824 (and also the same tree-code) when generating the epilog code and
2825 when generating the code inside the loop. */
2827 if (orig_stmt)
2829 /* This is a reduction pattern: get the vectype from the type of the
2830 reduction variable, and get the tree-code from orig_stmt. */
2831 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2832 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2833 if (!vectype)
2835 if (vect_print_dump_info (REPORT_DETAILS))
2837 fprintf (vect_dump, "unsupported data-type ");
2838 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2840 return false;
2843 vec_mode = TYPE_MODE (vectype);
2845 else
2847 /* Regular reduction: use the same vectype and tree-code as used for
2848 the vector code inside the loop can be used for the epilog code. */
2849 orig_code = code;
2852 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2853 return false;
2854 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2855 if (!reduc_optab)
2857 if (vect_print_dump_info (REPORT_DETAILS))
2858 fprintf (vect_dump, "no optab for reduction.");
2859 epilog_reduc_code = NUM_TREE_CODES;
2861 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2863 if (vect_print_dump_info (REPORT_DETAILS))
2864 fprintf (vect_dump, "reduc op not supported by target.");
2865 epilog_reduc_code = NUM_TREE_CODES;
2868 if (!vec_stmt) /* transformation not required. */
2870 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2871 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2872 return false;
2873 return true;
2876 /** Transform. **/
2878 if (vect_print_dump_info (REPORT_DETAILS))
2879 fprintf (vect_dump, "transform reduction.");
2881 /* Create the destination vector */
2882 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2884 /* Create the reduction-phi that defines the reduction-operand. */
2885 new_phi = create_phi_node (vec_dest, loop->header);
2887 /* In case the vectorization factor (VF) is bigger than the number
2888 of elements that we can fit in a vectype (nunits), we have to generate
2889 more than one vector stmt - i.e - we need to "unroll" the
2890 vector stmt by a factor VF/nunits. For more details see documentation
2891 in vectorizable_operation. */
2893 prev_stmt_info = NULL;
2894 for (j = 0; j < ncopies; j++)
2896 /* Handle uses. */
2897 if (j == 0)
2899 op = TREE_OPERAND (operation, 0);
2900 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2901 if (op_type == ternary_op)
2903 op = TREE_OPERAND (operation, 1);
2904 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2907 /* Get the vector def for the reduction variable from the phi node */
2908 reduc_def = PHI_RESULT (new_phi);
2910 else
2912 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2913 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2914 if (op_type == ternary_op)
2915 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2917 /* Get the vector def for the reduction variable from the vectorized
2918 reduction operation generated in the previous iteration (j-1) */
2919 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2922 /* Arguments are ready. create the new vector stmt. */
2923 if (op_type == binary_op)
2924 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2925 else
2926 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2927 reduc_def);
2928 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2929 new_temp = make_ssa_name (vec_dest, new_stmt);
2930 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2931 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2933 if (j == 0)
2934 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2935 else
2936 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2937 prev_stmt_info = vinfo_for_stmt (new_stmt);
2940 /* Finalize the reduction-phi (set it's arguments) and create the
2941 epilog reduction code. */
2942 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2943 return true;
2946 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2947 a function declaration if the target has a vectorized version
2948 of the function, or NULL_TREE if the function cannot be vectorized. */
2950 tree
2951 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2953 tree fndecl = get_callee_fndecl (call);
2954 enum built_in_function code;
2956 /* We only handle functions that do not read or clobber memory -- i.e.
2957 const or novops ones. */
2958 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2959 return NULL_TREE;
2961 if (!fndecl
2962 || TREE_CODE (fndecl) != FUNCTION_DECL
2963 || !DECL_BUILT_IN (fndecl))
2964 return NULL_TREE;
2966 code = DECL_FUNCTION_CODE (fndecl);
2967 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2968 vectype_in);
2971 /* Function vectorizable_call.
2973 Check if STMT performs a function call that can be vectorized.
2974 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2975 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2976 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2978 bool
2979 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2981 tree vec_dest;
2982 tree scalar_dest;
2983 tree operation;
2984 tree op, type;
2985 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2986 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2987 tree vectype_out, vectype_in;
2988 int nunits_in;
2989 int nunits_out;
2990 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2991 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2992 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2993 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
2994 tree new_stmt;
2995 int ncopies, j, nargs;
2996 call_expr_arg_iterator iter;
2997 tree vargs;
2998 enum { NARROW, NONE, WIDEN } modifier;
3000 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3001 return false;
3003 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3004 return false;
3006 /* FORNOW: SLP not supported. */
3007 if (STMT_SLP_TYPE (stmt_info))
3008 return false;
3010 /* Is STMT a vectorizable call? */
3011 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3012 return false;
3014 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3015 return false;
3017 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3018 if (TREE_CODE (operation) != CALL_EXPR)
3019 return false;
3021 /* Process function arguments. */
3022 rhs_type = NULL_TREE;
3023 nargs = 0;
3024 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3026 /* Bail out if the function has more than two arguments, we
3027 do not have interesting builtin functions to vectorize with
3028 more than two arguments. */
3029 if (nargs >= 2)
3030 return false;
3032 /* We can only handle calls with arguments of the same type. */
3033 if (rhs_type
3034 && rhs_type != TREE_TYPE (op))
3036 if (vect_print_dump_info (REPORT_DETAILS))
3037 fprintf (vect_dump, "argument types differ.");
3038 return false;
3040 rhs_type = TREE_TYPE (op);
3042 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3044 if (vect_print_dump_info (REPORT_DETAILS))
3045 fprintf (vect_dump, "use not simple.");
3046 return false;
3049 ++nargs;
3052 /* No arguments is also not good. */
3053 if (nargs == 0)
3054 return false;
3056 vectype_in = get_vectype_for_scalar_type (rhs_type);
3057 if (!vectype_in)
3058 return false;
3059 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3061 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3062 vectype_out = get_vectype_for_scalar_type (lhs_type);
3063 if (!vectype_out)
3064 return false;
3065 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3067 /* FORNOW */
3068 if (nunits_in == nunits_out / 2)
3069 modifier = NARROW;
3070 else if (nunits_out == nunits_in)
3071 modifier = NONE;
3072 else if (nunits_out == nunits_in / 2)
3073 modifier = WIDEN;
3074 else
3075 return false;
3077 /* For now, we only vectorize functions if a target specific builtin
3078 is available. TODO -- in some cases, it might be profitable to
3079 insert the calls for pieces of the vector, in order to be able
3080 to vectorize other operations in the loop. */
3081 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3082 if (fndecl == NULL_TREE)
3084 if (vect_print_dump_info (REPORT_DETAILS))
3085 fprintf (vect_dump, "function is not vectorizable.");
3087 return false;
3090 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3092 if (modifier == NARROW)
3093 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3094 else
3095 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3097 /* Sanity check: make sure that at least one copy of the vectorized stmt
3098 needs to be generated. */
3099 gcc_assert (ncopies >= 1);
3101 /* FORNOW. This restriction should be relaxed. */
3102 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3104 if (vect_print_dump_info (REPORT_DETAILS))
3105 fprintf (vect_dump, "multiple types in nested loop.");
3106 return false;
3109 if (!vec_stmt) /* transformation not required. */
3111 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3112 if (vect_print_dump_info (REPORT_DETAILS))
3113 fprintf (vect_dump, "=== vectorizable_call ===");
3114 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3115 return true;
3118 /** Transform. **/
3120 if (vect_print_dump_info (REPORT_DETAILS))
3121 fprintf (vect_dump, "transform operation.");
3123 /* FORNOW. This restriction should be relaxed. */
3124 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3126 if (vect_print_dump_info (REPORT_DETAILS))
3127 fprintf (vect_dump, "multiple types in nested loop.");
3128 return false;
3131 /* Handle def. */
3132 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3133 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3135 prev_stmt_info = NULL;
3136 switch (modifier)
3138 case NONE:
3139 for (j = 0; j < ncopies; ++j)
3141 /* Build argument list for the vectorized call. */
3142 /* FIXME: Rewrite this so that it doesn't
3143 construct a temporary list. */
3144 vargs = NULL_TREE;
3145 nargs = 0;
3146 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3148 if (j == 0)
3149 vec_oprnd0
3150 = vect_get_vec_def_for_operand (op, stmt, NULL);
3151 else
3152 vec_oprnd0
3153 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3155 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3157 ++nargs;
3159 vargs = nreverse (vargs);
3161 rhs = build_function_call_expr (fndecl, vargs);
3162 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3163 new_temp = make_ssa_name (vec_dest, new_stmt);
3164 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3166 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3168 if (j == 0)
3169 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3170 else
3171 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3173 prev_stmt_info = vinfo_for_stmt (new_stmt);
3176 break;
3178 case NARROW:
3179 for (j = 0; j < ncopies; ++j)
3181 /* Build argument list for the vectorized call. */
3182 /* FIXME: Rewrite this so that it doesn't
3183 construct a temporary list. */
3184 vargs = NULL_TREE;
3185 nargs = 0;
3186 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3188 if (j == 0)
3190 vec_oprnd0
3191 = vect_get_vec_def_for_operand (op, stmt, NULL);
3192 vec_oprnd1
3193 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3195 else
3197 vec_oprnd0
3198 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3199 vec_oprnd1
3200 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3203 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3204 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3206 ++nargs;
3208 vargs = nreverse (vargs);
3210 rhs = build_function_call_expr (fndecl, vargs);
3211 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3212 new_temp = make_ssa_name (vec_dest, new_stmt);
3213 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3215 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3217 if (j == 0)
3218 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3219 else
3220 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3222 prev_stmt_info = vinfo_for_stmt (new_stmt);
3225 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3227 break;
3229 case WIDEN:
3230 /* No current target implements this case. */
3231 return false;
3234 /* The call in STMT might prevent it from being removed in dce.
3235 We however cannot remove it here, due to the way the ssa name
3236 it defines is mapped to the new definition. So just replace
3237 rhs of the statement with something harmless. */
3238 type = TREE_TYPE (scalar_dest);
3239 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3240 update_stmt (stmt);
3242 return true;
3246 /* Function vect_gen_widened_results_half
3248 Create a vector stmt whose code, type, number of arguments, and result
3249 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3250 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3251 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3252 needs to be created (DECL is a function-decl of a target-builtin).
3253 STMT is the original scalar stmt that we are vectorizing. */
3255 static tree
3256 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3257 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3258 tree vec_dest, block_stmt_iterator *bsi,
3259 tree stmt)
3261 tree expr;
3262 tree new_stmt;
3263 tree new_temp;
3264 tree sym;
3265 ssa_op_iter iter;
3267 /* Generate half of the widened result: */
3268 if (code == CALL_EXPR)
3270 /* Target specific support */
3271 if (op_type == binary_op)
3272 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3273 else
3274 expr = build_call_expr (decl, 1, vec_oprnd0);
3276 else
3278 /* Generic support */
3279 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3280 if (op_type == binary_op)
3281 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3282 else
3283 expr = build1 (code, vectype, vec_oprnd0);
3285 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3286 new_temp = make_ssa_name (vec_dest, new_stmt);
3287 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3288 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3290 if (code == CALL_EXPR)
3292 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3294 if (TREE_CODE (sym) == SSA_NAME)
3295 sym = SSA_NAME_VAR (sym);
3296 mark_sym_for_renaming (sym);
3300 return new_stmt;
3304 /* Check if STMT performs a conversion operation, that can be vectorized.
3305 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3306 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3307 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3309 bool
3310 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3311 tree *vec_stmt, slp_tree slp_node)
3313 tree vec_dest;
3314 tree scalar_dest;
3315 tree operation;
3316 tree op0;
3317 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3318 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3319 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3320 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3321 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3322 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3323 tree new_temp;
3324 tree def, def_stmt;
3325 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3326 tree new_stmt = NULL_TREE;
3327 stmt_vec_info prev_stmt_info;
3328 int nunits_in;
3329 int nunits_out;
3330 tree vectype_out, vectype_in;
3331 int ncopies, j;
3332 tree expr;
3333 tree rhs_type, lhs_type;
3334 tree builtin_decl;
3335 enum { NARROW, NONE, WIDEN } modifier;
3336 int i;
3337 VEC(tree,heap) *vec_oprnds0 = NULL;
3338 tree vop0;
3340 /* Is STMT a vectorizable conversion? */
3342 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3343 return false;
3345 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3346 return false;
3348 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3349 return false;
3351 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3352 return false;
3354 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3355 code = TREE_CODE (operation);
3356 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3357 return false;
3359 /* Check types of lhs and rhs. */
3360 op0 = TREE_OPERAND (operation, 0);
3361 rhs_type = TREE_TYPE (op0);
3362 vectype_in = get_vectype_for_scalar_type (rhs_type);
3363 if (!vectype_in)
3364 return false;
3365 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3367 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3368 lhs_type = TREE_TYPE (scalar_dest);
3369 vectype_out = get_vectype_for_scalar_type (lhs_type);
3370 if (!vectype_out)
3371 return false;
3372 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3374 /* FORNOW */
3375 if (nunits_in == nunits_out / 2)
3376 modifier = NARROW;
3377 else if (nunits_out == nunits_in)
3378 modifier = NONE;
3379 else if (nunits_out == nunits_in / 2)
3380 modifier = WIDEN;
3381 else
3382 return false;
3384 if (modifier == NONE)
3385 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3387 /* Bail out if the types are both integral or non-integral. */
3388 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3389 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3390 return false;
3392 if (modifier == NARROW)
3393 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3394 else
3395 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3397 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3398 this, so we can safely override NCOPIES with 1 here. */
3399 if (slp_node)
3400 ncopies = 1;
3402 /* Sanity check: make sure that at least one copy of the vectorized stmt
3403 needs to be generated. */
3404 gcc_assert (ncopies >= 1);
3406 /* FORNOW. This restriction should be relaxed. */
3407 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3409 if (vect_print_dump_info (REPORT_DETAILS))
3410 fprintf (vect_dump, "multiple types in nested loop.");
3411 return false;
3414 /* Check the operands of the operation. */
3415 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3417 if (vect_print_dump_info (REPORT_DETAILS))
3418 fprintf (vect_dump, "use not simple.");
3419 return false;
3422 /* Supportable by target? */
3423 if ((modifier == NONE
3424 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3425 || (modifier == WIDEN
3426 && !supportable_widening_operation (code, stmt, vectype_in,
3427 &decl1, &decl2,
3428 &code1, &code2))
3429 || (modifier == NARROW
3430 && !supportable_narrowing_operation (code, stmt, vectype_in,
3431 &code1)))
3433 if (vect_print_dump_info (REPORT_DETAILS))
3434 fprintf (vect_dump, "op not supported by target.");
3435 return false;
3438 if (modifier != NONE)
3440 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3441 /* FORNOW: SLP not supported. */
3442 if (STMT_SLP_TYPE (stmt_info))
3443 return false;
3446 if (!vec_stmt) /* transformation not required. */
3448 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3449 return true;
3452 /** Transform. **/
3453 if (vect_print_dump_info (REPORT_DETAILS))
3454 fprintf (vect_dump, "transform conversion.");
3456 /* Handle def. */
3457 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3459 if (modifier == NONE && !slp_node)
3460 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3462 prev_stmt_info = NULL;
3463 switch (modifier)
3465 case NONE:
3466 for (j = 0; j < ncopies; j++)
3468 tree sym;
3469 ssa_op_iter iter;
3471 if (j == 0)
3472 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3473 else
3474 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3476 builtin_decl =
3477 targetm.vectorize.builtin_conversion (code, vectype_in);
3478 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3480 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3482 /* Arguments are ready. create the new vector stmt. */
3483 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3484 new_temp = make_ssa_name (vec_dest, new_stmt);
3485 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3486 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3487 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3488 SSA_OP_ALL_VIRTUALS)
3490 if (TREE_CODE (sym) == SSA_NAME)
3491 sym = SSA_NAME_VAR (sym);
3492 mark_sym_for_renaming (sym);
3494 if (slp_node)
3495 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3498 if (j == 0)
3499 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3500 else
3501 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3502 prev_stmt_info = vinfo_for_stmt (new_stmt);
3504 break;
3506 case WIDEN:
3507 /* In case the vectorization factor (VF) is bigger than the number
3508 of elements that we can fit in a vectype (nunits), we have to
3509 generate more than one vector stmt - i.e - we need to "unroll"
3510 the vector stmt by a factor VF/nunits. */
3511 for (j = 0; j < ncopies; j++)
3513 if (j == 0)
3514 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3515 else
3516 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3518 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3520 /* Generate first half of the widened result: */
3521 new_stmt
3522 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3523 vec_oprnd0, vec_oprnd1,
3524 unary_op, vec_dest, bsi, stmt);
3525 if (j == 0)
3526 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3527 else
3528 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3529 prev_stmt_info = vinfo_for_stmt (new_stmt);
3531 /* Generate second half of the widened result: */
3532 new_stmt
3533 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3534 vec_oprnd0, vec_oprnd1,
3535 unary_op, vec_dest, bsi, stmt);
3536 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3537 prev_stmt_info = vinfo_for_stmt (new_stmt);
3539 break;
3541 case NARROW:
3542 /* In case the vectorization factor (VF) is bigger than the number
3543 of elements that we can fit in a vectype (nunits), we have to
3544 generate more than one vector stmt - i.e - we need to "unroll"
3545 the vector stmt by a factor VF/nunits. */
3546 for (j = 0; j < ncopies; j++)
3548 /* Handle uses. */
3549 if (j == 0)
3551 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3552 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3554 else
3556 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3557 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3560 /* Arguments are ready. Create the new vector stmt. */
3561 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3562 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3563 new_temp = make_ssa_name (vec_dest, new_stmt);
3564 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3565 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3567 if (j == 0)
3568 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3569 else
3570 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3572 prev_stmt_info = vinfo_for_stmt (new_stmt);
3575 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3578 return true;
3582 /* Function vectorizable_assignment.
3584 Check if STMT performs an assignment (copy) that can be vectorized.
3585 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3586 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3587 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3589 bool
3590 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3591 slp_tree slp_node)
3593 tree vec_dest;
3594 tree scalar_dest;
3595 tree op;
3596 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3597 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3598 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3599 tree new_temp;
3600 tree def, def_stmt;
3601 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3602 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3603 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3604 int i;
3605 VEC(tree,heap) *vec_oprnds = NULL;
3606 tree vop;
3608 gcc_assert (ncopies >= 1);
3609 if (ncopies > 1)
3610 return false; /* FORNOW */
3612 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3613 return false;
3615 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3616 return false;
3618 /* Is vectorizable assignment? */
3619 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3620 return false;
3622 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3623 if (TREE_CODE (scalar_dest) != SSA_NAME)
3624 return false;
3626 op = GIMPLE_STMT_OPERAND (stmt, 1);
3627 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3629 if (vect_print_dump_info (REPORT_DETAILS))
3630 fprintf (vect_dump, "use not simple.");
3631 return false;
3634 if (!vec_stmt) /* transformation not required. */
3636 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3637 if (vect_print_dump_info (REPORT_DETAILS))
3638 fprintf (vect_dump, "=== vectorizable_assignment ===");
3639 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3640 return true;
3643 /** Transform. **/
3644 if (vect_print_dump_info (REPORT_DETAILS))
3645 fprintf (vect_dump, "transform assignment.");
3647 /* Handle def. */
3648 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3650 /* Handle use. */
3651 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3653 /* Arguments are ready. create the new vector stmt. */
3654 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3656 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3657 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3658 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3659 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3660 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3662 if (slp_node)
3663 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3666 VEC_free (tree, heap, vec_oprnds);
3667 return true;
3671 /* Function vect_min_worthwhile_factor.
3673 For a loop where we could vectorize the operation indicated by CODE,
3674 return the minimum vectorization factor that makes it worthwhile
3675 to use generic vectors. */
3676 static int
3677 vect_min_worthwhile_factor (enum tree_code code)
3679 switch (code)
3681 case PLUS_EXPR:
3682 case MINUS_EXPR:
3683 case NEGATE_EXPR:
3684 return 4;
3686 case BIT_AND_EXPR:
3687 case BIT_IOR_EXPR:
3688 case BIT_XOR_EXPR:
3689 case BIT_NOT_EXPR:
3690 return 2;
3692 default:
3693 return INT_MAX;
3698 /* Function vectorizable_induction
3700 Check if PHI performs an induction computation that can be vectorized.
3701 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3702 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3703 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3705 bool
3706 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3707 tree *vec_stmt)
3709 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3710 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3711 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3712 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3713 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3714 tree vec_def;
3716 gcc_assert (ncopies >= 1);
3718 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3719 return false;
3721 /* FORNOW: SLP not supported. */
3722 if (STMT_SLP_TYPE (stmt_info))
3723 return false;
3725 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3727 if (TREE_CODE (phi) != PHI_NODE)
3728 return false;
3730 if (!vec_stmt) /* transformation not required. */
3732 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3733 if (vect_print_dump_info (REPORT_DETAILS))
3734 fprintf (vect_dump, "=== vectorizable_induction ===");
3735 vect_model_induction_cost (stmt_info, ncopies);
3736 return true;
3739 /** Transform. **/
3741 if (vect_print_dump_info (REPORT_DETAILS))
3742 fprintf (vect_dump, "transform induction phi.");
3744 vec_def = get_initial_def_for_induction (phi);
3745 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3746 return true;
3750 /* Function vectorizable_operation.
3752 Check if STMT performs a binary or unary operation that can be vectorized.
3753 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3754 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3755 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3757 bool
3758 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3759 slp_tree slp_node)
3761 tree vec_dest;
3762 tree scalar_dest;
3763 tree operation;
3764 tree op0, op1 = NULL;
3765 tree vec_oprnd1 = NULL_TREE;
3766 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3767 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3768 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3769 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3770 enum tree_code code;
3771 enum machine_mode vec_mode;
3772 tree new_temp;
3773 int op_type;
3774 optab optab;
3775 int icode;
3776 enum machine_mode optab_op2_mode;
3777 tree def, def_stmt;
3778 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3779 tree new_stmt = NULL_TREE;
3780 stmt_vec_info prev_stmt_info;
3781 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3782 int nunits_out;
3783 tree vectype_out;
3784 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3785 int j, i;
3786 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3787 tree vop0, vop1;
3788 unsigned int k;
3789 bool scalar_shift_arg = false;
3791 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3792 this, so we can safely override NCOPIES with 1 here. */
3793 if (slp_node)
3794 ncopies = 1;
3795 gcc_assert (ncopies >= 1);
3796 /* FORNOW. This restriction should be relaxed. */
3797 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3799 if (vect_print_dump_info (REPORT_DETAILS))
3800 fprintf (vect_dump, "multiple types in nested loop.");
3801 return false;
3804 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3805 return false;
3807 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3808 return false;
3810 /* Is STMT a vectorizable binary/unary operation? */
3811 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3812 return false;
3814 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3815 return false;
3817 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3818 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3819 if (!vectype_out)
3820 return false;
3821 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3822 if (nunits_out != nunits_in)
3823 return false;
3825 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3826 code = TREE_CODE (operation);
3828 /* For pointer addition, we should use the normal plus for
3829 the vector addition. */
3830 if (code == POINTER_PLUS_EXPR)
3831 code = PLUS_EXPR;
3833 optab = optab_for_tree_code (code, vectype);
3835 /* Support only unary or binary operations. */
3836 op_type = TREE_OPERAND_LENGTH (operation);
3837 if (op_type != unary_op && op_type != binary_op)
3839 if (vect_print_dump_info (REPORT_DETAILS))
3840 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3841 return false;
3844 op0 = TREE_OPERAND (operation, 0);
3845 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3847 if (vect_print_dump_info (REPORT_DETAILS))
3848 fprintf (vect_dump, "use not simple.");
3849 return false;
3852 if (op_type == binary_op)
3854 op1 = TREE_OPERAND (operation, 1);
3855 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3857 if (vect_print_dump_info (REPORT_DETAILS))
3858 fprintf (vect_dump, "use not simple.");
3859 return false;
3863 /* Supportable by target? */
3864 if (!optab)
3866 if (vect_print_dump_info (REPORT_DETAILS))
3867 fprintf (vect_dump, "no optab.");
3868 return false;
3870 vec_mode = TYPE_MODE (vectype);
3871 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3872 if (icode == CODE_FOR_nothing)
3874 if (vect_print_dump_info (REPORT_DETAILS))
3875 fprintf (vect_dump, "op not supported by target.");
3876 /* Check only during analysis. */
3877 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3878 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3879 < vect_min_worthwhile_factor (code)
3880 && !vec_stmt))
3881 return false;
3882 if (vect_print_dump_info (REPORT_DETAILS))
3883 fprintf (vect_dump, "proceeding using word mode.");
3886 /* Worthwhile without SIMD support? Check only during analysis. */
3887 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3888 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3889 < vect_min_worthwhile_factor (code)
3890 && !vec_stmt)
3892 if (vect_print_dump_info (REPORT_DETAILS))
3893 fprintf (vect_dump, "not worthwhile without SIMD support.");
3894 return false;
3897 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3899 /* FORNOW: not yet supported. */
3900 if (!VECTOR_MODE_P (vec_mode))
3901 return false;
3903 /* Invariant argument is needed for a vector shift
3904 by a scalar shift operand. */
3905 optab_op2_mode = insn_data[icode].operand[2].mode;
3906 if (!VECTOR_MODE_P (optab_op2_mode))
3908 if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
3910 if (vect_print_dump_info (REPORT_DETAILS))
3911 fprintf (vect_dump, "operand mode requires invariant"
3912 " argument.");
3913 return false;
3916 scalar_shift_arg = true;
3920 if (!vec_stmt) /* transformation not required. */
3922 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3923 if (vect_print_dump_info (REPORT_DETAILS))
3924 fprintf (vect_dump, "=== vectorizable_operation ===");
3925 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3926 return true;
3929 /** Transform. **/
3931 if (vect_print_dump_info (REPORT_DETAILS))
3932 fprintf (vect_dump, "transform binary/unary operation.");
3934 /* Handle def. */
3935 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3937 /* Allocate VECs for vector operands. In case of SLP, vector operands are
3938 created in the previous stages of the recursion, so no allocation is
3939 needed, except for the case of shift with scalar shift argument. In that
3940 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
3941 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
3942 In case of loop-based vectorization we allocate VECs of size 1. We
3943 allocate VEC_OPRNDS1 only in case of binary operation. */
3944 if (!slp_node)
3946 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3947 if (op_type == binary_op)
3948 vec_oprnds1 = VEC_alloc (tree, heap, 1);
3950 else if (scalar_shift_arg)
3951 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
3953 /* In case the vectorization factor (VF) is bigger than the number
3954 of elements that we can fit in a vectype (nunits), we have to generate
3955 more than one vector stmt - i.e - we need to "unroll" the
3956 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3957 from one copy of the vector stmt to the next, in the field
3958 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3959 stages to find the correct vector defs to be used when vectorizing
3960 stmts that use the defs of the current stmt. The example below illustrates
3961 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3962 4 vectorized stmts):
3964 before vectorization:
3965 RELATED_STMT VEC_STMT
3966 S1: x = memref - -
3967 S2: z = x + 1 - -
3969 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3970 there):
3971 RELATED_STMT VEC_STMT
3972 VS1_0: vx0 = memref0 VS1_1 -
3973 VS1_1: vx1 = memref1 VS1_2 -
3974 VS1_2: vx2 = memref2 VS1_3 -
3975 VS1_3: vx3 = memref3 - -
3976 S1: x = load - VS1_0
3977 S2: z = x + 1 - -
3979 step2: vectorize stmt S2 (done here):
3980 To vectorize stmt S2 we first need to find the relevant vector
3981 def for the first operand 'x'. This is, as usual, obtained from
3982 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3983 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3984 relevant vector def 'vx0'. Having found 'vx0' we can generate
3985 the vector stmt VS2_0, and as usual, record it in the
3986 STMT_VINFO_VEC_STMT of stmt S2.
3987 When creating the second copy (VS2_1), we obtain the relevant vector
3988 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3989 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3990 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3991 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3992 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3993 chain of stmts and pointers:
3994 RELATED_STMT VEC_STMT
3995 VS1_0: vx0 = memref0 VS1_1 -
3996 VS1_1: vx1 = memref1 VS1_2 -
3997 VS1_2: vx2 = memref2 VS1_3 -
3998 VS1_3: vx3 = memref3 - -
3999 S1: x = load - VS1_0
4000 VS2_0: vz0 = vx0 + v1 VS2_1 -
4001 VS2_1: vz1 = vx1 + v1 VS2_2 -
4002 VS2_2: vz2 = vx2 + v1 VS2_3 -
4003 VS2_3: vz3 = vx3 + v1 - -
4004 S2: z = x + 1 - VS2_0 */
4006 prev_stmt_info = NULL;
4007 for (j = 0; j < ncopies; j++)
4009 /* Handle uses. */
4010 if (j == 0)
4012 if (op_type == binary_op
4013 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
4015 /* Vector shl and shr insn patterns can be defined with scalar
4016 operand 2 (shift operand). In this case, use constant or loop
4017 invariant op1 directly, without extending it to vector mode
4018 first. */
4019 optab_op2_mode = insn_data[icode].operand[2].mode;
4020 if (!VECTOR_MODE_P (optab_op2_mode))
4022 if (vect_print_dump_info (REPORT_DETAILS))
4023 fprintf (vect_dump, "operand 1 using scalar mode.");
4024 vec_oprnd1 = op1;
4025 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4026 if (slp_node)
4028 /* Store vec_oprnd1 for every vector stmt to be created
4029 for SLP_NODE. We check during the analysis that all the
4030 shift arguments are the same.
4031 TODO: Allow different constants for different vector
4032 stmts generated for an SLP instance. */
4033 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4034 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4039 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4040 (a special case for certain kind of vector shifts); otherwise,
4041 operand 1 should be of a vector type (the usual case). */
4042 if (op_type == binary_op && !vec_oprnd1)
4043 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4044 slp_node);
4045 else
4046 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4047 slp_node);
4049 else
4050 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4052 /* Arguments are ready. Create the new vector stmt. */
4053 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4055 if (op_type == binary_op)
4057 vop1 = VEC_index (tree, vec_oprnds1, i);
4058 new_stmt = build_gimple_modify_stmt (vec_dest,
4059 build2 (code, vectype, vop0, vop1));
4061 else
4062 new_stmt = build_gimple_modify_stmt (vec_dest,
4063 build1 (code, vectype, vop0));
4065 new_temp = make_ssa_name (vec_dest, new_stmt);
4066 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4067 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4068 if (slp_node)
4069 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4072 if (j == 0)
4073 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4074 else
4075 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4076 prev_stmt_info = vinfo_for_stmt (new_stmt);
4079 VEC_free (tree, heap, vec_oprnds0);
4080 if (vec_oprnds1)
4081 VEC_free (tree, heap, vec_oprnds1);
4083 return true;
4087 /* Function vectorizable_type_demotion
4089 Check if STMT performs a binary or unary operation that involves
4090 type demotion, and if it can be vectorized.
4091 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4092 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4093 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4095 bool
4096 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4097 tree *vec_stmt)
4099 tree vec_dest;
4100 tree scalar_dest;
4101 tree operation;
4102 tree op0;
4103 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4104 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4105 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4106 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4107 enum tree_code code, code1 = ERROR_MARK;
4108 tree new_temp;
4109 tree def, def_stmt;
4110 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4111 tree new_stmt;
4112 stmt_vec_info prev_stmt_info;
4113 int nunits_in;
4114 int nunits_out;
4115 tree vectype_out;
4116 int ncopies;
4117 int j;
4118 tree expr;
4119 tree vectype_in;
4121 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4122 return false;
4124 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4125 return false;
4127 /* Is STMT a vectorizable type-demotion operation? */
4128 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4129 return false;
4131 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4132 return false;
4134 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4135 code = TREE_CODE (operation);
4136 if (code != NOP_EXPR && code != CONVERT_EXPR)
4137 return false;
4139 op0 = TREE_OPERAND (operation, 0);
4140 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4141 if (!vectype_in)
4142 return false;
4143 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4145 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4146 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4147 if (!vectype_out)
4148 return false;
4149 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4150 if (nunits_in != nunits_out / 2) /* FORNOW */
4151 return false;
4153 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4154 gcc_assert (ncopies >= 1);
4155 /* FORNOW. This restriction should be relaxed. */
4156 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4158 if (vect_print_dump_info (REPORT_DETAILS))
4159 fprintf (vect_dump, "multiple types in nested loop.");
4160 return false;
4163 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4164 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4165 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4166 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4167 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4168 return false;
4170 /* Check the operands of the operation. */
4171 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4173 if (vect_print_dump_info (REPORT_DETAILS))
4174 fprintf (vect_dump, "use not simple.");
4175 return false;
4178 /* Supportable by target? */
4179 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4180 return false;
4182 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4184 if (!vec_stmt) /* transformation not required. */
4186 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4187 if (vect_print_dump_info (REPORT_DETAILS))
4188 fprintf (vect_dump, "=== vectorizable_demotion ===");
4189 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4190 return true;
4193 /** Transform. **/
4194 if (vect_print_dump_info (REPORT_DETAILS))
4195 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4196 ncopies);
4198 /* Handle def. */
4199 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4201 /* In case the vectorization factor (VF) is bigger than the number
4202 of elements that we can fit in a vectype (nunits), we have to generate
4203 more than one vector stmt - i.e - we need to "unroll" the
4204 vector stmt by a factor VF/nunits. */
4205 prev_stmt_info = NULL;
4206 for (j = 0; j < ncopies; j++)
4208 /* Handle uses. */
4209 if (j == 0)
4211 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4212 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4214 else
4216 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4217 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4220 /* Arguments are ready. Create the new vector stmt. */
4221 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4222 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4223 new_temp = make_ssa_name (vec_dest, new_stmt);
4224 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4225 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4227 if (j == 0)
4228 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4229 else
4230 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4232 prev_stmt_info = vinfo_for_stmt (new_stmt);
4235 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4236 return true;
4240 /* Function vectorizable_type_promotion
4242 Check if STMT performs a binary or unary operation that involves
4243 type promotion, and if it can be vectorized.
4244 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4245 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4246 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4248 bool
4249 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4250 tree *vec_stmt)
4252 tree vec_dest;
4253 tree scalar_dest;
4254 tree operation;
4255 tree op0, op1 = NULL;
4256 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4257 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4260 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4261 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4262 int op_type;
4263 tree def, def_stmt;
4264 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4265 tree new_stmt;
4266 stmt_vec_info prev_stmt_info;
4267 int nunits_in;
4268 int nunits_out;
4269 tree vectype_out;
4270 int ncopies;
4271 int j;
4272 tree vectype_in;
4274 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4275 return false;
4277 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4278 return false;
4280 /* Is STMT a vectorizable type-promotion operation? */
4281 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4282 return false;
4284 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4285 return false;
4287 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4288 code = TREE_CODE (operation);
4289 if (code != NOP_EXPR && code != CONVERT_EXPR
4290 && code != WIDEN_MULT_EXPR)
4291 return false;
4293 op0 = TREE_OPERAND (operation, 0);
4294 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4295 if (!vectype_in)
4296 return false;
4297 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4299 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4300 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4301 if (!vectype_out)
4302 return false;
4303 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4304 if (nunits_out != nunits_in / 2) /* FORNOW */
4305 return false;
4307 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4308 gcc_assert (ncopies >= 1);
4309 /* FORNOW. This restriction should be relaxed. */
4310 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4312 if (vect_print_dump_info (REPORT_DETAILS))
4313 fprintf (vect_dump, "multiple types in nested loop.");
4314 return false;
4317 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4318 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4319 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4320 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4321 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4322 return false;
4324 /* Check the operands of the operation. */
4325 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4327 if (vect_print_dump_info (REPORT_DETAILS))
4328 fprintf (vect_dump, "use not simple.");
4329 return false;
4332 op_type = TREE_CODE_LENGTH (code);
4333 if (op_type == binary_op)
4335 op1 = TREE_OPERAND (operation, 1);
4336 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4338 if (vect_print_dump_info (REPORT_DETAILS))
4339 fprintf (vect_dump, "use not simple.");
4340 return false;
4344 /* Supportable by target? */
4345 if (!supportable_widening_operation (code, stmt, vectype_in,
4346 &decl1, &decl2, &code1, &code2))
4347 return false;
4349 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4351 if (!vec_stmt) /* transformation not required. */
4353 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4354 if (vect_print_dump_info (REPORT_DETAILS))
4355 fprintf (vect_dump, "=== vectorizable_promotion ===");
4356 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4357 return true;
4360 /** Transform. **/
4362 if (vect_print_dump_info (REPORT_DETAILS))
4363 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4364 ncopies);
4366 /* Handle def. */
4367 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4369 /* In case the vectorization factor (VF) is bigger than the number
4370 of elements that we can fit in a vectype (nunits), we have to generate
4371 more than one vector stmt - i.e - we need to "unroll" the
4372 vector stmt by a factor VF/nunits. */
4374 prev_stmt_info = NULL;
4375 for (j = 0; j < ncopies; j++)
4377 /* Handle uses. */
4378 if (j == 0)
4380 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4381 if (op_type == binary_op)
4382 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4384 else
4386 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4387 if (op_type == binary_op)
4388 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4391 /* Arguments are ready. Create the new vector stmt. We are creating
4392 two vector defs because the widened result does not fit in one vector.
4393 The vectorized stmt can be expressed as a call to a taregt builtin,
4394 or a using a tree-code. */
4395 /* Generate first half of the widened result: */
4396 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4397 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4398 if (j == 0)
4399 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4400 else
4401 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4402 prev_stmt_info = vinfo_for_stmt (new_stmt);
4404 /* Generate second half of the widened result: */
4405 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4406 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4407 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4408 prev_stmt_info = vinfo_for_stmt (new_stmt);
4412 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4413 return true;
4417 /* Function vect_strided_store_supported.
4419 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4420 and FALSE otherwise. */
4422 static bool
4423 vect_strided_store_supported (tree vectype)
4425 optab interleave_high_optab, interleave_low_optab;
4426 int mode;
4428 mode = (int) TYPE_MODE (vectype);
4430 /* Check that the operation is supported. */
4431 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4432 vectype);
4433 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4434 vectype);
4435 if (!interleave_high_optab || !interleave_low_optab)
4437 if (vect_print_dump_info (REPORT_DETAILS))
4438 fprintf (vect_dump, "no optab for interleave.");
4439 return false;
4442 if (optab_handler (interleave_high_optab, mode)->insn_code
4443 == CODE_FOR_nothing
4444 || optab_handler (interleave_low_optab, mode)->insn_code
4445 == CODE_FOR_nothing)
4447 if (vect_print_dump_info (REPORT_DETAILS))
4448 fprintf (vect_dump, "interleave op not supported by target.");
4449 return false;
4452 return true;
4456 /* Function vect_permute_store_chain.
4458 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4459 a power of 2, generate interleave_high/low stmts to reorder the data
4460 correctly for the stores. Return the final references for stores in
4461 RESULT_CHAIN.
4463 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4464 The input is 4 vectors each containing 8 elements. We assign a number to each
4465 element, the input sequence is:
4467 1st vec: 0 1 2 3 4 5 6 7
4468 2nd vec: 8 9 10 11 12 13 14 15
4469 3rd vec: 16 17 18 19 20 21 22 23
4470 4th vec: 24 25 26 27 28 29 30 31
4472 The output sequence should be:
4474 1st vec: 0 8 16 24 1 9 17 25
4475 2nd vec: 2 10 18 26 3 11 19 27
4476 3rd vec: 4 12 20 28 5 13 21 30
4477 4th vec: 6 14 22 30 7 15 23 31
4479 i.e., we interleave the contents of the four vectors in their order.
4481 We use interleave_high/low instructions to create such output. The input of
4482 each interleave_high/low operation is two vectors:
4483 1st vec 2nd vec
4484 0 1 2 3 4 5 6 7
4485 the even elements of the result vector are obtained left-to-right from the
4486 high/low elements of the first vector. The odd elements of the result are
4487 obtained left-to-right from the high/low elements of the second vector.
4488 The output of interleave_high will be: 0 4 1 5
4489 and of interleave_low: 2 6 3 7
4492 The permutation is done in log LENGTH stages. In each stage interleave_high
4493 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4494 where the first argument is taken from the first half of DR_CHAIN and the
4495 second argument from it's second half.
4496 In our example,
4498 I1: interleave_high (1st vec, 3rd vec)
4499 I2: interleave_low (1st vec, 3rd vec)
4500 I3: interleave_high (2nd vec, 4th vec)
4501 I4: interleave_low (2nd vec, 4th vec)
4503 The output for the first stage is:
4505 I1: 0 16 1 17 2 18 3 19
4506 I2: 4 20 5 21 6 22 7 23
4507 I3: 8 24 9 25 10 26 11 27
4508 I4: 12 28 13 29 14 30 15 31
4510 The output of the second stage, i.e. the final result is:
4512 I1: 0 8 16 24 1 9 17 25
4513 I2: 2 10 18 26 3 11 19 27
4514 I3: 4 12 20 28 5 13 21 30
4515 I4: 6 14 22 30 7 15 23 31. */
4517 static bool
4518 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4519 unsigned int length,
4520 tree stmt,
4521 block_stmt_iterator *bsi,
4522 VEC(tree,heap) **result_chain)
4524 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4525 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4526 tree scalar_dest, tmp;
4527 int i;
4528 unsigned int j;
4529 VEC(tree,heap) *first, *second;
4531 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4532 first = VEC_alloc (tree, heap, length/2);
4533 second = VEC_alloc (tree, heap, length/2);
4535 /* Check that the operation is supported. */
4536 if (!vect_strided_store_supported (vectype))
4537 return false;
4539 *result_chain = VEC_copy (tree, heap, dr_chain);
4541 for (i = 0; i < exact_log2 (length); i++)
4543 for (j = 0; j < length/2; j++)
4545 vect1 = VEC_index (tree, dr_chain, j);
4546 vect2 = VEC_index (tree, dr_chain, j+length/2);
4548 /* Create interleaving stmt:
4549 in the case of big endian:
4550 high = interleave_high (vect1, vect2)
4551 and in the case of little endian:
4552 high = interleave_low (vect1, vect2). */
4553 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4554 DECL_GIMPLE_REG_P (perm_dest) = 1;
4555 add_referenced_var (perm_dest);
4556 if (BYTES_BIG_ENDIAN)
4557 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4558 else
4559 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4560 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4561 high = make_ssa_name (perm_dest, perm_stmt);
4562 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4563 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4564 VEC_replace (tree, *result_chain, 2*j, high);
4566 /* Create interleaving stmt:
4567 in the case of big endian:
4568 low = interleave_low (vect1, vect2)
4569 and in the case of little endian:
4570 low = interleave_high (vect1, vect2). */
4571 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4572 DECL_GIMPLE_REG_P (perm_dest) = 1;
4573 add_referenced_var (perm_dest);
4574 if (BYTES_BIG_ENDIAN)
4575 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4576 else
4577 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4578 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4579 low = make_ssa_name (perm_dest, perm_stmt);
4580 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4581 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4582 VEC_replace (tree, *result_chain, 2*j+1, low);
4584 dr_chain = VEC_copy (tree, heap, *result_chain);
4586 return true;
4590 /* Function vectorizable_store.
4592 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4593 can be vectorized.
4594 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4595 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4596 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4598 bool
4599 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4600 slp_tree slp_node)
4602 tree scalar_dest;
4603 tree data_ref;
4604 tree op;
4605 tree vec_oprnd = NULL_TREE;
4606 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4607 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4608 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4609 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4610 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4611 enum machine_mode vec_mode;
4612 tree dummy;
4613 enum dr_alignment_support alignment_support_scheme;
4614 tree def, def_stmt;
4615 enum vect_def_type dt;
4616 stmt_vec_info prev_stmt_info = NULL;
4617 tree dataref_ptr = NULL_TREE;
4618 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4619 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4620 int j;
4621 tree next_stmt, first_stmt = NULL_TREE;
4622 bool strided_store = false;
4623 unsigned int group_size, i;
4624 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4625 bool inv_p;
4626 VEC(tree,heap) *vec_oprnds = NULL;
4627 bool slp = (slp_node != NULL);
4628 stmt_vec_info first_stmt_vinfo;
4629 unsigned int vec_num;
4631 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4632 this, so we can safely override NCOPIES with 1 here. */
4633 if (slp)
4634 ncopies = 1;
4636 gcc_assert (ncopies >= 1);
4638 /* FORNOW. This restriction should be relaxed. */
4639 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4641 if (vect_print_dump_info (REPORT_DETAILS))
4642 fprintf (vect_dump, "multiple types in nested loop.");
4643 return false;
4646 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4647 return false;
4649 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4650 return false;
4652 /* Is vectorizable store? */
4654 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4655 return false;
4657 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4658 if (TREE_CODE (scalar_dest) != ARRAY_REF
4659 && TREE_CODE (scalar_dest) != INDIRECT_REF
4660 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4661 return false;
4663 op = GIMPLE_STMT_OPERAND (stmt, 1);
4664 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4666 if (vect_print_dump_info (REPORT_DETAILS))
4667 fprintf (vect_dump, "use not simple.");
4668 return false;
4671 vec_mode = TYPE_MODE (vectype);
4672 /* FORNOW. In some cases can vectorize even if data-type not supported
4673 (e.g. - array initialization with 0). */
4674 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4675 return false;
4677 if (!STMT_VINFO_DATA_REF (stmt_info))
4678 return false;
4680 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4682 strided_store = true;
4683 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4684 if (!vect_strided_store_supported (vectype)
4685 && !PURE_SLP_STMT (stmt_info) && !slp)
4686 return false;
4688 if (first_stmt == stmt)
4690 /* STMT is the leader of the group. Check the operands of all the
4691 stmts of the group. */
4692 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4693 while (next_stmt)
4695 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4696 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4698 if (vect_print_dump_info (REPORT_DETAILS))
4699 fprintf (vect_dump, "use not simple.");
4700 return false;
4702 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4707 if (!vec_stmt) /* transformation not required. */
4709 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4710 if (!PURE_SLP_STMT (stmt_info))
4711 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4712 return true;
4715 /** Transform. **/
4717 if (strided_store)
4719 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4720 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4722 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4724 /* FORNOW */
4725 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4727 /* We vectorize all the stmts of the interleaving group when we
4728 reach the last stmt in the group. */
4729 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4730 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4731 && !slp)
4733 *vec_stmt = NULL_TREE;
4734 return true;
4737 if (slp)
4738 strided_store = false;
4740 /* VEC_NUM is the number of vect stmts to be created for this group. */
4741 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4742 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4743 else
4744 vec_num = group_size;
4746 else
4748 first_stmt = stmt;
4749 first_dr = dr;
4750 group_size = vec_num = 1;
4751 first_stmt_vinfo = stmt_info;
4754 if (vect_print_dump_info (REPORT_DETAILS))
4755 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4757 dr_chain = VEC_alloc (tree, heap, group_size);
4758 oprnds = VEC_alloc (tree, heap, group_size);
4760 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4761 gcc_assert (alignment_support_scheme);
4762 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4764 /* In case the vectorization factor (VF) is bigger than the number
4765 of elements that we can fit in a vectype (nunits), we have to generate
4766 more than one vector stmt - i.e - we need to "unroll" the
4767 vector stmt by a factor VF/nunits. For more details see documentation in
4768 vect_get_vec_def_for_copy_stmt. */
4770 /* In case of interleaving (non-unit strided access):
4772 S1: &base + 2 = x2
4773 S2: &base = x0
4774 S3: &base + 1 = x1
4775 S4: &base + 3 = x3
4777 We create vectorized stores starting from base address (the access of the
4778 first stmt in the chain (S2 in the above example), when the last store stmt
4779 of the chain (S4) is reached:
4781 VS1: &base = vx2
4782 VS2: &base + vec_size*1 = vx0
4783 VS3: &base + vec_size*2 = vx1
4784 VS4: &base + vec_size*3 = vx3
4786 Then permutation statements are generated:
4788 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4789 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4792 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4793 (the order of the data-refs in the output of vect_permute_store_chain
4794 corresponds to the order of scalar stmts in the interleaving chain - see
4795 the documentation of vect_permute_store_chain()).
4797 In case of both multiple types and interleaving, above vector stores and
4798 permutation stmts are created for every copy. The result vector stmts are
4799 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4800 STMT_VINFO_RELATED_STMT for the next copies.
4803 prev_stmt_info = NULL;
4804 for (j = 0; j < ncopies; j++)
4806 tree new_stmt;
4807 tree ptr_incr;
4809 if (j == 0)
4811 if (slp)
4813 /* Get vectorized arguments for SLP_NODE. */
4814 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4816 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4818 else
4820 /* For interleaved stores we collect vectorized defs for all the
4821 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4822 used as an input to vect_permute_store_chain(), and OPRNDS as
4823 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4825 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4826 OPRNDS are of size 1. */
4827 next_stmt = first_stmt;
4828 for (i = 0; i < group_size; i++)
4830 /* Since gaps are not supported for interleaved stores,
4831 GROUP_SIZE is the exact number of stmts in the chain.
4832 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4833 there is no interleaving, GROUP_SIZE is 1, and only one
4834 iteration of the loop will be executed. */
4835 gcc_assert (next_stmt);
4836 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4838 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4839 NULL);
4840 VEC_quick_push(tree, dr_chain, vec_oprnd);
4841 VEC_quick_push(tree, oprnds, vec_oprnd);
4842 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4845 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4846 &dummy, &ptr_incr, false,
4847 TREE_TYPE (vec_oprnd), &inv_p);
4848 gcc_assert (!inv_p);
4850 else
4852 /* FORNOW SLP doesn't work for multiple types. */
4853 gcc_assert (!slp);
4855 /* For interleaved stores we created vectorized defs for all the
4856 defs stored in OPRNDS in the previous iteration (previous copy).
4857 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4858 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4859 next copy.
4860 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4861 OPRNDS are of size 1. */
4862 for (i = 0; i < group_size; i++)
4864 op = VEC_index (tree, oprnds, i);
4865 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4866 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4867 VEC_replace(tree, dr_chain, i, vec_oprnd);
4868 VEC_replace(tree, oprnds, i, vec_oprnd);
4870 dataref_ptr =
4871 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4874 if (strided_store)
4876 result_chain = VEC_alloc (tree, heap, group_size);
4877 /* Permute. */
4878 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4879 &result_chain))
4880 return false;
4883 next_stmt = first_stmt;
4884 for (i = 0; i < vec_num; i++)
4886 if (i > 0)
4887 /* Bump the vector pointer. */
4888 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4889 NULL_TREE);
4891 if (slp)
4892 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4893 else if (strided_store)
4894 /* For strided stores vectorized defs are interleaved in
4895 vect_permute_store_chain(). */
4896 vec_oprnd = VEC_index (tree, result_chain, i);
4898 data_ref = build_fold_indirect_ref (dataref_ptr);
4899 /* Arguments are ready. Create the new vector stmt. */
4900 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4901 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4902 mark_symbols_for_renaming (new_stmt);
4904 if (j == 0)
4905 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4906 else
4907 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4909 prev_stmt_info = vinfo_for_stmt (new_stmt);
4910 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4911 if (!next_stmt)
4912 break;
4916 return true;
4920 /* Function vect_setup_realignment
4922 This function is called when vectorizing an unaligned load using
4923 the dr_explicit_realign[_optimized] scheme.
4924 This function generates the following code at the loop prolog:
4926 p = initial_addr;
4927 x msq_init = *(floor(p)); # prolog load
4928 realignment_token = call target_builtin;
4929 loop:
4930 x msq = phi (msq_init, ---)
4932 The stmts marked with x are generated only for the case of
4933 dr_explicit_realign_optimized.
4935 The code above sets up a new (vector) pointer, pointing to the first
4936 location accessed by STMT, and a "floor-aligned" load using that pointer.
4937 It also generates code to compute the "realignment-token" (if the relevant
4938 target hook was defined), and creates a phi-node at the loop-header bb
4939 whose arguments are the result of the prolog-load (created by this
4940 function) and the result of a load that takes place in the loop (to be
4941 created by the caller to this function).
4943 For the case of dr_explicit_realign_optimized:
4944 The caller to this function uses the phi-result (msq) to create the
4945 realignment code inside the loop, and sets up the missing phi argument,
4946 as follows:
4947 loop:
4948 msq = phi (msq_init, lsq)
4949 lsq = *(floor(p')); # load in loop
4950 result = realign_load (msq, lsq, realignment_token);
4952 For the case of dr_explicit_realign:
4953 loop:
4954 msq = *(floor(p)); # load in loop
4955 p' = p + (VS-1);
4956 lsq = *(floor(p')); # load in loop
4957 result = realign_load (msq, lsq, realignment_token);
4959 Input:
4960 STMT - (scalar) load stmt to be vectorized. This load accesses
4961 a memory location that may be unaligned.
4962 BSI - place where new code is to be inserted.
4963 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4964 is used.
4966 Output:
4967 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4968 target hook, if defined.
4969 Return value - the result of the loop-header phi node. */
4971 static tree
4972 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
4973 tree *realignment_token,
4974 enum dr_alignment_support alignment_support_scheme,
4975 tree init_addr,
4976 struct loop **at_loop)
4978 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4979 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4980 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4981 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4982 edge pe;
4983 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4984 tree vec_dest;
4985 tree inc;
4986 tree ptr;
4987 tree data_ref;
4988 tree new_stmt;
4989 basic_block new_bb;
4990 tree msq_init = NULL_TREE;
4991 tree new_temp;
4992 tree phi_stmt;
4993 tree msq = NULL_TREE;
4994 tree stmts = NULL_TREE;
4995 bool inv_p;
4996 bool compute_in_loop = false;
4997 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4998 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
4999 struct loop *loop_for_initial_load;
5001 gcc_assert (alignment_support_scheme == dr_explicit_realign
5002 || alignment_support_scheme == dr_explicit_realign_optimized);
5004 /* We need to generate three things:
5005 1. the misalignment computation
5006 2. the extra vector load (for the optimized realignment scheme).
5007 3. the phi node for the two vectors from which the realignment is
5008 done (for the optimized realignment scheme).
5011 /* 1. Determine where to generate the misalignment computation.
5013 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5014 calculation will be generated by this function, outside the loop (in the
5015 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5016 caller, inside the loop.
5018 Background: If the misalignment remains fixed throughout the iterations of
5019 the loop, then both realignment schemes are applicable, and also the
5020 misalignment computation can be done outside LOOP. This is because we are
5021 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5022 are a multiple of VS (the Vector Size), and therefore the misalignment in
5023 different vectorized LOOP iterations is always the same.
5024 The problem arises only if the memory access is in an inner-loop nested
5025 inside LOOP, which is now being vectorized using outer-loop vectorization.
5026 This is the only case when the misalignment of the memory access may not
5027 remain fixed throughout the iterations of the inner-loop (as explained in
5028 detail in vect_supportable_dr_alignment). In this case, not only is the
5029 optimized realignment scheme not applicable, but also the misalignment
5030 computation (and generation of the realignment token that is passed to
5031 REALIGN_LOAD) have to be done inside the loop.
5033 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5034 or not, which in turn determines if the misalignment is computed inside
5035 the inner-loop, or outside LOOP. */
5037 if (init_addr != NULL_TREE)
5039 compute_in_loop = true;
5040 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5044 /* 2. Determine where to generate the extra vector load.
5046 For the optimized realignment scheme, instead of generating two vector
5047 loads in each iteration, we generate a single extra vector load in the
5048 preheader of the loop, and in each iteration reuse the result of the
5049 vector load from the previous iteration. In case the memory access is in
5050 an inner-loop nested inside LOOP, which is now being vectorized using
5051 outer-loop vectorization, we need to determine whether this initial vector
5052 load should be generated at the preheader of the inner-loop, or can be
5053 generated at the preheader of LOOP. If the memory access has no evolution
5054 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5055 to be generated inside LOOP (in the preheader of the inner-loop). */
5057 if (nested_in_vect_loop)
5059 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5060 bool invariant_in_outerloop =
5061 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5062 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5064 else
5065 loop_for_initial_load = loop;
5066 if (at_loop)
5067 *at_loop = loop_for_initial_load;
5069 /* 3. For the case of the optimized realignment, create the first vector
5070 load at the loop preheader. */
5072 if (alignment_support_scheme == dr_explicit_realign_optimized)
5074 /* Create msq_init = *(floor(p1)) in the loop preheader */
5076 gcc_assert (!compute_in_loop);
5077 pe = loop_preheader_edge (loop_for_initial_load);
5078 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5079 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5080 &init_addr, &inc, true, NULL_TREE, &inv_p);
5081 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5082 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5083 new_temp = make_ssa_name (vec_dest, new_stmt);
5084 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5085 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5086 gcc_assert (!new_bb);
5087 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5090 /* 4. Create realignment token using a target builtin, if available.
5091 It is done either inside the containing loop, or before LOOP (as
5092 determined above). */
5094 if (targetm.vectorize.builtin_mask_for_load)
5096 tree builtin_decl;
5098 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5099 if (compute_in_loop)
5100 gcc_assert (init_addr); /* already computed by the caller. */
5101 else
5103 /* Generate the INIT_ADDR computation outside LOOP. */
5104 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5105 NULL_TREE, loop);
5106 pe = loop_preheader_edge (loop);
5107 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5108 gcc_assert (!new_bb);
5111 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5112 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5113 vec_dest = vect_create_destination_var (scalar_dest,
5114 TREE_TYPE (new_stmt));
5115 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5116 new_temp = make_ssa_name (vec_dest, new_stmt);
5117 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5119 if (compute_in_loop)
5120 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5121 else
5123 /* Generate the misalignment computation outside LOOP. */
5124 pe = loop_preheader_edge (loop);
5125 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5126 gcc_assert (!new_bb);
5129 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5131 /* The result of the CALL_EXPR to this builtin is determined from
5132 the value of the parameter and no global variables are touched
5133 which makes the builtin a "const" function. Requiring the
5134 builtin to have the "const" attribute makes it unnecessary
5135 to call mark_call_clobbered. */
5136 gcc_assert (TREE_READONLY (builtin_decl));
5139 if (alignment_support_scheme == dr_explicit_realign)
5140 return msq;
5142 gcc_assert (!compute_in_loop);
5143 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5146 /* 5. Create msq = phi <msq_init, lsq> in loop */
5148 pe = loop_preheader_edge (containing_loop);
5149 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5150 msq = make_ssa_name (vec_dest, NULL_TREE);
5151 phi_stmt = create_phi_node (msq, containing_loop->header);
5152 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5153 add_phi_arg (phi_stmt, msq_init, pe);
5155 return msq;
5159 /* Function vect_strided_load_supported.
5161 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5162 and FALSE otherwise. */
5164 static bool
5165 vect_strided_load_supported (tree vectype)
5167 optab perm_even_optab, perm_odd_optab;
5168 int mode;
5170 mode = (int) TYPE_MODE (vectype);
5172 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5173 if (!perm_even_optab)
5175 if (vect_print_dump_info (REPORT_DETAILS))
5176 fprintf (vect_dump, "no optab for perm_even.");
5177 return false;
5180 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5182 if (vect_print_dump_info (REPORT_DETAILS))
5183 fprintf (vect_dump, "perm_even op not supported by target.");
5184 return false;
5187 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5188 if (!perm_odd_optab)
5190 if (vect_print_dump_info (REPORT_DETAILS))
5191 fprintf (vect_dump, "no optab for perm_odd.");
5192 return false;
5195 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5197 if (vect_print_dump_info (REPORT_DETAILS))
5198 fprintf (vect_dump, "perm_odd op not supported by target.");
5199 return false;
5201 return true;
5205 /* Function vect_permute_load_chain.
5207 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5208 a power of 2, generate extract_even/odd stmts to reorder the input data
5209 correctly. Return the final references for loads in RESULT_CHAIN.
5211 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5212 The input is 4 vectors each containing 8 elements. We assign a number to each
5213 element, the input sequence is:
5215 1st vec: 0 1 2 3 4 5 6 7
5216 2nd vec: 8 9 10 11 12 13 14 15
5217 3rd vec: 16 17 18 19 20 21 22 23
5218 4th vec: 24 25 26 27 28 29 30 31
5220 The output sequence should be:
5222 1st vec: 0 4 8 12 16 20 24 28
5223 2nd vec: 1 5 9 13 17 21 25 29
5224 3rd vec: 2 6 10 14 18 22 26 30
5225 4th vec: 3 7 11 15 19 23 27 31
5227 i.e., the first output vector should contain the first elements of each
5228 interleaving group, etc.
5230 We use extract_even/odd instructions to create such output. The input of each
5231 extract_even/odd operation is two vectors
5232 1st vec 2nd vec
5233 0 1 2 3 4 5 6 7
5235 and the output is the vector of extracted even/odd elements. The output of
5236 extract_even will be: 0 2 4 6
5237 and of extract_odd: 1 3 5 7
5240 The permutation is done in log LENGTH stages. In each stage extract_even and
5241 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5242 order. In our example,
5244 E1: extract_even (1st vec, 2nd vec)
5245 E2: extract_odd (1st vec, 2nd vec)
5246 E3: extract_even (3rd vec, 4th vec)
5247 E4: extract_odd (3rd vec, 4th vec)
5249 The output for the first stage will be:
5251 E1: 0 2 4 6 8 10 12 14
5252 E2: 1 3 5 7 9 11 13 15
5253 E3: 16 18 20 22 24 26 28 30
5254 E4: 17 19 21 23 25 27 29 31
5256 In order to proceed and create the correct sequence for the next stage (or
5257 for the correct output, if the second stage is the last one, as in our
5258 example), we first put the output of extract_even operation and then the
5259 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5260 The input for the second stage is:
5262 1st vec (E1): 0 2 4 6 8 10 12 14
5263 2nd vec (E3): 16 18 20 22 24 26 28 30
5264 3rd vec (E2): 1 3 5 7 9 11 13 15
5265 4th vec (E4): 17 19 21 23 25 27 29 31
5267 The output of the second stage:
5269 E1: 0 4 8 12 16 20 24 28
5270 E2: 2 6 10 14 18 22 26 30
5271 E3: 1 5 9 13 17 21 25 29
5272 E4: 3 7 11 15 19 23 27 31
5274 And RESULT_CHAIN after reordering:
5276 1st vec (E1): 0 4 8 12 16 20 24 28
5277 2nd vec (E3): 1 5 9 13 17 21 25 29
5278 3rd vec (E2): 2 6 10 14 18 22 26 30
5279 4th vec (E4): 3 7 11 15 19 23 27 31. */
5281 static bool
5282 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5283 unsigned int length,
5284 tree stmt,
5285 block_stmt_iterator *bsi,
5286 VEC(tree,heap) **result_chain)
5288 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5289 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5290 tree tmp;
5291 int i;
5292 unsigned int j;
5294 /* Check that the operation is supported. */
5295 if (!vect_strided_load_supported (vectype))
5296 return false;
5298 *result_chain = VEC_copy (tree, heap, dr_chain);
5299 for (i = 0; i < exact_log2 (length); i++)
5301 for (j = 0; j < length; j +=2)
5303 first_vect = VEC_index (tree, dr_chain, j);
5304 second_vect = VEC_index (tree, dr_chain, j+1);
5306 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5307 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5308 DECL_GIMPLE_REG_P (perm_dest) = 1;
5309 add_referenced_var (perm_dest);
5311 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5312 first_vect, second_vect);
5313 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5315 data_ref = make_ssa_name (perm_dest, perm_stmt);
5316 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5317 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5318 mark_symbols_for_renaming (perm_stmt);
5320 VEC_replace (tree, *result_chain, j/2, data_ref);
5322 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5323 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5324 DECL_GIMPLE_REG_P (perm_dest) = 1;
5325 add_referenced_var (perm_dest);
5327 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5328 first_vect, second_vect);
5329 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5330 data_ref = make_ssa_name (perm_dest, perm_stmt);
5331 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5332 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5333 mark_symbols_for_renaming (perm_stmt);
5335 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5337 dr_chain = VEC_copy (tree, heap, *result_chain);
5339 return true;
5343 /* Function vect_transform_strided_load.
5345 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5346 to perform their permutation and ascribe the result vectorized statements to
5347 the scalar statements.
5350 static bool
5351 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5352 block_stmt_iterator *bsi)
5354 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5355 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5356 tree next_stmt, new_stmt;
5357 VEC(tree,heap) *result_chain = NULL;
5358 unsigned int i, gap_count;
5359 tree tmp_data_ref;
5361 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5362 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5363 vectors, that are ready for vector computation. */
5364 result_chain = VEC_alloc (tree, heap, size);
5365 /* Permute. */
5366 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5367 return false;
5369 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5370 Since we scan the chain starting from it's first node, their order
5371 corresponds the order of data-refs in RESULT_CHAIN. */
5372 next_stmt = first_stmt;
5373 gap_count = 1;
5374 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5376 if (!next_stmt)
5377 break;
5379 /* Skip the gaps. Loads created for the gaps will be removed by dead
5380 code elimination pass later.
5381 DR_GROUP_GAP is the number of steps in elements from the previous
5382 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5383 correspond to the gaps.
5385 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5387 gap_count++;
5388 continue;
5391 while (next_stmt)
5393 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5394 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5395 copies, and we put the new vector statement in the first available
5396 RELATED_STMT. */
5397 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5398 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5399 else
5401 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5402 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5403 vinfo_for_stmt (prev_stmt));
5404 while (rel_stmt)
5406 prev_stmt = rel_stmt;
5407 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5409 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5411 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5412 gap_count = 1;
5413 /* If NEXT_STMT accesses the same DR as the previous statement,
5414 put the same TMP_DATA_REF as its vectorized statement; otherwise
5415 get the next data-ref from RESULT_CHAIN. */
5416 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5417 break;
5420 return true;
5424 /* vectorizable_load.
5426 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5427 can be vectorized.
5428 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5429 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5430 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5432 bool
5433 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5434 slp_tree slp_node)
5436 tree scalar_dest;
5437 tree vec_dest = NULL;
5438 tree data_ref = NULL;
5439 tree op;
5440 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5441 stmt_vec_info prev_stmt_info;
5442 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5443 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5444 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5445 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5446 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5447 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5448 tree new_temp;
5449 int mode;
5450 tree new_stmt = NULL_TREE;
5451 tree dummy;
5452 enum dr_alignment_support alignment_support_scheme;
5453 tree dataref_ptr = NULL_TREE;
5454 tree ptr_incr;
5455 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5456 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5457 int i, j, group_size;
5458 tree msq = NULL_TREE, lsq;
5459 tree offset = NULL_TREE;
5460 tree realignment_token = NULL_TREE;
5461 tree phi = NULL_TREE;
5462 VEC(tree,heap) *dr_chain = NULL;
5463 bool strided_load = false;
5464 tree first_stmt;
5465 tree scalar_type;
5466 bool inv_p;
5467 bool compute_in_loop = false;
5468 struct loop *at_loop;
5469 int vec_num;
5470 bool slp = (slp_node != NULL);
5472 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5473 this, so we can safely override NCOPIES with 1 here. */
5474 if (slp)
5475 ncopies = 1;
5477 gcc_assert (ncopies >= 1);
5479 /* FORNOW. This restriction should be relaxed. */
5480 if (nested_in_vect_loop && ncopies > 1)
5482 if (vect_print_dump_info (REPORT_DETAILS))
5483 fprintf (vect_dump, "multiple types in nested loop.");
5484 return false;
5487 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5488 return false;
5490 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5491 return false;
5493 /* Is vectorizable load? */
5494 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5495 return false;
5497 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5498 if (TREE_CODE (scalar_dest) != SSA_NAME)
5499 return false;
5501 op = GIMPLE_STMT_OPERAND (stmt, 1);
5502 if (TREE_CODE (op) != ARRAY_REF
5503 && TREE_CODE (op) != INDIRECT_REF
5504 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5505 return false;
5507 if (!STMT_VINFO_DATA_REF (stmt_info))
5508 return false;
5510 scalar_type = TREE_TYPE (DR_REF (dr));
5511 mode = (int) TYPE_MODE (vectype);
5513 /* FORNOW. In some cases can vectorize even if data-type not supported
5514 (e.g. - data copies). */
5515 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5517 if (vect_print_dump_info (REPORT_DETAILS))
5518 fprintf (vect_dump, "Aligned load, but unsupported type.");
5519 return false;
5522 /* Check if the load is a part of an interleaving chain. */
5523 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5525 strided_load = true;
5526 /* FORNOW */
5527 gcc_assert (! nested_in_vect_loop);
5529 /* Check if interleaving is supported. */
5530 if (!vect_strided_load_supported (vectype)
5531 && !PURE_SLP_STMT (stmt_info) && !slp)
5532 return false;
5535 if (!vec_stmt) /* transformation not required. */
5537 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5538 vect_model_load_cost (stmt_info, ncopies, NULL);
5539 return true;
5542 if (vect_print_dump_info (REPORT_DETAILS))
5543 fprintf (vect_dump, "transform load.");
5545 /** Transform. **/
5547 if (strided_load)
5549 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5550 /* Check if the chain of loads is already vectorized. */
5551 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5553 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5554 return true;
5556 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5557 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5558 dr_chain = VEC_alloc (tree, heap, group_size);
5560 /* VEC_NUM is the number of vect stmts to be created for this group. */
5561 if (slp)
5563 strided_load = false;
5564 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5566 else
5567 vec_num = group_size;
5569 else
5571 first_stmt = stmt;
5572 first_dr = dr;
5573 group_size = vec_num = 1;
5576 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5577 gcc_assert (alignment_support_scheme);
5579 /* In case the vectorization factor (VF) is bigger than the number
5580 of elements that we can fit in a vectype (nunits), we have to generate
5581 more than one vector stmt - i.e - we need to "unroll" the
5582 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5583 from one copy of the vector stmt to the next, in the field
5584 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5585 stages to find the correct vector defs to be used when vectorizing
5586 stmts that use the defs of the current stmt. The example below illustrates
5587 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5588 4 vectorized stmts):
5590 before vectorization:
5591 RELATED_STMT VEC_STMT
5592 S1: x = memref - -
5593 S2: z = x + 1 - -
5595 step 1: vectorize stmt S1:
5596 We first create the vector stmt VS1_0, and, as usual, record a
5597 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5598 Next, we create the vector stmt VS1_1, and record a pointer to
5599 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5600 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5601 stmts and pointers:
5602 RELATED_STMT VEC_STMT
5603 VS1_0: vx0 = memref0 VS1_1 -
5604 VS1_1: vx1 = memref1 VS1_2 -
5605 VS1_2: vx2 = memref2 VS1_3 -
5606 VS1_3: vx3 = memref3 - -
5607 S1: x = load - VS1_0
5608 S2: z = x + 1 - -
5610 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5611 information we recorded in RELATED_STMT field is used to vectorize
5612 stmt S2. */
5614 /* In case of interleaving (non-unit strided access):
5616 S1: x2 = &base + 2
5617 S2: x0 = &base
5618 S3: x1 = &base + 1
5619 S4: x3 = &base + 3
5621 Vectorized loads are created in the order of memory accesses
5622 starting from the access of the first stmt of the chain:
5624 VS1: vx0 = &base
5625 VS2: vx1 = &base + vec_size*1
5626 VS3: vx3 = &base + vec_size*2
5627 VS4: vx4 = &base + vec_size*3
5629 Then permutation statements are generated:
5631 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5632 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5635 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5636 (the order of the data-refs in the output of vect_permute_load_chain
5637 corresponds to the order of scalar stmts in the interleaving chain - see
5638 the documentation of vect_permute_load_chain()).
5639 The generation of permutation stmts and recording them in
5640 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5642 In case of both multiple types and interleaving, the vector loads and
5643 permutation stmts above are created for every copy. The result vector stmts
5644 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5645 STMT_VINFO_RELATED_STMT for the next copies. */
5647 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5648 on a target that supports unaligned accesses (dr_unaligned_supported)
5649 we generate the following code:
5650 p = initial_addr;
5651 indx = 0;
5652 loop {
5653 p = p + indx * vectype_size;
5654 vec_dest = *(p);
5655 indx = indx + 1;
5658 Otherwise, the data reference is potentially unaligned on a target that
5659 does not support unaligned accesses (dr_explicit_realign_optimized) -
5660 then generate the following code, in which the data in each iteration is
5661 obtained by two vector loads, one from the previous iteration, and one
5662 from the current iteration:
5663 p1 = initial_addr;
5664 msq_init = *(floor(p1))
5665 p2 = initial_addr + VS - 1;
5666 realignment_token = call target_builtin;
5667 indx = 0;
5668 loop {
5669 p2 = p2 + indx * vectype_size
5670 lsq = *(floor(p2))
5671 vec_dest = realign_load (msq, lsq, realignment_token)
5672 indx = indx + 1;
5673 msq = lsq;
5674 } */
5676 /* If the misalignment remains the same throughout the execution of the
5677 loop, we can create the init_addr and permutation mask at the loop
5678 preheader. Otherwise, it needs to be created inside the loop.
5679 This can only occur when vectorizing memory accesses in the inner-loop
5680 nested within an outer-loop that is being vectorized. */
5682 if (nested_in_vect_loop_p (loop, stmt)
5683 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5685 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5686 compute_in_loop = true;
5689 if ((alignment_support_scheme == dr_explicit_realign_optimized
5690 || alignment_support_scheme == dr_explicit_realign)
5691 && !compute_in_loop)
5693 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5694 alignment_support_scheme, NULL_TREE,
5695 &at_loop);
5696 if (alignment_support_scheme == dr_explicit_realign_optimized)
5698 phi = SSA_NAME_DEF_STMT (msq);
5699 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5702 else
5703 at_loop = loop;
5705 prev_stmt_info = NULL;
5706 for (j = 0; j < ncopies; j++)
5708 /* 1. Create the vector pointer update chain. */
5709 if (j == 0)
5710 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5711 at_loop, offset,
5712 &dummy, &ptr_incr, false,
5713 NULL_TREE, &inv_p);
5714 else
5715 dataref_ptr =
5716 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5718 for (i = 0; i < vec_num; i++)
5720 if (i > 0)
5721 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5722 NULL_TREE);
5724 /* 2. Create the vector-load in the loop. */
5725 switch (alignment_support_scheme)
5727 case dr_aligned:
5728 gcc_assert (aligned_access_p (first_dr));
5729 data_ref = build_fold_indirect_ref (dataref_ptr);
5730 break;
5731 case dr_unaligned_supported:
5733 int mis = DR_MISALIGNMENT (first_dr);
5734 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5736 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5737 data_ref =
5738 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5739 break;
5741 case dr_explicit_realign:
5743 tree ptr, bump;
5744 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5746 if (compute_in_loop)
5747 msq = vect_setup_realignment (first_stmt, bsi,
5748 &realignment_token,
5749 dr_explicit_realign,
5750 dataref_ptr, NULL);
5752 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5753 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5754 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5755 new_temp = make_ssa_name (vec_dest, new_stmt);
5756 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5757 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5758 copy_virtual_operands (new_stmt, stmt);
5759 mark_symbols_for_renaming (new_stmt);
5760 msq = new_temp;
5762 bump = size_binop (MULT_EXPR, vs_minus_1,
5763 TYPE_SIZE_UNIT (scalar_type));
5764 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5765 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5766 break;
5768 case dr_explicit_realign_optimized:
5769 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5770 break;
5771 default:
5772 gcc_unreachable ();
5774 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5775 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5776 new_temp = make_ssa_name (vec_dest, new_stmt);
5777 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5778 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5779 mark_symbols_for_renaming (new_stmt);
5781 /* 3. Handle explicit realignment if necessary/supported. Create in
5782 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5783 if (alignment_support_scheme == dr_explicit_realign_optimized
5784 || alignment_support_scheme == dr_explicit_realign)
5786 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5787 if (!realignment_token)
5788 realignment_token = dataref_ptr;
5789 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5790 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5791 realignment_token);
5792 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5793 new_temp = make_ssa_name (vec_dest, new_stmt);
5794 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5795 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5797 if (alignment_support_scheme == dr_explicit_realign_optimized)
5799 if (i == vec_num - 1 && j == ncopies - 1)
5800 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5801 msq = lsq;
5805 /* 4. Handle invariant-load. */
5806 if (inv_p)
5808 gcc_assert (!strided_load);
5809 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5810 if (j == 0)
5812 int k;
5813 tree t = NULL_TREE;
5814 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5816 /* CHECKME: bitpos depends on endianess? */
5817 bitpos = bitsize_zero_node;
5818 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5819 bitsize, bitpos);
5820 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5821 TYPE_UNSIGNED (scalar_type);
5822 vec_dest =
5823 vect_create_destination_var (scalar_dest, NULL_TREE);
5824 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5825 new_temp = make_ssa_name (vec_dest, new_stmt);
5826 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5827 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5829 for (k = nunits - 1; k >= 0; --k)
5830 t = tree_cons (NULL_TREE, new_temp, t);
5831 /* FIXME: use build_constructor directly. */
5832 vec_inv = build_constructor_from_list (vectype, t);
5833 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5834 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5836 else
5837 gcc_unreachable (); /* FORNOW. */
5840 /* Collect vector loads and later create their permutation in
5841 vect_transform_strided_load (). */
5842 if (strided_load)
5843 VEC_quick_push (tree, dr_chain, new_temp);
5845 /* Store vector loads in the corresponding SLP_NODE. */
5846 if (slp)
5847 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5850 /* FORNOW: SLP with multiple types is unsupported. */
5851 if (slp)
5852 return true;
5854 if (strided_load)
5856 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5857 return false;
5858 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5859 dr_chain = VEC_alloc (tree, heap, group_size);
5861 else
5863 if (j == 0)
5864 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5865 else
5866 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5867 prev_stmt_info = vinfo_for_stmt (new_stmt);
5871 return true;
5875 /* Function vectorizable_live_operation.
5877 STMT computes a value that is used outside the loop. Check if
5878 it can be supported. */
5880 bool
5881 vectorizable_live_operation (tree stmt,
5882 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5883 tree *vec_stmt ATTRIBUTE_UNUSED)
5885 tree operation;
5886 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5887 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5888 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5889 int i;
5890 int op_type;
5891 tree op;
5892 tree def, def_stmt;
5893 enum vect_def_type dt;
5895 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5897 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5898 return false;
5900 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5901 return false;
5903 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5904 return false;
5906 /* FORNOW. CHECKME. */
5907 if (nested_in_vect_loop_p (loop, stmt))
5908 return false;
5910 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5911 op_type = TREE_OPERAND_LENGTH (operation);
5913 /* FORNOW: support only if all uses are invariant. This means
5914 that the scalar operations can remain in place, unvectorized.
5915 The original last scalar value that they compute will be used. */
5917 for (i = 0; i < op_type; i++)
5919 op = TREE_OPERAND (operation, i);
5920 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5922 if (vect_print_dump_info (REPORT_DETAILS))
5923 fprintf (vect_dump, "use not simple.");
5924 return false;
5927 if (dt != vect_invariant_def && dt != vect_constant_def)
5928 return false;
5931 /* No transformation is required for the cases we currently support. */
5932 return true;
5936 /* Function vect_is_simple_cond.
5938 Input:
5939 LOOP - the loop that is being vectorized.
5940 COND - Condition that is checked for simple use.
5942 Returns whether a COND can be vectorized. Checks whether
5943 condition operands are supportable using vec_is_simple_use. */
5945 static bool
5946 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
5948 tree lhs, rhs;
5949 tree def;
5950 enum vect_def_type dt;
5952 if (!COMPARISON_CLASS_P (cond))
5953 return false;
5955 lhs = TREE_OPERAND (cond, 0);
5956 rhs = TREE_OPERAND (cond, 1);
5958 if (TREE_CODE (lhs) == SSA_NAME)
5960 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
5961 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
5962 return false;
5964 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
5965 && TREE_CODE (lhs) != FIXED_CST)
5966 return false;
5968 if (TREE_CODE (rhs) == SSA_NAME)
5970 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
5971 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
5972 return false;
5974 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
5975 && TREE_CODE (rhs) != FIXED_CST)
5976 return false;
5978 return true;
5981 /* vectorizable_condition.
5983 Check if STMT is conditional modify expression that can be vectorized.
5984 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5985 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
5986 at BSI.
5988 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5990 bool
5991 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5993 tree scalar_dest = NULL_TREE;
5994 tree vec_dest = NULL_TREE;
5995 tree op = NULL_TREE;
5996 tree cond_expr, then_clause, else_clause;
5997 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5998 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5999 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6000 tree vec_compare, vec_cond_expr;
6001 tree new_temp;
6002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6003 enum machine_mode vec_mode;
6004 tree def;
6005 enum vect_def_type dt;
6006 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6007 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6009 gcc_assert (ncopies >= 1);
6010 if (ncopies > 1)
6011 return false; /* FORNOW */
6013 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6014 return false;
6016 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6017 return false;
6019 /* FORNOW: SLP not supported. */
6020 if (STMT_SLP_TYPE (stmt_info))
6021 return false;
6023 /* FORNOW: not yet supported. */
6024 if (STMT_VINFO_LIVE_P (stmt_info))
6026 if (vect_print_dump_info (REPORT_DETAILS))
6027 fprintf (vect_dump, "value used after loop.");
6028 return false;
6031 /* Is vectorizable conditional operation? */
6032 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6033 return false;
6035 op = GIMPLE_STMT_OPERAND (stmt, 1);
6037 if (TREE_CODE (op) != COND_EXPR)
6038 return false;
6040 cond_expr = TREE_OPERAND (op, 0);
6041 then_clause = TREE_OPERAND (op, 1);
6042 else_clause = TREE_OPERAND (op, 2);
6044 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6045 return false;
6047 /* We do not handle two different vector types for the condition
6048 and the values. */
6049 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6050 return false;
6052 if (TREE_CODE (then_clause) == SSA_NAME)
6054 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6055 if (!vect_is_simple_use (then_clause, loop_vinfo,
6056 &then_def_stmt, &def, &dt))
6057 return false;
6059 else if (TREE_CODE (then_clause) != INTEGER_CST
6060 && TREE_CODE (then_clause) != REAL_CST
6061 && TREE_CODE (then_clause) != FIXED_CST)
6062 return false;
6064 if (TREE_CODE (else_clause) == SSA_NAME)
6066 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6067 if (!vect_is_simple_use (else_clause, loop_vinfo,
6068 &else_def_stmt, &def, &dt))
6069 return false;
6071 else if (TREE_CODE (else_clause) != INTEGER_CST
6072 && TREE_CODE (else_clause) != REAL_CST
6073 && TREE_CODE (else_clause) != FIXED_CST)
6074 return false;
6077 vec_mode = TYPE_MODE (vectype);
6079 if (!vec_stmt)
6081 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6082 return expand_vec_cond_expr_p (op, vec_mode);
6085 /* Transform */
6087 /* Handle def. */
6088 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6089 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6091 /* Handle cond expr. */
6092 vec_cond_lhs =
6093 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6094 vec_cond_rhs =
6095 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6096 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6097 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6099 /* Arguments are ready. create the new vector stmt. */
6100 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6101 vec_cond_lhs, vec_cond_rhs);
6102 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6103 vec_compare, vec_then_clause, vec_else_clause);
6105 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6106 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6107 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6108 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6110 return true;
6114 /* Function vect_transform_stmt.
6116 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6118 static bool
6119 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6120 slp_tree slp_node)
6122 bool is_store = false;
6123 tree vec_stmt = NULL_TREE;
6124 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6125 tree orig_stmt_in_pattern;
6126 bool done;
6128 switch (STMT_VINFO_TYPE (stmt_info))
6130 case type_demotion_vec_info_type:
6131 gcc_assert (!slp_node);
6132 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6133 gcc_assert (done);
6134 break;
6136 case type_promotion_vec_info_type:
6137 gcc_assert (!slp_node);
6138 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6139 gcc_assert (done);
6140 break;
6142 case type_conversion_vec_info_type:
6143 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6144 gcc_assert (done);
6145 break;
6147 case induc_vec_info_type:
6148 gcc_assert (!slp_node);
6149 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6150 gcc_assert (done);
6151 break;
6153 case op_vec_info_type:
6154 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6155 gcc_assert (done);
6156 break;
6158 case assignment_vec_info_type:
6159 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6160 gcc_assert (done);
6161 break;
6163 case load_vec_info_type:
6164 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6165 gcc_assert (done);
6166 break;
6168 case store_vec_info_type:
6169 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6170 gcc_assert (done);
6171 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6173 /* In case of interleaving, the whole chain is vectorized when the
6174 last store in the chain is reached. Store stmts before the last
6175 one are skipped, and there vec_stmt_info shouldn't be freed
6176 meanwhile. */
6177 *strided_store = true;
6178 if (STMT_VINFO_VEC_STMT (stmt_info))
6179 is_store = true;
6181 else
6182 is_store = true;
6183 break;
6185 case condition_vec_info_type:
6186 gcc_assert (!slp_node);
6187 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6188 gcc_assert (done);
6189 break;
6191 case call_vec_info_type:
6192 gcc_assert (!slp_node);
6193 done = vectorizable_call (stmt, bsi, &vec_stmt);
6194 break;
6196 case reduc_vec_info_type:
6197 gcc_assert (!slp_node);
6198 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6199 gcc_assert (done);
6200 break;
6202 default:
6203 if (!STMT_VINFO_LIVE_P (stmt_info))
6205 if (vect_print_dump_info (REPORT_DETAILS))
6206 fprintf (vect_dump, "stmt not supported.");
6207 gcc_unreachable ();
6211 if (STMT_VINFO_LIVE_P (stmt_info)
6212 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6214 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6215 gcc_assert (done);
6218 if (vec_stmt)
6220 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6221 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6222 if (orig_stmt_in_pattern)
6224 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6225 /* STMT was inserted by the vectorizer to replace a computation idiom.
6226 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6227 computed this idiom. We need to record a pointer to VEC_STMT in
6228 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6229 documentation of vect_pattern_recog. */
6230 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6232 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6233 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6238 return is_store;
6242 /* This function builds ni_name = number of iterations loop executes
6243 on the loop preheader. */
6245 static tree
6246 vect_build_loop_niters (loop_vec_info loop_vinfo)
6248 tree ni_name, stmt, var;
6249 edge pe;
6250 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6251 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6253 var = create_tmp_var (TREE_TYPE (ni), "niters");
6254 add_referenced_var (var);
6255 ni_name = force_gimple_operand (ni, &stmt, false, var);
6257 pe = loop_preheader_edge (loop);
6258 if (stmt)
6260 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6261 gcc_assert (!new_bb);
6264 return ni_name;
6268 /* This function generates the following statements:
6270 ni_name = number of iterations loop executes
6271 ratio = ni_name / vf
6272 ratio_mult_vf_name = ratio * vf
6274 and places them at the loop preheader edge. */
6276 static void
6277 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6278 tree *ni_name_ptr,
6279 tree *ratio_mult_vf_name_ptr,
6280 tree *ratio_name_ptr)
6283 edge pe;
6284 basic_block new_bb;
6285 tree stmt, ni_name;
6286 tree var;
6287 tree ratio_name;
6288 tree ratio_mult_vf_name;
6289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6290 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6291 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6292 tree log_vf;
6294 pe = loop_preheader_edge (loop);
6296 /* Generate temporary variable that contains
6297 number of iterations loop executes. */
6299 ni_name = vect_build_loop_niters (loop_vinfo);
6300 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6302 /* Create: ratio = ni >> log2(vf) */
6304 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6305 if (!is_gimple_val (ratio_name))
6307 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6308 add_referenced_var (var);
6310 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6311 pe = loop_preheader_edge (loop);
6312 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6313 gcc_assert (!new_bb);
6316 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6318 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6319 ratio_name, log_vf);
6320 if (!is_gimple_val (ratio_mult_vf_name))
6322 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6323 add_referenced_var (var);
6325 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6326 true, var);
6327 pe = loop_preheader_edge (loop);
6328 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6329 gcc_assert (!new_bb);
6332 *ni_name_ptr = ni_name;
6333 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6334 *ratio_name_ptr = ratio_name;
6336 return;
6340 /* Function vect_update_ivs_after_vectorizer.
6342 "Advance" the induction variables of LOOP to the value they should take
6343 after the execution of LOOP. This is currently necessary because the
6344 vectorizer does not handle induction variables that are used after the
6345 loop. Such a situation occurs when the last iterations of LOOP are
6346 peeled, because:
6347 1. We introduced new uses after LOOP for IVs that were not originally used
6348 after LOOP: the IVs of LOOP are now used by an epilog loop.
6349 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6350 times, whereas the loop IVs should be bumped N times.
6352 Input:
6353 - LOOP - a loop that is going to be vectorized. The last few iterations
6354 of LOOP were peeled.
6355 - NITERS - the number of iterations that LOOP executes (before it is
6356 vectorized). i.e, the number of times the ivs should be bumped.
6357 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6358 coming out from LOOP on which there are uses of the LOOP ivs
6359 (this is the path from LOOP->exit to epilog_loop->preheader).
6361 The new definitions of the ivs are placed in LOOP->exit.
6362 The phi args associated with the edge UPDATE_E in the bb
6363 UPDATE_E->dest are updated accordingly.
6365 Assumption 1: Like the rest of the vectorizer, this function assumes
6366 a single loop exit that has a single predecessor.
6368 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6369 organized in the same order.
6371 Assumption 3: The access function of the ivs is simple enough (see
6372 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6374 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6375 coming out of LOOP on which the ivs of LOOP are used (this is the path
6376 that leads to the epilog loop; other paths skip the epilog loop). This
6377 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6378 needs to have its phis updated.
6381 static void
6382 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6383 edge update_e)
6385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6386 basic_block exit_bb = single_exit (loop)->dest;
6387 tree phi, phi1;
6388 basic_block update_bb = update_e->dest;
6390 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6392 /* Make sure there exists a single-predecessor exit bb: */
6393 gcc_assert (single_pred_p (exit_bb));
6395 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6396 phi && phi1;
6397 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6399 tree access_fn = NULL;
6400 tree evolution_part;
6401 tree init_expr;
6402 tree step_expr;
6403 tree var, ni, ni_name;
6404 block_stmt_iterator last_bsi;
6406 if (vect_print_dump_info (REPORT_DETAILS))
6408 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6409 print_generic_expr (vect_dump, phi, TDF_SLIM);
6412 /* Skip virtual phi's. */
6413 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6415 if (vect_print_dump_info (REPORT_DETAILS))
6416 fprintf (vect_dump, "virtual phi. skip.");
6417 continue;
6420 /* Skip reduction phis. */
6421 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6423 if (vect_print_dump_info (REPORT_DETAILS))
6424 fprintf (vect_dump, "reduc phi. skip.");
6425 continue;
6428 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6429 gcc_assert (access_fn);
6430 evolution_part =
6431 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6432 gcc_assert (evolution_part != NULL_TREE);
6434 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6435 of degree >= 2 or exponential. */
6436 gcc_assert (!tree_is_chrec (evolution_part));
6438 step_expr = evolution_part;
6439 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6440 loop->num));
6442 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6443 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6444 init_expr,
6445 fold_convert (sizetype,
6446 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6447 niters, step_expr)));
6448 else
6449 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6450 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6451 fold_convert (TREE_TYPE (init_expr),
6452 niters),
6453 step_expr),
6454 init_expr);
6458 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6459 add_referenced_var (var);
6461 last_bsi = bsi_last (exit_bb);
6462 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6463 true, BSI_SAME_STMT);
6465 /* Fix phi expressions in the successor bb. */
6466 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6471 /* Function vect_do_peeling_for_loop_bound
6473 Peel the last iterations of the loop represented by LOOP_VINFO.
6474 The peeled iterations form a new epilog loop. Given that the loop now
6475 iterates NITERS times, the new epilog loop iterates
6476 NITERS % VECTORIZATION_FACTOR times.
6478 The original loop will later be made to iterate
6479 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6481 static void
6482 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6484 tree ni_name, ratio_mult_vf_name;
6485 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6486 struct loop *new_loop;
6487 edge update_e;
6488 basic_block preheader;
6489 int loop_num;
6490 unsigned int th;
6491 int min_scalar_loop_bound;
6492 int min_profitable_iters;
6494 if (vect_print_dump_info (REPORT_DETAILS))
6495 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6497 initialize_original_copy_tables ();
6499 /* Generate the following variables on the preheader of original loop:
6501 ni_name = number of iteration the original loop executes
6502 ratio = ni_name / vf
6503 ratio_mult_vf_name = ratio * vf */
6504 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6505 &ratio_mult_vf_name, ratio);
6507 loop_num = loop->num;
6509 /* Analyze cost to set threshhold for vectorized loop. */
6510 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6511 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6512 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6514 /* Use the cost model only if it is more conservative than user specified
6515 threshold. */
6517 th = (unsigned) min_scalar_loop_bound;
6518 if (min_profitable_iters
6519 && (!min_scalar_loop_bound
6520 || min_profitable_iters > min_scalar_loop_bound))
6521 th = (unsigned) min_profitable_iters;
6523 if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
6524 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6525 && vect_print_dump_info (REPORT_DETAILS))
6526 fprintf (vect_dump, "vectorization may not be profitable.");
6528 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6529 ratio_mult_vf_name, ni_name, false,
6530 th);
6531 gcc_assert (new_loop);
6532 gcc_assert (loop_num == loop->num);
6533 #ifdef ENABLE_CHECKING
6534 slpeel_verify_cfg_after_peeling (loop, new_loop);
6535 #endif
6537 /* A guard that controls whether the new_loop is to be executed or skipped
6538 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6539 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6540 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6541 is on the path where the LOOP IVs are used and need to be updated. */
6543 preheader = loop_preheader_edge (new_loop)->src;
6544 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6545 update_e = EDGE_PRED (preheader, 0);
6546 else
6547 update_e = EDGE_PRED (preheader, 1);
6549 /* Update IVs of original loop as if they were advanced
6550 by ratio_mult_vf_name steps. */
6551 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6553 /* After peeling we have to reset scalar evolution analyzer. */
6554 scev_reset ();
6556 free_original_copy_tables ();
6560 /* Function vect_gen_niters_for_prolog_loop
6562 Set the number of iterations for the loop represented by LOOP_VINFO
6563 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6564 and the misalignment of DR - the data reference recorded in
6565 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6566 this loop, the data reference DR will refer to an aligned location.
6568 The following computation is generated:
6570 If the misalignment of DR is known at compile time:
6571 addr_mis = int mis = DR_MISALIGNMENT (dr);
6572 Else, compute address misalignment in bytes:
6573 addr_mis = addr & (vectype_size - 1)
6575 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6577 (elem_size = element type size; an element is the scalar element
6578 whose type is the inner type of the vectype)
6580 For interleaving,
6582 prolog_niters = min ( LOOP_NITERS ,
6583 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6584 where group_size is the size of the interleaved group.
6586 The above formulas assume that VF == number of elements in the vector. This
6587 may not hold when there are multiple-types in the loop.
6588 In this case, for some data-references in the loop the VF does not represent
6589 the number of elements that fit in the vector. Therefore, instead of VF we
6590 use TYPE_VECTOR_SUBPARTS. */
6592 static tree
6593 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6595 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6596 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6597 tree var, stmt;
6598 tree iters, iters_name;
6599 edge pe;
6600 basic_block new_bb;
6601 tree dr_stmt = DR_STMT (dr);
6602 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6603 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6604 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6605 tree niters_type = TREE_TYPE (loop_niters);
6606 int group_size = 1;
6607 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6608 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6610 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6612 /* For interleaved access element size must be multiplied by the size of
6613 the interleaved group. */
6614 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6615 DR_GROUP_FIRST_DR (stmt_info)));
6616 element_size *= group_size;
6619 pe = loop_preheader_edge (loop);
6621 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6623 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6624 int elem_misalign = byte_misalign / element_size;
6626 if (vect_print_dump_info (REPORT_DETAILS))
6627 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6628 iters = build_int_cst (niters_type,
6629 (nelements - elem_misalign)&(nelements/group_size-1));
6631 else
6633 tree new_stmts = NULL_TREE;
6634 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6635 &new_stmts, NULL_TREE, loop);
6636 tree ptr_type = TREE_TYPE (start_addr);
6637 tree size = TYPE_SIZE (ptr_type);
6638 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6639 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6640 tree elem_size_log =
6641 build_int_cst (type, exact_log2 (vectype_align/nelements));
6642 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6643 tree nelements_tree = build_int_cst (type, nelements);
6644 tree byte_misalign;
6645 tree elem_misalign;
6647 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6648 gcc_assert (!new_bb);
6650 /* Create: byte_misalign = addr & (vectype_size - 1) */
6651 byte_misalign =
6652 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6654 /* Create: elem_misalign = byte_misalign / element_size */
6655 elem_misalign =
6656 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6658 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6659 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6660 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6661 iters = fold_convert (niters_type, iters);
6664 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6665 /* If the loop bound is known at compile time we already verified that it is
6666 greater than vf; since the misalignment ('iters') is at most vf, there's
6667 no need to generate the MIN_EXPR in this case. */
6668 if (TREE_CODE (loop_niters) != INTEGER_CST)
6669 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6671 if (vect_print_dump_info (REPORT_DETAILS))
6673 fprintf (vect_dump, "niters for prolog loop: ");
6674 print_generic_expr (vect_dump, iters, TDF_SLIM);
6677 var = create_tmp_var (niters_type, "prolog_loop_niters");
6678 add_referenced_var (var);
6679 iters_name = force_gimple_operand (iters, &stmt, false, var);
6681 /* Insert stmt on loop preheader edge. */
6682 if (stmt)
6684 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6685 gcc_assert (!new_bb);
6688 return iters_name;
6692 /* Function vect_update_init_of_dr
6694 NITERS iterations were peeled from LOOP. DR represents a data reference
6695 in LOOP. This function updates the information recorded in DR to
6696 account for the fact that the first NITERS iterations had already been
6697 executed. Specifically, it updates the OFFSET field of DR. */
6699 static void
6700 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6702 tree offset = DR_OFFSET (dr);
6704 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6705 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6706 DR_OFFSET (dr) = offset;
6710 /* Function vect_update_inits_of_drs
6712 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6713 This function updates the information recorded for the data references in
6714 the loop to account for the fact that the first NITERS iterations had
6715 already been executed. Specifically, it updates the initial_condition of
6716 the access_function of all the data_references in the loop. */
6718 static void
6719 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6721 unsigned int i;
6722 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6723 struct data_reference *dr;
6725 if (vect_print_dump_info (REPORT_DETAILS))
6726 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6728 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6729 vect_update_init_of_dr (dr, niters);
6733 /* Function vect_do_peeling_for_alignment
6735 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6736 'niters' is set to the misalignment of one of the data references in the
6737 loop, thereby forcing it to refer to an aligned location at the beginning
6738 of the execution of this loop. The data reference for which we are
6739 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6741 static void
6742 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6744 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6745 tree niters_of_prolog_loop, ni_name;
6746 tree n_iters;
6747 struct loop *new_loop;
6749 if (vect_print_dump_info (REPORT_DETAILS))
6750 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6752 initialize_original_copy_tables ();
6754 ni_name = vect_build_loop_niters (loop_vinfo);
6755 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6757 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6758 new_loop =
6759 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6760 niters_of_prolog_loop, ni_name, true, 0);
6761 gcc_assert (new_loop);
6762 #ifdef ENABLE_CHECKING
6763 slpeel_verify_cfg_after_peeling (new_loop, loop);
6764 #endif
6766 /* Update number of times loop executes. */
6767 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6768 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6769 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6771 /* Update the init conditions of the access functions of all data refs. */
6772 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6774 /* After peeling we have to reset scalar evolution analyzer. */
6775 scev_reset ();
6777 free_original_copy_tables ();
6781 /* Function vect_create_cond_for_align_checks.
6783 Create a conditional expression that represents the alignment checks for
6784 all of data references (array element references) whose alignment must be
6785 checked at runtime.
6787 Input:
6788 LOOP_VINFO - two fields of the loop information are used.
6789 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6790 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6792 Output:
6793 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6794 expression.
6795 The returned value is the conditional expression to be used in the if
6796 statement that controls which version of the loop gets executed at runtime.
6798 The algorithm makes two assumptions:
6799 1) The number of bytes "n" in a vector is a power of 2.
6800 2) An address "a" is aligned if a%n is zero and that this
6801 test can be done as a&(n-1) == 0. For example, for 16
6802 byte vectors the test is a&0xf == 0. */
6804 static tree
6805 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6806 tree *cond_expr_stmt_list)
6808 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6809 VEC(tree,heap) *may_misalign_stmts
6810 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6811 tree ref_stmt, tmp;
6812 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6813 tree mask_cst;
6814 unsigned int i;
6815 tree psize;
6816 tree int_ptrsize_type;
6817 char tmp_name[20];
6818 tree or_tmp_name = NULL_TREE;
6819 tree and_tmp, and_tmp_name, and_stmt;
6820 tree ptrsize_zero;
6822 /* Check that mask is one less than a power of 2, i.e., mask is
6823 all zeros followed by all ones. */
6824 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6826 /* CHECKME: what is the best integer or unsigned type to use to hold a
6827 cast from a pointer value? */
6828 psize = TYPE_SIZE (ptr_type_node);
6829 int_ptrsize_type
6830 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6832 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6833 of the first vector of the i'th data reference. */
6835 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6837 tree new_stmt_list = NULL_TREE;
6838 tree addr_base;
6839 tree addr_tmp, addr_tmp_name, addr_stmt;
6840 tree or_tmp, new_or_tmp_name, or_stmt;
6842 /* create: addr_tmp = (int)(address_of_first_vector) */
6843 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6844 &new_stmt_list, NULL_TREE, loop);
6846 if (new_stmt_list != NULL_TREE)
6847 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6849 sprintf (tmp_name, "%s%d", "addr2int", i);
6850 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6851 add_referenced_var (addr_tmp);
6852 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6853 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6854 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6855 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6856 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6858 /* The addresses are OR together. */
6860 if (or_tmp_name != NULL_TREE)
6862 /* create: or_tmp = or_tmp | addr_tmp */
6863 sprintf (tmp_name, "%s%d", "orptrs", i);
6864 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6865 add_referenced_var (or_tmp);
6866 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6867 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6868 or_tmp_name, addr_tmp_name);
6869 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6870 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6871 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6872 or_tmp_name = new_or_tmp_name;
6874 else
6875 or_tmp_name = addr_tmp_name;
6877 } /* end for i */
6879 mask_cst = build_int_cst (int_ptrsize_type, mask);
6881 /* create: and_tmp = or_tmp & mask */
6882 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6883 add_referenced_var (and_tmp);
6884 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6886 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6887 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6888 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6889 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6891 /* Make and_tmp the left operand of the conditional test against zero.
6892 if and_tmp has a nonzero bit then some address is unaligned. */
6893 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6894 return build2 (EQ_EXPR, boolean_type_node,
6895 and_tmp_name, ptrsize_zero);
6898 /* Function vect_vfa_segment_size.
6900 Create an expression that computes the size of segment
6901 that will be accessed for a data reference. The functions takes into
6902 account that realignment loads may access one more vector.
6904 Input:
6905 DR: The data reference.
6906 VECT_FACTOR: vectorization factor.
6908 Return an expression whose value is the size of segment which will be
6909 accessed by DR. */
6911 static tree
6912 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
6914 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
6915 DR_STEP (dr), vect_factor);
6917 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
6919 tree vector_size = TYPE_SIZE_UNIT
6920 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
6922 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
6923 segment_length, vector_size);
6925 return fold_convert (sizetype, segment_length);
6928 /* Function vect_create_cond_for_alias_checks.
6930 Create a conditional expression that represents the run-time checks for
6931 overlapping of address ranges represented by a list of data references
6932 relations passed as input.
6934 Input:
6935 COND_EXPR - input conditional expression. New conditions will be chained
6936 with logical and operation.
6937 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
6938 to be checked.
6940 Output:
6941 COND_EXPR - conditional expression.
6942 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6943 expression.
6946 The returned value is the conditional expression to be used in the if
6947 statement that controls which version of the loop gets executed at runtime.
6950 static void
6951 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
6952 tree * cond_expr,
6953 tree * cond_expr_stmt_list)
6955 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6956 VEC (ddr_p, heap) * may_alias_ddrs =
6957 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
6958 tree vect_factor =
6959 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
6961 ddr_p ddr;
6962 unsigned int i;
6963 tree part_cond_expr;
6965 /* Create expression
6966 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
6967 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
6971 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
6972 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
6974 if (VEC_empty (ddr_p, may_alias_ddrs))
6975 return;
6977 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
6979 struct data_reference *dr_a, *dr_b;
6980 tree dr_group_first_a, dr_group_first_b;
6981 tree addr_base_a, addr_base_b;
6982 tree segment_length_a, segment_length_b;
6983 tree stmt_a, stmt_b;
6985 dr_a = DDR_A (ddr);
6986 stmt_a = DR_STMT (DDR_A (ddr));
6987 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
6988 if (dr_group_first_a)
6990 stmt_a = dr_group_first_a;
6991 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
6994 dr_b = DDR_B (ddr);
6995 stmt_b = DR_STMT (DDR_B (ddr));
6996 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
6997 if (dr_group_first_b)
6999 stmt_b = dr_group_first_b;
7000 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7003 addr_base_a =
7004 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7005 NULL_TREE, loop);
7006 addr_base_b =
7007 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7008 NULL_TREE, loop);
7010 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7011 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7013 if (vect_print_dump_info (REPORT_DR_DETAILS))
7015 fprintf (vect_dump,
7016 "create runtime check for data references ");
7017 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7018 fprintf (vect_dump, " and ");
7019 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7023 part_cond_expr =
7024 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7025 fold_build2 (LT_EXPR, boolean_type_node,
7026 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7027 addr_base_a,
7028 segment_length_a),
7029 addr_base_b),
7030 fold_build2 (LT_EXPR, boolean_type_node,
7031 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7032 addr_base_b,
7033 segment_length_b),
7034 addr_base_a));
7036 if (*cond_expr)
7037 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7038 *cond_expr, part_cond_expr);
7039 else
7040 *cond_expr = part_cond_expr;
7042 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7043 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7044 VEC_length (ddr_p, may_alias_ddrs));
7048 /* Function vect_loop_versioning.
7050 If the loop has data references that may or may not be aligned or/and
7051 has data reference relations whose independence was not proven then
7052 two versions of the loop need to be generated, one which is vectorized
7053 and one which isn't. A test is then generated to control which of the
7054 loops is executed. The test checks for the alignment of all of the
7055 data references that may or may not be aligned. An additional
7056 sequence of runtime tests is generated for each pairs of DDRs whose
7057 independence was not proven. The vectorized version of loop is
7058 executed only if both alias and alignment tests are passed. */
7060 static void
7061 vect_loop_versioning (loop_vec_info loop_vinfo)
7063 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7064 struct loop *nloop;
7065 tree cond_expr = NULL_TREE;
7066 tree cond_expr_stmt_list = NULL_TREE;
7067 basic_block condition_bb;
7068 block_stmt_iterator cond_exp_bsi;
7069 basic_block merge_bb;
7070 basic_block new_exit_bb;
7071 edge new_exit_e, e;
7072 tree orig_phi, new_phi, arg;
7073 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7074 tree gimplify_stmt_list;
7076 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7077 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7078 return;
7080 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7081 cond_expr =
7082 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
7084 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7085 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, &cond_expr_stmt_list);
7087 cond_expr =
7088 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7089 cond_expr =
7090 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7091 NULL_TREE);
7092 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7094 initialize_original_copy_tables ();
7095 nloop = loop_version (loop, cond_expr, &condition_bb,
7096 prob, prob, REG_BR_PROB_BASE - prob, true);
7097 free_original_copy_tables();
7099 /* Loop versioning violates an assumption we try to maintain during
7100 vectorization - that the loop exit block has a single predecessor.
7101 After versioning, the exit block of both loop versions is the same
7102 basic block (i.e. it has two predecessors). Just in order to simplify
7103 following transformations in the vectorizer, we fix this situation
7104 here by adding a new (empty) block on the exit-edge of the loop,
7105 with the proper loop-exit phis to maintain loop-closed-form. */
7107 merge_bb = single_exit (loop)->dest;
7108 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7109 new_exit_bb = split_edge (single_exit (loop));
7110 new_exit_e = single_exit (loop);
7111 e = EDGE_SUCC (new_exit_bb, 0);
7113 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7114 orig_phi = PHI_CHAIN (orig_phi))
7116 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7117 new_exit_bb);
7118 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7119 add_phi_arg (new_phi, arg, new_exit_e);
7120 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7123 /* End loop-exit-fixes after versioning. */
7125 update_ssa (TODO_update_ssa);
7126 if (cond_expr_stmt_list)
7128 cond_exp_bsi = bsi_last (condition_bb);
7129 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7133 /* Remove a group of stores (for SLP or interleaving), free their
7134 stmt_vec_info. */
7136 static void
7137 vect_remove_stores (tree first_stmt)
7139 stmt_ann_t ann;
7140 tree next = first_stmt;
7141 tree tmp;
7142 stmt_vec_info next_stmt_info;
7143 block_stmt_iterator next_si;
7145 while (next)
7147 /* Free the attached stmt_vec_info and remove the stmt. */
7148 next_si = bsi_for_stmt (next);
7149 bsi_remove (&next_si, true);
7150 next_stmt_info = vinfo_for_stmt (next);
7151 ann = stmt_ann (next);
7152 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7153 free (next_stmt_info);
7154 set_stmt_info (ann, NULL);
7155 next = tmp;
7160 /* Vectorize SLP instance tree in postorder. */
7162 static bool
7163 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7165 tree stmt;
7166 bool strided_store, is_store;
7167 block_stmt_iterator si;
7168 stmt_vec_info stmt_info;
7170 if (!node)
7171 return false;
7173 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7174 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7176 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7177 stmt_info = vinfo_for_stmt (stmt);
7178 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7179 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7181 if (vect_print_dump_info (REPORT_DETAILS))
7183 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7184 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7187 si = bsi_for_stmt (stmt);
7188 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7189 if (is_store)
7191 if (DR_GROUP_FIRST_DR (stmt_info))
7192 /* If IS_STORE is TRUE, the vectorization of the
7193 interleaving chain was completed - free all the stores in
7194 the chain. */
7195 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7196 else
7197 /* FORNOW: SLP originates only from strided stores. */
7198 gcc_unreachable ();
7200 return true;
7203 /* FORNOW: SLP originates only from strided stores. */
7204 return false;
7208 static bool
7209 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7211 VEC (slp_instance, heap) *slp_instances =
7212 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7213 slp_instance instance;
7214 unsigned int vec_stmts_size;
7215 unsigned int group_size, i;
7216 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7217 bool is_store = false;
7219 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7221 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7222 /* For each SLP instance calculate number of vector stmts to be created
7223 for the scalar stmts in each node of the SLP tree. Number of vector
7224 elements in one vector iteration is the number of scalar elements in
7225 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7226 size. */
7227 vec_stmts_size = vectorization_factor * group_size / nunits;
7229 /* Schedule the tree of INSTANCE. */
7230 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7231 vec_stmts_size);
7233 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7234 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7235 fprintf (vect_dump, "vectorizing stmts using SLP.");
7238 return is_store;
7241 /* Function vect_transform_loop.
7243 The analysis phase has determined that the loop is vectorizable.
7244 Vectorize the loop - created vectorized stmts to replace the scalar
7245 stmts in the loop, and update the loop exit condition. */
7247 void
7248 vect_transform_loop (loop_vec_info loop_vinfo)
7250 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7251 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7252 int nbbs = loop->num_nodes;
7253 block_stmt_iterator si, next_si;
7254 int i;
7255 tree ratio = NULL;
7256 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7257 bool strided_store;
7258 bool slp_scheduled = false;
7259 unsigned int nunits;
7261 if (vect_print_dump_info (REPORT_DETAILS))
7262 fprintf (vect_dump, "=== vec_transform_loop ===");
7263 vect_loop_versioning (loop_vinfo);
7265 /* CHECKME: we wouldn't need this if we called update_ssa once
7266 for all loops. */
7267 bitmap_zero (vect_memsyms_to_rename);
7269 /* Peel the loop if there are data refs with unknown alignment.
7270 Only one data ref with unknown store is allowed. */
7272 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7273 vect_do_peeling_for_alignment (loop_vinfo);
7275 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7276 compile time constant), or it is a constant that doesn't divide by the
7277 vectorization factor, then an epilog loop needs to be created.
7278 We therefore duplicate the loop: the original loop will be vectorized,
7279 and will compute the first (n/VF) iterations. The second copy of the loop
7280 will remain scalar and will compute the remaining (n%VF) iterations.
7281 (VF is the vectorization factor). */
7283 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7284 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7285 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7286 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7287 else
7288 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7289 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7291 /* 1) Make sure the loop header has exactly two entries
7292 2) Make sure we have a preheader basic block. */
7294 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7296 split_edge (loop_preheader_edge (loop));
7298 /* FORNOW: the vectorizer supports only loops which body consist
7299 of one basic block (header + empty latch). When the vectorizer will
7300 support more involved loop forms, the order by which the BBs are
7301 traversed need to be reconsidered. */
7303 for (i = 0; i < nbbs; i++)
7305 basic_block bb = bbs[i];
7306 stmt_vec_info stmt_info;
7307 tree phi;
7309 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7311 if (vect_print_dump_info (REPORT_DETAILS))
7313 fprintf (vect_dump, "------>vectorizing phi: ");
7314 print_generic_expr (vect_dump, phi, TDF_SLIM);
7316 stmt_info = vinfo_for_stmt (phi);
7317 if (!stmt_info)
7318 continue;
7320 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7321 && !STMT_VINFO_LIVE_P (stmt_info))
7322 continue;
7324 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7325 != (unsigned HOST_WIDE_INT) vectorization_factor)
7326 && vect_print_dump_info (REPORT_DETAILS))
7327 fprintf (vect_dump, "multiple-types.");
7329 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7331 if (vect_print_dump_info (REPORT_DETAILS))
7332 fprintf (vect_dump, "transform phi.");
7333 vect_transform_stmt (phi, NULL, NULL, NULL);
7337 for (si = bsi_start (bb); !bsi_end_p (si);)
7339 tree stmt = bsi_stmt (si);
7340 bool is_store;
7342 if (vect_print_dump_info (REPORT_DETAILS))
7344 fprintf (vect_dump, "------>vectorizing statement: ");
7345 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7348 stmt_info = vinfo_for_stmt (stmt);
7350 /* vector stmts created in the outer-loop during vectorization of
7351 stmts in an inner-loop may not have a stmt_info, and do not
7352 need to be vectorized. */
7353 if (!stmt_info)
7355 bsi_next (&si);
7356 continue;
7359 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7360 && !STMT_VINFO_LIVE_P (stmt_info))
7362 bsi_next (&si);
7363 continue;
7366 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7367 nunits =
7368 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7369 if (!STMT_SLP_TYPE (stmt_info)
7370 && nunits != (unsigned int) vectorization_factor
7371 && vect_print_dump_info (REPORT_DETAILS))
7372 /* For SLP VF is set according to unrolling factor, and not to
7373 vector size, hence for SLP this print is not valid. */
7374 fprintf (vect_dump, "multiple-types.");
7376 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7377 reached. */
7378 if (STMT_SLP_TYPE (stmt_info))
7380 if (!slp_scheduled)
7382 slp_scheduled = true;
7384 if (vect_print_dump_info (REPORT_DETAILS))
7385 fprintf (vect_dump, "=== scheduling SLP instances ===");
7387 is_store = vect_schedule_slp (loop_vinfo, nunits);
7389 /* IS_STORE is true if STMT is a store. Stores cannot be of
7390 hybrid SLP type. They are removed in
7391 vect_schedule_slp_instance and their vinfo is destroyed. */
7392 if (is_store)
7394 bsi_next (&si);
7395 continue;
7399 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7400 if (PURE_SLP_STMT (stmt_info))
7402 bsi_next (&si);
7403 continue;
7407 /* -------- vectorize statement ------------ */
7408 if (vect_print_dump_info (REPORT_DETAILS))
7409 fprintf (vect_dump, "transform statement.");
7411 strided_store = false;
7412 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7413 if (is_store)
7415 stmt_ann_t ann;
7416 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7418 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7419 interleaving chain was completed - free all the stores in
7420 the chain. */
7421 tree next = DR_GROUP_FIRST_DR (stmt_info);
7422 tree tmp;
7423 stmt_vec_info next_stmt_info;
7425 while (next)
7427 next_si = bsi_for_stmt (next);
7428 next_stmt_info = vinfo_for_stmt (next);
7429 /* Free the attached stmt_vec_info and remove the stmt. */
7430 ann = stmt_ann (next);
7431 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7432 free (next_stmt_info);
7433 set_stmt_info (ann, NULL);
7434 bsi_remove (&next_si, true);
7435 next = tmp;
7437 bsi_remove (&si, true);
7438 continue;
7440 else
7442 /* Free the attached stmt_vec_info and remove the stmt. */
7443 ann = stmt_ann (stmt);
7444 free (stmt_info);
7445 set_stmt_info (ann, NULL);
7446 bsi_remove (&si, true);
7447 continue;
7450 bsi_next (&si);
7451 } /* stmts in BB */
7452 } /* BBs in loop */
7454 slpeel_make_loop_iterate_ntimes (loop, ratio);
7456 mark_set_for_renaming (vect_memsyms_to_rename);
7458 /* The memory tags and pointers in vectorized statements need to
7459 have their SSA forms updated. FIXME, why can't this be delayed
7460 until all the loops have been transformed? */
7461 update_ssa (TODO_update_ssa);
7463 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7464 fprintf (vect_dump, "LOOP VECTORIZED.");
7465 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7466 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");