* compare-debug: Avoid spurious errors when .stripped files
[official-gcc.git] / gcc / tree-vect-transform.c
blobf02d912b29f63bc9f027cd2a6aa329e0a62eb626
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
124 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
125 int nbbs = loop->num_nodes;
126 int byte_misalign;
127 int peel_guard_costs = 0;
128 int innerloop_iters = 0, factor;
129 VEC (slp_instance, heap) *slp_instances;
130 slp_instance instance;
132 /* Cost model disabled. */
133 if (!flag_vect_cost_model)
135 if (vect_print_dump_info (REPORT_DETAILS))
136 fprintf (vect_dump, "cost model disabled.");
137 return 0;
140 /* Requires loop versioning tests to handle misalignment. */
142 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
144 /* FIXME: Make cost depend on complexity of individual check. */
145 vec_outside_cost +=
146 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
147 if (vect_print_dump_info (REPORT_DETAILS))
148 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
149 "versioning to treat misalignment.\n");
152 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
154 /* FIXME: Make cost depend on complexity of individual check. */
155 vec_outside_cost +=
156 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
157 if (vect_print_dump_info (REPORT_DETAILS))
158 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
159 "versioning aliasing.\n");
162 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
163 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
168 /* Count statements in scalar loop. Using this as scalar cost for a single
169 iteration for now.
171 TODO: Add outer loop support.
173 TODO: Consider assigning different costs to different scalar
174 statements. */
176 /* FORNOW. */
177 if (loop->inner)
178 innerloop_iters = 50; /* FIXME */
180 for (i = 0; i < nbbs; i++)
182 block_stmt_iterator si;
183 basic_block bb = bbs[i];
185 if (bb->loop_father == loop->inner)
186 factor = innerloop_iters;
187 else
188 factor = 1;
190 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
192 tree stmt = bsi_stmt (si);
193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
194 if (!STMT_VINFO_RELEVANT_P (stmt_info)
195 && !STMT_VINFO_LIVE_P (stmt_info))
196 continue;
197 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
198 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
199 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
200 some of the "outside" costs are generated inside the outer-loop. */
201 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
205 /* Add additional cost for the peeled instructions in prologue and epilogue
206 loop.
208 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
209 at compile-time - we assume it's vf/2 (the worst would be vf-1).
211 TODO: Build an expression that represents peel_iters for prologue and
212 epilogue to be used in a run-time test. */
214 byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
216 if (byte_misalign < 0)
218 peel_iters_prologue = vf/2;
219 if (vect_print_dump_info (REPORT_DETAILS))
220 fprintf (vect_dump, "cost model: "
221 "prologue peel iters set to vf/2.");
223 /* If peeling for alignment is unknown, loop bound of main loop becomes
224 unknown. */
225 peel_iters_epilogue = vf/2;
226 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "cost model: "
228 "epilogue peel iters set to vf/2 because "
229 "peeling for alignment is unknown .");
231 /* If peeled iterations are unknown, count a taken branch and a not taken
232 branch per peeled loop. Even if scalar loop iterations are known,
233 vector iterations are not known since peeled prologue iterations are
234 not known. Hence guards remain the same. */
235 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
236 + TARG_COND_NOT_TAKEN_BRANCH_COST);
239 else
241 if (byte_misalign)
243 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
244 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
245 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
246 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
248 peel_iters_prologue = nelements - (byte_misalign / element_size);
250 else
251 peel_iters_prologue = 0;
253 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
255 peel_iters_epilogue = vf/2;
256 if (vect_print_dump_info (REPORT_DETAILS))
257 fprintf (vect_dump, "cost model: "
258 "epilogue peel iters set to vf/2 because "
259 "loop iterations are unknown .");
261 /* If peeled iterations are known but number of scalar loop
262 iterations are unknown, count a taken branch per peeled loop. */
263 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
266 else
268 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
269 peel_iters_prologue = niters < peel_iters_prologue ?
270 niters : peel_iters_prologue;
271 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
275 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
276 + (peel_iters_epilogue * scalar_single_iter_cost)
277 + peel_guard_costs;
279 /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
280 information we provide for the target is whether testing against the
281 threshold involves a runtime test. */
282 if (targetm.vectorize.builtin_vectorization_cost)
284 bool runtime_test = false;
286 /* If the number of iterations is unknown, or the
287 peeling-for-misalignment amount is unknown, we eill have to generate
288 a runtime test to test the loop count against the threshold. */
289 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
290 || (byte_misalign < 0))
291 runtime_test = true;
292 vec_outside_cost +=
293 targetm.vectorize.builtin_vectorization_cost (runtime_test);
294 if (vect_print_dump_info (REPORT_DETAILS))
295 fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
296 targetm.vectorize.builtin_vectorization_cost (runtime_test));
299 /* Add SLP costs. */
300 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
301 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
303 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
304 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
307 /* Calculate number of iterations required to make the vector version
308 profitable, relative to the loop bodies only. The following condition
309 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
310 SIC = scalar iteration cost, VIC = vector iteration cost,
311 VOC = vector outside cost and VF = vectorization factor. */
313 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
315 if (vec_outside_cost <= 0)
316 min_profitable_iters = 1;
317 else
319 min_profitable_iters = (vec_outside_cost * vf
320 - vec_inside_cost * peel_iters_prologue
321 - vec_inside_cost * peel_iters_epilogue)
322 / ((scalar_single_iter_cost * vf)
323 - vec_inside_cost);
325 if ((scalar_single_iter_cost * vf * min_profitable_iters)
326 <= ((vec_inside_cost * min_profitable_iters)
327 + (vec_outside_cost * vf)))
328 min_profitable_iters++;
331 /* vector version will never be profitable. */
332 else
334 if (vect_print_dump_info (REPORT_DETAILS))
335 fprintf (vect_dump, "cost model: vector iteration cost = %d "
336 "is divisible by scalar iteration cost = %d by a factor "
337 "greater than or equal to the vectorization factor = %d .",
338 vec_inside_cost, scalar_single_iter_cost, vf);
339 return -1;
342 if (vect_print_dump_info (REPORT_DETAILS))
344 fprintf (vect_dump, "Cost model analysis: \n");
345 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
346 vec_inside_cost);
347 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
348 vec_outside_cost);
349 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
350 fprintf (vect_dump, " prologue iterations: %d\n",
351 peel_iters_prologue);
352 fprintf (vect_dump, " epilogue iterations: %d\n",
353 peel_iters_epilogue);
354 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
355 min_profitable_iters);
358 min_profitable_iters =
359 min_profitable_iters < vf ? vf : min_profitable_iters;
361 /* Because the condition we create is:
362 if (niters <= min_profitable_iters)
363 then skip the vectorized loop. */
364 min_profitable_iters--;
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, " Profitability threshold = %d\n",
368 min_profitable_iters);
370 return min_profitable_iters;
374 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
375 functions. Design better to avoid maintenance issues. */
377 /* Function vect_model_reduction_cost.
379 Models cost for a reduction operation, including the vector ops
380 generated within the strip-mine loop, the initial definition before
381 the loop, and the epilogue code that must be generated. */
383 static void
384 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
385 int ncopies)
387 int outer_cost = 0;
388 enum tree_code code;
389 optab optab;
390 tree vectype;
391 tree orig_stmt;
392 tree reduction_op;
393 enum machine_mode mode;
394 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
395 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
397 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
399 /* Cost of reduction op inside loop. */
400 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
402 reduction_op = TREE_OPERAND (operation, op_type-1);
403 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
404 mode = TYPE_MODE (vectype);
405 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
407 if (!orig_stmt)
408 orig_stmt = STMT_VINFO_STMT (stmt_info);
410 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
412 /* Add in cost for initial definition. */
413 outer_cost += TARG_SCALAR_TO_VEC_COST;
415 /* Determine cost of epilogue code.
417 We have a reduction operator that will reduce the vector in one statement.
418 Also requires scalar extract. */
420 if (!nested_in_vect_loop_p (loop, orig_stmt))
422 if (reduc_code < NUM_TREE_CODES)
423 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
424 else
426 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
427 tree bitsize =
428 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
429 int element_bitsize = tree_low_cst (bitsize, 1);
430 int nelements = vec_size_in_bits / element_bitsize;
432 optab = optab_for_tree_code (code, vectype);
434 /* We have a whole vector shift available. */
435 if (VECTOR_MODE_P (mode)
436 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
437 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
438 /* Final reduction via vector shifts and the reduction operator. Also
439 requires scalar extract. */
440 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
441 + TARG_VEC_TO_SCALAR_COST);
442 else
443 /* Use extracts and reduction op for final reduction. For N elements,
444 we have N extracts and N-1 reduction ops. */
445 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
449 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
451 if (vect_print_dump_info (REPORT_DETAILS))
452 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
453 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
454 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
458 /* Function vect_model_induction_cost.
460 Models cost for induction operations. */
462 static void
463 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
465 /* loop cost for vec_loop. */
466 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
467 /* prologue cost for vec_init and vec_step. */
468 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
470 if (vect_print_dump_info (REPORT_DETAILS))
471 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
472 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
473 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
477 /* Function vect_model_simple_cost.
479 Models cost for simple operations, i.e. those that only emit ncopies of a
480 single op. Right now, this does not account for multiple insns that could
481 be generated for the single vector op. We will handle that shortly. */
483 void
484 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
485 enum vect_def_type *dt, slp_tree slp_node)
487 int i;
488 int inside_cost = 0, outside_cost = 0;
490 inside_cost = ncopies * TARG_VEC_STMT_COST;
492 /* FORNOW: Assuming maximum 2 args per stmts. */
493 for (i = 0; i < 2; i++)
495 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
496 outside_cost += TARG_SCALAR_TO_VEC_COST;
499 if (vect_print_dump_info (REPORT_DETAILS))
500 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
501 "outside_cost = %d .", inside_cost, outside_cost);
503 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
504 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
505 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
509 /* Function vect_cost_strided_group_size
511 For strided load or store, return the group_size only if it is the first
512 load or store of a group, else return 1. This ensures that group size is
513 only returned once per group. */
515 static int
516 vect_cost_strided_group_size (stmt_vec_info stmt_info)
518 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
520 if (first_stmt == STMT_VINFO_STMT (stmt_info))
521 return DR_GROUP_SIZE (stmt_info);
523 return 1;
527 /* Function vect_model_store_cost
529 Models cost for stores. In the case of strided accesses, one access
530 has the overhead of the strided access attributed to it. */
532 void
533 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
534 enum vect_def_type dt, slp_tree slp_node)
536 int group_size;
537 int inside_cost = 0, outside_cost = 0;
539 if (dt == vect_constant_def || dt == vect_invariant_def)
540 outside_cost = TARG_SCALAR_TO_VEC_COST;
542 /* Strided access? */
543 if (DR_GROUP_FIRST_DR (stmt_info))
544 group_size = vect_cost_strided_group_size (stmt_info);
545 /* Not a strided access. */
546 else
547 group_size = 1;
549 /* Is this an access in a group of stores, which provide strided access?
550 If so, add in the cost of the permutes. */
551 if (group_size > 1)
553 /* Uses a high and low interleave operation for each needed permute. */
554 inside_cost = ncopies * exact_log2(group_size) * group_size
555 * TARG_VEC_STMT_COST;
557 if (vect_print_dump_info (REPORT_DETAILS))
558 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
559 group_size);
563 /* Costs of the stores. */
564 inside_cost += ncopies * TARG_VEC_STORE_COST;
566 if (vect_print_dump_info (REPORT_DETAILS))
567 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
568 "outside_cost = %d .", inside_cost, outside_cost);
570 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
571 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
572 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
576 /* Function vect_model_load_cost
578 Models cost for loads. In the case of strided accesses, the last access
579 has the overhead of the strided access attributed to it. Since unaligned
580 accesses are supported for loads, we also account for the costs of the
581 access scheme chosen. */
583 void
584 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
587 int group_size;
588 int alignment_support_cheme;
589 tree first_stmt;
590 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
591 int inside_cost = 0, outside_cost = 0;
593 /* Strided accesses? */
594 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
595 if (first_stmt && !slp_node)
597 group_size = vect_cost_strided_group_size (stmt_info);
598 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
600 /* Not a strided access. */
601 else
603 group_size = 1;
604 first_dr = dr;
607 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
609 /* Is this an access in a group of loads providing strided access?
610 If so, add in the cost of the permutes. */
611 if (group_size > 1)
613 /* Uses an even and odd extract operations for each needed permute. */
614 inside_cost = ncopies * exact_log2(group_size) * group_size
615 * TARG_VEC_STMT_COST;
617 if (vect_print_dump_info (REPORT_DETAILS))
618 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
619 group_size);
623 /* The loads themselves. */
624 switch (alignment_support_cheme)
626 case dr_aligned:
628 inside_cost += ncopies * TARG_VEC_LOAD_COST;
630 if (vect_print_dump_info (REPORT_DETAILS))
631 fprintf (vect_dump, "vect_model_load_cost: aligned.");
633 break;
635 case dr_unaligned_supported:
637 /* Here, we assign an additional cost for the unaligned load. */
638 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
640 if (vect_print_dump_info (REPORT_DETAILS))
641 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
642 "hardware.");
644 break;
646 case dr_explicit_realign:
648 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
650 /* FIXME: If the misalignment remains fixed across the iterations of
651 the containing loop, the following cost should be added to the
652 outside costs. */
653 if (targetm.vectorize.builtin_mask_for_load)
654 inside_cost += TARG_VEC_STMT_COST;
656 break;
658 case dr_explicit_realign_optimized:
660 if (vect_print_dump_info (REPORT_DETAILS))
661 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
662 "pipelined.");
664 /* Unaligned software pipeline has a load of an address, an initial
665 load, and possibly a mask operation to "prime" the loop. However,
666 if this is an access in a group of loads, which provide strided
667 access, then the above cost should only be considered for one
668 access in the group. Inside the loop, there is a load op
669 and a realignment op. */
671 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
673 outside_cost = 2*TARG_VEC_STMT_COST;
674 if (targetm.vectorize.builtin_mask_for_load)
675 outside_cost += TARG_VEC_STMT_COST;
678 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
680 break;
683 default:
684 gcc_unreachable ();
687 if (vect_print_dump_info (REPORT_DETAILS))
688 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
689 "outside_cost = %d .", inside_cost, outside_cost);
691 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
692 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
693 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
697 /* Function vect_get_new_vect_var.
699 Returns a name for a new variable. The current naming scheme appends the
700 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
701 the name of vectorizer generated variables, and appends that to NAME if
702 provided. */
704 static tree
705 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
707 const char *prefix;
708 tree new_vect_var;
710 switch (var_kind)
712 case vect_simple_var:
713 prefix = "vect_";
714 break;
715 case vect_scalar_var:
716 prefix = "stmp_";
717 break;
718 case vect_pointer_var:
719 prefix = "vect_p";
720 break;
721 default:
722 gcc_unreachable ();
725 if (name)
727 char* tmp = concat (prefix, name, NULL);
728 new_vect_var = create_tmp_var (type, tmp);
729 free (tmp);
731 else
732 new_vect_var = create_tmp_var (type, prefix);
734 /* Mark vector typed variable as a gimple register variable. */
735 if (TREE_CODE (type) == VECTOR_TYPE)
736 DECL_GIMPLE_REG_P (new_vect_var) = true;
738 return new_vect_var;
742 /* Function vect_create_addr_base_for_vector_ref.
744 Create an expression that computes the address of the first memory location
745 that will be accessed for a data reference.
747 Input:
748 STMT: The statement containing the data reference.
749 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
750 OFFSET: Optional. If supplied, it is be added to the initial address.
751 LOOP: Specify relative to which loop-nest should the address be computed.
752 For example, when the dataref is in an inner-loop nested in an
753 outer-loop that is now being vectorized, LOOP can be either the
754 outer-loop, or the inner-loop. The first memory location accessed
755 by the following dataref ('in' points to short):
757 for (i=0; i<N; i++)
758 for (j=0; j<M; j++)
759 s += in[i+j]
761 is as follows:
762 if LOOP=i_loop: &in (relative to i_loop)
763 if LOOP=j_loop: &in+i*2B (relative to j_loop)
765 Output:
766 1. Return an SSA_NAME whose value is the address of the memory location of
767 the first vector of the data reference.
768 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
769 these statement(s) which define the returned SSA_NAME.
771 FORNOW: We are only handling array accesses with step 1. */
773 static tree
774 vect_create_addr_base_for_vector_ref (tree stmt,
775 tree *new_stmt_list,
776 tree offset,
777 struct loop *loop)
779 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
780 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
781 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
782 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
783 tree base_name;
784 tree data_ref_base_var;
785 tree new_base_stmt;
786 tree vec_stmt;
787 tree addr_base, addr_expr;
788 tree dest, new_stmt;
789 tree base_offset = unshare_expr (DR_OFFSET (dr));
790 tree init = unshare_expr (DR_INIT (dr));
791 tree vect_ptr_type, addr_expr2;
792 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
794 gcc_assert (loop);
795 if (loop != containing_loop)
797 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
798 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
800 gcc_assert (nested_in_vect_loop_p (loop, stmt));
802 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
803 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
804 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
807 /* Create data_ref_base */
808 base_name = build_fold_indirect_ref (data_ref_base);
809 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
810 add_referenced_var (data_ref_base_var);
811 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
812 true, data_ref_base_var);
813 append_to_statement_list_force(new_base_stmt, new_stmt_list);
815 /* Create base_offset */
816 base_offset = size_binop (PLUS_EXPR, base_offset, init);
817 base_offset = fold_convert (sizetype, base_offset);
818 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
819 add_referenced_var (dest);
820 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
821 append_to_statement_list_force (new_stmt, new_stmt_list);
823 if (offset)
825 tree tmp = create_tmp_var (sizetype, "offset");
827 add_referenced_var (tmp);
828 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
829 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
830 base_offset, offset);
831 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
832 append_to_statement_list_force (new_stmt, new_stmt_list);
835 /* base + base_offset */
836 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
837 data_ref_base, base_offset);
839 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
841 /* addr_expr = addr_base */
842 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
843 get_name (base_name));
844 add_referenced_var (addr_expr);
845 vec_stmt = fold_convert (vect_ptr_type, addr_base);
846 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
847 get_name (base_name));
848 add_referenced_var (addr_expr2);
849 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
850 append_to_statement_list_force (new_stmt, new_stmt_list);
852 if (vect_print_dump_info (REPORT_DETAILS))
854 fprintf (vect_dump, "created ");
855 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
857 return vec_stmt;
861 /* Function vect_create_data_ref_ptr.
863 Create a new pointer to vector type (vp), that points to the first location
864 accessed in the loop by STMT, along with the def-use update chain to
865 appropriately advance the pointer through the loop iterations. Also set
866 aliasing information for the pointer. This vector pointer is used by the
867 callers to this function to create a memory reference expression for vector
868 load/store access.
870 Input:
871 1. STMT: a stmt that references memory. Expected to be of the form
872 GIMPLE_MODIFY_STMT <name, data-ref> or
873 GIMPLE_MODIFY_STMT <data-ref, name>.
874 2. AT_LOOP: the loop where the vector memref is to be created.
875 3. OFFSET (optional): an offset to be added to the initial address accessed
876 by the data-ref in STMT.
877 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
878 pointing to the initial address.
879 5. TYPE: if not NULL indicates the required type of the data-ref
881 Output:
882 1. Declare a new ptr to vector_type, and have it point to the base of the
883 data reference (initial addressed accessed by the data reference).
884 For example, for vector of type V8HI, the following code is generated:
886 v8hi *vp;
887 vp = (v8hi *)initial_address;
889 if OFFSET is not supplied:
890 initial_address = &a[init];
891 if OFFSET is supplied:
892 initial_address = &a[init + OFFSET];
894 Return the initial_address in INITIAL_ADDRESS.
896 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
897 update the pointer in each iteration of the loop.
899 Return the increment stmt that updates the pointer in PTR_INCR.
901 3. Set INV_P to true if the access pattern of the data reference in the
902 vectorized loop is invariant. Set it to false otherwise.
904 4. Return the pointer. */
906 static tree
907 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
908 tree offset, tree *initial_address, tree *ptr_incr,
909 bool only_init, tree type, bool *inv_p)
911 tree base_name;
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
914 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
915 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
916 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
917 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
918 tree vect_ptr_type;
919 tree vect_ptr;
920 tree tag;
921 tree new_temp;
922 tree vec_stmt;
923 tree new_stmt_list = NULL_TREE;
924 edge pe;
925 basic_block new_bb;
926 tree vect_ptr_init;
927 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
928 tree vptr;
929 block_stmt_iterator incr_bsi;
930 bool insert_after;
931 tree indx_before_incr, indx_after_incr;
932 tree incr;
933 tree step;
935 /* Check the step (evolution) of the load in LOOP, and record
936 whether it's invariant. */
937 if (nested_in_vect_loop)
938 step = STMT_VINFO_DR_STEP (stmt_info);
939 else
940 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
942 if (tree_int_cst_compare (step, size_zero_node) == 0)
943 *inv_p = true;
944 else
945 *inv_p = false;
947 /* Create an expression for the first address accessed by this load
948 in LOOP. */
949 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
951 if (vect_print_dump_info (REPORT_DETAILS))
953 tree data_ref_base = base_name;
954 fprintf (vect_dump, "create vector-pointer variable to type: ");
955 print_generic_expr (vect_dump, vectype, TDF_SLIM);
956 if (TREE_CODE (data_ref_base) == VAR_DECL)
957 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
958 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
959 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
960 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
961 fprintf (vect_dump, " vectorizing a record based array ref: ");
962 else if (TREE_CODE (data_ref_base) == SSA_NAME)
963 fprintf (vect_dump, " vectorizing a pointer ref: ");
964 print_generic_expr (vect_dump, base_name, TDF_SLIM);
967 /** (1) Create the new vector-pointer variable: **/
968 if (type)
969 vect_ptr_type = build_pointer_type (type);
970 else
971 vect_ptr_type = build_pointer_type (vectype);
972 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
973 get_name (base_name));
974 add_referenced_var (vect_ptr);
976 /** (2) Add aliasing information to the new vector-pointer:
977 (The points-to info (DR_PTR_INFO) may be defined later.) **/
979 tag = DR_SYMBOL_TAG (dr);
980 gcc_assert (tag);
982 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
983 tag must be created with tag added to its may alias list. */
984 if (!MTAG_P (tag))
985 new_type_alias (vect_ptr, tag, DR_REF (dr));
986 else
987 set_symbol_mem_tag (vect_ptr, tag);
989 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
991 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
992 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
993 def-use update cycles for the pointer: One relative to the outer-loop
994 (LOOP), which is what steps (3) and (4) below do. The other is relative
995 to the inner-loop (which is the inner-most loop containing the dataref),
996 and this is done be step (5) below.
998 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
999 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1000 redundant. Steps (3),(4) create the following:
1002 vp0 = &base_addr;
1003 LOOP: vp1 = phi(vp0,vp2)
1004 ...
1006 vp2 = vp1 + step
1007 goto LOOP
1009 If there is an inner-loop nested in loop, then step (5) will also be
1010 applied, and an additional update in the inner-loop will be created:
1012 vp0 = &base_addr;
1013 LOOP: vp1 = phi(vp0,vp2)
1015 inner: vp3 = phi(vp1,vp4)
1016 vp4 = vp3 + inner_step
1017 if () goto inner
1019 vp2 = vp1 + step
1020 if () goto LOOP */
1022 /** (3) Calculate the initial address the vector-pointer, and set
1023 the vector-pointer to point to it before the loop: **/
1025 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1027 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1028 offset, loop);
1029 pe = loop_preheader_edge (loop);
1030 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1031 gcc_assert (!new_bb);
1032 *initial_address = new_temp;
1034 /* Create: p = (vectype *) initial_base */
1035 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1036 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1037 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1038 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1039 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1040 gcc_assert (!new_bb);
1043 /** (4) Handle the updating of the vector-pointer inside the loop.
1044 This is needed when ONLY_INIT is false, and also when AT_LOOP
1045 is the inner-loop nested in LOOP (during outer-loop vectorization).
1048 if (only_init && at_loop == loop) /* No update in loop is required. */
1050 /* Copy the points-to information if it exists. */
1051 if (DR_PTR_INFO (dr))
1052 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1053 vptr = vect_ptr_init;
1055 else
1057 /* The step of the vector pointer is the Vector Size. */
1058 tree step = TYPE_SIZE_UNIT (vectype);
1059 /* One exception to the above is when the scalar step of the load in
1060 LOOP is zero. In this case the step here is also zero. */
1061 if (*inv_p)
1062 step = size_zero_node;
1064 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1066 create_iv (vect_ptr_init,
1067 fold_convert (vect_ptr_type, step),
1068 NULL_TREE, loop, &incr_bsi, insert_after,
1069 &indx_before_incr, &indx_after_incr);
1070 incr = bsi_stmt (incr_bsi);
1071 set_stmt_info (stmt_ann (incr),
1072 new_stmt_vec_info (incr, loop_vinfo));
1074 /* Copy the points-to information if it exists. */
1075 if (DR_PTR_INFO (dr))
1077 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1078 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1080 merge_alias_info (vect_ptr_init, indx_before_incr);
1081 merge_alias_info (vect_ptr_init, indx_after_incr);
1082 if (ptr_incr)
1083 *ptr_incr = incr;
1085 vptr = indx_before_incr;
1088 if (!nested_in_vect_loop || only_init)
1089 return vptr;
1092 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1093 nested in LOOP, if exists: **/
1095 gcc_assert (nested_in_vect_loop);
1096 if (!only_init)
1098 standard_iv_increment_position (containing_loop, &incr_bsi,
1099 &insert_after);
1100 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1101 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1102 &indx_after_incr);
1103 incr = bsi_stmt (incr_bsi);
1104 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1106 /* Copy the points-to information if it exists. */
1107 if (DR_PTR_INFO (dr))
1109 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1110 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1112 merge_alias_info (vect_ptr_init, indx_before_incr);
1113 merge_alias_info (vect_ptr_init, indx_after_incr);
1114 if (ptr_incr)
1115 *ptr_incr = incr;
1117 return indx_before_incr;
1119 else
1120 gcc_unreachable ();
1124 /* Function bump_vector_ptr
1126 Increment a pointer (to a vector type) by vector-size. If requested,
1127 i.e. if PTR-INCR is given, then also connect the new increment stmt
1128 to the existing def-use update-chain of the pointer, by modifying
1129 the PTR_INCR as illustrated below:
1131 The pointer def-use update-chain before this function:
1132 DATAREF_PTR = phi (p_0, p_2)
1133 ....
1134 PTR_INCR: p_2 = DATAREF_PTR + step
1136 The pointer def-use update-chain after this function:
1137 DATAREF_PTR = phi (p_0, p_2)
1138 ....
1139 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1140 ....
1141 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1143 Input:
1144 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1145 in the loop.
1146 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1147 the loop. The increment amount across iterations is expected
1148 to be vector_size.
1149 BSI - location where the new update stmt is to be placed.
1150 STMT - the original scalar memory-access stmt that is being vectorized.
1151 BUMP - optional. The offset by which to bump the pointer. If not given,
1152 the offset is assumed to be vector_size.
1154 Output: Return NEW_DATAREF_PTR as illustrated above.
1158 static tree
1159 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1160 tree stmt, tree bump)
1162 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1163 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1164 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1165 tree vptr_type = TREE_TYPE (dataref_ptr);
1166 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1167 tree update = TYPE_SIZE_UNIT (vectype);
1168 tree incr_stmt;
1169 ssa_op_iter iter;
1170 use_operand_p use_p;
1171 tree new_dataref_ptr;
1173 if (bump)
1174 update = bump;
1176 incr_stmt = build_gimple_modify_stmt (ptr_var,
1177 build2 (POINTER_PLUS_EXPR, vptr_type,
1178 dataref_ptr, update));
1179 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1180 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1181 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1183 /* Copy the points-to information if it exists. */
1184 if (DR_PTR_INFO (dr))
1185 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1186 merge_alias_info (new_dataref_ptr, dataref_ptr);
1188 if (!ptr_incr)
1189 return new_dataref_ptr;
1191 /* Update the vector-pointer's cross-iteration increment. */
1192 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1194 tree use = USE_FROM_PTR (use_p);
1196 if (use == dataref_ptr)
1197 SET_USE (use_p, new_dataref_ptr);
1198 else
1199 gcc_assert (tree_int_cst_compare (use, update) == 0);
1202 return new_dataref_ptr;
1206 /* Function vect_create_destination_var.
1208 Create a new temporary of type VECTYPE. */
1210 static tree
1211 vect_create_destination_var (tree scalar_dest, tree vectype)
1213 tree vec_dest;
1214 const char *new_name;
1215 tree type;
1216 enum vect_var_kind kind;
1218 kind = vectype ? vect_simple_var : vect_scalar_var;
1219 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1221 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1223 new_name = get_name (scalar_dest);
1224 if (!new_name)
1225 new_name = "var_";
1226 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1227 add_referenced_var (vec_dest);
1229 return vec_dest;
1233 /* Function vect_init_vector.
1235 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1236 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1237 is not NULL. Otherwise, place the initialization at the loop preheader.
1238 Return the DEF of INIT_STMT.
1239 It will be used in the vectorization of STMT. */
1241 static tree
1242 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1243 block_stmt_iterator *bsi)
1245 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1246 tree new_var;
1247 tree init_stmt;
1248 tree vec_oprnd;
1249 edge pe;
1250 tree new_temp;
1251 basic_block new_bb;
1253 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1254 add_referenced_var (new_var);
1255 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1256 new_temp = make_ssa_name (new_var, init_stmt);
1257 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1259 if (bsi)
1260 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1261 else
1263 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1264 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1266 if (nested_in_vect_loop_p (loop, stmt))
1267 loop = loop->inner;
1268 pe = loop_preheader_edge (loop);
1269 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1270 gcc_assert (!new_bb);
1273 if (vect_print_dump_info (REPORT_DETAILS))
1275 fprintf (vect_dump, "created new init_stmt: ");
1276 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1279 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1280 return vec_oprnd;
1284 /* For constant and loop invariant defs of SLP_NODE this function returns
1285 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1286 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1287 stmts. */
1289 static void
1290 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1291 unsigned int op_num)
1293 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1294 tree stmt = VEC_index (tree, stmts, 0);
1295 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1296 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1297 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1298 tree vec_cst;
1299 tree t = NULL_TREE;
1300 int j, number_of_places_left_in_vector;
1301 tree vector_type;
1302 tree op, vop, operation;
1303 int group_size = VEC_length (tree, stmts);
1304 unsigned int vec_num, i;
1305 int number_of_copies = 1;
1306 bool is_store = false;
1307 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1308 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1310 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1311 is_store = true;
1313 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1314 created vectors. It is greater than 1 if unrolling is performed.
1316 For example, we have two scalar operands, s1 and s2 (e.g., group of
1317 strided accesses of size two), while NUINTS is four (i.e., four scalars
1318 of this type can be packed in a vector). The output vector will contain
1319 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1320 will be 2).
1322 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1323 containing the operands.
1325 For example, NUINTS is four as before, and the group size is 8
1326 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1327 {s5, s6, s7, s8}. */
1329 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1331 number_of_places_left_in_vector = nunits;
1332 for (j = 0; j < number_of_copies; j++)
1334 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1336 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1337 if (is_store)
1338 op = operation;
1339 else
1340 op = TREE_OPERAND (operation, op_num);
1342 /* Create 'vect_ = {op0,op1,...,opn}'. */
1343 t = tree_cons (NULL_TREE, op, t);
1345 number_of_places_left_in_vector--;
1347 if (number_of_places_left_in_vector == 0)
1349 number_of_places_left_in_vector = nunits;
1351 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1352 vec_cst = build_constructor_from_list (vector_type, t);
1353 VEC_quick_push (tree, voprnds,
1354 vect_init_vector (stmt, vec_cst, vector_type,
1355 NULL));
1356 t = NULL_TREE;
1361 /* Since the vectors are created in the reverse order, we should invert
1362 them. */
1363 vec_num = VEC_length (tree, voprnds);
1364 for (j = vec_num - 1; j >= 0; j--)
1366 vop = VEC_index (tree, voprnds, j);
1367 VEC_quick_push (tree, *vec_oprnds, vop);
1370 VEC_free (tree, heap, voprnds);
1372 /* In case that VF is greater than the unrolling factor needed for the SLP
1373 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1374 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1375 to replicate the vectors. */
1376 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1378 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1379 VEC_quick_push (tree, *vec_oprnds, vop);
1384 /* Get vectorized defintions from SLP_NODE that contains corresponding
1385 vectorized def-stmts. */
1387 static void
1388 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1390 tree vec_oprnd;
1391 tree vec_def_stmt;
1392 unsigned int i;
1394 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1396 for (i = 0;
1397 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1398 i++)
1400 gcc_assert (vec_def_stmt);
1401 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1402 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1407 /* Get vectorized definitions for SLP_NODE.
1408 If the scalar definitions are loop invariants or constants, collect them and
1409 call vect_get_constant_vectors() to create vector stmts.
1410 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1411 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1412 vect_get_slp_vect_defs() to retrieve them.
1413 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1414 the right node. This is used when the second operand must remain scalar. */
1416 static void
1417 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1418 VEC (tree,heap) **vec_oprnds1)
1420 tree operation, first_stmt;
1422 /* Allocate memory for vectorized defs. */
1423 *vec_oprnds0 = VEC_alloc (tree, heap,
1424 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1426 /* SLP_NODE corresponds either to a group of stores or to a group of
1427 unary/binary operations. We don't call this function for loads. */
1428 if (SLP_TREE_LEFT (slp_node))
1429 /* The defs are already vectorized. */
1430 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1431 else
1432 /* Build vectors from scalar defs. */
1433 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1435 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1436 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1437 /* Since we don't call this function with loads, this is a group of
1438 stores. */
1439 return;
1441 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1442 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1443 return;
1445 *vec_oprnds1 = VEC_alloc (tree, heap,
1446 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1448 if (SLP_TREE_RIGHT (slp_node))
1449 /* The defs are already vectorized. */
1450 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1451 else
1452 /* Build vectors from scalar defs. */
1453 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1457 /* Function get_initial_def_for_induction
1459 Input:
1460 STMT - a stmt that performs an induction operation in the loop.
1461 IV_PHI - the initial value of the induction variable
1463 Output:
1464 Return a vector variable, initialized with the first VF values of
1465 the induction variable. E.g., for an iv with IV_PHI='X' and
1466 evolution S, for a vector of 4 units, we want to return:
1467 [X, X + S, X + 2*S, X + 3*S]. */
1469 static tree
1470 get_initial_def_for_induction (tree iv_phi)
1472 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1475 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1476 tree vectype = get_vectype_for_scalar_type (scalar_type);
1477 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1478 edge pe = loop_preheader_edge (loop);
1479 struct loop *iv_loop;
1480 basic_block new_bb;
1481 tree vec, vec_init, vec_step, t;
1482 tree access_fn;
1483 tree new_var;
1484 tree new_name;
1485 tree init_stmt;
1486 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1487 tree init_expr, step_expr;
1488 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1489 int i;
1490 bool ok;
1491 int ncopies = vf / nunits;
1492 tree expr;
1493 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1494 bool nested_in_vect_loop = false;
1495 tree stmts;
1496 imm_use_iterator imm_iter;
1497 use_operand_p use_p;
1498 tree exit_phi;
1499 edge latch_e;
1500 tree loop_arg;
1501 block_stmt_iterator si;
1502 basic_block bb = bb_for_stmt (iv_phi);
1504 gcc_assert (phi_info);
1505 gcc_assert (ncopies >= 1);
1507 /* Find the first insertion point in the BB. */
1508 si = bsi_after_labels (bb);
1510 if (INTEGRAL_TYPE_P (scalar_type))
1511 step_expr = build_int_cst (scalar_type, 0);
1512 else
1513 step_expr = build_real (scalar_type, dconst0);
1515 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1516 if (nested_in_vect_loop_p (loop, iv_phi))
1518 nested_in_vect_loop = true;
1519 iv_loop = loop->inner;
1521 else
1522 iv_loop = loop;
1523 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1525 latch_e = loop_latch_edge (iv_loop);
1526 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1528 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1529 gcc_assert (access_fn);
1530 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1531 &init_expr, &step_expr);
1532 gcc_assert (ok);
1533 pe = loop_preheader_edge (iv_loop);
1535 /* Create the vector that holds the initial_value of the induction. */
1536 if (nested_in_vect_loop)
1538 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1539 been created during vectorization of previous stmts; We obtain it from
1540 the STMT_VINFO_VEC_STMT of the defining stmt. */
1541 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1542 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1544 else
1546 /* iv_loop is the loop to be vectorized. Create:
1547 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1548 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1549 add_referenced_var (new_var);
1551 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1552 if (stmts)
1554 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1555 gcc_assert (!new_bb);
1558 t = NULL_TREE;
1559 t = tree_cons (NULL_TREE, init_expr, t);
1560 for (i = 1; i < nunits; i++)
1562 tree tmp;
1564 /* Create: new_name_i = new_name + step_expr */
1565 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1566 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1567 new_name = make_ssa_name (new_var, init_stmt);
1568 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1570 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1571 gcc_assert (!new_bb);
1573 if (vect_print_dump_info (REPORT_DETAILS))
1575 fprintf (vect_dump, "created new init_stmt: ");
1576 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1578 t = tree_cons (NULL_TREE, new_name, t);
1580 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1581 vec = build_constructor_from_list (vectype, nreverse (t));
1582 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1586 /* Create the vector that holds the step of the induction. */
1587 if (nested_in_vect_loop)
1588 /* iv_loop is nested in the loop to be vectorized. Generate:
1589 vec_step = [S, S, S, S] */
1590 new_name = step_expr;
1591 else
1593 /* iv_loop is the loop to be vectorized. Generate:
1594 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1595 expr = build_int_cst (scalar_type, vf);
1596 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1599 t = NULL_TREE;
1600 for (i = 0; i < nunits; i++)
1601 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1602 vec = build_constructor_from_list (vectype, t);
1603 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1606 /* Create the following def-use cycle:
1607 loop prolog:
1608 vec_init = ...
1609 vec_step = ...
1610 loop:
1611 vec_iv = PHI <vec_init, vec_loop>
1613 STMT
1615 vec_loop = vec_iv + vec_step; */
1617 /* Create the induction-phi that defines the induction-operand. */
1618 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1619 add_referenced_var (vec_dest);
1620 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1621 set_stmt_info (get_stmt_ann (induction_phi),
1622 new_stmt_vec_info (induction_phi, loop_vinfo));
1623 induc_def = PHI_RESULT (induction_phi);
1625 /* Create the iv update inside the loop */
1626 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1627 build2 (PLUS_EXPR, vectype,
1628 induc_def, vec_step));
1629 vec_def = make_ssa_name (vec_dest, new_stmt);
1630 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1631 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1632 set_stmt_info (get_stmt_ann (new_stmt),
1633 new_stmt_vec_info (new_stmt, loop_vinfo));
1635 /* Set the arguments of the phi node: */
1636 add_phi_arg (induction_phi, vec_init, pe);
1637 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1640 /* In case that vectorization factor (VF) is bigger than the number
1641 of elements that we can fit in a vectype (nunits), we have to generate
1642 more than one vector stmt - i.e - we need to "unroll" the
1643 vector stmt by a factor VF/nunits. For more details see documentation
1644 in vectorizable_operation. */
1646 if (ncopies > 1)
1648 stmt_vec_info prev_stmt_vinfo;
1649 /* FORNOW. This restriction should be relaxed. */
1650 gcc_assert (!nested_in_vect_loop);
1652 /* Create the vector that holds the step of the induction. */
1653 expr = build_int_cst (scalar_type, nunits);
1654 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1655 t = NULL_TREE;
1656 for (i = 0; i < nunits; i++)
1657 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1658 vec = build_constructor_from_list (vectype, t);
1659 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1661 vec_def = induc_def;
1662 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1663 for (i = 1; i < ncopies; i++)
1665 tree tmp;
1667 /* vec_i = vec_prev + vec_step */
1668 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1669 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1670 vec_def = make_ssa_name (vec_dest, new_stmt);
1671 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1672 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1673 set_stmt_info (get_stmt_ann (new_stmt),
1674 new_stmt_vec_info (new_stmt, loop_vinfo));
1675 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1676 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1680 if (nested_in_vect_loop)
1682 /* Find the loop-closed exit-phi of the induction, and record
1683 the final vector of induction results: */
1684 exit_phi = NULL;
1685 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1687 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1689 exit_phi = USE_STMT (use_p);
1690 break;
1693 if (exit_phi)
1695 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1696 /* FORNOW. Currently not supporting the case that an inner-loop induction
1697 is not used in the outer-loop (i.e. only outside the outer-loop). */
1698 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1699 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1701 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1702 if (vect_print_dump_info (REPORT_DETAILS))
1704 fprintf (vect_dump, "vector of inductions after inner-loop:");
1705 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1711 if (vect_print_dump_info (REPORT_DETAILS))
1713 fprintf (vect_dump, "transform induction: created def-use cycle:");
1714 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1715 fprintf (vect_dump, "\n");
1716 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1719 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1720 return induc_def;
1724 /* Function vect_get_vec_def_for_operand.
1726 OP is an operand in STMT. This function returns a (vector) def that will be
1727 used in the vectorized stmt for STMT.
1729 In the case that OP is an SSA_NAME which is defined in the loop, then
1730 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1732 In case OP is an invariant or constant, a new stmt that creates a vector def
1733 needs to be introduced. */
1735 static tree
1736 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1738 tree vec_oprnd;
1739 tree vec_stmt;
1740 tree def_stmt;
1741 stmt_vec_info def_stmt_info = NULL;
1742 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1743 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1744 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1745 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1746 tree vec_inv;
1747 tree vec_cst;
1748 tree t = NULL_TREE;
1749 tree def;
1750 int i;
1751 enum vect_def_type dt;
1752 bool is_simple_use;
1753 tree vector_type;
1755 if (vect_print_dump_info (REPORT_DETAILS))
1757 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1758 print_generic_expr (vect_dump, op, TDF_SLIM);
1761 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1762 gcc_assert (is_simple_use);
1763 if (vect_print_dump_info (REPORT_DETAILS))
1765 if (def)
1767 fprintf (vect_dump, "def = ");
1768 print_generic_expr (vect_dump, def, TDF_SLIM);
1770 if (def_stmt)
1772 fprintf (vect_dump, " def_stmt = ");
1773 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1777 switch (dt)
1779 /* Case 1: operand is a constant. */
1780 case vect_constant_def:
1782 if (scalar_def)
1783 *scalar_def = op;
1785 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1786 if (vect_print_dump_info (REPORT_DETAILS))
1787 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1789 for (i = nunits - 1; i >= 0; --i)
1791 t = tree_cons (NULL_TREE, op, t);
1793 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1794 vec_cst = build_vector (vector_type, t);
1796 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1799 /* Case 2: operand is defined outside the loop - loop invariant. */
1800 case vect_invariant_def:
1802 if (scalar_def)
1803 *scalar_def = def;
1805 /* Create 'vec_inv = {inv,inv,..,inv}' */
1806 if (vect_print_dump_info (REPORT_DETAILS))
1807 fprintf (vect_dump, "Create vector_inv.");
1809 for (i = nunits - 1; i >= 0; --i)
1811 t = tree_cons (NULL_TREE, def, t);
1814 /* FIXME: use build_constructor directly. */
1815 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1816 vec_inv = build_constructor_from_list (vector_type, t);
1817 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1820 /* Case 3: operand is defined inside the loop. */
1821 case vect_loop_def:
1823 if (scalar_def)
1824 *scalar_def = def_stmt;
1826 /* Get the def from the vectorized stmt. */
1827 def_stmt_info = vinfo_for_stmt (def_stmt);
1828 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1829 gcc_assert (vec_stmt);
1830 if (TREE_CODE (vec_stmt) == PHI_NODE)
1831 vec_oprnd = PHI_RESULT (vec_stmt);
1832 else
1833 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1834 return vec_oprnd;
1837 /* Case 4: operand is defined by a loop header phi - reduction */
1838 case vect_reduction_def:
1840 struct loop *loop;
1842 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1843 loop = (bb_for_stmt (def_stmt))->loop_father;
1845 /* Get the def before the loop */
1846 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1847 return get_initial_def_for_reduction (stmt, op, scalar_def);
1850 /* Case 5: operand is defined by loop-header phi - induction. */
1851 case vect_induction_def:
1853 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1855 /* Get the def from the vectorized stmt. */
1856 def_stmt_info = vinfo_for_stmt (def_stmt);
1857 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1858 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1859 vec_oprnd = PHI_RESULT (vec_stmt);
1860 return vec_oprnd;
1863 default:
1864 gcc_unreachable ();
1869 /* Function vect_get_vec_def_for_stmt_copy
1871 Return a vector-def for an operand. This function is used when the
1872 vectorized stmt to be created (by the caller to this function) is a "copy"
1873 created in case the vectorized result cannot fit in one vector, and several
1874 copies of the vector-stmt are required. In this case the vector-def is
1875 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1876 of the stmt that defines VEC_OPRND.
1877 DT is the type of the vector def VEC_OPRND.
1879 Context:
1880 In case the vectorization factor (VF) is bigger than the number
1881 of elements that can fit in a vectype (nunits), we have to generate
1882 more than one vector stmt to vectorize the scalar stmt. This situation
1883 arises when there are multiple data-types operated upon in the loop; the
1884 smallest data-type determines the VF, and as a result, when vectorizing
1885 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1886 vector stmt (each computing a vector of 'nunits' results, and together
1887 computing 'VF' results in each iteration). This function is called when
1888 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1889 which VF=16 and nunits=4, so the number of copies required is 4):
1891 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1893 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1894 VS1.1: vx.1 = memref1 VS1.2
1895 VS1.2: vx.2 = memref2 VS1.3
1896 VS1.3: vx.3 = memref3
1898 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1899 VSnew.1: vz1 = vx.1 + ... VSnew.2
1900 VSnew.2: vz2 = vx.2 + ... VSnew.3
1901 VSnew.3: vz3 = vx.3 + ...
1903 The vectorization of S1 is explained in vectorizable_load.
1904 The vectorization of S2:
1905 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1906 the function 'vect_get_vec_def_for_operand' is called to
1907 get the relevant vector-def for each operand of S2. For operand x it
1908 returns the vector-def 'vx.0'.
1910 To create the remaining copies of the vector-stmt (VSnew.j), this
1911 function is called to get the relevant vector-def for each operand. It is
1912 obtained from the respective VS1.j stmt, which is recorded in the
1913 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1915 For example, to obtain the vector-def 'vx.1' in order to create the
1916 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1917 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1918 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1919 and return its def ('vx.1').
1920 Overall, to create the above sequence this function will be called 3 times:
1921 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1922 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1923 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1925 static tree
1926 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1928 tree vec_stmt_for_operand;
1929 stmt_vec_info def_stmt_info;
1931 /* Do nothing; can reuse same def. */
1932 if (dt == vect_invariant_def || dt == vect_constant_def )
1933 return vec_oprnd;
1935 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1936 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1937 gcc_assert (def_stmt_info);
1938 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1939 gcc_assert (vec_stmt_for_operand);
1940 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1941 return vec_oprnd;
1945 /* Get vectorized definitions for the operands to create a copy of an original
1946 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
1948 static void
1949 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
1950 VEC(tree,heap) **vec_oprnds0,
1951 VEC(tree,heap) **vec_oprnds1)
1953 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
1955 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
1956 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
1958 if (vec_oprnds1 && *vec_oprnds1)
1960 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
1961 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
1962 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
1967 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
1969 static void
1970 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
1971 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
1973 if (slp_node)
1974 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
1975 else
1977 tree vec_oprnd;
1979 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
1980 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
1981 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
1983 if (op1)
1985 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
1986 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
1987 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
1993 /* Function vect_finish_stmt_generation.
1995 Insert a new stmt. */
1997 static void
1998 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
1999 block_stmt_iterator *bsi)
2001 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2004 gcc_assert (stmt == bsi_stmt (*bsi));
2005 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2007 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2009 set_stmt_info (get_stmt_ann (vec_stmt),
2010 new_stmt_vec_info (vec_stmt, loop_vinfo));
2012 if (vect_print_dump_info (REPORT_DETAILS))
2014 fprintf (vect_dump, "add new stmt: ");
2015 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2018 /* Make sure bsi points to the stmt that is being vectorized. */
2019 gcc_assert (stmt == bsi_stmt (*bsi));
2021 #ifdef USE_MAPPED_LOCATION
2022 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2023 #else
2024 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
2025 #endif
2029 /* Function get_initial_def_for_reduction
2031 Input:
2032 STMT - a stmt that performs a reduction operation in the loop.
2033 INIT_VAL - the initial value of the reduction variable
2035 Output:
2036 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2037 of the reduction (used for adjusting the epilog - see below).
2038 Return a vector variable, initialized according to the operation that STMT
2039 performs. This vector will be used as the initial value of the
2040 vector of partial results.
2042 Option1 (adjust in epilog): Initialize the vector as follows:
2043 add: [0,0,...,0,0]
2044 mult: [1,1,...,1,1]
2045 min/max: [init_val,init_val,..,init_val,init_val]
2046 bit and/or: [init_val,init_val,..,init_val,init_val]
2047 and when necessary (e.g. add/mult case) let the caller know
2048 that it needs to adjust the result by init_val.
2050 Option2: Initialize the vector as follows:
2051 add: [0,0,...,0,init_val]
2052 mult: [1,1,...,1,init_val]
2053 min/max: [init_val,init_val,...,init_val]
2054 bit and/or: [init_val,init_val,...,init_val]
2055 and no adjustments are needed.
2057 For example, for the following code:
2059 s = init_val;
2060 for (i=0;i<n;i++)
2061 s = s + a[i];
2063 STMT is 's = s + a[i]', and the reduction variable is 's'.
2064 For a vector of 4 units, we want to return either [0,0,0,init_val],
2065 or [0,0,0,0] and let the caller know that it needs to adjust
2066 the result at the end by 'init_val'.
2068 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2069 initialization vector is simpler (same element in all entries).
2070 A cost model should help decide between these two schemes. */
2072 static tree
2073 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2075 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2076 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2077 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2078 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2079 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2080 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2081 tree type = TREE_TYPE (init_val);
2082 tree vecdef;
2083 tree def_for_init;
2084 tree init_def;
2085 tree t = NULL_TREE;
2086 int i;
2087 tree vector_type;
2088 bool nested_in_vect_loop = false;
2090 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2091 if (nested_in_vect_loop_p (loop, stmt))
2092 nested_in_vect_loop = true;
2093 else
2094 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2096 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2098 switch (code)
2100 case WIDEN_SUM_EXPR:
2101 case DOT_PROD_EXPR:
2102 case PLUS_EXPR:
2103 if (nested_in_vect_loop)
2104 *adjustment_def = vecdef;
2105 else
2106 *adjustment_def = init_val;
2107 /* Create a vector of zeros for init_def. */
2108 if (INTEGRAL_TYPE_P (type))
2109 def_for_init = build_int_cst (type, 0);
2110 else
2111 def_for_init = build_real (type, dconst0);
2112 for (i = nunits - 1; i >= 0; --i)
2113 t = tree_cons (NULL_TREE, def_for_init, t);
2114 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2115 init_def = build_vector (vector_type, t);
2116 break;
2118 case MIN_EXPR:
2119 case MAX_EXPR:
2120 *adjustment_def = NULL_TREE;
2121 init_def = vecdef;
2122 break;
2124 default:
2125 gcc_unreachable ();
2128 return init_def;
2132 /* Function vect_create_epilog_for_reduction
2134 Create code at the loop-epilog to finalize the result of a reduction
2135 computation.
2137 VECT_DEF is a vector of partial results.
2138 REDUC_CODE is the tree-code for the epilog reduction.
2139 STMT is the scalar reduction stmt that is being vectorized.
2140 REDUCTION_PHI is the phi-node that carries the reduction computation.
2142 This function:
2143 1. Creates the reduction def-use cycle: sets the arguments for
2144 REDUCTION_PHI:
2145 The loop-entry argument is the vectorized initial-value of the reduction.
2146 The loop-latch argument is VECT_DEF - the vector of partial sums.
2147 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2148 by applying the operation specified by REDUC_CODE if available, or by
2149 other means (whole-vector shifts or a scalar loop).
2150 The function also creates a new phi node at the loop exit to preserve
2151 loop-closed form, as illustrated below.
2153 The flow at the entry to this function:
2155 loop:
2156 vec_def = phi <null, null> # REDUCTION_PHI
2157 VECT_DEF = vector_stmt # vectorized form of STMT
2158 s_loop = scalar_stmt # (scalar) STMT
2159 loop_exit:
2160 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2161 use <s_out0>
2162 use <s_out0>
2164 The above is transformed by this function into:
2166 loop:
2167 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2168 VECT_DEF = vector_stmt # vectorized form of STMT
2169 s_loop = scalar_stmt # (scalar) STMT
2170 loop_exit:
2171 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2172 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2173 v_out2 = reduce <v_out1>
2174 s_out3 = extract_field <v_out2, 0>
2175 s_out4 = adjust_result <s_out3>
2176 use <s_out4>
2177 use <s_out4>
2180 static void
2181 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2182 enum tree_code reduc_code, tree reduction_phi)
2184 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2185 tree vectype;
2186 enum machine_mode mode;
2187 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2189 basic_block exit_bb;
2190 tree scalar_dest;
2191 tree scalar_type;
2192 tree new_phi;
2193 block_stmt_iterator exit_bsi;
2194 tree vec_dest;
2195 tree new_temp = NULL_TREE;
2196 tree new_name;
2197 tree epilog_stmt = NULL_TREE;
2198 tree new_scalar_dest, exit_phi, new_dest;
2199 tree bitsize, bitpos, bytesize;
2200 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2201 tree adjustment_def;
2202 tree vec_initial_def;
2203 tree orig_name;
2204 imm_use_iterator imm_iter;
2205 use_operand_p use_p;
2206 bool extract_scalar_result = false;
2207 tree reduction_op, expr;
2208 tree orig_stmt;
2209 tree use_stmt;
2210 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2211 bool nested_in_vect_loop = false;
2212 int op_type;
2213 VEC(tree,heap) *phis = NULL;
2214 int i;
2216 if (nested_in_vect_loop_p (loop, stmt))
2218 loop = loop->inner;
2219 nested_in_vect_loop = true;
2222 op_type = TREE_OPERAND_LENGTH (operation);
2223 reduction_op = TREE_OPERAND (operation, op_type-1);
2224 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2225 mode = TYPE_MODE (vectype);
2227 /*** 1. Create the reduction def-use cycle ***/
2229 /* 1.1 set the loop-entry arg of the reduction-phi: */
2230 /* For the case of reduction, vect_get_vec_def_for_operand returns
2231 the scalar def before the loop, that defines the initial value
2232 of the reduction variable. */
2233 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2234 &adjustment_def);
2235 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2237 /* 1.2 set the loop-latch arg for the reduction-phi: */
2238 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2240 if (vect_print_dump_info (REPORT_DETAILS))
2242 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2243 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2244 fprintf (vect_dump, "\n");
2245 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2249 /*** 2. Create epilog code
2250 The reduction epilog code operates across the elements of the vector
2251 of partial results computed by the vectorized loop.
2252 The reduction epilog code consists of:
2253 step 1: compute the scalar result in a vector (v_out2)
2254 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2255 step 3: adjust the scalar result (s_out3) if needed.
2257 Step 1 can be accomplished using one the following three schemes:
2258 (scheme 1) using reduc_code, if available.
2259 (scheme 2) using whole-vector shifts, if available.
2260 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2261 combined.
2263 The overall epilog code looks like this:
2265 s_out0 = phi <s_loop> # original EXIT_PHI
2266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2267 v_out2 = reduce <v_out1> # step 1
2268 s_out3 = extract_field <v_out2, 0> # step 2
2269 s_out4 = adjust_result <s_out3> # step 3
2271 (step 3 is optional, and step2 1 and 2 may be combined).
2272 Lastly, the uses of s_out0 are replaced by s_out4.
2274 ***/
2276 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2277 v_out1 = phi <v_loop> */
2279 exit_bb = single_exit (loop)->dest;
2280 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2281 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2282 exit_bsi = bsi_after_labels (exit_bb);
2284 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2285 (i.e. when reduc_code is not available) and in the final adjustment
2286 code (if needed). Also get the original scalar reduction variable as
2287 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2288 represents a reduction pattern), the tree-code and scalar-def are
2289 taken from the original stmt that the pattern-stmt (STMT) replaces.
2290 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2291 are taken from STMT. */
2293 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2294 if (!orig_stmt)
2296 /* Regular reduction */
2297 orig_stmt = stmt;
2299 else
2301 /* Reduction pattern */
2302 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2303 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2304 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2306 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2307 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2308 scalar_type = TREE_TYPE (scalar_dest);
2309 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2310 bitsize = TYPE_SIZE (scalar_type);
2311 bytesize = TYPE_SIZE_UNIT (scalar_type);
2314 /* In case this is a reduction in an inner-loop while vectorizing an outer
2315 loop - we don't need to extract a single scalar result at the end of the
2316 inner-loop. The final vector of partial results will be used in the
2317 vectorized outer-loop, or reduced to a scalar result at the end of the
2318 outer-loop. */
2319 if (nested_in_vect_loop)
2320 goto vect_finalize_reduction;
2322 /* 2.3 Create the reduction code, using one of the three schemes described
2323 above. */
2325 if (reduc_code < NUM_TREE_CODES)
2327 tree tmp;
2329 /*** Case 1: Create:
2330 v_out2 = reduc_expr <v_out1> */
2332 if (vect_print_dump_info (REPORT_DETAILS))
2333 fprintf (vect_dump, "Reduce using direct vector reduction.");
2335 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2336 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2337 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2338 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2339 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2340 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2342 extract_scalar_result = true;
2344 else
2346 enum tree_code shift_code = 0;
2347 bool have_whole_vector_shift = true;
2348 int bit_offset;
2349 int element_bitsize = tree_low_cst (bitsize, 1);
2350 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2351 tree vec_temp;
2353 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2354 shift_code = VEC_RSHIFT_EXPR;
2355 else
2356 have_whole_vector_shift = false;
2358 /* Regardless of whether we have a whole vector shift, if we're
2359 emulating the operation via tree-vect-generic, we don't want
2360 to use it. Only the first round of the reduction is likely
2361 to still be profitable via emulation. */
2362 /* ??? It might be better to emit a reduction tree code here, so that
2363 tree-vect-generic can expand the first round via bit tricks. */
2364 if (!VECTOR_MODE_P (mode))
2365 have_whole_vector_shift = false;
2366 else
2368 optab optab = optab_for_tree_code (code, vectype);
2369 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2370 have_whole_vector_shift = false;
2373 if (have_whole_vector_shift)
2375 /*** Case 2: Create:
2376 for (offset = VS/2; offset >= element_size; offset/=2)
2378 Create: va' = vec_shift <va, offset>
2379 Create: va = vop <va, va'>
2380 } */
2382 if (vect_print_dump_info (REPORT_DETAILS))
2383 fprintf (vect_dump, "Reduce using vector shifts");
2385 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2386 new_temp = PHI_RESULT (new_phi);
2388 for (bit_offset = vec_size_in_bits/2;
2389 bit_offset >= element_bitsize;
2390 bit_offset /= 2)
2392 tree bitpos = size_int (bit_offset);
2393 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2394 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2395 new_name = make_ssa_name (vec_dest, epilog_stmt);
2396 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2397 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2399 tmp = build2 (code, vectype, new_name, new_temp);
2400 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2401 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2402 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2403 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2406 extract_scalar_result = true;
2408 else
2410 tree rhs;
2412 /*** Case 3: Create:
2413 s = extract_field <v_out2, 0>
2414 for (offset = element_size;
2415 offset < vector_size;
2416 offset += element_size;)
2418 Create: s' = extract_field <v_out2, offset>
2419 Create: s = op <s, s'>
2420 } */
2422 if (vect_print_dump_info (REPORT_DETAILS))
2423 fprintf (vect_dump, "Reduce using scalar code. ");
2425 vec_temp = PHI_RESULT (new_phi);
2426 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2427 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2428 bitsize_zero_node);
2429 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2430 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2431 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2432 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2433 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2435 for (bit_offset = element_bitsize;
2436 bit_offset < vec_size_in_bits;
2437 bit_offset += element_bitsize)
2439 tree tmp;
2440 tree bitpos = bitsize_int (bit_offset);
2441 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2442 bitpos);
2444 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2445 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2446 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2447 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2448 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2450 tmp = build2 (code, scalar_type, new_name, new_temp);
2451 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2452 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2453 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2454 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2457 extract_scalar_result = false;
2461 /* 2.4 Extract the final scalar result. Create:
2462 s_out3 = extract_field <v_out2, bitpos> */
2464 if (extract_scalar_result)
2466 tree rhs;
2468 gcc_assert (!nested_in_vect_loop);
2469 if (vect_print_dump_info (REPORT_DETAILS))
2470 fprintf (vect_dump, "extract scalar result");
2472 if (BYTES_BIG_ENDIAN)
2473 bitpos = size_binop (MULT_EXPR,
2474 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2475 TYPE_SIZE (scalar_type));
2476 else
2477 bitpos = bitsize_zero_node;
2479 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2480 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2481 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2482 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2483 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2484 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2487 vect_finalize_reduction:
2489 /* 2.5 Adjust the final result by the initial value of the reduction
2490 variable. (When such adjustment is not needed, then
2491 'adjustment_def' is zero). For example, if code is PLUS we create:
2492 new_temp = loop_exit_def + adjustment_def */
2494 if (adjustment_def)
2496 if (nested_in_vect_loop)
2498 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2499 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2500 new_dest = vect_create_destination_var (scalar_dest, vectype);
2502 else
2504 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2505 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2506 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2508 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2509 new_temp = make_ssa_name (new_dest, epilog_stmt);
2510 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2511 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2515 /* 2.6 Handle the loop-exit phi */
2517 /* Replace uses of s_out0 with uses of s_out3:
2518 Find the loop-closed-use at the loop exit of the original scalar result.
2519 (The reduction result is expected to have two immediate uses - one at the
2520 latch block, and one at the loop exit). */
2521 phis = VEC_alloc (tree, heap, 10);
2522 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2524 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2526 exit_phi = USE_STMT (use_p);
2527 VEC_quick_push (tree, phis, exit_phi);
2530 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2531 gcc_assert (!VEC_empty (tree, phis));
2533 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2535 if (nested_in_vect_loop)
2537 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2539 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2540 is not used in the outer-loop (but only outside the outer-loop). */
2541 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2542 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2544 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2545 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2546 set_stmt_info (get_stmt_ann (epilog_stmt),
2547 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2548 continue;
2551 /* Replace the uses: */
2552 orig_name = PHI_RESULT (exit_phi);
2553 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2554 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2555 SET_USE (use_p, new_temp);
2557 VEC_free (tree, heap, phis);
2561 /* Function vectorizable_reduction.
2563 Check if STMT performs a reduction operation that can be vectorized.
2564 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2565 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2566 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2568 This function also handles reduction idioms (patterns) that have been
2569 recognized in advance during vect_pattern_recog. In this case, STMT may be
2570 of this form:
2571 X = pattern_expr (arg0, arg1, ..., X)
2572 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2573 sequence that had been detected and replaced by the pattern-stmt (STMT).
2575 In some cases of reduction patterns, the type of the reduction variable X is
2576 different than the type of the other arguments of STMT.
2577 In such cases, the vectype that is used when transforming STMT into a vector
2578 stmt is different than the vectype that is used to determine the
2579 vectorization factor, because it consists of a different number of elements
2580 than the actual number of elements that are being operated upon in parallel.
2582 For example, consider an accumulation of shorts into an int accumulator.
2583 On some targets it's possible to vectorize this pattern operating on 8
2584 shorts at a time (hence, the vectype for purposes of determining the
2585 vectorization factor should be V8HI); on the other hand, the vectype that
2586 is used to create the vector form is actually V4SI (the type of the result).
2588 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2589 indicates what is the actual level of parallelism (V8HI in the example), so
2590 that the right vectorization factor would be derived. This vectype
2591 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2592 be used to create the vectorized stmt. The right vectype for the vectorized
2593 stmt is obtained from the type of the result X:
2594 get_vectype_for_scalar_type (TREE_TYPE (X))
2596 This means that, contrary to "regular" reductions (or "regular" stmts in
2597 general), the following equation:
2598 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2599 does *NOT* necessarily hold for reduction patterns. */
2601 bool
2602 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2604 tree vec_dest;
2605 tree scalar_dest;
2606 tree op;
2607 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2608 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2609 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2610 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2611 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2612 tree operation;
2613 enum tree_code code, orig_code, epilog_reduc_code = 0;
2614 enum machine_mode vec_mode;
2615 int op_type;
2616 optab optab, reduc_optab;
2617 tree new_temp = NULL_TREE;
2618 tree def, def_stmt;
2619 enum vect_def_type dt;
2620 tree new_phi;
2621 tree scalar_type;
2622 bool is_simple_use;
2623 tree orig_stmt;
2624 stmt_vec_info orig_stmt_info;
2625 tree expr = NULL_TREE;
2626 int i;
2627 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2628 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2629 stmt_vec_info prev_stmt_info;
2630 tree reduc_def;
2631 tree new_stmt = NULL_TREE;
2632 int j;
2634 if (nested_in_vect_loop_p (loop, stmt))
2636 loop = loop->inner;
2637 /* FORNOW. This restriction should be relaxed. */
2638 if (ncopies > 1)
2640 if (vect_print_dump_info (REPORT_DETAILS))
2641 fprintf (vect_dump, "multiple types in nested loop.");
2642 return false;
2646 gcc_assert (ncopies >= 1);
2648 /* FORNOW: SLP not supported. */
2649 if (STMT_SLP_TYPE (stmt_info))
2650 return false;
2652 /* 1. Is vectorizable reduction? */
2654 /* Not supportable if the reduction variable is used in the loop. */
2655 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2656 return false;
2658 /* Reductions that are not used even in an enclosing outer-loop,
2659 are expected to be "live" (used out of the loop). */
2660 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2661 && !STMT_VINFO_LIVE_P (stmt_info))
2662 return false;
2664 /* Make sure it was already recognized as a reduction computation. */
2665 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2666 return false;
2668 /* 2. Has this been recognized as a reduction pattern?
2670 Check if STMT represents a pattern that has been recognized
2671 in earlier analysis stages. For stmts that represent a pattern,
2672 the STMT_VINFO_RELATED_STMT field records the last stmt in
2673 the original sequence that constitutes the pattern. */
2675 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2676 if (orig_stmt)
2678 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2679 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2680 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2681 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2684 /* 3. Check the operands of the operation. The first operands are defined
2685 inside the loop body. The last operand is the reduction variable,
2686 which is defined by the loop-header-phi. */
2688 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2690 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2691 code = TREE_CODE (operation);
2692 op_type = TREE_OPERAND_LENGTH (operation);
2693 if (op_type != binary_op && op_type != ternary_op)
2694 return false;
2695 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2696 scalar_type = TREE_TYPE (scalar_dest);
2698 /* All uses but the last are expected to be defined in the loop.
2699 The last use is the reduction variable. */
2700 for (i = 0; i < op_type-1; i++)
2702 op = TREE_OPERAND (operation, i);
2703 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2704 gcc_assert (is_simple_use);
2705 if (dt != vect_loop_def
2706 && dt != vect_invariant_def
2707 && dt != vect_constant_def
2708 && dt != vect_induction_def)
2709 return false;
2712 op = TREE_OPERAND (operation, i);
2713 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2714 gcc_assert (is_simple_use);
2715 gcc_assert (dt == vect_reduction_def);
2716 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2717 if (orig_stmt)
2718 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2719 else
2720 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2722 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2723 return false;
2725 /* 4. Supportable by target? */
2727 /* 4.1. check support for the operation in the loop */
2728 optab = optab_for_tree_code (code, vectype);
2729 if (!optab)
2731 if (vect_print_dump_info (REPORT_DETAILS))
2732 fprintf (vect_dump, "no optab.");
2733 return false;
2735 vec_mode = TYPE_MODE (vectype);
2736 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2738 if (vect_print_dump_info (REPORT_DETAILS))
2739 fprintf (vect_dump, "op not supported by target.");
2740 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2741 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2742 < vect_min_worthwhile_factor (code))
2743 return false;
2744 if (vect_print_dump_info (REPORT_DETAILS))
2745 fprintf (vect_dump, "proceeding using word mode.");
2748 /* Worthwhile without SIMD support? */
2749 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2750 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2751 < vect_min_worthwhile_factor (code))
2753 if (vect_print_dump_info (REPORT_DETAILS))
2754 fprintf (vect_dump, "not worthwhile without SIMD support.");
2755 return false;
2758 /* 4.2. Check support for the epilog operation.
2760 If STMT represents a reduction pattern, then the type of the
2761 reduction variable may be different than the type of the rest
2762 of the arguments. For example, consider the case of accumulation
2763 of shorts into an int accumulator; The original code:
2764 S1: int_a = (int) short_a;
2765 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2767 was replaced with:
2768 STMT: int_acc = widen_sum <short_a, int_acc>
2770 This means that:
2771 1. The tree-code that is used to create the vector operation in the
2772 epilog code (that reduces the partial results) is not the
2773 tree-code of STMT, but is rather the tree-code of the original
2774 stmt from the pattern that STMT is replacing. I.e, in the example
2775 above we want to use 'widen_sum' in the loop, but 'plus' in the
2776 epilog.
2777 2. The type (mode) we use to check available target support
2778 for the vector operation to be created in the *epilog*, is
2779 determined by the type of the reduction variable (in the example
2780 above we'd check this: plus_optab[vect_int_mode]).
2781 However the type (mode) we use to check available target support
2782 for the vector operation to be created *inside the loop*, is
2783 determined by the type of the other arguments to STMT (in the
2784 example we'd check this: widen_sum_optab[vect_short_mode]).
2786 This is contrary to "regular" reductions, in which the types of all
2787 the arguments are the same as the type of the reduction variable.
2788 For "regular" reductions we can therefore use the same vector type
2789 (and also the same tree-code) when generating the epilog code and
2790 when generating the code inside the loop. */
2792 if (orig_stmt)
2794 /* This is a reduction pattern: get the vectype from the type of the
2795 reduction variable, and get the tree-code from orig_stmt. */
2796 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2797 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2798 vec_mode = TYPE_MODE (vectype);
2800 else
2802 /* Regular reduction: use the same vectype and tree-code as used for
2803 the vector code inside the loop can be used for the epilog code. */
2804 orig_code = code;
2807 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2808 return false;
2809 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2810 if (!reduc_optab)
2812 if (vect_print_dump_info (REPORT_DETAILS))
2813 fprintf (vect_dump, "no optab for reduction.");
2814 epilog_reduc_code = NUM_TREE_CODES;
2816 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2818 if (vect_print_dump_info (REPORT_DETAILS))
2819 fprintf (vect_dump, "reduc op not supported by target.");
2820 epilog_reduc_code = NUM_TREE_CODES;
2823 if (!vec_stmt) /* transformation not required. */
2825 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2826 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
2827 return true;
2830 /** Transform. **/
2832 if (vect_print_dump_info (REPORT_DETAILS))
2833 fprintf (vect_dump, "transform reduction.");
2835 /* Create the destination vector */
2836 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2838 /* Create the reduction-phi that defines the reduction-operand. */
2839 new_phi = create_phi_node (vec_dest, loop->header);
2841 /* In case the vectorization factor (VF) is bigger than the number
2842 of elements that we can fit in a vectype (nunits), we have to generate
2843 more than one vector stmt - i.e - we need to "unroll" the
2844 vector stmt by a factor VF/nunits. For more details see documentation
2845 in vectorizable_operation. */
2847 prev_stmt_info = NULL;
2848 for (j = 0; j < ncopies; j++)
2850 /* Handle uses. */
2851 if (j == 0)
2853 op = TREE_OPERAND (operation, 0);
2854 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2855 if (op_type == ternary_op)
2857 op = TREE_OPERAND (operation, 1);
2858 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2861 /* Get the vector def for the reduction variable from the phi node */
2862 reduc_def = PHI_RESULT (new_phi);
2864 else
2866 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2867 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2868 if (op_type == ternary_op)
2869 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2871 /* Get the vector def for the reduction variable from the vectorized
2872 reduction operation generated in the previous iteration (j-1) */
2873 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2876 /* Arguments are ready. create the new vector stmt. */
2877 if (op_type == binary_op)
2878 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2879 else
2880 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2881 reduc_def);
2882 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2883 new_temp = make_ssa_name (vec_dest, new_stmt);
2884 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2885 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2887 if (j == 0)
2888 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2889 else
2890 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2891 prev_stmt_info = vinfo_for_stmt (new_stmt);
2894 /* Finalize the reduction-phi (set it's arguments) and create the
2895 epilog reduction code. */
2896 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2897 return true;
2900 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2901 a function declaration if the target has a vectorized version
2902 of the function, or NULL_TREE if the function cannot be vectorized. */
2904 tree
2905 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2907 tree fndecl = get_callee_fndecl (call);
2908 enum built_in_function code;
2910 /* We only handle functions that do not read or clobber memory -- i.e.
2911 const or novops ones. */
2912 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2913 return NULL_TREE;
2915 if (!fndecl
2916 || TREE_CODE (fndecl) != FUNCTION_DECL
2917 || !DECL_BUILT_IN (fndecl))
2918 return NULL_TREE;
2920 code = DECL_FUNCTION_CODE (fndecl);
2921 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2922 vectype_in);
2925 /* Function vectorizable_call.
2927 Check if STMT performs a function call that can be vectorized.
2928 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2929 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2930 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2932 bool
2933 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2935 tree vec_dest;
2936 tree scalar_dest;
2937 tree operation;
2938 tree op, type;
2939 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2940 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2941 tree vectype_out, vectype_in;
2942 int nunits_in;
2943 int nunits_out;
2944 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2945 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2946 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2947 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
2948 tree new_stmt;
2949 int ncopies, j, nargs;
2950 call_expr_arg_iterator iter;
2951 tree vargs;
2952 enum { NARROW, NONE, WIDEN } modifier;
2954 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2955 return false;
2957 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2958 return false;
2960 /* FORNOW: SLP not supported. */
2961 if (STMT_SLP_TYPE (stmt_info))
2962 return false;
2964 /* FORNOW: not yet supported. */
2965 if (STMT_VINFO_LIVE_P (stmt_info))
2967 if (vect_print_dump_info (REPORT_DETAILS))
2968 fprintf (vect_dump, "value used after loop.");
2969 return false;
2972 /* Is STMT a vectorizable call? */
2973 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2974 return false;
2976 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2977 return false;
2979 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2980 if (TREE_CODE (operation) != CALL_EXPR)
2981 return false;
2983 /* Process function arguments. */
2984 rhs_type = NULL_TREE;
2985 nargs = 0;
2986 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2988 /* Bail out if the function has more than two arguments, we
2989 do not have interesting builtin functions to vectorize with
2990 more than two arguments. */
2991 if (nargs >= 2)
2992 return false;
2994 /* We can only handle calls with arguments of the same type. */
2995 if (rhs_type
2996 && rhs_type != TREE_TYPE (op))
2998 if (vect_print_dump_info (REPORT_DETAILS))
2999 fprintf (vect_dump, "argument types differ.");
3000 return false;
3002 rhs_type = TREE_TYPE (op);
3004 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3006 if (vect_print_dump_info (REPORT_DETAILS))
3007 fprintf (vect_dump, "use not simple.");
3008 return false;
3011 ++nargs;
3014 /* No arguments is also not good. */
3015 if (nargs == 0)
3016 return false;
3018 vectype_in = get_vectype_for_scalar_type (rhs_type);
3019 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3021 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3022 vectype_out = get_vectype_for_scalar_type (lhs_type);
3023 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3025 /* FORNOW */
3026 if (nunits_in == nunits_out / 2)
3027 modifier = NARROW;
3028 else if (nunits_out == nunits_in)
3029 modifier = NONE;
3030 else if (nunits_out == nunits_in / 2)
3031 modifier = WIDEN;
3032 else
3033 return false;
3035 /* For now, we only vectorize functions if a target specific builtin
3036 is available. TODO -- in some cases, it might be profitable to
3037 insert the calls for pieces of the vector, in order to be able
3038 to vectorize other operations in the loop. */
3039 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3040 if (fndecl == NULL_TREE)
3042 if (vect_print_dump_info (REPORT_DETAILS))
3043 fprintf (vect_dump, "function is not vectorizable.");
3045 return false;
3048 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3050 if (modifier == NARROW)
3051 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3052 else
3053 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3055 /* Sanity check: make sure that at least one copy of the vectorized stmt
3056 needs to be generated. */
3057 gcc_assert (ncopies >= 1);
3059 /* FORNOW. This restriction should be relaxed. */
3060 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3062 if (vect_print_dump_info (REPORT_DETAILS))
3063 fprintf (vect_dump, "multiple types in nested loop.");
3064 return false;
3067 if (!vec_stmt) /* transformation not required. */
3069 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3070 if (vect_print_dump_info (REPORT_DETAILS))
3071 fprintf (vect_dump, "=== vectorizable_call ===");
3072 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3073 return true;
3076 /** Transform. **/
3078 if (vect_print_dump_info (REPORT_DETAILS))
3079 fprintf (vect_dump, "transform operation.");
3081 /* FORNOW. This restriction should be relaxed. */
3082 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3084 if (vect_print_dump_info (REPORT_DETAILS))
3085 fprintf (vect_dump, "multiple types in nested loop.");
3086 return false;
3089 /* Handle def. */
3090 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3091 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3093 prev_stmt_info = NULL;
3094 switch (modifier)
3096 case NONE:
3097 for (j = 0; j < ncopies; ++j)
3099 /* Build argument list for the vectorized call. */
3100 /* FIXME: Rewrite this so that it doesn't
3101 construct a temporary list. */
3102 vargs = NULL_TREE;
3103 nargs = 0;
3104 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3106 if (j == 0)
3107 vec_oprnd0
3108 = vect_get_vec_def_for_operand (op, stmt, NULL);
3109 else
3110 vec_oprnd0
3111 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3113 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3115 ++nargs;
3117 vargs = nreverse (vargs);
3119 rhs = build_function_call_expr (fndecl, vargs);
3120 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3121 new_temp = make_ssa_name (vec_dest, new_stmt);
3122 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3124 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3126 if (j == 0)
3127 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3128 else
3129 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3131 prev_stmt_info = vinfo_for_stmt (new_stmt);
3134 break;
3136 case NARROW:
3137 for (j = 0; j < ncopies; ++j)
3139 /* Build argument list for the vectorized call. */
3140 /* FIXME: Rewrite this so that it doesn't
3141 construct a temporary list. */
3142 vargs = NULL_TREE;
3143 nargs = 0;
3144 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3146 if (j == 0)
3148 vec_oprnd0
3149 = vect_get_vec_def_for_operand (op, stmt, NULL);
3150 vec_oprnd1
3151 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3153 else
3155 vec_oprnd0
3156 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3157 vec_oprnd1
3158 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3161 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3162 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3164 ++nargs;
3166 vargs = nreverse (vargs);
3168 rhs = build_function_call_expr (fndecl, vargs);
3169 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3170 new_temp = make_ssa_name (vec_dest, new_stmt);
3171 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3173 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3175 if (j == 0)
3176 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3177 else
3178 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3180 prev_stmt_info = vinfo_for_stmt (new_stmt);
3183 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3185 break;
3187 case WIDEN:
3188 /* No current target implements this case. */
3189 return false;
3192 /* The call in STMT might prevent it from being removed in dce.
3193 We however cannot remove it here, due to the way the ssa name
3194 it defines is mapped to the new definition. So just replace
3195 rhs of the statement with something harmless. */
3196 type = TREE_TYPE (scalar_dest);
3197 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3198 update_stmt (stmt);
3200 return true;
3204 /* Function vect_gen_widened_results_half
3206 Create a vector stmt whose code, type, number of arguments, and result
3207 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3208 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3209 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3210 needs to be created (DECL is a function-decl of a target-builtin).
3211 STMT is the original scalar stmt that we are vectorizing. */
3213 static tree
3214 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3215 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3216 tree vec_dest, block_stmt_iterator *bsi,
3217 tree stmt)
3219 tree expr;
3220 tree new_stmt;
3221 tree new_temp;
3222 tree sym;
3223 ssa_op_iter iter;
3225 /* Generate half of the widened result: */
3226 if (code == CALL_EXPR)
3228 /* Target specific support */
3229 if (op_type == binary_op)
3230 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3231 else
3232 expr = build_call_expr (decl, 1, vec_oprnd0);
3234 else
3236 /* Generic support */
3237 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3238 if (op_type == binary_op)
3239 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3240 else
3241 expr = build1 (code, vectype, vec_oprnd0);
3243 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3244 new_temp = make_ssa_name (vec_dest, new_stmt);
3245 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3246 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3248 if (code == CALL_EXPR)
3250 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3252 if (TREE_CODE (sym) == SSA_NAME)
3253 sym = SSA_NAME_VAR (sym);
3254 mark_sym_for_renaming (sym);
3258 return new_stmt;
3262 /* Check if STMT performs a conversion operation, that can be vectorized.
3263 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3264 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3265 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3267 bool
3268 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3269 tree *vec_stmt, slp_tree slp_node)
3271 tree vec_dest;
3272 tree scalar_dest;
3273 tree operation;
3274 tree op0;
3275 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3276 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3278 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3279 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3280 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3281 tree new_temp;
3282 tree def, def_stmt;
3283 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3284 tree new_stmt = NULL_TREE;
3285 stmt_vec_info prev_stmt_info;
3286 int nunits_in;
3287 int nunits_out;
3288 tree vectype_out, vectype_in;
3289 int ncopies, j;
3290 tree expr;
3291 tree rhs_type, lhs_type;
3292 tree builtin_decl;
3293 enum { NARROW, NONE, WIDEN } modifier;
3294 int i;
3295 VEC(tree,heap) *vec_oprnds0 = NULL;
3296 tree vop0;
3298 /* Is STMT a vectorizable conversion? */
3300 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3301 return false;
3303 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3304 return false;
3306 if (STMT_VINFO_LIVE_P (stmt_info))
3308 /* FORNOW: not yet supported. */
3309 if (vect_print_dump_info (REPORT_DETAILS))
3310 fprintf (vect_dump, "value used after loop.");
3311 return false;
3314 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3315 return false;
3317 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3318 return false;
3320 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3321 code = TREE_CODE (operation);
3322 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3323 return false;
3325 /* Check types of lhs and rhs. */
3326 op0 = TREE_OPERAND (operation, 0);
3327 rhs_type = TREE_TYPE (op0);
3328 vectype_in = get_vectype_for_scalar_type (rhs_type);
3329 if (!vectype_in)
3330 return false;
3331 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3333 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3334 lhs_type = TREE_TYPE (scalar_dest);
3335 vectype_out = get_vectype_for_scalar_type (lhs_type);
3336 if (!vectype_out)
3337 return false;
3338 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3340 /* FORNOW */
3341 if (nunits_in == nunits_out / 2)
3342 modifier = NARROW;
3343 else if (nunits_out == nunits_in)
3344 modifier = NONE;
3345 else if (nunits_out == nunits_in / 2)
3346 modifier = WIDEN;
3347 else
3348 return false;
3350 if (modifier == NONE)
3351 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3353 /* Bail out if the types are both integral or non-integral. */
3354 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3355 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3356 return false;
3358 if (modifier == NARROW)
3359 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3360 else
3361 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3363 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3364 this, so we can safely override NCOPIES with 1 here. */
3365 if (slp_node)
3366 ncopies = 1;
3368 /* Sanity check: make sure that at least one copy of the vectorized stmt
3369 needs to be generated. */
3370 gcc_assert (ncopies >= 1);
3372 /* FORNOW. This restriction should be relaxed. */
3373 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3375 if (vect_print_dump_info (REPORT_DETAILS))
3376 fprintf (vect_dump, "multiple types in nested loop.");
3377 return false;
3380 /* Check the operands of the operation. */
3381 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3383 if (vect_print_dump_info (REPORT_DETAILS))
3384 fprintf (vect_dump, "use not simple.");
3385 return false;
3388 /* Supportable by target? */
3389 if ((modifier == NONE
3390 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3391 || (modifier == WIDEN
3392 && !supportable_widening_operation (code, stmt, vectype_in,
3393 &decl1, &decl2,
3394 &code1, &code2))
3395 || (modifier == NARROW
3396 && !supportable_narrowing_operation (code, stmt, vectype_in,
3397 &code1)))
3399 if (vect_print_dump_info (REPORT_DETAILS))
3400 fprintf (vect_dump, "op not supported by target.");
3401 return false;
3404 if (modifier != NONE)
3406 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3407 /* FORNOW: SLP not supported. */
3408 if (STMT_SLP_TYPE (stmt_info))
3409 return false;
3412 if (!vec_stmt) /* transformation not required. */
3414 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3415 return true;
3418 /** Transform. **/
3419 if (vect_print_dump_info (REPORT_DETAILS))
3420 fprintf (vect_dump, "transform conversion.");
3422 /* Handle def. */
3423 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3425 if (modifier == NONE && !slp_node)
3426 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3428 prev_stmt_info = NULL;
3429 switch (modifier)
3431 case NONE:
3432 for (j = 0; j < ncopies; j++)
3434 tree sym;
3435 ssa_op_iter iter;
3437 if (j == 0)
3438 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3439 else
3440 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3442 builtin_decl =
3443 targetm.vectorize.builtin_conversion (code, vectype_in);
3444 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3446 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3448 /* Arguments are ready. create the new vector stmt. */
3449 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3450 new_temp = make_ssa_name (vec_dest, new_stmt);
3451 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3452 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3453 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3454 SSA_OP_ALL_VIRTUALS)
3456 if (TREE_CODE (sym) == SSA_NAME)
3457 sym = SSA_NAME_VAR (sym);
3458 mark_sym_for_renaming (sym);
3460 if (slp_node)
3461 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3464 if (j == 0)
3465 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3466 else
3467 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3468 prev_stmt_info = vinfo_for_stmt (new_stmt);
3470 break;
3472 case WIDEN:
3473 /* In case the vectorization factor (VF) is bigger than the number
3474 of elements that we can fit in a vectype (nunits), we have to
3475 generate more than one vector stmt - i.e - we need to "unroll"
3476 the vector stmt by a factor VF/nunits. */
3477 for (j = 0; j < ncopies; j++)
3479 if (j == 0)
3480 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3481 else
3482 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3484 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3486 /* Generate first half of the widened result: */
3487 new_stmt
3488 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3489 vec_oprnd0, vec_oprnd1,
3490 unary_op, vec_dest, bsi, stmt);
3491 if (j == 0)
3492 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3493 else
3494 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3495 prev_stmt_info = vinfo_for_stmt (new_stmt);
3497 /* Generate second half of the widened result: */
3498 new_stmt
3499 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3500 vec_oprnd0, vec_oprnd1,
3501 unary_op, vec_dest, bsi, stmt);
3502 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3503 prev_stmt_info = vinfo_for_stmt (new_stmt);
3505 break;
3507 case NARROW:
3508 /* In case the vectorization factor (VF) is bigger than the number
3509 of elements that we can fit in a vectype (nunits), we have to
3510 generate more than one vector stmt - i.e - we need to "unroll"
3511 the vector stmt by a factor VF/nunits. */
3512 for (j = 0; j < ncopies; j++)
3514 /* Handle uses. */
3515 if (j == 0)
3517 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3518 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3520 else
3522 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3523 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3526 /* Arguments are ready. Create the new vector stmt. */
3527 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3528 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3529 new_temp = make_ssa_name (vec_dest, new_stmt);
3530 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3531 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3533 if (j == 0)
3534 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3535 else
3536 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3538 prev_stmt_info = vinfo_for_stmt (new_stmt);
3541 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3544 return true;
3548 /* Function vectorizable_assignment.
3550 Check if STMT performs an assignment (copy) that can be vectorized.
3551 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3552 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3553 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3555 bool
3556 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3557 slp_tree slp_node)
3559 tree vec_dest;
3560 tree scalar_dest;
3561 tree op;
3562 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3563 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3564 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3565 tree new_temp;
3566 tree def, def_stmt;
3567 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3568 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3569 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3570 int i;
3571 VEC(tree,heap) *vec_oprnds = NULL;
3572 tree vop;
3574 gcc_assert (ncopies >= 1);
3575 if (ncopies > 1)
3576 return false; /* FORNOW */
3578 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3579 return false;
3581 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3582 return false;
3584 /* FORNOW: not yet supported. */
3585 if (STMT_VINFO_LIVE_P (stmt_info))
3587 if (vect_print_dump_info (REPORT_DETAILS))
3588 fprintf (vect_dump, "value used after loop.");
3589 return false;
3592 /* Is vectorizable assignment? */
3593 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3594 return false;
3596 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3597 if (TREE_CODE (scalar_dest) != SSA_NAME)
3598 return false;
3600 op = GIMPLE_STMT_OPERAND (stmt, 1);
3601 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3603 if (vect_print_dump_info (REPORT_DETAILS))
3604 fprintf (vect_dump, "use not simple.");
3605 return false;
3608 if (!vec_stmt) /* transformation not required. */
3610 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3611 if (vect_print_dump_info (REPORT_DETAILS))
3612 fprintf (vect_dump, "=== vectorizable_assignment ===");
3613 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3614 return true;
3617 /** Transform. **/
3618 if (vect_print_dump_info (REPORT_DETAILS))
3619 fprintf (vect_dump, "transform assignment.");
3621 /* Handle def. */
3622 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3624 /* Handle use. */
3625 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3627 /* Arguments are ready. create the new vector stmt. */
3628 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3630 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3631 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3632 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3633 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3634 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3636 if (slp_node)
3637 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3640 VEC_free (tree, heap, vec_oprnds);
3641 return true;
3645 /* Function vect_min_worthwhile_factor.
3647 For a loop where we could vectorize the operation indicated by CODE,
3648 return the minimum vectorization factor that makes it worthwhile
3649 to use generic vectors. */
3650 static int
3651 vect_min_worthwhile_factor (enum tree_code code)
3653 switch (code)
3655 case PLUS_EXPR:
3656 case MINUS_EXPR:
3657 case NEGATE_EXPR:
3658 return 4;
3660 case BIT_AND_EXPR:
3661 case BIT_IOR_EXPR:
3662 case BIT_XOR_EXPR:
3663 case BIT_NOT_EXPR:
3664 return 2;
3666 default:
3667 return INT_MAX;
3672 /* Function vectorizable_induction
3674 Check if PHI performs an induction computation that can be vectorized.
3675 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3676 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3677 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3679 bool
3680 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3681 tree *vec_stmt)
3683 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3684 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3685 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3686 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3687 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3688 tree vec_def;
3690 gcc_assert (ncopies >= 1);
3692 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3693 return false;
3695 /* FORNOW: SLP not supported. */
3696 if (STMT_SLP_TYPE (stmt_info))
3697 return false;
3699 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3701 if (STMT_VINFO_LIVE_P (stmt_info))
3703 /* FORNOW: not yet supported. */
3704 if (vect_print_dump_info (REPORT_DETAILS))
3705 fprintf (vect_dump, "value used after loop.");
3706 return false;
3709 if (TREE_CODE (phi) != PHI_NODE)
3710 return false;
3712 if (!vec_stmt) /* transformation not required. */
3714 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3715 if (vect_print_dump_info (REPORT_DETAILS))
3716 fprintf (vect_dump, "=== vectorizable_induction ===");
3717 vect_model_induction_cost (stmt_info, ncopies);
3718 return true;
3721 /** Transform. **/
3723 if (vect_print_dump_info (REPORT_DETAILS))
3724 fprintf (vect_dump, "transform induction phi.");
3726 vec_def = get_initial_def_for_induction (phi);
3727 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3728 return true;
3732 /* Function vectorizable_operation.
3734 Check if STMT performs a binary or unary operation that can be vectorized.
3735 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3736 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3737 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3739 bool
3740 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3741 slp_tree slp_node)
3743 tree vec_dest;
3744 tree scalar_dest;
3745 tree operation;
3746 tree op0, op1 = NULL;
3747 tree vec_oprnd1 = NULL_TREE;
3748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3749 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3750 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3751 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3752 enum tree_code code;
3753 enum machine_mode vec_mode;
3754 tree new_temp;
3755 int op_type;
3756 optab optab;
3757 int icode;
3758 enum machine_mode optab_op2_mode;
3759 tree def, def_stmt;
3760 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3761 tree new_stmt = NULL_TREE;
3762 stmt_vec_info prev_stmt_info;
3763 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3764 int nunits_out;
3765 tree vectype_out;
3766 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3767 int j, i;
3768 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3769 tree vop0, vop1;
3771 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3772 this, so we can safely override NCOPIES with 1 here. */
3773 if (slp_node)
3774 ncopies = 1;
3775 gcc_assert (ncopies >= 1);
3776 /* FORNOW. This restriction should be relaxed. */
3777 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3779 if (vect_print_dump_info (REPORT_DETAILS))
3780 fprintf (vect_dump, "multiple types in nested loop.");
3781 return false;
3784 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3785 return false;
3787 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3788 return false;
3790 /* FORNOW: not yet supported. */
3791 if (STMT_VINFO_LIVE_P (stmt_info))
3793 if (vect_print_dump_info (REPORT_DETAILS))
3794 fprintf (vect_dump, "value used after loop.");
3795 return false;
3798 /* Is STMT a vectorizable binary/unary operation? */
3799 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3800 return false;
3802 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3803 return false;
3805 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3806 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3807 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3808 if (nunits_out != nunits_in)
3809 return false;
3811 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3812 code = TREE_CODE (operation);
3814 /* For pointer addition, we should use the normal plus for
3815 the vector addition. */
3816 if (code == POINTER_PLUS_EXPR)
3817 code = PLUS_EXPR;
3819 optab = optab_for_tree_code (code, vectype);
3821 /* Support only unary or binary operations. */
3822 op_type = TREE_OPERAND_LENGTH (operation);
3823 if (op_type != unary_op && op_type != binary_op)
3825 if (vect_print_dump_info (REPORT_DETAILS))
3826 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3827 return false;
3830 op0 = TREE_OPERAND (operation, 0);
3831 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3833 if (vect_print_dump_info (REPORT_DETAILS))
3834 fprintf (vect_dump, "use not simple.");
3835 return false;
3838 if (op_type == binary_op)
3840 op1 = TREE_OPERAND (operation, 1);
3841 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3843 if (vect_print_dump_info (REPORT_DETAILS))
3844 fprintf (vect_dump, "use not simple.");
3845 return false;
3849 /* Supportable by target? */
3850 if (!optab)
3852 if (vect_print_dump_info (REPORT_DETAILS))
3853 fprintf (vect_dump, "no optab.");
3854 return false;
3856 vec_mode = TYPE_MODE (vectype);
3857 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3858 if (icode == CODE_FOR_nothing)
3860 if (vect_print_dump_info (REPORT_DETAILS))
3861 fprintf (vect_dump, "op not supported by target.");
3862 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3863 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3864 < vect_min_worthwhile_factor (code))
3865 return false;
3866 if (vect_print_dump_info (REPORT_DETAILS))
3867 fprintf (vect_dump, "proceeding using word mode.");
3870 /* Worthwhile without SIMD support? */
3871 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3872 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3873 < vect_min_worthwhile_factor (code))
3875 if (vect_print_dump_info (REPORT_DETAILS))
3876 fprintf (vect_dump, "not worthwhile without SIMD support.");
3877 return false;
3880 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3882 /* FORNOW: not yet supported. */
3883 if (!VECTOR_MODE_P (vec_mode))
3884 return false;
3886 /* Invariant argument is needed for a vector shift
3887 by a scalar shift operand. */
3888 optab_op2_mode = insn_data[icode].operand[2].mode;
3889 if (! (VECTOR_MODE_P (optab_op2_mode)
3890 || dt[1] == vect_constant_def
3891 || dt[1] == vect_invariant_def))
3893 if (vect_print_dump_info (REPORT_DETAILS))
3894 fprintf (vect_dump, "operand mode requires invariant argument.");
3895 return false;
3899 if (!vec_stmt) /* transformation not required. */
3901 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3902 if (vect_print_dump_info (REPORT_DETAILS))
3903 fprintf (vect_dump, "=== vectorizable_operation ===");
3904 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3905 return true;
3908 /** Transform. **/
3910 if (vect_print_dump_info (REPORT_DETAILS))
3911 fprintf (vect_dump, "transform binary/unary operation.");
3913 /* Handle def. */
3914 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3916 if (!slp_node)
3917 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3918 if (op_type == binary_op)
3919 vec_oprnds1 = VEC_alloc (tree, heap, 1);
3921 /* In case the vectorization factor (VF) is bigger than the number
3922 of elements that we can fit in a vectype (nunits), we have to generate
3923 more than one vector stmt - i.e - we need to "unroll" the
3924 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3925 from one copy of the vector stmt to the next, in the field
3926 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3927 stages to find the correct vector defs to be used when vectorizing
3928 stmts that use the defs of the current stmt. The example below illustrates
3929 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3930 4 vectorized stmts):
3932 before vectorization:
3933 RELATED_STMT VEC_STMT
3934 S1: x = memref - -
3935 S2: z = x + 1 - -
3937 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3938 there):
3939 RELATED_STMT VEC_STMT
3940 VS1_0: vx0 = memref0 VS1_1 -
3941 VS1_1: vx1 = memref1 VS1_2 -
3942 VS1_2: vx2 = memref2 VS1_3 -
3943 VS1_3: vx3 = memref3 - -
3944 S1: x = load - VS1_0
3945 S2: z = x + 1 - -
3947 step2: vectorize stmt S2 (done here):
3948 To vectorize stmt S2 we first need to find the relevant vector
3949 def for the first operand 'x'. This is, as usual, obtained from
3950 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3951 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3952 relevant vector def 'vx0'. Having found 'vx0' we can generate
3953 the vector stmt VS2_0, and as usual, record it in the
3954 STMT_VINFO_VEC_STMT of stmt S2.
3955 When creating the second copy (VS2_1), we obtain the relevant vector
3956 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3957 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3958 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3959 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3960 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3961 chain of stmts and pointers:
3962 RELATED_STMT VEC_STMT
3963 VS1_0: vx0 = memref0 VS1_1 -
3964 VS1_1: vx1 = memref1 VS1_2 -
3965 VS1_2: vx2 = memref2 VS1_3 -
3966 VS1_3: vx3 = memref3 - -
3967 S1: x = load - VS1_0
3968 VS2_0: vz0 = vx0 + v1 VS2_1 -
3969 VS2_1: vz1 = vx1 + v1 VS2_2 -
3970 VS2_2: vz2 = vx2 + v1 VS2_3 -
3971 VS2_3: vz3 = vx3 + v1 - -
3972 S2: z = x + 1 - VS2_0 */
3974 prev_stmt_info = NULL;
3975 for (j = 0; j < ncopies; j++)
3977 /* Handle uses. */
3978 if (j == 0)
3980 if (op_type == binary_op
3981 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
3983 /* Vector shl and shr insn patterns can be defined with scalar
3984 operand 2 (shift operand). In this case, use constant or loop
3985 invariant op1 directly, without extending it to vector mode
3986 first. */
3987 optab_op2_mode = insn_data[icode].operand[2].mode;
3988 if (!VECTOR_MODE_P (optab_op2_mode))
3990 if (vect_print_dump_info (REPORT_DETAILS))
3991 fprintf (vect_dump, "operand 1 using scalar mode.");
3992 vec_oprnd1 = op1;
3993 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
3997 /* vec_oprnd is available if operand 1 should be of a scalar-type
3998 (a special case for certain kind of vector shifts); otherwise,
3999 operand 1 should be of a vector type (the usual case). */
4000 if (op_type == binary_op && !vec_oprnd1)
4001 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4002 slp_node);
4003 else
4004 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4005 slp_node);
4007 else
4008 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4010 /* Arguments are ready. Create the new vector stmt. */
4011 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4013 if (op_type == binary_op)
4015 vop1 = VEC_index (tree, vec_oprnds1, i);
4016 new_stmt = build_gimple_modify_stmt (vec_dest,
4017 build2 (code, vectype, vop0, vop1));
4019 else
4020 new_stmt = build_gimple_modify_stmt (vec_dest,
4021 build1 (code, vectype, vop0));
4023 new_temp = make_ssa_name (vec_dest, new_stmt);
4024 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4025 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4026 if (slp_node)
4027 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4030 if (j == 0)
4031 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4032 else
4033 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4034 prev_stmt_info = vinfo_for_stmt (new_stmt);
4037 VEC_free (tree, heap, vec_oprnds0);
4038 if (vec_oprnds1)
4039 VEC_free (tree, heap, vec_oprnds1);
4041 return true;
4045 /* Function vectorizable_type_demotion
4047 Check if STMT performs a binary or unary operation that involves
4048 type demotion, and if it can be vectorized.
4049 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4050 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4051 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4053 bool
4054 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4055 tree *vec_stmt)
4057 tree vec_dest;
4058 tree scalar_dest;
4059 tree operation;
4060 tree op0;
4061 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4062 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4063 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4064 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4065 enum tree_code code, code1 = ERROR_MARK;
4066 tree new_temp;
4067 tree def, def_stmt;
4068 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4069 tree new_stmt;
4070 stmt_vec_info prev_stmt_info;
4071 int nunits_in;
4072 int nunits_out;
4073 tree vectype_out;
4074 int ncopies;
4075 int j;
4076 tree expr;
4077 tree vectype_in;
4079 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4080 return false;
4082 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4083 return false;
4085 /* FORNOW: not yet supported. */
4086 if (STMT_VINFO_LIVE_P (stmt_info))
4088 if (vect_print_dump_info (REPORT_DETAILS))
4089 fprintf (vect_dump, "value used after loop.");
4090 return false;
4093 /* Is STMT a vectorizable type-demotion operation? */
4094 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4095 return false;
4097 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4098 return false;
4100 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4101 code = TREE_CODE (operation);
4102 if (code != NOP_EXPR && code != CONVERT_EXPR)
4103 return false;
4105 op0 = TREE_OPERAND (operation, 0);
4106 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4107 if (!vectype_in)
4108 return false;
4109 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4111 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4112 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4113 if (!vectype_out)
4114 return false;
4115 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4116 if (nunits_in != nunits_out / 2) /* FORNOW */
4117 return false;
4119 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4120 gcc_assert (ncopies >= 1);
4121 /* FORNOW. This restriction should be relaxed. */
4122 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4124 if (vect_print_dump_info (REPORT_DETAILS))
4125 fprintf (vect_dump, "multiple types in nested loop.");
4126 return false;
4129 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4130 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4131 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4132 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4133 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4134 return false;
4136 /* Check the operands of the operation. */
4137 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4139 if (vect_print_dump_info (REPORT_DETAILS))
4140 fprintf (vect_dump, "use not simple.");
4141 return false;
4144 /* Supportable by target? */
4145 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4146 return false;
4148 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4150 if (!vec_stmt) /* transformation not required. */
4152 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4153 if (vect_print_dump_info (REPORT_DETAILS))
4154 fprintf (vect_dump, "=== vectorizable_demotion ===");
4155 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4156 return true;
4159 /** Transform. **/
4160 if (vect_print_dump_info (REPORT_DETAILS))
4161 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4162 ncopies);
4164 /* Handle def. */
4165 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4167 /* In case the vectorization factor (VF) is bigger than the number
4168 of elements that we can fit in a vectype (nunits), we have to generate
4169 more than one vector stmt - i.e - we need to "unroll" the
4170 vector stmt by a factor VF/nunits. */
4171 prev_stmt_info = NULL;
4172 for (j = 0; j < ncopies; j++)
4174 /* Handle uses. */
4175 if (j == 0)
4177 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4178 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4180 else
4182 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4183 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4186 /* Arguments are ready. Create the new vector stmt. */
4187 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4188 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4189 new_temp = make_ssa_name (vec_dest, new_stmt);
4190 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4191 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4193 if (j == 0)
4194 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4195 else
4196 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4198 prev_stmt_info = vinfo_for_stmt (new_stmt);
4201 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4202 return true;
4206 /* Function vectorizable_type_promotion
4208 Check if STMT performs a binary or unary operation that involves
4209 type promotion, and if it can be vectorized.
4210 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4211 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4212 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4214 bool
4215 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4216 tree *vec_stmt)
4218 tree vec_dest;
4219 tree scalar_dest;
4220 tree operation;
4221 tree op0, op1 = NULL;
4222 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4223 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4224 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4225 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4226 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4227 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4228 int op_type;
4229 tree def, def_stmt;
4230 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4231 tree new_stmt;
4232 stmt_vec_info prev_stmt_info;
4233 int nunits_in;
4234 int nunits_out;
4235 tree vectype_out;
4236 int ncopies;
4237 int j;
4238 tree vectype_in;
4240 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4241 return false;
4243 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4244 return false;
4246 /* FORNOW: not yet supported. */
4247 if (STMT_VINFO_LIVE_P (stmt_info))
4249 if (vect_print_dump_info (REPORT_DETAILS))
4250 fprintf (vect_dump, "value used after loop.");
4251 return false;
4254 /* Is STMT a vectorizable type-promotion operation? */
4255 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4256 return false;
4258 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4259 return false;
4261 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4262 code = TREE_CODE (operation);
4263 if (code != NOP_EXPR && code != CONVERT_EXPR
4264 && code != WIDEN_MULT_EXPR)
4265 return false;
4267 op0 = TREE_OPERAND (operation, 0);
4268 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4269 if (!vectype_in)
4270 return false;
4271 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4273 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4274 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4275 if (!vectype_out)
4276 return false;
4277 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4278 if (nunits_out != nunits_in / 2) /* FORNOW */
4279 return false;
4281 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4282 gcc_assert (ncopies >= 1);
4283 /* FORNOW. This restriction should be relaxed. */
4284 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4286 if (vect_print_dump_info (REPORT_DETAILS))
4287 fprintf (vect_dump, "multiple types in nested loop.");
4288 return false;
4291 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4292 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4293 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4294 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4295 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4296 return false;
4298 /* Check the operands of the operation. */
4299 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4301 if (vect_print_dump_info (REPORT_DETAILS))
4302 fprintf (vect_dump, "use not simple.");
4303 return false;
4306 op_type = TREE_CODE_LENGTH (code);
4307 if (op_type == binary_op)
4309 op1 = TREE_OPERAND (operation, 1);
4310 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4312 if (vect_print_dump_info (REPORT_DETAILS))
4313 fprintf (vect_dump, "use not simple.");
4314 return false;
4318 /* Supportable by target? */
4319 if (!supportable_widening_operation (code, stmt, vectype_in,
4320 &decl1, &decl2, &code1, &code2))
4321 return false;
4323 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4325 if (!vec_stmt) /* transformation not required. */
4327 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4328 if (vect_print_dump_info (REPORT_DETAILS))
4329 fprintf (vect_dump, "=== vectorizable_promotion ===");
4330 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4331 return true;
4334 /** Transform. **/
4336 if (vect_print_dump_info (REPORT_DETAILS))
4337 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4338 ncopies);
4340 /* Handle def. */
4341 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4343 /* In case the vectorization factor (VF) is bigger than the number
4344 of elements that we can fit in a vectype (nunits), we have to generate
4345 more than one vector stmt - i.e - we need to "unroll" the
4346 vector stmt by a factor VF/nunits. */
4348 prev_stmt_info = NULL;
4349 for (j = 0; j < ncopies; j++)
4351 /* Handle uses. */
4352 if (j == 0)
4354 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4355 if (op_type == binary_op)
4356 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4358 else
4360 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4361 if (op_type == binary_op)
4362 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4365 /* Arguments are ready. Create the new vector stmt. We are creating
4366 two vector defs because the widened result does not fit in one vector.
4367 The vectorized stmt can be expressed as a call to a taregt builtin,
4368 or a using a tree-code. */
4369 /* Generate first half of the widened result: */
4370 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4371 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4372 if (j == 0)
4373 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4374 else
4375 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4376 prev_stmt_info = vinfo_for_stmt (new_stmt);
4378 /* Generate second half of the widened result: */
4379 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4380 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4381 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4382 prev_stmt_info = vinfo_for_stmt (new_stmt);
4386 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4387 return true;
4391 /* Function vect_strided_store_supported.
4393 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4394 and FALSE otherwise. */
4396 static bool
4397 vect_strided_store_supported (tree vectype)
4399 optab interleave_high_optab, interleave_low_optab;
4400 int mode;
4402 mode = (int) TYPE_MODE (vectype);
4404 /* Check that the operation is supported. */
4405 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4406 vectype);
4407 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4408 vectype);
4409 if (!interleave_high_optab || !interleave_low_optab)
4411 if (vect_print_dump_info (REPORT_DETAILS))
4412 fprintf (vect_dump, "no optab for interleave.");
4413 return false;
4416 if (optab_handler (interleave_high_optab, mode)->insn_code
4417 == CODE_FOR_nothing
4418 || optab_handler (interleave_low_optab, mode)->insn_code
4419 == CODE_FOR_nothing)
4421 if (vect_print_dump_info (REPORT_DETAILS))
4422 fprintf (vect_dump, "interleave op not supported by target.");
4423 return false;
4426 return true;
4430 /* Function vect_permute_store_chain.
4432 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4433 a power of 2, generate interleave_high/low stmts to reorder the data
4434 correctly for the stores. Return the final references for stores in
4435 RESULT_CHAIN.
4437 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4438 The input is 4 vectors each containing 8 elements. We assign a number to each
4439 element, the input sequence is:
4441 1st vec: 0 1 2 3 4 5 6 7
4442 2nd vec: 8 9 10 11 12 13 14 15
4443 3rd vec: 16 17 18 19 20 21 22 23
4444 4th vec: 24 25 26 27 28 29 30 31
4446 The output sequence should be:
4448 1st vec: 0 8 16 24 1 9 17 25
4449 2nd vec: 2 10 18 26 3 11 19 27
4450 3rd vec: 4 12 20 28 5 13 21 30
4451 4th vec: 6 14 22 30 7 15 23 31
4453 i.e., we interleave the contents of the four vectors in their order.
4455 We use interleave_high/low instructions to create such output. The input of
4456 each interleave_high/low operation is two vectors:
4457 1st vec 2nd vec
4458 0 1 2 3 4 5 6 7
4459 the even elements of the result vector are obtained left-to-right from the
4460 high/low elements of the first vector. The odd elements of the result are
4461 obtained left-to-right from the high/low elements of the second vector.
4462 The output of interleave_high will be: 0 4 1 5
4463 and of interleave_low: 2 6 3 7
4466 The permutation is done in log LENGTH stages. In each stage interleave_high
4467 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4468 where the first argument is taken from the first half of DR_CHAIN and the
4469 second argument from it's second half.
4470 In our example,
4472 I1: interleave_high (1st vec, 3rd vec)
4473 I2: interleave_low (1st vec, 3rd vec)
4474 I3: interleave_high (2nd vec, 4th vec)
4475 I4: interleave_low (2nd vec, 4th vec)
4477 The output for the first stage is:
4479 I1: 0 16 1 17 2 18 3 19
4480 I2: 4 20 5 21 6 22 7 23
4481 I3: 8 24 9 25 10 26 11 27
4482 I4: 12 28 13 29 14 30 15 31
4484 The output of the second stage, i.e. the final result is:
4486 I1: 0 8 16 24 1 9 17 25
4487 I2: 2 10 18 26 3 11 19 27
4488 I3: 4 12 20 28 5 13 21 30
4489 I4: 6 14 22 30 7 15 23 31. */
4491 static bool
4492 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4493 unsigned int length,
4494 tree stmt,
4495 block_stmt_iterator *bsi,
4496 VEC(tree,heap) **result_chain)
4498 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4499 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4500 tree scalar_dest, tmp;
4501 int i;
4502 unsigned int j;
4503 VEC(tree,heap) *first, *second;
4505 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4506 first = VEC_alloc (tree, heap, length/2);
4507 second = VEC_alloc (tree, heap, length/2);
4509 /* Check that the operation is supported. */
4510 if (!vect_strided_store_supported (vectype))
4511 return false;
4513 *result_chain = VEC_copy (tree, heap, dr_chain);
4515 for (i = 0; i < exact_log2 (length); i++)
4517 for (j = 0; j < length/2; j++)
4519 vect1 = VEC_index (tree, dr_chain, j);
4520 vect2 = VEC_index (tree, dr_chain, j+length/2);
4522 /* Create interleaving stmt:
4523 in the case of big endian:
4524 high = interleave_high (vect1, vect2)
4525 and in the case of little endian:
4526 high = interleave_low (vect1, vect2). */
4527 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4528 DECL_GIMPLE_REG_P (perm_dest) = 1;
4529 add_referenced_var (perm_dest);
4530 if (BYTES_BIG_ENDIAN)
4531 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4532 else
4533 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4534 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4535 high = make_ssa_name (perm_dest, perm_stmt);
4536 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4537 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4538 VEC_replace (tree, *result_chain, 2*j, high);
4540 /* Create interleaving stmt:
4541 in the case of big endian:
4542 low = interleave_low (vect1, vect2)
4543 and in the case of little endian:
4544 low = interleave_high (vect1, vect2). */
4545 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4546 DECL_GIMPLE_REG_P (perm_dest) = 1;
4547 add_referenced_var (perm_dest);
4548 if (BYTES_BIG_ENDIAN)
4549 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4550 else
4551 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4552 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4553 low = make_ssa_name (perm_dest, perm_stmt);
4554 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4555 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4556 VEC_replace (tree, *result_chain, 2*j+1, low);
4558 dr_chain = VEC_copy (tree, heap, *result_chain);
4560 return true;
4564 /* Function vectorizable_store.
4566 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4567 can be vectorized.
4568 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4569 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4570 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4572 bool
4573 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4574 slp_tree slp_node)
4576 tree scalar_dest;
4577 tree data_ref;
4578 tree op;
4579 tree vec_oprnd = NULL_TREE;
4580 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4581 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4582 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4583 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4584 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4585 enum machine_mode vec_mode;
4586 tree dummy;
4587 enum dr_alignment_support alignment_support_scheme;
4588 tree def, def_stmt;
4589 enum vect_def_type dt;
4590 stmt_vec_info prev_stmt_info = NULL;
4591 tree dataref_ptr = NULL_TREE;
4592 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4593 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4594 int j;
4595 tree next_stmt, first_stmt;
4596 bool strided_store = false;
4597 unsigned int group_size, i;
4598 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4599 bool inv_p;
4600 VEC(tree,heap) *vec_oprnds = NULL;
4601 bool slp = (slp_node != NULL);
4602 stmt_vec_info first_stmt_vinfo;
4603 unsigned int vec_num;
4605 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4606 this, so we can safely override NCOPIES with 1 here. */
4607 if (slp)
4608 ncopies = 1;
4610 gcc_assert (ncopies >= 1);
4612 /* FORNOW. This restriction should be relaxed. */
4613 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4615 if (vect_print_dump_info (REPORT_DETAILS))
4616 fprintf (vect_dump, "multiple types in nested loop.");
4617 return false;
4620 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4621 return false;
4623 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4624 return false;
4626 if (STMT_VINFO_LIVE_P (stmt_info))
4628 if (vect_print_dump_info (REPORT_DETAILS))
4629 fprintf (vect_dump, "value used after loop.");
4630 return false;
4633 /* Is vectorizable store? */
4635 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4636 return false;
4638 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4639 if (TREE_CODE (scalar_dest) != ARRAY_REF
4640 && TREE_CODE (scalar_dest) != INDIRECT_REF
4641 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4642 return false;
4644 op = GIMPLE_STMT_OPERAND (stmt, 1);
4645 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4647 if (vect_print_dump_info (REPORT_DETAILS))
4648 fprintf (vect_dump, "use not simple.");
4649 return false;
4652 vec_mode = TYPE_MODE (vectype);
4653 /* FORNOW. In some cases can vectorize even if data-type not supported
4654 (e.g. - array initialization with 0). */
4655 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4656 return false;
4658 if (!STMT_VINFO_DATA_REF (stmt_info))
4659 return false;
4661 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4663 strided_store = true;
4664 if (!vect_strided_store_supported (vectype)
4665 && !PURE_SLP_STMT (stmt_info) && !slp)
4666 return false;
4669 if (!vec_stmt) /* transformation not required. */
4671 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4672 if (!PURE_SLP_STMT (stmt_info))
4673 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4674 return true;
4677 /** Transform. **/
4679 if (strided_store)
4681 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4682 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4683 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4685 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4687 /* FORNOW */
4688 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4690 /* We vectorize all the stmts of the interleaving group when we
4691 reach the last stmt in the group. */
4692 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4693 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4694 && !slp)
4696 *vec_stmt = NULL_TREE;
4697 return true;
4700 if (slp)
4701 strided_store = false;
4703 /* VEC_NUM is the number of vect stmts to be created for this group. */
4704 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4705 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4706 else
4707 vec_num = group_size;
4709 else
4711 first_stmt = stmt;
4712 first_dr = dr;
4713 group_size = vec_num = 1;
4714 first_stmt_vinfo = stmt_info;
4717 if (vect_print_dump_info (REPORT_DETAILS))
4718 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4720 dr_chain = VEC_alloc (tree, heap, group_size);
4721 oprnds = VEC_alloc (tree, heap, group_size);
4723 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4724 gcc_assert (alignment_support_scheme);
4725 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4727 /* In case the vectorization factor (VF) is bigger than the number
4728 of elements that we can fit in a vectype (nunits), we have to generate
4729 more than one vector stmt - i.e - we need to "unroll" the
4730 vector stmt by a factor VF/nunits. For more details see documentation in
4731 vect_get_vec_def_for_copy_stmt. */
4733 /* In case of interleaving (non-unit strided access):
4735 S1: &base + 2 = x2
4736 S2: &base = x0
4737 S3: &base + 1 = x1
4738 S4: &base + 3 = x3
4740 We create vectorized stores starting from base address (the access of the
4741 first stmt in the chain (S2 in the above example), when the last store stmt
4742 of the chain (S4) is reached:
4744 VS1: &base = vx2
4745 VS2: &base + vec_size*1 = vx0
4746 VS3: &base + vec_size*2 = vx1
4747 VS4: &base + vec_size*3 = vx3
4749 Then permutation statements are generated:
4751 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4752 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4755 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4756 (the order of the data-refs in the output of vect_permute_store_chain
4757 corresponds to the order of scalar stmts in the interleaving chain - see
4758 the documentation of vect_permute_store_chain()).
4760 In case of both multiple types and interleaving, above vector stores and
4761 permutation stmts are created for every copy. The result vector stmts are
4762 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4763 STMT_VINFO_RELATED_STMT for the next copies.
4766 prev_stmt_info = NULL;
4767 for (j = 0; j < ncopies; j++)
4769 tree new_stmt;
4770 tree ptr_incr;
4772 if (j == 0)
4774 if (slp)
4776 /* Get vectorized arguments for SLP_NODE. */
4777 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4779 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4781 else
4783 /* For interleaved stores we collect vectorized defs for all the
4784 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4785 used as an input to vect_permute_store_chain(), and OPRNDS as
4786 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4788 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4789 OPRNDS are of size 1. */
4790 next_stmt = first_stmt;
4791 for (i = 0; i < group_size; i++)
4793 /* Since gaps are not supported for interleaved stores,
4794 GROUP_SIZE is the exact number of stmts in the chain.
4795 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4796 there is no interleaving, GROUP_SIZE is 1, and only one
4797 iteration of the loop will be executed. */
4798 gcc_assert (next_stmt);
4799 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4801 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4802 NULL);
4803 VEC_quick_push(tree, dr_chain, vec_oprnd);
4804 VEC_quick_push(tree, oprnds, vec_oprnd);
4805 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4808 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4809 &dummy, &ptr_incr, false,
4810 TREE_TYPE (vec_oprnd), &inv_p);
4811 gcc_assert (!inv_p);
4813 else
4815 /* FORNOW SLP doesn't work for multiple types. */
4816 gcc_assert (!slp);
4818 /* For interleaved stores we created vectorized defs for all the
4819 defs stored in OPRNDS in the previous iteration (previous copy).
4820 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4821 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4822 next copy.
4823 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4824 OPRNDS are of size 1. */
4825 for (i = 0; i < group_size; i++)
4827 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
4828 VEC_index (tree, oprnds, i));
4829 VEC_replace(tree, dr_chain, i, vec_oprnd);
4830 VEC_replace(tree, oprnds, i, vec_oprnd);
4832 dataref_ptr =
4833 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4836 if (strided_store)
4838 result_chain = VEC_alloc (tree, heap, group_size);
4839 /* Permute. */
4840 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4841 &result_chain))
4842 return false;
4845 next_stmt = first_stmt;
4846 for (i = 0; i < vec_num; i++)
4848 if (i > 0)
4849 /* Bump the vector pointer. */
4850 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4851 NULL_TREE);
4853 if (slp)
4854 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4855 else if (strided_store)
4856 /* For strided stores vectorized defs are interleaved in
4857 vect_permute_store_chain(). */
4858 vec_oprnd = VEC_index (tree, result_chain, i);
4860 data_ref = build_fold_indirect_ref (dataref_ptr);
4861 /* Arguments are ready. Create the new vector stmt. */
4862 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4863 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4864 mark_symbols_for_renaming (new_stmt);
4866 if (j == 0)
4867 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4868 else
4869 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4871 prev_stmt_info = vinfo_for_stmt (new_stmt);
4872 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4873 if (!next_stmt)
4874 break;
4878 return true;
4882 /* Function vect_setup_realignment
4884 This function is called when vectorizing an unaligned load using
4885 the dr_explicit_realign[_optimized] scheme.
4886 This function generates the following code at the loop prolog:
4888 p = initial_addr;
4889 x msq_init = *(floor(p)); # prolog load
4890 realignment_token = call target_builtin;
4891 loop:
4892 x msq = phi (msq_init, ---)
4894 The stmts marked with x are generated only for the case of
4895 dr_explicit_realign_optimized.
4897 The code above sets up a new (vector) pointer, pointing to the first
4898 location accessed by STMT, and a "floor-aligned" load using that pointer.
4899 It also generates code to compute the "realignment-token" (if the relevant
4900 target hook was defined), and creates a phi-node at the loop-header bb
4901 whose arguments are the result of the prolog-load (created by this
4902 function) and the result of a load that takes place in the loop (to be
4903 created by the caller to this function).
4905 For the case of dr_explicit_realign_optimized:
4906 The caller to this function uses the phi-result (msq) to create the
4907 realignment code inside the loop, and sets up the missing phi argument,
4908 as follows:
4909 loop:
4910 msq = phi (msq_init, lsq)
4911 lsq = *(floor(p')); # load in loop
4912 result = realign_load (msq, lsq, realignment_token);
4914 For the case of dr_explicit_realign:
4915 loop:
4916 msq = *(floor(p)); # load in loop
4917 p' = p + (VS-1);
4918 lsq = *(floor(p')); # load in loop
4919 result = realign_load (msq, lsq, realignment_token);
4921 Input:
4922 STMT - (scalar) load stmt to be vectorized. This load accesses
4923 a memory location that may be unaligned.
4924 BSI - place where new code is to be inserted.
4925 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4926 is used.
4928 Output:
4929 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4930 target hook, if defined.
4931 Return value - the result of the loop-header phi node. */
4933 static tree
4934 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
4935 tree *realignment_token,
4936 enum dr_alignment_support alignment_support_scheme,
4937 tree init_addr,
4938 struct loop **at_loop)
4940 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4941 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4942 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4943 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4944 edge pe;
4945 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4946 tree vec_dest;
4947 tree inc;
4948 tree ptr;
4949 tree data_ref;
4950 tree new_stmt;
4951 basic_block new_bb;
4952 tree msq_init = NULL_TREE;
4953 tree new_temp;
4954 tree phi_stmt;
4955 tree msq = NULL_TREE;
4956 tree stmts = NULL_TREE;
4957 bool inv_p;
4958 bool compute_in_loop = false;
4959 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4960 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
4961 struct loop *loop_for_initial_load;
4963 gcc_assert (alignment_support_scheme == dr_explicit_realign
4964 || alignment_support_scheme == dr_explicit_realign_optimized);
4966 /* We need to generate three things:
4967 1. the misalignment computation
4968 2. the extra vector load (for the optimized realignment scheme).
4969 3. the phi node for the two vectors from which the realignment is
4970 done (for the optimized realignment scheme).
4973 /* 1. Determine where to generate the misalignment computation.
4975 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4976 calculation will be generated by this function, outside the loop (in the
4977 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4978 caller, inside the loop.
4980 Background: If the misalignment remains fixed throughout the iterations of
4981 the loop, then both realignment schemes are applicable, and also the
4982 misalignment computation can be done outside LOOP. This is because we are
4983 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4984 are a multiple of VS (the Vector Size), and therefore the misalignment in
4985 different vectorized LOOP iterations is always the same.
4986 The problem arises only if the memory access is in an inner-loop nested
4987 inside LOOP, which is now being vectorized using outer-loop vectorization.
4988 This is the only case when the misalignment of the memory access may not
4989 remain fixed throughout the iterations of the inner-loop (as explained in
4990 detail in vect_supportable_dr_alignment). In this case, not only is the
4991 optimized realignment scheme not applicable, but also the misalignment
4992 computation (and generation of the realignment token that is passed to
4993 REALIGN_LOAD) have to be done inside the loop.
4995 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4996 or not, which in turn determines if the misalignment is computed inside
4997 the inner-loop, or outside LOOP. */
4999 if (init_addr != NULL_TREE)
5001 compute_in_loop = true;
5002 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5006 /* 2. Determine where to generate the extra vector load.
5008 For the optimized realignment scheme, instead of generating two vector
5009 loads in each iteration, we generate a single extra vector load in the
5010 preheader of the loop, and in each iteration reuse the result of the
5011 vector load from the previous iteration. In case the memory access is in
5012 an inner-loop nested inside LOOP, which is now being vectorized using
5013 outer-loop vectorization, we need to determine whether this initial vector
5014 load should be generated at the preheader of the inner-loop, or can be
5015 generated at the preheader of LOOP. If the memory access has no evolution
5016 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5017 to be generated inside LOOP (in the preheader of the inner-loop). */
5019 if (nested_in_vect_loop)
5021 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5022 bool invariant_in_outerloop =
5023 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5024 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5026 else
5027 loop_for_initial_load = loop;
5028 if (at_loop)
5029 *at_loop = loop_for_initial_load;
5031 /* 3. For the case of the optimized realignment, create the first vector
5032 load at the loop preheader. */
5034 if (alignment_support_scheme == dr_explicit_realign_optimized)
5036 /* Create msq_init = *(floor(p1)) in the loop preheader */
5038 gcc_assert (!compute_in_loop);
5039 pe = loop_preheader_edge (loop_for_initial_load);
5040 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5041 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5042 &init_addr, &inc, true, NULL_TREE, &inv_p);
5043 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5044 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5045 new_temp = make_ssa_name (vec_dest, new_stmt);
5046 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5047 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5048 gcc_assert (!new_bb);
5049 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5052 /* 4. Create realignment token using a target builtin, if available.
5053 It is done either inside the containing loop, or before LOOP (as
5054 determined above). */
5056 if (targetm.vectorize.builtin_mask_for_load)
5058 tree builtin_decl;
5060 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5061 if (compute_in_loop)
5062 gcc_assert (init_addr); /* already computed by the caller. */
5063 else
5065 /* Generate the INIT_ADDR computation outside LOOP. */
5066 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5067 NULL_TREE, loop);
5068 pe = loop_preheader_edge (loop);
5069 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5070 gcc_assert (!new_bb);
5073 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5074 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5075 vec_dest = vect_create_destination_var (scalar_dest,
5076 TREE_TYPE (new_stmt));
5077 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5078 new_temp = make_ssa_name (vec_dest, new_stmt);
5079 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5081 if (compute_in_loop)
5082 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5083 else
5085 /* Generate the misalignment computation outside LOOP. */
5086 pe = loop_preheader_edge (loop);
5087 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5088 gcc_assert (!new_bb);
5091 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5093 /* The result of the CALL_EXPR to this builtin is determined from
5094 the value of the parameter and no global variables are touched
5095 which makes the builtin a "const" function. Requiring the
5096 builtin to have the "const" attribute makes it unnecessary
5097 to call mark_call_clobbered. */
5098 gcc_assert (TREE_READONLY (builtin_decl));
5101 if (alignment_support_scheme == dr_explicit_realign)
5102 return msq;
5104 gcc_assert (!compute_in_loop);
5105 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5108 /* 5. Create msq = phi <msq_init, lsq> in loop */
5110 pe = loop_preheader_edge (containing_loop);
5111 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5112 msq = make_ssa_name (vec_dest, NULL_TREE);
5113 phi_stmt = create_phi_node (msq, containing_loop->header);
5114 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5115 add_phi_arg (phi_stmt, msq_init, pe);
5117 return msq;
5121 /* Function vect_strided_load_supported.
5123 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5124 and FALSE otherwise. */
5126 static bool
5127 vect_strided_load_supported (tree vectype)
5129 optab perm_even_optab, perm_odd_optab;
5130 int mode;
5132 mode = (int) TYPE_MODE (vectype);
5134 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5135 if (!perm_even_optab)
5137 if (vect_print_dump_info (REPORT_DETAILS))
5138 fprintf (vect_dump, "no optab for perm_even.");
5139 return false;
5142 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5144 if (vect_print_dump_info (REPORT_DETAILS))
5145 fprintf (vect_dump, "perm_even op not supported by target.");
5146 return false;
5149 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5150 if (!perm_odd_optab)
5152 if (vect_print_dump_info (REPORT_DETAILS))
5153 fprintf (vect_dump, "no optab for perm_odd.");
5154 return false;
5157 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5159 if (vect_print_dump_info (REPORT_DETAILS))
5160 fprintf (vect_dump, "perm_odd op not supported by target.");
5161 return false;
5163 return true;
5167 /* Function vect_permute_load_chain.
5169 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5170 a power of 2, generate extract_even/odd stmts to reorder the input data
5171 correctly. Return the final references for loads in RESULT_CHAIN.
5173 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5174 The input is 4 vectors each containing 8 elements. We assign a number to each
5175 element, the input sequence is:
5177 1st vec: 0 1 2 3 4 5 6 7
5178 2nd vec: 8 9 10 11 12 13 14 15
5179 3rd vec: 16 17 18 19 20 21 22 23
5180 4th vec: 24 25 26 27 28 29 30 31
5182 The output sequence should be:
5184 1st vec: 0 4 8 12 16 20 24 28
5185 2nd vec: 1 5 9 13 17 21 25 29
5186 3rd vec: 2 6 10 14 18 22 26 30
5187 4th vec: 3 7 11 15 19 23 27 31
5189 i.e., the first output vector should contain the first elements of each
5190 interleaving group, etc.
5192 We use extract_even/odd instructions to create such output. The input of each
5193 extract_even/odd operation is two vectors
5194 1st vec 2nd vec
5195 0 1 2 3 4 5 6 7
5197 and the output is the vector of extracted even/odd elements. The output of
5198 extract_even will be: 0 2 4 6
5199 and of extract_odd: 1 3 5 7
5202 The permutation is done in log LENGTH stages. In each stage extract_even and
5203 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5204 order. In our example,
5206 E1: extract_even (1st vec, 2nd vec)
5207 E2: extract_odd (1st vec, 2nd vec)
5208 E3: extract_even (3rd vec, 4th vec)
5209 E4: extract_odd (3rd vec, 4th vec)
5211 The output for the first stage will be:
5213 E1: 0 2 4 6 8 10 12 14
5214 E2: 1 3 5 7 9 11 13 15
5215 E3: 16 18 20 22 24 26 28 30
5216 E4: 17 19 21 23 25 27 29 31
5218 In order to proceed and create the correct sequence for the next stage (or
5219 for the correct output, if the second stage is the last one, as in our
5220 example), we first put the output of extract_even operation and then the
5221 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5222 The input for the second stage is:
5224 1st vec (E1): 0 2 4 6 8 10 12 14
5225 2nd vec (E3): 16 18 20 22 24 26 28 30
5226 3rd vec (E2): 1 3 5 7 9 11 13 15
5227 4th vec (E4): 17 19 21 23 25 27 29 31
5229 The output of the second stage:
5231 E1: 0 4 8 12 16 20 24 28
5232 E2: 2 6 10 14 18 22 26 30
5233 E3: 1 5 9 13 17 21 25 29
5234 E4: 3 7 11 15 19 23 27 31
5236 And RESULT_CHAIN after reordering:
5238 1st vec (E1): 0 4 8 12 16 20 24 28
5239 2nd vec (E3): 1 5 9 13 17 21 25 29
5240 3rd vec (E2): 2 6 10 14 18 22 26 30
5241 4th vec (E4): 3 7 11 15 19 23 27 31. */
5243 static bool
5244 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5245 unsigned int length,
5246 tree stmt,
5247 block_stmt_iterator *bsi,
5248 VEC(tree,heap) **result_chain)
5250 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5251 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5252 tree tmp;
5253 int i;
5254 unsigned int j;
5256 /* Check that the operation is supported. */
5257 if (!vect_strided_load_supported (vectype))
5258 return false;
5260 *result_chain = VEC_copy (tree, heap, dr_chain);
5261 for (i = 0; i < exact_log2 (length); i++)
5263 for (j = 0; j < length; j +=2)
5265 first_vect = VEC_index (tree, dr_chain, j);
5266 second_vect = VEC_index (tree, dr_chain, j+1);
5268 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5269 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5270 DECL_GIMPLE_REG_P (perm_dest) = 1;
5271 add_referenced_var (perm_dest);
5273 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5274 first_vect, second_vect);
5275 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5277 data_ref = make_ssa_name (perm_dest, perm_stmt);
5278 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5279 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5280 mark_symbols_for_renaming (perm_stmt);
5282 VEC_replace (tree, *result_chain, j/2, data_ref);
5284 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5285 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5286 DECL_GIMPLE_REG_P (perm_dest) = 1;
5287 add_referenced_var (perm_dest);
5289 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5290 first_vect, second_vect);
5291 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5292 data_ref = make_ssa_name (perm_dest, perm_stmt);
5293 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5294 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5295 mark_symbols_for_renaming (perm_stmt);
5297 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5299 dr_chain = VEC_copy (tree, heap, *result_chain);
5301 return true;
5305 /* Function vect_transform_strided_load.
5307 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5308 to perform their permutation and ascribe the result vectorized statements to
5309 the scalar statements.
5312 static bool
5313 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5314 block_stmt_iterator *bsi)
5316 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5317 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5318 tree next_stmt, new_stmt;
5319 VEC(tree,heap) *result_chain = NULL;
5320 unsigned int i, gap_count;
5321 tree tmp_data_ref;
5323 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5324 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5325 vectors, that are ready for vector computation. */
5326 result_chain = VEC_alloc (tree, heap, size);
5327 /* Permute. */
5328 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5329 return false;
5331 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5332 Since we scan the chain starting from it's first node, their order
5333 corresponds the order of data-refs in RESULT_CHAIN. */
5334 next_stmt = first_stmt;
5335 gap_count = 1;
5336 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5338 if (!next_stmt)
5339 break;
5341 /* Skip the gaps. Loads created for the gaps will be removed by dead
5342 code elimination pass later.
5343 DR_GROUP_GAP is the number of steps in elements from the previous
5344 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5345 correspond to the gaps.
5347 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5349 gap_count++;
5350 continue;
5353 while (next_stmt)
5355 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5356 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5357 copies, and we put the new vector statement in the first available
5358 RELATED_STMT. */
5359 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5360 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5361 else
5363 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5364 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5365 vinfo_for_stmt (prev_stmt));
5366 while (rel_stmt)
5368 prev_stmt = rel_stmt;
5369 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5371 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5373 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5374 gap_count = 1;
5375 /* If NEXT_STMT accesses the same DR as the previous statement,
5376 put the same TMP_DATA_REF as its vectorized statement; otherwise
5377 get the next data-ref from RESULT_CHAIN. */
5378 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5379 break;
5382 return true;
5386 /* vectorizable_load.
5388 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5389 can be vectorized.
5390 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5391 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5392 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5394 bool
5395 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5396 slp_tree slp_node)
5398 tree scalar_dest;
5399 tree vec_dest = NULL;
5400 tree data_ref = NULL;
5401 tree op;
5402 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5403 stmt_vec_info prev_stmt_info;
5404 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5405 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5406 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5407 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5408 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5409 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5410 tree new_temp;
5411 int mode;
5412 tree new_stmt = NULL_TREE;
5413 tree dummy;
5414 enum dr_alignment_support alignment_support_scheme;
5415 tree dataref_ptr = NULL_TREE;
5416 tree ptr_incr;
5417 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5418 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5419 int i, j, group_size;
5420 tree msq = NULL_TREE, lsq;
5421 tree offset = NULL_TREE;
5422 tree realignment_token = NULL_TREE;
5423 tree phi = NULL_TREE;
5424 VEC(tree,heap) *dr_chain = NULL;
5425 bool strided_load = false;
5426 tree first_stmt;
5427 tree scalar_type;
5428 bool inv_p;
5429 bool compute_in_loop = false;
5430 struct loop *at_loop;
5431 int vec_num;
5432 bool slp = (slp_node != NULL);
5434 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5435 this, so we can safely override NCOPIES with 1 here. */
5436 if (slp)
5437 ncopies = 1;
5439 gcc_assert (ncopies >= 1);
5441 /* FORNOW. This restriction should be relaxed. */
5442 if (nested_in_vect_loop && ncopies > 1)
5444 if (vect_print_dump_info (REPORT_DETAILS))
5445 fprintf (vect_dump, "multiple types in nested loop.");
5446 return false;
5449 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5450 return false;
5452 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5453 return false;
5455 /* FORNOW: not yet supported. */
5456 if (STMT_VINFO_LIVE_P (stmt_info))
5458 if (vect_print_dump_info (REPORT_DETAILS))
5459 fprintf (vect_dump, "value used after loop.");
5460 return false;
5463 /* Is vectorizable load? */
5464 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5465 return false;
5467 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5468 if (TREE_CODE (scalar_dest) != SSA_NAME)
5469 return false;
5471 op = GIMPLE_STMT_OPERAND (stmt, 1);
5472 if (TREE_CODE (op) != ARRAY_REF
5473 && TREE_CODE (op) != INDIRECT_REF
5474 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5475 return false;
5477 if (!STMT_VINFO_DATA_REF (stmt_info))
5478 return false;
5480 scalar_type = TREE_TYPE (DR_REF (dr));
5481 mode = (int) TYPE_MODE (vectype);
5483 /* FORNOW. In some cases can vectorize even if data-type not supported
5484 (e.g. - data copies). */
5485 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5487 if (vect_print_dump_info (REPORT_DETAILS))
5488 fprintf (vect_dump, "Aligned load, but unsupported type.");
5489 return false;
5492 /* Check if the load is a part of an interleaving chain. */
5493 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5495 strided_load = true;
5496 /* FORNOW */
5497 gcc_assert (! nested_in_vect_loop);
5499 /* Check if interleaving is supported. */
5500 if (!vect_strided_load_supported (vectype)
5501 && !PURE_SLP_STMT (stmt_info) && !slp)
5502 return false;
5505 if (!vec_stmt) /* transformation not required. */
5507 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5508 vect_model_load_cost (stmt_info, ncopies, NULL);
5509 return true;
5512 if (vect_print_dump_info (REPORT_DETAILS))
5513 fprintf (vect_dump, "transform load.");
5515 /** Transform. **/
5517 if (strided_load)
5519 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5520 /* Check if the chain of loads is already vectorized. */
5521 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5523 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5524 return true;
5526 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5527 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5528 dr_chain = VEC_alloc (tree, heap, group_size);
5530 /* VEC_NUM is the number of vect stmts to be created for this group. */
5531 if (slp)
5533 strided_load = false;
5534 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5536 else
5537 vec_num = group_size;
5539 else
5541 first_stmt = stmt;
5542 first_dr = dr;
5543 group_size = vec_num = 1;
5546 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5547 gcc_assert (alignment_support_scheme);
5549 /* In case the vectorization factor (VF) is bigger than the number
5550 of elements that we can fit in a vectype (nunits), we have to generate
5551 more than one vector stmt - i.e - we need to "unroll" the
5552 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5553 from one copy of the vector stmt to the next, in the field
5554 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5555 stages to find the correct vector defs to be used when vectorizing
5556 stmts that use the defs of the current stmt. The example below illustrates
5557 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5558 4 vectorized stmts):
5560 before vectorization:
5561 RELATED_STMT VEC_STMT
5562 S1: x = memref - -
5563 S2: z = x + 1 - -
5565 step 1: vectorize stmt S1:
5566 We first create the vector stmt VS1_0, and, as usual, record a
5567 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5568 Next, we create the vector stmt VS1_1, and record a pointer to
5569 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5570 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5571 stmts and pointers:
5572 RELATED_STMT VEC_STMT
5573 VS1_0: vx0 = memref0 VS1_1 -
5574 VS1_1: vx1 = memref1 VS1_2 -
5575 VS1_2: vx2 = memref2 VS1_3 -
5576 VS1_3: vx3 = memref3 - -
5577 S1: x = load - VS1_0
5578 S2: z = x + 1 - -
5580 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5581 information we recorded in RELATED_STMT field is used to vectorize
5582 stmt S2. */
5584 /* In case of interleaving (non-unit strided access):
5586 S1: x2 = &base + 2
5587 S2: x0 = &base
5588 S3: x1 = &base + 1
5589 S4: x3 = &base + 3
5591 Vectorized loads are created in the order of memory accesses
5592 starting from the access of the first stmt of the chain:
5594 VS1: vx0 = &base
5595 VS2: vx1 = &base + vec_size*1
5596 VS3: vx3 = &base + vec_size*2
5597 VS4: vx4 = &base + vec_size*3
5599 Then permutation statements are generated:
5601 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5602 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5605 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5606 (the order of the data-refs in the output of vect_permute_load_chain
5607 corresponds to the order of scalar stmts in the interleaving chain - see
5608 the documentation of vect_permute_load_chain()).
5609 The generation of permutation stmts and recording them in
5610 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5612 In case of both multiple types and interleaving, the vector loads and
5613 permutation stmts above are created for every copy. The result vector stmts
5614 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5615 STMT_VINFO_RELATED_STMT for the next copies. */
5617 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5618 on a target that supports unaligned accesses (dr_unaligned_supported)
5619 we generate the following code:
5620 p = initial_addr;
5621 indx = 0;
5622 loop {
5623 p = p + indx * vectype_size;
5624 vec_dest = *(p);
5625 indx = indx + 1;
5628 Otherwise, the data reference is potentially unaligned on a target that
5629 does not support unaligned accesses (dr_explicit_realign_optimized) -
5630 then generate the following code, in which the data in each iteration is
5631 obtained by two vector loads, one from the previous iteration, and one
5632 from the current iteration:
5633 p1 = initial_addr;
5634 msq_init = *(floor(p1))
5635 p2 = initial_addr + VS - 1;
5636 realignment_token = call target_builtin;
5637 indx = 0;
5638 loop {
5639 p2 = p2 + indx * vectype_size
5640 lsq = *(floor(p2))
5641 vec_dest = realign_load (msq, lsq, realignment_token)
5642 indx = indx + 1;
5643 msq = lsq;
5644 } */
5646 /* If the misalignment remains the same throughout the execution of the
5647 loop, we can create the init_addr and permutation mask at the loop
5648 preheader. Otherwise, it needs to be created inside the loop.
5649 This can only occur when vectorizing memory accesses in the inner-loop
5650 nested within an outer-loop that is being vectorized. */
5652 if (nested_in_vect_loop_p (loop, stmt)
5653 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5655 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5656 compute_in_loop = true;
5659 if ((alignment_support_scheme == dr_explicit_realign_optimized
5660 || alignment_support_scheme == dr_explicit_realign)
5661 && !compute_in_loop)
5663 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5664 alignment_support_scheme, NULL_TREE,
5665 &at_loop);
5666 if (alignment_support_scheme == dr_explicit_realign_optimized)
5668 phi = SSA_NAME_DEF_STMT (msq);
5669 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5672 else
5673 at_loop = loop;
5675 prev_stmt_info = NULL;
5676 for (j = 0; j < ncopies; j++)
5678 /* 1. Create the vector pointer update chain. */
5679 if (j == 0)
5680 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5681 at_loop, offset,
5682 &dummy, &ptr_incr, false,
5683 NULL_TREE, &inv_p);
5684 else
5685 dataref_ptr =
5686 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5688 for (i = 0; i < vec_num; i++)
5690 if (i > 0)
5691 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5692 NULL_TREE);
5694 /* 2. Create the vector-load in the loop. */
5695 switch (alignment_support_scheme)
5697 case dr_aligned:
5698 gcc_assert (aligned_access_p (first_dr));
5699 data_ref = build_fold_indirect_ref (dataref_ptr);
5700 break;
5701 case dr_unaligned_supported:
5703 int mis = DR_MISALIGNMENT (first_dr);
5704 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5706 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5707 data_ref =
5708 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5709 break;
5711 case dr_explicit_realign:
5713 tree ptr, bump;
5714 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5716 if (compute_in_loop)
5717 msq = vect_setup_realignment (first_stmt, bsi,
5718 &realignment_token,
5719 dr_explicit_realign,
5720 dataref_ptr, NULL);
5722 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5723 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5724 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5725 new_temp = make_ssa_name (vec_dest, new_stmt);
5726 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5727 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5728 copy_virtual_operands (new_stmt, stmt);
5729 mark_symbols_for_renaming (new_stmt);
5730 msq = new_temp;
5732 bump = size_binop (MULT_EXPR, vs_minus_1,
5733 TYPE_SIZE_UNIT (scalar_type));
5734 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5735 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5736 break;
5738 case dr_explicit_realign_optimized:
5739 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5740 break;
5741 default:
5742 gcc_unreachable ();
5744 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5745 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5746 new_temp = make_ssa_name (vec_dest, new_stmt);
5747 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5748 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5749 mark_symbols_for_renaming (new_stmt);
5751 /* 3. Handle explicit realignment if necessary/supported. Create in
5752 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5753 if (alignment_support_scheme == dr_explicit_realign_optimized
5754 || alignment_support_scheme == dr_explicit_realign)
5756 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5757 if (!realignment_token)
5758 realignment_token = dataref_ptr;
5759 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5760 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5761 realignment_token);
5762 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5763 new_temp = make_ssa_name (vec_dest, new_stmt);
5764 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5765 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5767 if (alignment_support_scheme == dr_explicit_realign_optimized)
5769 if (i == vec_num - 1 && j == ncopies - 1)
5770 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5771 msq = lsq;
5775 /* 4. Handle invariant-load. */
5776 if (inv_p)
5778 gcc_assert (!strided_load);
5779 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5780 if (j == 0)
5782 int k;
5783 tree t = NULL_TREE;
5784 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5786 /* CHECKME: bitpos depends on endianess? */
5787 bitpos = bitsize_zero_node;
5788 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5789 bitsize, bitpos);
5790 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5791 TYPE_UNSIGNED (scalar_type);
5792 vec_dest =
5793 vect_create_destination_var (scalar_dest, NULL_TREE);
5794 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5795 new_temp = make_ssa_name (vec_dest, new_stmt);
5796 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5797 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5799 for (k = nunits - 1; k >= 0; --k)
5800 t = tree_cons (NULL_TREE, new_temp, t);
5801 /* FIXME: use build_constructor directly. */
5802 vec_inv = build_constructor_from_list (vectype, t);
5803 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5804 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5806 else
5807 gcc_unreachable (); /* FORNOW. */
5810 /* Collect vector loads and later create their permutation in
5811 vect_transform_strided_load (). */
5812 if (strided_load)
5813 VEC_quick_push (tree, dr_chain, new_temp);
5815 /* Store vector loads in the corresponding SLP_NODE. */
5816 if (slp)
5817 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5820 /* FORNOW: SLP with multiple types is unsupported. */
5821 if (slp)
5822 return true;
5824 if (strided_load)
5826 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5827 return false;
5828 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5829 dr_chain = VEC_alloc (tree, heap, group_size);
5831 else
5833 if (j == 0)
5834 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5835 else
5836 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5837 prev_stmt_info = vinfo_for_stmt (new_stmt);
5841 return true;
5845 /* Function vectorizable_live_operation.
5847 STMT computes a value that is used outside the loop. Check if
5848 it can be supported. */
5850 bool
5851 vectorizable_live_operation (tree stmt,
5852 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5853 tree *vec_stmt ATTRIBUTE_UNUSED)
5855 tree operation;
5856 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5857 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5858 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5859 int i;
5860 int op_type;
5861 tree op;
5862 tree def, def_stmt;
5863 enum vect_def_type dt;
5865 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5867 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5868 return false;
5870 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5871 return false;
5873 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5874 return false;
5876 /* FORNOW. CHECKME. */
5877 if (nested_in_vect_loop_p (loop, stmt))
5878 return false;
5880 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5881 op_type = TREE_OPERAND_LENGTH (operation);
5883 /* FORNOW: support only if all uses are invariant. This means
5884 that the scalar operations can remain in place, unvectorized.
5885 The original last scalar value that they compute will be used. */
5887 for (i = 0; i < op_type; i++)
5889 op = TREE_OPERAND (operation, i);
5890 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5892 if (vect_print_dump_info (REPORT_DETAILS))
5893 fprintf (vect_dump, "use not simple.");
5894 return false;
5897 if (dt != vect_invariant_def && dt != vect_constant_def)
5898 return false;
5901 /* No transformation is required for the cases we currently support. */
5902 return true;
5906 /* Function vect_is_simple_cond.
5908 Input:
5909 LOOP - the loop that is being vectorized.
5910 COND - Condition that is checked for simple use.
5912 Returns whether a COND can be vectorized. Checks whether
5913 condition operands are supportable using vec_is_simple_use. */
5915 static bool
5916 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
5918 tree lhs, rhs;
5919 tree def;
5920 enum vect_def_type dt;
5922 if (!COMPARISON_CLASS_P (cond))
5923 return false;
5925 lhs = TREE_OPERAND (cond, 0);
5926 rhs = TREE_OPERAND (cond, 1);
5928 if (TREE_CODE (lhs) == SSA_NAME)
5930 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
5931 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
5932 return false;
5934 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
5935 && TREE_CODE (lhs) != FIXED_CST)
5936 return false;
5938 if (TREE_CODE (rhs) == SSA_NAME)
5940 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
5941 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
5942 return false;
5944 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
5945 && TREE_CODE (rhs) != FIXED_CST)
5946 return false;
5948 return true;
5951 /* vectorizable_condition.
5953 Check if STMT is conditional modify expression that can be vectorized.
5954 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5955 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
5956 at BSI.
5958 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5960 bool
5961 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5963 tree scalar_dest = NULL_TREE;
5964 tree vec_dest = NULL_TREE;
5965 tree op = NULL_TREE;
5966 tree cond_expr, then_clause, else_clause;
5967 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5968 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5969 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
5970 tree vec_compare, vec_cond_expr;
5971 tree new_temp;
5972 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5973 enum machine_mode vec_mode;
5974 tree def;
5975 enum vect_def_type dt;
5976 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5977 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5979 gcc_assert (ncopies >= 1);
5980 if (ncopies > 1)
5981 return false; /* FORNOW */
5983 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5984 return false;
5986 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5987 return false;
5989 /* FORNOW: SLP not supported. */
5990 if (STMT_SLP_TYPE (stmt_info))
5991 return false;
5993 /* FORNOW: not yet supported. */
5994 if (STMT_VINFO_LIVE_P (stmt_info))
5996 if (vect_print_dump_info (REPORT_DETAILS))
5997 fprintf (vect_dump, "value used after loop.");
5998 return false;
6001 /* Is vectorizable conditional operation? */
6002 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6003 return false;
6005 op = GIMPLE_STMT_OPERAND (stmt, 1);
6007 if (TREE_CODE (op) != COND_EXPR)
6008 return false;
6010 cond_expr = TREE_OPERAND (op, 0);
6011 then_clause = TREE_OPERAND (op, 1);
6012 else_clause = TREE_OPERAND (op, 2);
6014 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6015 return false;
6017 /* We do not handle two different vector types for the condition
6018 and the values. */
6019 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6020 return false;
6022 if (TREE_CODE (then_clause) == SSA_NAME)
6024 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6025 if (!vect_is_simple_use (then_clause, loop_vinfo,
6026 &then_def_stmt, &def, &dt))
6027 return false;
6029 else if (TREE_CODE (then_clause) != INTEGER_CST
6030 && TREE_CODE (then_clause) != REAL_CST
6031 && TREE_CODE (then_clause) != FIXED_CST)
6032 return false;
6034 if (TREE_CODE (else_clause) == SSA_NAME)
6036 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6037 if (!vect_is_simple_use (else_clause, loop_vinfo,
6038 &else_def_stmt, &def, &dt))
6039 return false;
6041 else if (TREE_CODE (else_clause) != INTEGER_CST
6042 && TREE_CODE (else_clause) != REAL_CST
6043 && TREE_CODE (else_clause) != FIXED_CST)
6044 return false;
6047 vec_mode = TYPE_MODE (vectype);
6049 if (!vec_stmt)
6051 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6052 return expand_vec_cond_expr_p (op, vec_mode);
6055 /* Transform */
6057 /* Handle def. */
6058 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6059 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6061 /* Handle cond expr. */
6062 vec_cond_lhs =
6063 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6064 vec_cond_rhs =
6065 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6066 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6067 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6069 /* Arguments are ready. create the new vector stmt. */
6070 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6071 vec_cond_lhs, vec_cond_rhs);
6072 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6073 vec_compare, vec_then_clause, vec_else_clause);
6075 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6076 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6077 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6078 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6080 return true;
6084 /* Function vect_transform_stmt.
6086 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6088 static bool
6089 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6090 slp_tree slp_node)
6092 bool is_store = false;
6093 tree vec_stmt = NULL_TREE;
6094 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6095 tree orig_stmt_in_pattern;
6096 bool done;
6098 switch (STMT_VINFO_TYPE (stmt_info))
6100 case type_demotion_vec_info_type:
6101 gcc_assert (!slp_node);
6102 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6103 gcc_assert (done);
6104 break;
6106 case type_promotion_vec_info_type:
6107 gcc_assert (!slp_node);
6108 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6109 gcc_assert (done);
6110 break;
6112 case type_conversion_vec_info_type:
6113 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6114 gcc_assert (done);
6115 break;
6117 case induc_vec_info_type:
6118 gcc_assert (!slp_node);
6119 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6120 gcc_assert (done);
6121 break;
6123 case op_vec_info_type:
6124 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6125 gcc_assert (done);
6126 break;
6128 case assignment_vec_info_type:
6129 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6130 gcc_assert (done);
6131 break;
6133 case load_vec_info_type:
6134 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6135 gcc_assert (done);
6136 break;
6138 case store_vec_info_type:
6139 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6140 gcc_assert (done);
6141 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6143 /* In case of interleaving, the whole chain is vectorized when the
6144 last store in the chain is reached. Store stmts before the last
6145 one are skipped, and there vec_stmt_info shouldn't be freed
6146 meanwhile. */
6147 *strided_store = true;
6148 if (STMT_VINFO_VEC_STMT (stmt_info))
6149 is_store = true;
6151 else
6152 is_store = true;
6153 break;
6155 case condition_vec_info_type:
6156 gcc_assert (!slp_node);
6157 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6158 gcc_assert (done);
6159 break;
6161 case call_vec_info_type:
6162 gcc_assert (!slp_node);
6163 done = vectorizable_call (stmt, bsi, &vec_stmt);
6164 break;
6166 case reduc_vec_info_type:
6167 gcc_assert (!slp_node);
6168 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6169 gcc_assert (done);
6170 break;
6172 default:
6173 if (!STMT_VINFO_LIVE_P (stmt_info))
6175 if (vect_print_dump_info (REPORT_DETAILS))
6176 fprintf (vect_dump, "stmt not supported.");
6177 gcc_unreachable ();
6181 if (STMT_VINFO_LIVE_P (stmt_info)
6182 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6184 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6185 gcc_assert (done);
6188 if (vec_stmt)
6190 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6191 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6192 if (orig_stmt_in_pattern)
6194 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6195 /* STMT was inserted by the vectorizer to replace a computation idiom.
6196 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6197 computed this idiom. We need to record a pointer to VEC_STMT in
6198 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6199 documentation of vect_pattern_recog. */
6200 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6202 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6203 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6208 return is_store;
6212 /* This function builds ni_name = number of iterations loop executes
6213 on the loop preheader. */
6215 static tree
6216 vect_build_loop_niters (loop_vec_info loop_vinfo)
6218 tree ni_name, stmt, var;
6219 edge pe;
6220 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6221 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6223 var = create_tmp_var (TREE_TYPE (ni), "niters");
6224 add_referenced_var (var);
6225 ni_name = force_gimple_operand (ni, &stmt, false, var);
6227 pe = loop_preheader_edge (loop);
6228 if (stmt)
6230 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6231 gcc_assert (!new_bb);
6234 return ni_name;
6238 /* This function generates the following statements:
6240 ni_name = number of iterations loop executes
6241 ratio = ni_name / vf
6242 ratio_mult_vf_name = ratio * vf
6244 and places them at the loop preheader edge. */
6246 static void
6247 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6248 tree *ni_name_ptr,
6249 tree *ratio_mult_vf_name_ptr,
6250 tree *ratio_name_ptr)
6253 edge pe;
6254 basic_block new_bb;
6255 tree stmt, ni_name;
6256 tree var;
6257 tree ratio_name;
6258 tree ratio_mult_vf_name;
6259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6260 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6261 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6262 tree log_vf;
6264 pe = loop_preheader_edge (loop);
6266 /* Generate temporary variable that contains
6267 number of iterations loop executes. */
6269 ni_name = vect_build_loop_niters (loop_vinfo);
6270 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6272 /* Create: ratio = ni >> log2(vf) */
6274 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6275 if (!is_gimple_val (ratio_name))
6277 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6278 add_referenced_var (var);
6280 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6281 pe = loop_preheader_edge (loop);
6282 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6283 gcc_assert (!new_bb);
6286 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6288 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6289 ratio_name, log_vf);
6290 if (!is_gimple_val (ratio_mult_vf_name))
6292 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6293 add_referenced_var (var);
6295 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6296 true, var);
6297 pe = loop_preheader_edge (loop);
6298 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6299 gcc_assert (!new_bb);
6302 *ni_name_ptr = ni_name;
6303 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6304 *ratio_name_ptr = ratio_name;
6306 return;
6310 /* Function vect_update_ivs_after_vectorizer.
6312 "Advance" the induction variables of LOOP to the value they should take
6313 after the execution of LOOP. This is currently necessary because the
6314 vectorizer does not handle induction variables that are used after the
6315 loop. Such a situation occurs when the last iterations of LOOP are
6316 peeled, because:
6317 1. We introduced new uses after LOOP for IVs that were not originally used
6318 after LOOP: the IVs of LOOP are now used by an epilog loop.
6319 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6320 times, whereas the loop IVs should be bumped N times.
6322 Input:
6323 - LOOP - a loop that is going to be vectorized. The last few iterations
6324 of LOOP were peeled.
6325 - NITERS - the number of iterations that LOOP executes (before it is
6326 vectorized). i.e, the number of times the ivs should be bumped.
6327 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6328 coming out from LOOP on which there are uses of the LOOP ivs
6329 (this is the path from LOOP->exit to epilog_loop->preheader).
6331 The new definitions of the ivs are placed in LOOP->exit.
6332 The phi args associated with the edge UPDATE_E in the bb
6333 UPDATE_E->dest are updated accordingly.
6335 Assumption 1: Like the rest of the vectorizer, this function assumes
6336 a single loop exit that has a single predecessor.
6338 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6339 organized in the same order.
6341 Assumption 3: The access function of the ivs is simple enough (see
6342 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6344 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6345 coming out of LOOP on which the ivs of LOOP are used (this is the path
6346 that leads to the epilog loop; other paths skip the epilog loop). This
6347 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6348 needs to have its phis updated.
6351 static void
6352 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6353 edge update_e)
6355 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6356 basic_block exit_bb = single_exit (loop)->dest;
6357 tree phi, phi1;
6358 basic_block update_bb = update_e->dest;
6360 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6362 /* Make sure there exists a single-predecessor exit bb: */
6363 gcc_assert (single_pred_p (exit_bb));
6365 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6366 phi && phi1;
6367 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6369 tree access_fn = NULL;
6370 tree evolution_part;
6371 tree init_expr;
6372 tree step_expr;
6373 tree var, ni, ni_name;
6374 block_stmt_iterator last_bsi;
6376 if (vect_print_dump_info (REPORT_DETAILS))
6378 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6379 print_generic_expr (vect_dump, phi, TDF_SLIM);
6382 /* Skip virtual phi's. */
6383 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6385 if (vect_print_dump_info (REPORT_DETAILS))
6386 fprintf (vect_dump, "virtual phi. skip.");
6387 continue;
6390 /* Skip reduction phis. */
6391 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6393 if (vect_print_dump_info (REPORT_DETAILS))
6394 fprintf (vect_dump, "reduc phi. skip.");
6395 continue;
6398 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6399 gcc_assert (access_fn);
6400 evolution_part =
6401 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6402 gcc_assert (evolution_part != NULL_TREE);
6404 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6405 of degree >= 2 or exponential. */
6406 gcc_assert (!tree_is_chrec (evolution_part));
6408 step_expr = evolution_part;
6409 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6410 loop->num));
6412 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6413 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6414 init_expr,
6415 fold_convert (sizetype,
6416 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6417 niters, step_expr)));
6418 else
6419 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6420 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6421 fold_convert (TREE_TYPE (init_expr),
6422 niters),
6423 step_expr),
6424 init_expr);
6428 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6429 add_referenced_var (var);
6431 last_bsi = bsi_last (exit_bb);
6432 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6433 true, BSI_SAME_STMT);
6435 /* Fix phi expressions in the successor bb. */
6436 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6441 /* Function vect_do_peeling_for_loop_bound
6443 Peel the last iterations of the loop represented by LOOP_VINFO.
6444 The peeled iterations form a new epilog loop. Given that the loop now
6445 iterates NITERS times, the new epilog loop iterates
6446 NITERS % VECTORIZATION_FACTOR times.
6448 The original loop will later be made to iterate
6449 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6451 static void
6452 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6454 tree ni_name, ratio_mult_vf_name;
6455 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6456 struct loop *new_loop;
6457 edge update_e;
6458 basic_block preheader;
6459 int loop_num;
6460 unsigned int th;
6461 int min_scalar_loop_bound;
6462 int min_profitable_iters;
6464 if (vect_print_dump_info (REPORT_DETAILS))
6465 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6467 initialize_original_copy_tables ();
6469 /* Generate the following variables on the preheader of original loop:
6471 ni_name = number of iteration the original loop executes
6472 ratio = ni_name / vf
6473 ratio_mult_vf_name = ratio * vf */
6474 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6475 &ratio_mult_vf_name, ratio);
6477 loop_num = loop->num;
6479 /* Analyze cost to set threshhold for vectorized loop. */
6480 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6481 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6482 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6484 /* Use the cost model only if it is more conservative than user specified
6485 threshold. */
6487 th = (unsigned) min_scalar_loop_bound;
6488 if (min_profitable_iters
6489 && (!min_scalar_loop_bound
6490 || min_profitable_iters > min_scalar_loop_bound))
6491 th = (unsigned) min_profitable_iters;
6493 if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
6494 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6495 && vect_print_dump_info (REPORT_DETAILS))
6496 fprintf (vect_dump, "vectorization may not be profitable.");
6498 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6499 ratio_mult_vf_name, ni_name, false,
6500 th);
6501 gcc_assert (new_loop);
6502 gcc_assert (loop_num == loop->num);
6503 #ifdef ENABLE_CHECKING
6504 slpeel_verify_cfg_after_peeling (loop, new_loop);
6505 #endif
6507 /* A guard that controls whether the new_loop is to be executed or skipped
6508 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6509 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6510 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6511 is on the path where the LOOP IVs are used and need to be updated. */
6513 preheader = loop_preheader_edge (new_loop)->src;
6514 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6515 update_e = EDGE_PRED (preheader, 0);
6516 else
6517 update_e = EDGE_PRED (preheader, 1);
6519 /* Update IVs of original loop as if they were advanced
6520 by ratio_mult_vf_name steps. */
6521 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6523 /* After peeling we have to reset scalar evolution analyzer. */
6524 scev_reset ();
6526 free_original_copy_tables ();
6530 /* Function vect_gen_niters_for_prolog_loop
6532 Set the number of iterations for the loop represented by LOOP_VINFO
6533 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6534 and the misalignment of DR - the data reference recorded in
6535 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6536 this loop, the data reference DR will refer to an aligned location.
6538 The following computation is generated:
6540 If the misalignment of DR is known at compile time:
6541 addr_mis = int mis = DR_MISALIGNMENT (dr);
6542 Else, compute address misalignment in bytes:
6543 addr_mis = addr & (vectype_size - 1)
6545 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6547 (elem_size = element type size; an element is the scalar element
6548 whose type is the inner type of the vectype)
6550 For interleaving,
6552 prolog_niters = min ( LOOP_NITERS ,
6553 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6554 where group_size is the size of the interleaved group.
6556 The above formulas assume that VF == number of elements in the vector. This
6557 may not hold when there are multiple-types in the loop.
6558 In this case, for some data-references in the loop the VF does not represent
6559 the number of elements that fit in the vector. Therefore, instead of VF we
6560 use TYPE_VECTOR_SUBPARTS. */
6562 static tree
6563 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6565 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6566 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6567 tree var, stmt;
6568 tree iters, iters_name;
6569 edge pe;
6570 basic_block new_bb;
6571 tree dr_stmt = DR_STMT (dr);
6572 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6573 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6574 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6575 tree niters_type = TREE_TYPE (loop_niters);
6576 int group_size = 1;
6577 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6578 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6580 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6582 /* For interleaved access element size must be multiplied by the size of
6583 the interleaved group. */
6584 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6585 DR_GROUP_FIRST_DR (stmt_info)));
6586 element_size *= group_size;
6589 pe = loop_preheader_edge (loop);
6591 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6593 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6594 int elem_misalign = byte_misalign / element_size;
6596 if (vect_print_dump_info (REPORT_DETAILS))
6597 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6598 iters = build_int_cst (niters_type,
6599 (nelements - elem_misalign)&(nelements/group_size-1));
6601 else
6603 tree new_stmts = NULL_TREE;
6604 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6605 &new_stmts, NULL_TREE, loop);
6606 tree ptr_type = TREE_TYPE (start_addr);
6607 tree size = TYPE_SIZE (ptr_type);
6608 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6609 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6610 tree elem_size_log =
6611 build_int_cst (type, exact_log2 (vectype_align/nelements));
6612 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6613 tree nelements_tree = build_int_cst (type, nelements);
6614 tree byte_misalign;
6615 tree elem_misalign;
6617 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6618 gcc_assert (!new_bb);
6620 /* Create: byte_misalign = addr & (vectype_size - 1) */
6621 byte_misalign =
6622 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6624 /* Create: elem_misalign = byte_misalign / element_size */
6625 elem_misalign =
6626 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6628 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6629 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6630 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6631 iters = fold_convert (niters_type, iters);
6634 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6635 /* If the loop bound is known at compile time we already verified that it is
6636 greater than vf; since the misalignment ('iters') is at most vf, there's
6637 no need to generate the MIN_EXPR in this case. */
6638 if (TREE_CODE (loop_niters) != INTEGER_CST)
6639 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6641 if (vect_print_dump_info (REPORT_DETAILS))
6643 fprintf (vect_dump, "niters for prolog loop: ");
6644 print_generic_expr (vect_dump, iters, TDF_SLIM);
6647 var = create_tmp_var (niters_type, "prolog_loop_niters");
6648 add_referenced_var (var);
6649 iters_name = force_gimple_operand (iters, &stmt, false, var);
6651 /* Insert stmt on loop preheader edge. */
6652 if (stmt)
6654 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6655 gcc_assert (!new_bb);
6658 return iters_name;
6662 /* Function vect_update_init_of_dr
6664 NITERS iterations were peeled from LOOP. DR represents a data reference
6665 in LOOP. This function updates the information recorded in DR to
6666 account for the fact that the first NITERS iterations had already been
6667 executed. Specifically, it updates the OFFSET field of DR. */
6669 static void
6670 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6672 tree offset = DR_OFFSET (dr);
6674 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6675 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6676 DR_OFFSET (dr) = offset;
6680 /* Function vect_update_inits_of_drs
6682 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6683 This function updates the information recorded for the data references in
6684 the loop to account for the fact that the first NITERS iterations had
6685 already been executed. Specifically, it updates the initial_condition of
6686 the access_function of all the data_references in the loop. */
6688 static void
6689 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6691 unsigned int i;
6692 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6693 struct data_reference *dr;
6695 if (vect_print_dump_info (REPORT_DETAILS))
6696 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6698 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6699 vect_update_init_of_dr (dr, niters);
6703 /* Function vect_do_peeling_for_alignment
6705 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6706 'niters' is set to the misalignment of one of the data references in the
6707 loop, thereby forcing it to refer to an aligned location at the beginning
6708 of the execution of this loop. The data reference for which we are
6709 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6711 static void
6712 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6714 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6715 tree niters_of_prolog_loop, ni_name;
6716 tree n_iters;
6717 struct loop *new_loop;
6719 if (vect_print_dump_info (REPORT_DETAILS))
6720 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6722 initialize_original_copy_tables ();
6724 ni_name = vect_build_loop_niters (loop_vinfo);
6725 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6727 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6728 new_loop =
6729 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6730 niters_of_prolog_loop, ni_name, true, 0);
6731 gcc_assert (new_loop);
6732 #ifdef ENABLE_CHECKING
6733 slpeel_verify_cfg_after_peeling (new_loop, loop);
6734 #endif
6736 /* Update number of times loop executes. */
6737 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6738 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6739 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6741 /* Update the init conditions of the access functions of all data refs. */
6742 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6744 /* After peeling we have to reset scalar evolution analyzer. */
6745 scev_reset ();
6747 free_original_copy_tables ();
6751 /* Function vect_create_cond_for_align_checks.
6753 Create a conditional expression that represents the alignment checks for
6754 all of data references (array element references) whose alignment must be
6755 checked at runtime.
6757 Input:
6758 LOOP_VINFO - two fields of the loop information are used.
6759 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6760 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6762 Output:
6763 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6764 expression.
6765 The returned value is the conditional expression to be used in the if
6766 statement that controls which version of the loop gets executed at runtime.
6768 The algorithm makes two assumptions:
6769 1) The number of bytes "n" in a vector is a power of 2.
6770 2) An address "a" is aligned if a%n is zero and that this
6771 test can be done as a&(n-1) == 0. For example, for 16
6772 byte vectors the test is a&0xf == 0. */
6774 static tree
6775 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6776 tree *cond_expr_stmt_list)
6778 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6779 VEC(tree,heap) *may_misalign_stmts
6780 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6781 tree ref_stmt, tmp;
6782 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6783 tree mask_cst;
6784 unsigned int i;
6785 tree psize;
6786 tree int_ptrsize_type;
6787 char tmp_name[20];
6788 tree or_tmp_name = NULL_TREE;
6789 tree and_tmp, and_tmp_name, and_stmt;
6790 tree ptrsize_zero;
6792 /* Check that mask is one less than a power of 2, i.e., mask is
6793 all zeros followed by all ones. */
6794 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6796 /* CHECKME: what is the best integer or unsigned type to use to hold a
6797 cast from a pointer value? */
6798 psize = TYPE_SIZE (ptr_type_node);
6799 int_ptrsize_type
6800 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6802 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6803 of the first vector of the i'th data reference. */
6805 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6807 tree new_stmt_list = NULL_TREE;
6808 tree addr_base;
6809 tree addr_tmp, addr_tmp_name, addr_stmt;
6810 tree or_tmp, new_or_tmp_name, or_stmt;
6812 /* create: addr_tmp = (int)(address_of_first_vector) */
6813 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6814 &new_stmt_list, NULL_TREE, loop);
6816 if (new_stmt_list != NULL_TREE)
6817 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6819 sprintf (tmp_name, "%s%d", "addr2int", i);
6820 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6821 add_referenced_var (addr_tmp);
6822 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6823 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6824 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6825 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6826 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6828 /* The addresses are OR together. */
6830 if (or_tmp_name != NULL_TREE)
6832 /* create: or_tmp = or_tmp | addr_tmp */
6833 sprintf (tmp_name, "%s%d", "orptrs", i);
6834 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6835 add_referenced_var (or_tmp);
6836 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6837 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6838 or_tmp_name, addr_tmp_name);
6839 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6840 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6841 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6842 or_tmp_name = new_or_tmp_name;
6844 else
6845 or_tmp_name = addr_tmp_name;
6847 } /* end for i */
6849 mask_cst = build_int_cst (int_ptrsize_type, mask);
6851 /* create: and_tmp = or_tmp & mask */
6852 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6853 add_referenced_var (and_tmp);
6854 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6856 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6857 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6858 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6859 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6861 /* Make and_tmp the left operand of the conditional test against zero.
6862 if and_tmp has a nonzero bit then some address is unaligned. */
6863 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6864 return build2 (EQ_EXPR, boolean_type_node,
6865 and_tmp_name, ptrsize_zero);
6868 /* Function vect_vfa_segment_size.
6870 Create an expression that computes the size of segment
6871 that will be accessed for a data reference. The functions takes into
6872 account that realignment loads may access one more vector.
6874 Input:
6875 DR: The data reference.
6876 VECT_FACTOR: vectorization factor.
6878 Return an expression whose value is the size of segment which will be
6879 accessed by DR. */
6881 static tree
6882 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
6884 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
6885 DR_STEP (dr), vect_factor);
6887 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
6889 tree vector_size = TYPE_SIZE_UNIT
6890 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
6892 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
6893 segment_length, vector_size);
6895 return fold_convert (sizetype, segment_length);
6898 /* Function vect_create_cond_for_alias_checks.
6900 Create a conditional expression that represents the run-time checks for
6901 overlapping of address ranges represented by a list of data references
6902 relations passed as input.
6904 Input:
6905 COND_EXPR - input conditional expression. New conditions will be chained
6906 with logical and operation.
6907 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
6908 to be checked.
6910 Output:
6911 COND_EXPR - conditional expression.
6912 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6913 expression.
6916 The returned value is the conditional expression to be used in the if
6917 statement that controls which version of the loop gets executed at runtime.
6920 static void
6921 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
6922 tree * cond_expr,
6923 tree * cond_expr_stmt_list)
6925 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6926 VEC (ddr_p, heap) * may_alias_ddrs =
6927 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
6928 tree vect_factor =
6929 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
6931 ddr_p ddr;
6932 unsigned int i;
6933 tree part_cond_expr;
6935 /* Create expression
6936 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
6937 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
6941 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
6942 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
6944 if (VEC_empty (ddr_p, may_alias_ddrs))
6945 return;
6947 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
6949 struct data_reference *dr_a, *dr_b;
6950 tree dr_group_first_a, dr_group_first_b;
6951 tree addr_base_a, addr_base_b;
6952 tree segment_length_a, segment_length_b;
6953 tree stmt_a, stmt_b;
6955 dr_a = DDR_A (ddr);
6956 stmt_a = DR_STMT (DDR_A (ddr));
6957 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
6958 if (dr_group_first_a)
6960 stmt_a = dr_group_first_a;
6961 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
6964 dr_b = DDR_B (ddr);
6965 stmt_b = DR_STMT (DDR_B (ddr));
6966 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
6967 if (dr_group_first_b)
6969 stmt_b = dr_group_first_b;
6970 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
6973 addr_base_a =
6974 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
6975 NULL_TREE, loop);
6976 addr_base_b =
6977 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
6978 NULL_TREE, loop);
6980 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
6981 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
6983 if (vect_print_dump_info (REPORT_DR_DETAILS))
6985 fprintf (vect_dump,
6986 "create runtime check for data references ");
6987 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
6988 fprintf (vect_dump, " and ");
6989 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
6993 part_cond_expr =
6994 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
6995 fold_build2 (LT_EXPR, boolean_type_node,
6996 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
6997 addr_base_a,
6998 segment_length_a),
6999 addr_base_b),
7000 fold_build2 (LT_EXPR, boolean_type_node,
7001 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7002 addr_base_b,
7003 segment_length_b),
7004 addr_base_a));
7006 if (*cond_expr)
7007 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7008 *cond_expr, part_cond_expr);
7009 else
7010 *cond_expr = part_cond_expr;
7012 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7013 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7014 VEC_length (ddr_p, may_alias_ddrs));
7018 /* Function vect_loop_versioning.
7020 If the loop has data references that may or may not be aligned or/and
7021 has data reference relations whose independence was not proven then
7022 two versions of the loop need to be generated, one which is vectorized
7023 and one which isn't. A test is then generated to control which of the
7024 loops is executed. The test checks for the alignment of all of the
7025 data references that may or may not be aligned. An additional
7026 sequence of runtime tests is generated for each pairs of DDRs whose
7027 independence was not proven. The vectorized version of loop is
7028 executed only if both alias and alignment tests are passed. */
7030 static void
7031 vect_loop_versioning (loop_vec_info loop_vinfo)
7033 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7034 struct loop *nloop;
7035 tree cond_expr = NULL_TREE;
7036 tree cond_expr_stmt_list = NULL_TREE;
7037 basic_block condition_bb;
7038 block_stmt_iterator cond_exp_bsi;
7039 basic_block merge_bb;
7040 basic_block new_exit_bb;
7041 edge new_exit_e, e;
7042 tree orig_phi, new_phi, arg;
7043 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7044 tree gimplify_stmt_list;
7046 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7047 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7048 return;
7050 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7051 cond_expr =
7052 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
7054 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7055 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, &cond_expr_stmt_list);
7057 cond_expr =
7058 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7059 cond_expr =
7060 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7061 NULL_TREE);
7062 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7064 initialize_original_copy_tables ();
7065 nloop = loop_version (loop, cond_expr, &condition_bb,
7066 prob, prob, REG_BR_PROB_BASE - prob, true);
7067 free_original_copy_tables();
7069 /* Loop versioning violates an assumption we try to maintain during
7070 vectorization - that the loop exit block has a single predecessor.
7071 After versioning, the exit block of both loop versions is the same
7072 basic block (i.e. it has two predecessors). Just in order to simplify
7073 following transformations in the vectorizer, we fix this situation
7074 here by adding a new (empty) block on the exit-edge of the loop,
7075 with the proper loop-exit phis to maintain loop-closed-form. */
7077 merge_bb = single_exit (loop)->dest;
7078 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7079 new_exit_bb = split_edge (single_exit (loop));
7080 new_exit_e = single_exit (loop);
7081 e = EDGE_SUCC (new_exit_bb, 0);
7083 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7084 orig_phi = PHI_CHAIN (orig_phi))
7086 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7087 new_exit_bb);
7088 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7089 add_phi_arg (new_phi, arg, new_exit_e);
7090 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7093 /* End loop-exit-fixes after versioning. */
7095 update_ssa (TODO_update_ssa);
7096 if (cond_expr_stmt_list)
7098 cond_exp_bsi = bsi_last (condition_bb);
7099 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7103 /* Remove a group of stores (for SLP or interleaving), free their
7104 stmt_vec_info. */
7106 static void
7107 vect_remove_stores (tree first_stmt)
7109 stmt_ann_t ann;
7110 tree next = first_stmt;
7111 tree tmp;
7112 stmt_vec_info next_stmt_info;
7113 block_stmt_iterator next_si;
7115 while (next)
7117 /* Free the attached stmt_vec_info and remove the stmt. */
7118 next_si = bsi_for_stmt (next);
7119 bsi_remove (&next_si, true);
7120 next_stmt_info = vinfo_for_stmt (next);
7121 ann = stmt_ann (next);
7122 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7123 free (next_stmt_info);
7124 set_stmt_info (ann, NULL);
7125 next = tmp;
7130 /* Vectorize SLP instance tree in postorder. */
7132 static bool
7133 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7135 tree stmt;
7136 bool strided_store, is_store;
7137 block_stmt_iterator si;
7138 stmt_vec_info stmt_info;
7140 if (!node)
7141 return false;
7143 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7144 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7146 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7147 stmt_info = vinfo_for_stmt (stmt);
7148 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7149 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7151 if (vect_print_dump_info (REPORT_DETAILS))
7153 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7154 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7157 si = bsi_for_stmt (stmt);
7158 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7159 if (is_store)
7161 if (DR_GROUP_FIRST_DR (stmt_info))
7162 /* If IS_STORE is TRUE, the vectorization of the
7163 interleaving chain was completed - free all the stores in
7164 the chain. */
7165 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7166 else
7167 /* FORNOW: SLP originates only from strided stores. */
7168 gcc_unreachable ();
7170 return true;
7173 /* FORNOW: SLP originates only from strided stores. */
7174 return false;
7178 static bool
7179 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7181 VEC (slp_instance, heap) *slp_instances =
7182 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7183 slp_instance instance;
7184 unsigned int vec_stmts_size;
7185 unsigned int group_size, i;
7186 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7187 bool is_store = false;
7189 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7191 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7192 /* For each SLP instance calculate number of vector stmts to be created
7193 for the scalar stmts in each node of the SLP tree. Number of vector
7194 elements in one vector iteration is the number of scalar elements in
7195 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7196 size. */
7197 vec_stmts_size = vectorization_factor * group_size / nunits;
7199 /* Schedule the tree of INSTANCE. */
7200 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7201 vec_stmts_size);
7203 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7204 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7205 fprintf (vect_dump, "vectorizing stmts using SLP.");
7208 return is_store;
7211 /* Function vect_transform_loop.
7213 The analysis phase has determined that the loop is vectorizable.
7214 Vectorize the loop - created vectorized stmts to replace the scalar
7215 stmts in the loop, and update the loop exit condition. */
7217 void
7218 vect_transform_loop (loop_vec_info loop_vinfo)
7220 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7221 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7222 int nbbs = loop->num_nodes;
7223 block_stmt_iterator si, next_si;
7224 int i;
7225 tree ratio = NULL;
7226 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7227 bool strided_store;
7228 bool slp_scheduled = false;
7229 unsigned int nunits;
7231 if (vect_print_dump_info (REPORT_DETAILS))
7232 fprintf (vect_dump, "=== vec_transform_loop ===");
7233 vect_loop_versioning (loop_vinfo);
7235 /* CHECKME: we wouldn't need this if we called update_ssa once
7236 for all loops. */
7237 bitmap_zero (vect_memsyms_to_rename);
7239 /* Peel the loop if there are data refs with unknown alignment.
7240 Only one data ref with unknown store is allowed. */
7242 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7243 vect_do_peeling_for_alignment (loop_vinfo);
7245 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7246 compile time constant), or it is a constant that doesn't divide by the
7247 vectorization factor, then an epilog loop needs to be created.
7248 We therefore duplicate the loop: the original loop will be vectorized,
7249 and will compute the first (n/VF) iterations. The second copy of the loop
7250 will remain scalar and will compute the remaining (n%VF) iterations.
7251 (VF is the vectorization factor). */
7253 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7254 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7255 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7256 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7257 else
7258 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7259 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7261 /* 1) Make sure the loop header has exactly two entries
7262 2) Make sure we have a preheader basic block. */
7264 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7266 split_edge (loop_preheader_edge (loop));
7268 /* FORNOW: the vectorizer supports only loops which body consist
7269 of one basic block (header + empty latch). When the vectorizer will
7270 support more involved loop forms, the order by which the BBs are
7271 traversed need to be reconsidered. */
7273 for (i = 0; i < nbbs; i++)
7275 basic_block bb = bbs[i];
7276 stmt_vec_info stmt_info;
7277 tree phi;
7279 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7281 if (vect_print_dump_info (REPORT_DETAILS))
7283 fprintf (vect_dump, "------>vectorizing phi: ");
7284 print_generic_expr (vect_dump, phi, TDF_SLIM);
7286 stmt_info = vinfo_for_stmt (phi);
7287 if (!stmt_info)
7288 continue;
7290 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7291 && !STMT_VINFO_LIVE_P (stmt_info))
7292 continue;
7294 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7295 != (unsigned HOST_WIDE_INT) vectorization_factor)
7296 && vect_print_dump_info (REPORT_DETAILS))
7297 fprintf (vect_dump, "multiple-types.");
7299 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7301 if (vect_print_dump_info (REPORT_DETAILS))
7302 fprintf (vect_dump, "transform phi.");
7303 vect_transform_stmt (phi, NULL, NULL, NULL);
7307 for (si = bsi_start (bb); !bsi_end_p (si);)
7309 tree stmt = bsi_stmt (si);
7310 bool is_store;
7312 if (vect_print_dump_info (REPORT_DETAILS))
7314 fprintf (vect_dump, "------>vectorizing statement: ");
7315 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7318 stmt_info = vinfo_for_stmt (stmt);
7320 /* vector stmts created in the outer-loop during vectorization of
7321 stmts in an inner-loop may not have a stmt_info, and do not
7322 need to be vectorized. */
7323 if (!stmt_info)
7325 bsi_next (&si);
7326 continue;
7329 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7330 && !STMT_VINFO_LIVE_P (stmt_info))
7332 bsi_next (&si);
7333 continue;
7336 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7337 nunits =
7338 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7339 if (!STMT_SLP_TYPE (stmt_info)
7340 && nunits != (unsigned int) vectorization_factor
7341 && vect_print_dump_info (REPORT_DETAILS))
7342 /* For SLP VF is set according to unrolling factor, and not to
7343 vector size, hence for SLP this print is not valid. */
7344 fprintf (vect_dump, "multiple-types.");
7346 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7347 reached. */
7348 if (STMT_SLP_TYPE (stmt_info))
7350 if (!slp_scheduled)
7352 slp_scheduled = true;
7354 if (vect_print_dump_info (REPORT_DETAILS))
7355 fprintf (vect_dump, "=== scheduling SLP instances ===");
7357 is_store = vect_schedule_slp (loop_vinfo, nunits);
7359 /* IS_STORE is true if STMT is a store. Stores cannot be of
7360 hybrid SLP type. They are removed in
7361 vect_schedule_slp_instance and their vinfo is destroyed. */
7362 if (is_store)
7364 bsi_next (&si);
7365 continue;
7369 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7370 if (PURE_SLP_STMT (stmt_info))
7372 bsi_next (&si);
7373 continue;
7377 /* -------- vectorize statement ------------ */
7378 if (vect_print_dump_info (REPORT_DETAILS))
7379 fprintf (vect_dump, "transform statement.");
7381 strided_store = false;
7382 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7383 if (is_store)
7385 stmt_ann_t ann;
7386 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7388 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7389 interleaving chain was completed - free all the stores in
7390 the chain. */
7391 tree next = DR_GROUP_FIRST_DR (stmt_info);
7392 tree tmp;
7393 stmt_vec_info next_stmt_info;
7395 while (next)
7397 next_si = bsi_for_stmt (next);
7398 next_stmt_info = vinfo_for_stmt (next);
7399 /* Free the attached stmt_vec_info and remove the stmt. */
7400 ann = stmt_ann (next);
7401 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7402 free (next_stmt_info);
7403 set_stmt_info (ann, NULL);
7404 bsi_remove (&next_si, true);
7405 next = tmp;
7407 bsi_remove (&si, true);
7408 continue;
7410 else
7412 /* Free the attached stmt_vec_info and remove the stmt. */
7413 ann = stmt_ann (stmt);
7414 free (stmt_info);
7415 set_stmt_info (ann, NULL);
7416 bsi_remove (&si, true);
7417 continue;
7420 bsi_next (&si);
7421 } /* stmts in BB */
7422 } /* BBs in loop */
7424 slpeel_make_loop_iterate_ntimes (loop, ratio);
7426 mark_set_for_renaming (vect_memsyms_to_rename);
7428 /* The memory tags and pointers in vectorized statements need to
7429 have their SSA forms updated. FIXME, why can't this be delayed
7430 until all the loops have been transformed? */
7431 update_ssa (TODO_update_ssa);
7433 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7434 fprintf (vect_dump, "LOOP VECTORIZED.");
7435 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7436 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");