mips.c (TARGET_MIN_ANCHOR_OFFSET): Delete.
[official-gcc.git] / gcc / tree-vect-transform.c
blobc718e0742127284762526bcc17542349d737cedf
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
124 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
125 int nbbs = loop->num_nodes;
126 int byte_misalign;
127 int peel_guard_costs = 0;
128 int innerloop_iters = 0, factor;
129 VEC (slp_instance, heap) *slp_instances;
130 slp_instance instance;
132 /* Cost model disabled. */
133 if (!flag_vect_cost_model)
135 if (vect_print_dump_info (REPORT_DETAILS))
136 fprintf (vect_dump, "cost model disabled.");
137 return 0;
140 /* Requires loop versioning tests to handle misalignment. */
142 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
144 /* FIXME: Make cost depend on complexity of individual check. */
145 vec_outside_cost +=
146 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
147 if (vect_print_dump_info (REPORT_DETAILS))
148 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
149 "versioning to treat misalignment.\n");
152 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
154 /* FIXME: Make cost depend on complexity of individual check. */
155 vec_outside_cost +=
156 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
157 if (vect_print_dump_info (REPORT_DETAILS))
158 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
159 "versioning aliasing.\n");
162 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
163 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
165 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
168 /* Count statements in scalar loop. Using this as scalar cost for a single
169 iteration for now.
171 TODO: Add outer loop support.
173 TODO: Consider assigning different costs to different scalar
174 statements. */
176 /* FORNOW. */
177 if (loop->inner)
178 innerloop_iters = 50; /* FIXME */
180 for (i = 0; i < nbbs; i++)
182 block_stmt_iterator si;
183 basic_block bb = bbs[i];
185 if (bb->loop_father == loop->inner)
186 factor = innerloop_iters;
187 else
188 factor = 1;
190 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
192 tree stmt = bsi_stmt (si);
193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
194 if (!STMT_VINFO_RELEVANT_P (stmt_info)
195 && !STMT_VINFO_LIVE_P (stmt_info))
196 continue;
197 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
198 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
199 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
200 some of the "outside" costs are generated inside the outer-loop. */
201 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
205 /* Add additional cost for the peeled instructions in prologue and epilogue
206 loop.
208 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
209 at compile-time - we assume it's vf/2 (the worst would be vf-1).
211 TODO: Build an expression that represents peel_iters for prologue and
212 epilogue to be used in a run-time test. */
214 byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
216 if (byte_misalign < 0)
218 peel_iters_prologue = vf/2;
219 if (vect_print_dump_info (REPORT_DETAILS))
220 fprintf (vect_dump, "cost model: "
221 "prologue peel iters set to vf/2.");
223 /* If peeling for alignment is unknown, loop bound of main loop becomes
224 unknown. */
225 peel_iters_epilogue = vf/2;
226 if (vect_print_dump_info (REPORT_DETAILS))
227 fprintf (vect_dump, "cost model: "
228 "epilogue peel iters set to vf/2 because "
229 "peeling for alignment is unknown .");
231 /* If peeled iterations are unknown, count a taken branch and a not taken
232 branch per peeled loop. Even if scalar loop iterations are known,
233 vector iterations are not known since peeled prologue iterations are
234 not known. Hence guards remain the same. */
235 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
236 + TARG_COND_NOT_TAKEN_BRANCH_COST);
239 else
241 if (byte_misalign)
243 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
244 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
245 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
246 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
248 peel_iters_prologue = nelements - (byte_misalign / element_size);
250 else
251 peel_iters_prologue = 0;
253 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
255 peel_iters_epilogue = vf/2;
256 if (vect_print_dump_info (REPORT_DETAILS))
257 fprintf (vect_dump, "cost model: "
258 "epilogue peel iters set to vf/2 because "
259 "loop iterations are unknown .");
261 /* If peeled iterations are known but number of scalar loop
262 iterations are unknown, count a taken branch per peeled loop. */
263 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
266 else
268 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
269 peel_iters_prologue = niters < peel_iters_prologue ?
270 niters : peel_iters_prologue;
271 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
275 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
276 + (peel_iters_epilogue * scalar_single_iter_cost)
277 + peel_guard_costs;
279 /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
280 information we provide for the target is whether testing against the
281 threshold involves a runtime test. */
282 if (targetm.vectorize.builtin_vectorization_cost)
284 bool runtime_test = false;
286 /* If the number of iterations is unknown, or the
287 peeling-for-misalignment amount is unknown, we eill have to generate
288 a runtime test to test the loop count against the threshold. */
289 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
290 || (byte_misalign < 0))
291 runtime_test = true;
292 vec_outside_cost +=
293 targetm.vectorize.builtin_vectorization_cost (runtime_test);
294 if (vect_print_dump_info (REPORT_DETAILS))
295 fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
296 targetm.vectorize.builtin_vectorization_cost (runtime_test));
299 /* Add SLP costs. */
300 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
301 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
303 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
304 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
307 /* Calculate number of iterations required to make the vector version
308 profitable, relative to the loop bodies only. The following condition
309 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
310 SIC = scalar iteration cost, VIC = vector iteration cost,
311 VOC = vector outside cost and VF = vectorization factor. */
313 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
315 if (vec_outside_cost <= 0)
316 min_profitable_iters = 1;
317 else
319 min_profitable_iters = (vec_outside_cost * vf
320 - vec_inside_cost * peel_iters_prologue
321 - vec_inside_cost * peel_iters_epilogue)
322 / ((scalar_single_iter_cost * vf)
323 - vec_inside_cost);
325 if ((scalar_single_iter_cost * vf * min_profitable_iters)
326 <= ((vec_inside_cost * min_profitable_iters)
327 + (vec_outside_cost * vf)))
328 min_profitable_iters++;
331 /* vector version will never be profitable. */
332 else
334 if (vect_print_dump_info (REPORT_DETAILS))
335 fprintf (vect_dump, "cost model: vector iteration cost = %d "
336 "is divisible by scalar iteration cost = %d by a factor "
337 "greater than or equal to the vectorization factor = %d .",
338 vec_inside_cost, scalar_single_iter_cost, vf);
339 return -1;
342 if (vect_print_dump_info (REPORT_DETAILS))
344 fprintf (vect_dump, "Cost model analysis: \n");
345 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
346 vec_inside_cost);
347 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
348 vec_outside_cost);
349 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
350 fprintf (vect_dump, " prologue iterations: %d\n",
351 peel_iters_prologue);
352 fprintf (vect_dump, " epilogue iterations: %d\n",
353 peel_iters_epilogue);
354 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
355 min_profitable_iters);
358 min_profitable_iters =
359 min_profitable_iters < vf ? vf : min_profitable_iters;
361 /* Because the condition we create is:
362 if (niters <= min_profitable_iters)
363 then skip the vectorized loop. */
364 min_profitable_iters--;
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, " Profitability threshold = %d\n",
368 min_profitable_iters);
370 return min_profitable_iters;
374 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
375 functions. Design better to avoid maintenance issues. */
377 /* Function vect_model_reduction_cost.
379 Models cost for a reduction operation, including the vector ops
380 generated within the strip-mine loop, the initial definition before
381 the loop, and the epilogue code that must be generated. */
383 static void
384 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
385 int ncopies)
387 int outer_cost = 0;
388 enum tree_code code;
389 optab optab;
390 tree vectype;
391 tree orig_stmt;
392 tree reduction_op;
393 enum machine_mode mode;
394 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
395 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
397 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
399 /* Cost of reduction op inside loop. */
400 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
402 reduction_op = TREE_OPERAND (operation, op_type-1);
403 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
404 mode = TYPE_MODE (vectype);
405 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
407 if (!orig_stmt)
408 orig_stmt = STMT_VINFO_STMT (stmt_info);
410 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
412 /* Add in cost for initial definition. */
413 outer_cost += TARG_SCALAR_TO_VEC_COST;
415 /* Determine cost of epilogue code.
417 We have a reduction operator that will reduce the vector in one statement.
418 Also requires scalar extract. */
420 if (!nested_in_vect_loop_p (loop, orig_stmt))
422 if (reduc_code < NUM_TREE_CODES)
423 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
424 else
426 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
427 tree bitsize =
428 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
429 int element_bitsize = tree_low_cst (bitsize, 1);
430 int nelements = vec_size_in_bits / element_bitsize;
432 optab = optab_for_tree_code (code, vectype);
434 /* We have a whole vector shift available. */
435 if (VECTOR_MODE_P (mode)
436 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
437 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
438 /* Final reduction via vector shifts and the reduction operator. Also
439 requires scalar extract. */
440 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
441 + TARG_VEC_TO_SCALAR_COST);
442 else
443 /* Use extracts and reduction op for final reduction. For N elements,
444 we have N extracts and N-1 reduction ops. */
445 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
449 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
451 if (vect_print_dump_info (REPORT_DETAILS))
452 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
453 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
454 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
458 /* Function vect_model_induction_cost.
460 Models cost for induction operations. */
462 static void
463 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
465 /* loop cost for vec_loop. */
466 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
467 /* prologue cost for vec_init and vec_step. */
468 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
470 if (vect_print_dump_info (REPORT_DETAILS))
471 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
472 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
473 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
477 /* Function vect_model_simple_cost.
479 Models cost for simple operations, i.e. those that only emit ncopies of a
480 single op. Right now, this does not account for multiple insns that could
481 be generated for the single vector op. We will handle that shortly. */
483 void
484 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
485 enum vect_def_type *dt, slp_tree slp_node)
487 int i;
488 int inside_cost = 0, outside_cost = 0;
490 inside_cost = ncopies * TARG_VEC_STMT_COST;
492 /* FORNOW: Assuming maximum 2 args per stmts. */
493 for (i = 0; i < 2; i++)
495 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
496 outside_cost += TARG_SCALAR_TO_VEC_COST;
499 if (vect_print_dump_info (REPORT_DETAILS))
500 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
501 "outside_cost = %d .", inside_cost, outside_cost);
503 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
504 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
505 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
509 /* Function vect_cost_strided_group_size
511 For strided load or store, return the group_size only if it is the first
512 load or store of a group, else return 1. This ensures that group size is
513 only returned once per group. */
515 static int
516 vect_cost_strided_group_size (stmt_vec_info stmt_info)
518 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
520 if (first_stmt == STMT_VINFO_STMT (stmt_info))
521 return DR_GROUP_SIZE (stmt_info);
523 return 1;
527 /* Function vect_model_store_cost
529 Models cost for stores. In the case of strided accesses, one access
530 has the overhead of the strided access attributed to it. */
532 void
533 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
534 enum vect_def_type dt, slp_tree slp_node)
536 int group_size;
537 int inside_cost = 0, outside_cost = 0;
539 if (dt == vect_constant_def || dt == vect_invariant_def)
540 outside_cost = TARG_SCALAR_TO_VEC_COST;
542 /* Strided access? */
543 if (DR_GROUP_FIRST_DR (stmt_info))
544 group_size = vect_cost_strided_group_size (stmt_info);
545 /* Not a strided access. */
546 else
547 group_size = 1;
549 /* Is this an access in a group of stores, which provide strided access?
550 If so, add in the cost of the permutes. */
551 if (group_size > 1)
553 /* Uses a high and low interleave operation for each needed permute. */
554 inside_cost = ncopies * exact_log2(group_size) * group_size
555 * TARG_VEC_STMT_COST;
557 if (vect_print_dump_info (REPORT_DETAILS))
558 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
559 group_size);
563 /* Costs of the stores. */
564 inside_cost += ncopies * TARG_VEC_STORE_COST;
566 if (vect_print_dump_info (REPORT_DETAILS))
567 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
568 "outside_cost = %d .", inside_cost, outside_cost);
570 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
571 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
572 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
576 /* Function vect_model_load_cost
578 Models cost for loads. In the case of strided accesses, the last access
579 has the overhead of the strided access attributed to it. Since unaligned
580 accesses are supported for loads, we also account for the costs of the
581 access scheme chosen. */
583 void
584 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
587 int group_size;
588 int alignment_support_cheme;
589 tree first_stmt;
590 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
591 int inside_cost = 0, outside_cost = 0;
593 /* Strided accesses? */
594 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
595 if (first_stmt && !slp_node)
597 group_size = vect_cost_strided_group_size (stmt_info);
598 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
600 /* Not a strided access. */
601 else
603 group_size = 1;
604 first_dr = dr;
607 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
609 /* Is this an access in a group of loads providing strided access?
610 If so, add in the cost of the permutes. */
611 if (group_size > 1)
613 /* Uses an even and odd extract operations for each needed permute. */
614 inside_cost = ncopies * exact_log2(group_size) * group_size
615 * TARG_VEC_STMT_COST;
617 if (vect_print_dump_info (REPORT_DETAILS))
618 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
619 group_size);
623 /* The loads themselves. */
624 switch (alignment_support_cheme)
626 case dr_aligned:
628 inside_cost += ncopies * TARG_VEC_LOAD_COST;
630 if (vect_print_dump_info (REPORT_DETAILS))
631 fprintf (vect_dump, "vect_model_load_cost: aligned.");
633 break;
635 case dr_unaligned_supported:
637 /* Here, we assign an additional cost for the unaligned load. */
638 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
640 if (vect_print_dump_info (REPORT_DETAILS))
641 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
642 "hardware.");
644 break;
646 case dr_explicit_realign:
648 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
650 /* FIXME: If the misalignment remains fixed across the iterations of
651 the containing loop, the following cost should be added to the
652 outside costs. */
653 if (targetm.vectorize.builtin_mask_for_load)
654 inside_cost += TARG_VEC_STMT_COST;
656 break;
658 case dr_explicit_realign_optimized:
660 if (vect_print_dump_info (REPORT_DETAILS))
661 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
662 "pipelined.");
664 /* Unaligned software pipeline has a load of an address, an initial
665 load, and possibly a mask operation to "prime" the loop. However,
666 if this is an access in a group of loads, which provide strided
667 access, then the above cost should only be considered for one
668 access in the group. Inside the loop, there is a load op
669 and a realignment op. */
671 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
673 outside_cost = 2*TARG_VEC_STMT_COST;
674 if (targetm.vectorize.builtin_mask_for_load)
675 outside_cost += TARG_VEC_STMT_COST;
678 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
680 break;
683 default:
684 gcc_unreachable ();
687 if (vect_print_dump_info (REPORT_DETAILS))
688 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
689 "outside_cost = %d .", inside_cost, outside_cost);
691 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
692 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
693 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
697 /* Function vect_get_new_vect_var.
699 Returns a name for a new variable. The current naming scheme appends the
700 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
701 the name of vectorizer generated variables, and appends that to NAME if
702 provided. */
704 static tree
705 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
707 const char *prefix;
708 tree new_vect_var;
710 switch (var_kind)
712 case vect_simple_var:
713 prefix = "vect_";
714 break;
715 case vect_scalar_var:
716 prefix = "stmp_";
717 break;
718 case vect_pointer_var:
719 prefix = "vect_p";
720 break;
721 default:
722 gcc_unreachable ();
725 if (name)
727 char* tmp = concat (prefix, name, NULL);
728 new_vect_var = create_tmp_var (type, tmp);
729 free (tmp);
731 else
732 new_vect_var = create_tmp_var (type, prefix);
734 /* Mark vector typed variable as a gimple register variable. */
735 if (TREE_CODE (type) == VECTOR_TYPE)
736 DECL_GIMPLE_REG_P (new_vect_var) = true;
738 return new_vect_var;
742 /* Function vect_create_addr_base_for_vector_ref.
744 Create an expression that computes the address of the first memory location
745 that will be accessed for a data reference.
747 Input:
748 STMT: The statement containing the data reference.
749 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
750 OFFSET: Optional. If supplied, it is be added to the initial address.
751 LOOP: Specify relative to which loop-nest should the address be computed.
752 For example, when the dataref is in an inner-loop nested in an
753 outer-loop that is now being vectorized, LOOP can be either the
754 outer-loop, or the inner-loop. The first memory location accessed
755 by the following dataref ('in' points to short):
757 for (i=0; i<N; i++)
758 for (j=0; j<M; j++)
759 s += in[i+j]
761 is as follows:
762 if LOOP=i_loop: &in (relative to i_loop)
763 if LOOP=j_loop: &in+i*2B (relative to j_loop)
765 Output:
766 1. Return an SSA_NAME whose value is the address of the memory location of
767 the first vector of the data reference.
768 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
769 these statement(s) which define the returned SSA_NAME.
771 FORNOW: We are only handling array accesses with step 1. */
773 static tree
774 vect_create_addr_base_for_vector_ref (tree stmt,
775 tree *new_stmt_list,
776 tree offset,
777 struct loop *loop)
779 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
780 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
781 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
782 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
783 tree base_name;
784 tree data_ref_base_var;
785 tree new_base_stmt;
786 tree vec_stmt;
787 tree addr_base, addr_expr;
788 tree dest, new_stmt;
789 tree base_offset = unshare_expr (DR_OFFSET (dr));
790 tree init = unshare_expr (DR_INIT (dr));
791 tree vect_ptr_type, addr_expr2;
792 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
794 gcc_assert (loop);
795 if (loop != containing_loop)
797 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
798 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
800 gcc_assert (nested_in_vect_loop_p (loop, stmt));
802 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
803 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
804 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
807 /* Create data_ref_base */
808 base_name = build_fold_indirect_ref (data_ref_base);
809 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
810 add_referenced_var (data_ref_base_var);
811 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
812 true, data_ref_base_var);
813 append_to_statement_list_force(new_base_stmt, new_stmt_list);
815 /* Create base_offset */
816 base_offset = size_binop (PLUS_EXPR, base_offset, init);
817 base_offset = fold_convert (sizetype, base_offset);
818 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
819 add_referenced_var (dest);
820 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
821 append_to_statement_list_force (new_stmt, new_stmt_list);
823 if (offset)
825 tree tmp = create_tmp_var (sizetype, "offset");
827 add_referenced_var (tmp);
828 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
829 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
830 base_offset, offset);
831 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
832 append_to_statement_list_force (new_stmt, new_stmt_list);
835 /* base + base_offset */
836 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
837 data_ref_base, base_offset);
839 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
841 /* addr_expr = addr_base */
842 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
843 get_name (base_name));
844 add_referenced_var (addr_expr);
845 vec_stmt = fold_convert (vect_ptr_type, addr_base);
846 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
847 get_name (base_name));
848 add_referenced_var (addr_expr2);
849 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
850 append_to_statement_list_force (new_stmt, new_stmt_list);
852 if (vect_print_dump_info (REPORT_DETAILS))
854 fprintf (vect_dump, "created ");
855 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
857 return vec_stmt;
861 /* Function vect_create_data_ref_ptr.
863 Create a new pointer to vector type (vp), that points to the first location
864 accessed in the loop by STMT, along with the def-use update chain to
865 appropriately advance the pointer through the loop iterations. Also set
866 aliasing information for the pointer. This vector pointer is used by the
867 callers to this function to create a memory reference expression for vector
868 load/store access.
870 Input:
871 1. STMT: a stmt that references memory. Expected to be of the form
872 GIMPLE_MODIFY_STMT <name, data-ref> or
873 GIMPLE_MODIFY_STMT <data-ref, name>.
874 2. AT_LOOP: the loop where the vector memref is to be created.
875 3. OFFSET (optional): an offset to be added to the initial address accessed
876 by the data-ref in STMT.
877 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
878 pointing to the initial address.
879 5. TYPE: if not NULL indicates the required type of the data-ref
881 Output:
882 1. Declare a new ptr to vector_type, and have it point to the base of the
883 data reference (initial addressed accessed by the data reference).
884 For example, for vector of type V8HI, the following code is generated:
886 v8hi *vp;
887 vp = (v8hi *)initial_address;
889 if OFFSET is not supplied:
890 initial_address = &a[init];
891 if OFFSET is supplied:
892 initial_address = &a[init + OFFSET];
894 Return the initial_address in INITIAL_ADDRESS.
896 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
897 update the pointer in each iteration of the loop.
899 Return the increment stmt that updates the pointer in PTR_INCR.
901 3. Set INV_P to true if the access pattern of the data reference in the
902 vectorized loop is invariant. Set it to false otherwise.
904 4. Return the pointer. */
906 static tree
907 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
908 tree offset, tree *initial_address, tree *ptr_incr,
909 bool only_init, tree type, bool *inv_p)
911 tree base_name;
912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
914 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
915 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
916 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
917 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
918 tree vect_ptr_type;
919 tree vect_ptr;
920 tree tag;
921 tree new_temp;
922 tree vec_stmt;
923 tree new_stmt_list = NULL_TREE;
924 edge pe;
925 basic_block new_bb;
926 tree vect_ptr_init;
927 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
928 tree vptr;
929 block_stmt_iterator incr_bsi;
930 bool insert_after;
931 tree indx_before_incr, indx_after_incr;
932 tree incr;
933 tree step;
935 /* Check the step (evolution) of the load in LOOP, and record
936 whether it's invariant. */
937 if (nested_in_vect_loop)
938 step = STMT_VINFO_DR_STEP (stmt_info);
939 else
940 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
942 if (tree_int_cst_compare (step, size_zero_node) == 0)
943 *inv_p = true;
944 else
945 *inv_p = false;
947 /* Create an expression for the first address accessed by this load
948 in LOOP. */
949 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
951 if (vect_print_dump_info (REPORT_DETAILS))
953 tree data_ref_base = base_name;
954 fprintf (vect_dump, "create vector-pointer variable to type: ");
955 print_generic_expr (vect_dump, vectype, TDF_SLIM);
956 if (TREE_CODE (data_ref_base) == VAR_DECL)
957 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
958 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
959 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
960 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
961 fprintf (vect_dump, " vectorizing a record based array ref: ");
962 else if (TREE_CODE (data_ref_base) == SSA_NAME)
963 fprintf (vect_dump, " vectorizing a pointer ref: ");
964 print_generic_expr (vect_dump, base_name, TDF_SLIM);
967 /** (1) Create the new vector-pointer variable: **/
968 if (type)
969 vect_ptr_type = build_pointer_type (type);
970 else
971 vect_ptr_type = build_pointer_type (vectype);
972 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
973 get_name (base_name));
974 add_referenced_var (vect_ptr);
976 /** (2) Add aliasing information to the new vector-pointer:
977 (The points-to info (DR_PTR_INFO) may be defined later.) **/
979 tag = DR_SYMBOL_TAG (dr);
980 gcc_assert (tag);
982 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
983 tag must be created with tag added to its may alias list. */
984 if (!MTAG_P (tag))
985 new_type_alias (vect_ptr, tag, DR_REF (dr));
986 else
987 set_symbol_mem_tag (vect_ptr, tag);
989 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
991 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
992 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
993 def-use update cycles for the pointer: One relative to the outer-loop
994 (LOOP), which is what steps (3) and (4) below do. The other is relative
995 to the inner-loop (which is the inner-most loop containing the dataref),
996 and this is done be step (5) below.
998 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
999 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1000 redundant. Steps (3),(4) create the following:
1002 vp0 = &base_addr;
1003 LOOP: vp1 = phi(vp0,vp2)
1004 ...
1006 vp2 = vp1 + step
1007 goto LOOP
1009 If there is an inner-loop nested in loop, then step (5) will also be
1010 applied, and an additional update in the inner-loop will be created:
1012 vp0 = &base_addr;
1013 LOOP: vp1 = phi(vp0,vp2)
1015 inner: vp3 = phi(vp1,vp4)
1016 vp4 = vp3 + inner_step
1017 if () goto inner
1019 vp2 = vp1 + step
1020 if () goto LOOP */
1022 /** (3) Calculate the initial address the vector-pointer, and set
1023 the vector-pointer to point to it before the loop: **/
1025 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1027 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1028 offset, loop);
1029 pe = loop_preheader_edge (loop);
1030 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1031 gcc_assert (!new_bb);
1032 *initial_address = new_temp;
1034 /* Create: p = (vectype *) initial_base */
1035 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1036 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1037 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1038 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1039 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1040 gcc_assert (!new_bb);
1043 /** (4) Handle the updating of the vector-pointer inside the loop.
1044 This is needed when ONLY_INIT is false, and also when AT_LOOP
1045 is the inner-loop nested in LOOP (during outer-loop vectorization).
1048 if (only_init && at_loop == loop) /* No update in loop is required. */
1050 /* Copy the points-to information if it exists. */
1051 if (DR_PTR_INFO (dr))
1052 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1053 vptr = vect_ptr_init;
1055 else
1057 /* The step of the vector pointer is the Vector Size. */
1058 tree step = TYPE_SIZE_UNIT (vectype);
1059 /* One exception to the above is when the scalar step of the load in
1060 LOOP is zero. In this case the step here is also zero. */
1061 if (*inv_p)
1062 step = size_zero_node;
1064 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1066 create_iv (vect_ptr_init,
1067 fold_convert (vect_ptr_type, step),
1068 NULL_TREE, loop, &incr_bsi, insert_after,
1069 &indx_before_incr, &indx_after_incr);
1070 incr = bsi_stmt (incr_bsi);
1071 set_stmt_info (stmt_ann (incr),
1072 new_stmt_vec_info (incr, loop_vinfo));
1074 /* Copy the points-to information if it exists. */
1075 if (DR_PTR_INFO (dr))
1077 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1078 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1080 merge_alias_info (vect_ptr_init, indx_before_incr);
1081 merge_alias_info (vect_ptr_init, indx_after_incr);
1082 if (ptr_incr)
1083 *ptr_incr = incr;
1085 vptr = indx_before_incr;
1088 if (!nested_in_vect_loop || only_init)
1089 return vptr;
1092 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1093 nested in LOOP, if exists: **/
1095 gcc_assert (nested_in_vect_loop);
1096 if (!only_init)
1098 standard_iv_increment_position (containing_loop, &incr_bsi,
1099 &insert_after);
1100 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1101 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1102 &indx_after_incr);
1103 incr = bsi_stmt (incr_bsi);
1104 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1106 /* Copy the points-to information if it exists. */
1107 if (DR_PTR_INFO (dr))
1109 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1110 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1112 merge_alias_info (vect_ptr_init, indx_before_incr);
1113 merge_alias_info (vect_ptr_init, indx_after_incr);
1114 if (ptr_incr)
1115 *ptr_incr = incr;
1117 return indx_before_incr;
1119 else
1120 gcc_unreachable ();
1124 /* Function bump_vector_ptr
1126 Increment a pointer (to a vector type) by vector-size. If requested,
1127 i.e. if PTR-INCR is given, then also connect the new increment stmt
1128 to the existing def-use update-chain of the pointer, by modifying
1129 the PTR_INCR as illustrated below:
1131 The pointer def-use update-chain before this function:
1132 DATAREF_PTR = phi (p_0, p_2)
1133 ....
1134 PTR_INCR: p_2 = DATAREF_PTR + step
1136 The pointer def-use update-chain after this function:
1137 DATAREF_PTR = phi (p_0, p_2)
1138 ....
1139 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1140 ....
1141 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1143 Input:
1144 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1145 in the loop.
1146 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1147 the loop. The increment amount across iterations is expected
1148 to be vector_size.
1149 BSI - location where the new update stmt is to be placed.
1150 STMT - the original scalar memory-access stmt that is being vectorized.
1151 BUMP - optional. The offset by which to bump the pointer. If not given,
1152 the offset is assumed to be vector_size.
1154 Output: Return NEW_DATAREF_PTR as illustrated above.
1158 static tree
1159 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1160 tree stmt, tree bump)
1162 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1163 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1164 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1165 tree vptr_type = TREE_TYPE (dataref_ptr);
1166 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1167 tree update = TYPE_SIZE_UNIT (vectype);
1168 tree incr_stmt;
1169 ssa_op_iter iter;
1170 use_operand_p use_p;
1171 tree new_dataref_ptr;
1173 if (bump)
1174 update = bump;
1176 incr_stmt = build_gimple_modify_stmt (ptr_var,
1177 build2 (POINTER_PLUS_EXPR, vptr_type,
1178 dataref_ptr, update));
1179 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1180 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1181 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1183 /* Copy the points-to information if it exists. */
1184 if (DR_PTR_INFO (dr))
1185 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1186 merge_alias_info (new_dataref_ptr, dataref_ptr);
1188 if (!ptr_incr)
1189 return new_dataref_ptr;
1191 /* Update the vector-pointer's cross-iteration increment. */
1192 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1194 tree use = USE_FROM_PTR (use_p);
1196 if (use == dataref_ptr)
1197 SET_USE (use_p, new_dataref_ptr);
1198 else
1199 gcc_assert (tree_int_cst_compare (use, update) == 0);
1202 return new_dataref_ptr;
1206 /* Function vect_create_destination_var.
1208 Create a new temporary of type VECTYPE. */
1210 static tree
1211 vect_create_destination_var (tree scalar_dest, tree vectype)
1213 tree vec_dest;
1214 const char *new_name;
1215 tree type;
1216 enum vect_var_kind kind;
1218 kind = vectype ? vect_simple_var : vect_scalar_var;
1219 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1221 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1223 new_name = get_name (scalar_dest);
1224 if (!new_name)
1225 new_name = "var_";
1226 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1227 add_referenced_var (vec_dest);
1229 return vec_dest;
1233 /* Function vect_init_vector.
1235 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1236 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1237 is not NULL. Otherwise, place the initialization at the loop preheader.
1238 Return the DEF of INIT_STMT.
1239 It will be used in the vectorization of STMT. */
1241 static tree
1242 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1243 block_stmt_iterator *bsi)
1245 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1246 tree new_var;
1247 tree init_stmt;
1248 tree vec_oprnd;
1249 edge pe;
1250 tree new_temp;
1251 basic_block new_bb;
1253 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1254 add_referenced_var (new_var);
1255 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1256 new_temp = make_ssa_name (new_var, init_stmt);
1257 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1259 if (bsi)
1260 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1261 else
1263 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1264 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1266 if (nested_in_vect_loop_p (loop, stmt))
1267 loop = loop->inner;
1268 pe = loop_preheader_edge (loop);
1269 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1270 gcc_assert (!new_bb);
1273 if (vect_print_dump_info (REPORT_DETAILS))
1275 fprintf (vect_dump, "created new init_stmt: ");
1276 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1279 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1280 return vec_oprnd;
1284 /* For constant and loop invariant defs of SLP_NODE this function returns
1285 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1286 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1287 stmts. */
1289 static void
1290 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1291 unsigned int op_num)
1293 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1294 tree stmt = VEC_index (tree, stmts, 0);
1295 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1296 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1297 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1298 tree vec_cst;
1299 tree t = NULL_TREE;
1300 int j, number_of_places_left_in_vector;
1301 tree vector_type;
1302 tree op, vop, operation;
1303 int group_size = VEC_length (tree, stmts);
1304 unsigned int vec_num, i;
1305 int number_of_copies = 1;
1306 bool is_store = false;
1307 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1308 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1310 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1311 is_store = true;
1313 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1314 created vectors. It is greater than 1 if unrolling is performed.
1316 For example, we have two scalar operands, s1 and s2 (e.g., group of
1317 strided accesses of size two), while NUINTS is four (i.e., four scalars
1318 of this type can be packed in a vector). The output vector will contain
1319 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1320 will be 2).
1322 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1323 containing the operands.
1325 For example, NUINTS is four as before, and the group size is 8
1326 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1327 {s5, s6, s7, s8}. */
1329 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1331 number_of_places_left_in_vector = nunits;
1332 for (j = 0; j < number_of_copies; j++)
1334 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1336 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1337 if (is_store)
1338 op = operation;
1339 else
1340 op = TREE_OPERAND (operation, op_num);
1342 /* Create 'vect_ = {op0,op1,...,opn}'. */
1343 t = tree_cons (NULL_TREE, op, t);
1345 number_of_places_left_in_vector--;
1347 if (number_of_places_left_in_vector == 0)
1349 number_of_places_left_in_vector = nunits;
1351 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1352 vec_cst = build_constructor_from_list (vector_type, t);
1353 VEC_quick_push (tree, voprnds,
1354 vect_init_vector (stmt, vec_cst, vector_type,
1355 NULL));
1356 t = NULL_TREE;
1361 /* Since the vectors are created in the reverse order, we should invert
1362 them. */
1363 vec_num = VEC_length (tree, voprnds);
1364 for (j = vec_num - 1; j >= 0; j--)
1366 vop = VEC_index (tree, voprnds, j);
1367 VEC_quick_push (tree, *vec_oprnds, vop);
1370 VEC_free (tree, heap, voprnds);
1372 /* In case that VF is greater than the unrolling factor needed for the SLP
1373 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1374 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1375 to replicate the vectors. */
1376 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1378 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1379 VEC_quick_push (tree, *vec_oprnds, vop);
1384 /* Get vectorized definitions from SLP_NODE that contains corresponding
1385 vectorized def-stmts. */
1387 static void
1388 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1390 tree vec_oprnd;
1391 tree vec_def_stmt;
1392 unsigned int i;
1394 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1396 for (i = 0;
1397 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1398 i++)
1400 gcc_assert (vec_def_stmt);
1401 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1402 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1407 /* Get vectorized definitions for SLP_NODE.
1408 If the scalar definitions are loop invariants or constants, collect them and
1409 call vect_get_constant_vectors() to create vector stmts.
1410 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1411 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1412 vect_get_slp_vect_defs() to retrieve them.
1413 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1414 the right node. This is used when the second operand must remain scalar. */
1416 static void
1417 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1418 VEC (tree,heap) **vec_oprnds1)
1420 tree operation, first_stmt;
1422 /* Allocate memory for vectorized defs. */
1423 *vec_oprnds0 = VEC_alloc (tree, heap,
1424 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1426 /* SLP_NODE corresponds either to a group of stores or to a group of
1427 unary/binary operations. We don't call this function for loads. */
1428 if (SLP_TREE_LEFT (slp_node))
1429 /* The defs are already vectorized. */
1430 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1431 else
1432 /* Build vectors from scalar defs. */
1433 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1435 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1436 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1437 /* Since we don't call this function with loads, this is a group of
1438 stores. */
1439 return;
1441 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1442 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1443 return;
1445 *vec_oprnds1 = VEC_alloc (tree, heap,
1446 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1448 if (SLP_TREE_RIGHT (slp_node))
1449 /* The defs are already vectorized. */
1450 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1451 else
1452 /* Build vectors from scalar defs. */
1453 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1457 /* Function get_initial_def_for_induction
1459 Input:
1460 STMT - a stmt that performs an induction operation in the loop.
1461 IV_PHI - the initial value of the induction variable
1463 Output:
1464 Return a vector variable, initialized with the first VF values of
1465 the induction variable. E.g., for an iv with IV_PHI='X' and
1466 evolution S, for a vector of 4 units, we want to return:
1467 [X, X + S, X + 2*S, X + 3*S]. */
1469 static tree
1470 get_initial_def_for_induction (tree iv_phi)
1472 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1474 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1475 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1476 tree vectype = get_vectype_for_scalar_type (scalar_type);
1477 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1478 edge pe = loop_preheader_edge (loop);
1479 struct loop *iv_loop;
1480 basic_block new_bb;
1481 tree vec, vec_init, vec_step, t;
1482 tree access_fn;
1483 tree new_var;
1484 tree new_name;
1485 tree init_stmt;
1486 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1487 tree init_expr, step_expr;
1488 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1489 int i;
1490 bool ok;
1491 int ncopies = vf / nunits;
1492 tree expr;
1493 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1494 bool nested_in_vect_loop = false;
1495 tree stmts;
1496 imm_use_iterator imm_iter;
1497 use_operand_p use_p;
1498 tree exit_phi;
1499 edge latch_e;
1500 tree loop_arg;
1501 block_stmt_iterator si;
1502 basic_block bb = bb_for_stmt (iv_phi);
1504 gcc_assert (phi_info);
1505 gcc_assert (ncopies >= 1);
1507 /* Find the first insertion point in the BB. */
1508 si = bsi_after_labels (bb);
1510 if (INTEGRAL_TYPE_P (scalar_type))
1511 step_expr = build_int_cst (scalar_type, 0);
1512 else
1513 step_expr = build_real (scalar_type, dconst0);
1515 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1516 if (nested_in_vect_loop_p (loop, iv_phi))
1518 nested_in_vect_loop = true;
1519 iv_loop = loop->inner;
1521 else
1522 iv_loop = loop;
1523 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1525 latch_e = loop_latch_edge (iv_loop);
1526 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1528 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1529 gcc_assert (access_fn);
1530 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1531 &init_expr, &step_expr);
1532 gcc_assert (ok);
1533 pe = loop_preheader_edge (iv_loop);
1535 /* Create the vector that holds the initial_value of the induction. */
1536 if (nested_in_vect_loop)
1538 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1539 been created during vectorization of previous stmts; We obtain it from
1540 the STMT_VINFO_VEC_STMT of the defining stmt. */
1541 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1542 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1544 else
1546 /* iv_loop is the loop to be vectorized. Create:
1547 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1548 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1549 add_referenced_var (new_var);
1551 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1552 if (stmts)
1554 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1555 gcc_assert (!new_bb);
1558 t = NULL_TREE;
1559 t = tree_cons (NULL_TREE, init_expr, t);
1560 for (i = 1; i < nunits; i++)
1562 tree tmp;
1564 /* Create: new_name_i = new_name + step_expr */
1565 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1566 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1567 new_name = make_ssa_name (new_var, init_stmt);
1568 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1570 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1571 gcc_assert (!new_bb);
1573 if (vect_print_dump_info (REPORT_DETAILS))
1575 fprintf (vect_dump, "created new init_stmt: ");
1576 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1578 t = tree_cons (NULL_TREE, new_name, t);
1580 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1581 vec = build_constructor_from_list (vectype, nreverse (t));
1582 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1586 /* Create the vector that holds the step of the induction. */
1587 if (nested_in_vect_loop)
1588 /* iv_loop is nested in the loop to be vectorized. Generate:
1589 vec_step = [S, S, S, S] */
1590 new_name = step_expr;
1591 else
1593 /* iv_loop is the loop to be vectorized. Generate:
1594 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1595 expr = build_int_cst (scalar_type, vf);
1596 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1599 t = NULL_TREE;
1600 for (i = 0; i < nunits; i++)
1601 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1602 vec = build_constructor_from_list (vectype, t);
1603 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1606 /* Create the following def-use cycle:
1607 loop prolog:
1608 vec_init = ...
1609 vec_step = ...
1610 loop:
1611 vec_iv = PHI <vec_init, vec_loop>
1613 STMT
1615 vec_loop = vec_iv + vec_step; */
1617 /* Create the induction-phi that defines the induction-operand. */
1618 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1619 add_referenced_var (vec_dest);
1620 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1621 set_stmt_info (get_stmt_ann (induction_phi),
1622 new_stmt_vec_info (induction_phi, loop_vinfo));
1623 induc_def = PHI_RESULT (induction_phi);
1625 /* Create the iv update inside the loop */
1626 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1627 build2 (PLUS_EXPR, vectype,
1628 induc_def, vec_step));
1629 vec_def = make_ssa_name (vec_dest, new_stmt);
1630 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1631 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1632 set_stmt_info (get_stmt_ann (new_stmt),
1633 new_stmt_vec_info (new_stmt, loop_vinfo));
1635 /* Set the arguments of the phi node: */
1636 add_phi_arg (induction_phi, vec_init, pe);
1637 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1640 /* In case that vectorization factor (VF) is bigger than the number
1641 of elements that we can fit in a vectype (nunits), we have to generate
1642 more than one vector stmt - i.e - we need to "unroll" the
1643 vector stmt by a factor VF/nunits. For more details see documentation
1644 in vectorizable_operation. */
1646 if (ncopies > 1)
1648 stmt_vec_info prev_stmt_vinfo;
1649 /* FORNOW. This restriction should be relaxed. */
1650 gcc_assert (!nested_in_vect_loop);
1652 /* Create the vector that holds the step of the induction. */
1653 expr = build_int_cst (scalar_type, nunits);
1654 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1655 t = NULL_TREE;
1656 for (i = 0; i < nunits; i++)
1657 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1658 vec = build_constructor_from_list (vectype, t);
1659 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1661 vec_def = induc_def;
1662 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1663 for (i = 1; i < ncopies; i++)
1665 tree tmp;
1667 /* vec_i = vec_prev + vec_step */
1668 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1669 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1670 vec_def = make_ssa_name (vec_dest, new_stmt);
1671 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1672 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1673 set_stmt_info (get_stmt_ann (new_stmt),
1674 new_stmt_vec_info (new_stmt, loop_vinfo));
1675 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1676 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1680 if (nested_in_vect_loop)
1682 /* Find the loop-closed exit-phi of the induction, and record
1683 the final vector of induction results: */
1684 exit_phi = NULL;
1685 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1687 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1689 exit_phi = USE_STMT (use_p);
1690 break;
1693 if (exit_phi)
1695 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1696 /* FORNOW. Currently not supporting the case that an inner-loop induction
1697 is not used in the outer-loop (i.e. only outside the outer-loop). */
1698 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1699 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1701 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1702 if (vect_print_dump_info (REPORT_DETAILS))
1704 fprintf (vect_dump, "vector of inductions after inner-loop:");
1705 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1711 if (vect_print_dump_info (REPORT_DETAILS))
1713 fprintf (vect_dump, "transform induction: created def-use cycle:");
1714 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1715 fprintf (vect_dump, "\n");
1716 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1719 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1720 return induc_def;
1724 /* Function vect_get_vec_def_for_operand.
1726 OP is an operand in STMT. This function returns a (vector) def that will be
1727 used in the vectorized stmt for STMT.
1729 In the case that OP is an SSA_NAME which is defined in the loop, then
1730 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1732 In case OP is an invariant or constant, a new stmt that creates a vector def
1733 needs to be introduced. */
1735 static tree
1736 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1738 tree vec_oprnd;
1739 tree vec_stmt;
1740 tree def_stmt;
1741 stmt_vec_info def_stmt_info = NULL;
1742 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1743 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1744 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1745 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1746 tree vec_inv;
1747 tree vec_cst;
1748 tree t = NULL_TREE;
1749 tree def;
1750 int i;
1751 enum vect_def_type dt;
1752 bool is_simple_use;
1753 tree vector_type;
1755 if (vect_print_dump_info (REPORT_DETAILS))
1757 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1758 print_generic_expr (vect_dump, op, TDF_SLIM);
1761 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1762 gcc_assert (is_simple_use);
1763 if (vect_print_dump_info (REPORT_DETAILS))
1765 if (def)
1767 fprintf (vect_dump, "def = ");
1768 print_generic_expr (vect_dump, def, TDF_SLIM);
1770 if (def_stmt)
1772 fprintf (vect_dump, " def_stmt = ");
1773 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1777 switch (dt)
1779 /* Case 1: operand is a constant. */
1780 case vect_constant_def:
1782 if (scalar_def)
1783 *scalar_def = op;
1785 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1786 if (vect_print_dump_info (REPORT_DETAILS))
1787 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1789 for (i = nunits - 1; i >= 0; --i)
1791 t = tree_cons (NULL_TREE, op, t);
1793 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1794 vec_cst = build_vector (vector_type, t);
1796 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1799 /* Case 2: operand is defined outside the loop - loop invariant. */
1800 case vect_invariant_def:
1802 if (scalar_def)
1803 *scalar_def = def;
1805 /* Create 'vec_inv = {inv,inv,..,inv}' */
1806 if (vect_print_dump_info (REPORT_DETAILS))
1807 fprintf (vect_dump, "Create vector_inv.");
1809 for (i = nunits - 1; i >= 0; --i)
1811 t = tree_cons (NULL_TREE, def, t);
1814 /* FIXME: use build_constructor directly. */
1815 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1816 vec_inv = build_constructor_from_list (vector_type, t);
1817 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1820 /* Case 3: operand is defined inside the loop. */
1821 case vect_loop_def:
1823 if (scalar_def)
1824 *scalar_def = def_stmt;
1826 /* Get the def from the vectorized stmt. */
1827 def_stmt_info = vinfo_for_stmt (def_stmt);
1828 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1829 gcc_assert (vec_stmt);
1830 if (TREE_CODE (vec_stmt) == PHI_NODE)
1831 vec_oprnd = PHI_RESULT (vec_stmt);
1832 else
1833 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1834 return vec_oprnd;
1837 /* Case 4: operand is defined by a loop header phi - reduction */
1838 case vect_reduction_def:
1840 struct loop *loop;
1842 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1843 loop = (bb_for_stmt (def_stmt))->loop_father;
1845 /* Get the def before the loop */
1846 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1847 return get_initial_def_for_reduction (stmt, op, scalar_def);
1850 /* Case 5: operand is defined by loop-header phi - induction. */
1851 case vect_induction_def:
1853 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1855 /* Get the def from the vectorized stmt. */
1856 def_stmt_info = vinfo_for_stmt (def_stmt);
1857 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1858 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1859 vec_oprnd = PHI_RESULT (vec_stmt);
1860 return vec_oprnd;
1863 default:
1864 gcc_unreachable ();
1869 /* Function vect_get_vec_def_for_stmt_copy
1871 Return a vector-def for an operand. This function is used when the
1872 vectorized stmt to be created (by the caller to this function) is a "copy"
1873 created in case the vectorized result cannot fit in one vector, and several
1874 copies of the vector-stmt are required. In this case the vector-def is
1875 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1876 of the stmt that defines VEC_OPRND.
1877 DT is the type of the vector def VEC_OPRND.
1879 Context:
1880 In case the vectorization factor (VF) is bigger than the number
1881 of elements that can fit in a vectype (nunits), we have to generate
1882 more than one vector stmt to vectorize the scalar stmt. This situation
1883 arises when there are multiple data-types operated upon in the loop; the
1884 smallest data-type determines the VF, and as a result, when vectorizing
1885 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1886 vector stmt (each computing a vector of 'nunits' results, and together
1887 computing 'VF' results in each iteration). This function is called when
1888 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1889 which VF=16 and nunits=4, so the number of copies required is 4):
1891 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1893 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1894 VS1.1: vx.1 = memref1 VS1.2
1895 VS1.2: vx.2 = memref2 VS1.3
1896 VS1.3: vx.3 = memref3
1898 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1899 VSnew.1: vz1 = vx.1 + ... VSnew.2
1900 VSnew.2: vz2 = vx.2 + ... VSnew.3
1901 VSnew.3: vz3 = vx.3 + ...
1903 The vectorization of S1 is explained in vectorizable_load.
1904 The vectorization of S2:
1905 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1906 the function 'vect_get_vec_def_for_operand' is called to
1907 get the relevant vector-def for each operand of S2. For operand x it
1908 returns the vector-def 'vx.0'.
1910 To create the remaining copies of the vector-stmt (VSnew.j), this
1911 function is called to get the relevant vector-def for each operand. It is
1912 obtained from the respective VS1.j stmt, which is recorded in the
1913 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1915 For example, to obtain the vector-def 'vx.1' in order to create the
1916 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1917 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1918 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1919 and return its def ('vx.1').
1920 Overall, to create the above sequence this function will be called 3 times:
1921 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1922 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1923 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1925 static tree
1926 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1928 tree vec_stmt_for_operand;
1929 stmt_vec_info def_stmt_info;
1931 /* Do nothing; can reuse same def. */
1932 if (dt == vect_invariant_def || dt == vect_constant_def )
1933 return vec_oprnd;
1935 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1936 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1937 gcc_assert (def_stmt_info);
1938 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1939 gcc_assert (vec_stmt_for_operand);
1940 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1941 return vec_oprnd;
1945 /* Get vectorized definitions for the operands to create a copy of an original
1946 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
1948 static void
1949 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
1950 VEC(tree,heap) **vec_oprnds0,
1951 VEC(tree,heap) **vec_oprnds1)
1953 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
1955 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
1956 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
1958 if (vec_oprnds1 && *vec_oprnds1)
1960 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
1961 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
1962 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
1967 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
1969 static void
1970 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
1971 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
1973 if (slp_node)
1974 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
1975 else
1977 tree vec_oprnd;
1979 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
1980 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
1981 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
1983 if (op1)
1985 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
1986 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
1987 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
1993 /* Function vect_finish_stmt_generation.
1995 Insert a new stmt. */
1997 static void
1998 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
1999 block_stmt_iterator *bsi)
2001 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2002 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2004 gcc_assert (stmt == bsi_stmt (*bsi));
2005 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2007 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2009 set_stmt_info (get_stmt_ann (vec_stmt),
2010 new_stmt_vec_info (vec_stmt, loop_vinfo));
2012 if (vect_print_dump_info (REPORT_DETAILS))
2014 fprintf (vect_dump, "add new stmt: ");
2015 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2018 /* Make sure bsi points to the stmt that is being vectorized. */
2019 gcc_assert (stmt == bsi_stmt (*bsi));
2021 #ifdef USE_MAPPED_LOCATION
2022 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2023 #else
2024 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
2025 #endif
2029 /* Function get_initial_def_for_reduction
2031 Input:
2032 STMT - a stmt that performs a reduction operation in the loop.
2033 INIT_VAL - the initial value of the reduction variable
2035 Output:
2036 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2037 of the reduction (used for adjusting the epilog - see below).
2038 Return a vector variable, initialized according to the operation that STMT
2039 performs. This vector will be used as the initial value of the
2040 vector of partial results.
2042 Option1 (adjust in epilog): Initialize the vector as follows:
2043 add: [0,0,...,0,0]
2044 mult: [1,1,...,1,1]
2045 min/max: [init_val,init_val,..,init_val,init_val]
2046 bit and/or: [init_val,init_val,..,init_val,init_val]
2047 and when necessary (e.g. add/mult case) let the caller know
2048 that it needs to adjust the result by init_val.
2050 Option2: Initialize the vector as follows:
2051 add: [0,0,...,0,init_val]
2052 mult: [1,1,...,1,init_val]
2053 min/max: [init_val,init_val,...,init_val]
2054 bit and/or: [init_val,init_val,...,init_val]
2055 and no adjustments are needed.
2057 For example, for the following code:
2059 s = init_val;
2060 for (i=0;i<n;i++)
2061 s = s + a[i];
2063 STMT is 's = s + a[i]', and the reduction variable is 's'.
2064 For a vector of 4 units, we want to return either [0,0,0,init_val],
2065 or [0,0,0,0] and let the caller know that it needs to adjust
2066 the result at the end by 'init_val'.
2068 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2069 initialization vector is simpler (same element in all entries).
2070 A cost model should help decide between these two schemes. */
2072 static tree
2073 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2075 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2076 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2077 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2078 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2079 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2080 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2081 tree type = TREE_TYPE (init_val);
2082 tree vecdef;
2083 tree def_for_init;
2084 tree init_def;
2085 tree t = NULL_TREE;
2086 int i;
2087 tree vector_type;
2088 bool nested_in_vect_loop = false;
2090 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2091 if (nested_in_vect_loop_p (loop, stmt))
2092 nested_in_vect_loop = true;
2093 else
2094 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2096 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2098 switch (code)
2100 case WIDEN_SUM_EXPR:
2101 case DOT_PROD_EXPR:
2102 case PLUS_EXPR:
2103 if (nested_in_vect_loop)
2104 *adjustment_def = vecdef;
2105 else
2106 *adjustment_def = init_val;
2107 /* Create a vector of zeros for init_def. */
2108 if (INTEGRAL_TYPE_P (type))
2109 def_for_init = build_int_cst (type, 0);
2110 else
2111 def_for_init = build_real (type, dconst0);
2112 for (i = nunits - 1; i >= 0; --i)
2113 t = tree_cons (NULL_TREE, def_for_init, t);
2114 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2115 init_def = build_vector (vector_type, t);
2116 break;
2118 case MIN_EXPR:
2119 case MAX_EXPR:
2120 *adjustment_def = NULL_TREE;
2121 init_def = vecdef;
2122 break;
2124 default:
2125 gcc_unreachable ();
2128 return init_def;
2132 /* Function vect_create_epilog_for_reduction
2134 Create code at the loop-epilog to finalize the result of a reduction
2135 computation.
2137 VECT_DEF is a vector of partial results.
2138 REDUC_CODE is the tree-code for the epilog reduction.
2139 STMT is the scalar reduction stmt that is being vectorized.
2140 REDUCTION_PHI is the phi-node that carries the reduction computation.
2142 This function:
2143 1. Creates the reduction def-use cycle: sets the arguments for
2144 REDUCTION_PHI:
2145 The loop-entry argument is the vectorized initial-value of the reduction.
2146 The loop-latch argument is VECT_DEF - the vector of partial sums.
2147 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2148 by applying the operation specified by REDUC_CODE if available, or by
2149 other means (whole-vector shifts or a scalar loop).
2150 The function also creates a new phi node at the loop exit to preserve
2151 loop-closed form, as illustrated below.
2153 The flow at the entry to this function:
2155 loop:
2156 vec_def = phi <null, null> # REDUCTION_PHI
2157 VECT_DEF = vector_stmt # vectorized form of STMT
2158 s_loop = scalar_stmt # (scalar) STMT
2159 loop_exit:
2160 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2161 use <s_out0>
2162 use <s_out0>
2164 The above is transformed by this function into:
2166 loop:
2167 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2168 VECT_DEF = vector_stmt # vectorized form of STMT
2169 s_loop = scalar_stmt # (scalar) STMT
2170 loop_exit:
2171 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2172 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2173 v_out2 = reduce <v_out1>
2174 s_out3 = extract_field <v_out2, 0>
2175 s_out4 = adjust_result <s_out3>
2176 use <s_out4>
2177 use <s_out4>
2180 static void
2181 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2182 enum tree_code reduc_code, tree reduction_phi)
2184 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2185 tree vectype;
2186 enum machine_mode mode;
2187 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2189 basic_block exit_bb;
2190 tree scalar_dest;
2191 tree scalar_type;
2192 tree new_phi;
2193 block_stmt_iterator exit_bsi;
2194 tree vec_dest;
2195 tree new_temp = NULL_TREE;
2196 tree new_name;
2197 tree epilog_stmt = NULL_TREE;
2198 tree new_scalar_dest, exit_phi, new_dest;
2199 tree bitsize, bitpos, bytesize;
2200 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2201 tree adjustment_def;
2202 tree vec_initial_def;
2203 tree orig_name;
2204 imm_use_iterator imm_iter;
2205 use_operand_p use_p;
2206 bool extract_scalar_result = false;
2207 tree reduction_op, expr;
2208 tree orig_stmt;
2209 tree use_stmt;
2210 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2211 bool nested_in_vect_loop = false;
2212 int op_type;
2213 VEC(tree,heap) *phis = NULL;
2214 int i;
2216 if (nested_in_vect_loop_p (loop, stmt))
2218 loop = loop->inner;
2219 nested_in_vect_loop = true;
2222 op_type = TREE_OPERAND_LENGTH (operation);
2223 reduction_op = TREE_OPERAND (operation, op_type-1);
2224 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2225 mode = TYPE_MODE (vectype);
2227 /*** 1. Create the reduction def-use cycle ***/
2229 /* 1.1 set the loop-entry arg of the reduction-phi: */
2230 /* For the case of reduction, vect_get_vec_def_for_operand returns
2231 the scalar def before the loop, that defines the initial value
2232 of the reduction variable. */
2233 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2234 &adjustment_def);
2235 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2237 /* 1.2 set the loop-latch arg for the reduction-phi: */
2238 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2240 if (vect_print_dump_info (REPORT_DETAILS))
2242 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2243 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2244 fprintf (vect_dump, "\n");
2245 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2249 /*** 2. Create epilog code
2250 The reduction epilog code operates across the elements of the vector
2251 of partial results computed by the vectorized loop.
2252 The reduction epilog code consists of:
2253 step 1: compute the scalar result in a vector (v_out2)
2254 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2255 step 3: adjust the scalar result (s_out3) if needed.
2257 Step 1 can be accomplished using one the following three schemes:
2258 (scheme 1) using reduc_code, if available.
2259 (scheme 2) using whole-vector shifts, if available.
2260 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2261 combined.
2263 The overall epilog code looks like this:
2265 s_out0 = phi <s_loop> # original EXIT_PHI
2266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2267 v_out2 = reduce <v_out1> # step 1
2268 s_out3 = extract_field <v_out2, 0> # step 2
2269 s_out4 = adjust_result <s_out3> # step 3
2271 (step 3 is optional, and step2 1 and 2 may be combined).
2272 Lastly, the uses of s_out0 are replaced by s_out4.
2274 ***/
2276 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2277 v_out1 = phi <v_loop> */
2279 exit_bb = single_exit (loop)->dest;
2280 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2281 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2282 exit_bsi = bsi_after_labels (exit_bb);
2284 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2285 (i.e. when reduc_code is not available) and in the final adjustment
2286 code (if needed). Also get the original scalar reduction variable as
2287 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2288 represents a reduction pattern), the tree-code and scalar-def are
2289 taken from the original stmt that the pattern-stmt (STMT) replaces.
2290 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2291 are taken from STMT. */
2293 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2294 if (!orig_stmt)
2296 /* Regular reduction */
2297 orig_stmt = stmt;
2299 else
2301 /* Reduction pattern */
2302 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2303 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2304 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2306 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2307 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2308 scalar_type = TREE_TYPE (scalar_dest);
2309 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2310 bitsize = TYPE_SIZE (scalar_type);
2311 bytesize = TYPE_SIZE_UNIT (scalar_type);
2314 /* In case this is a reduction in an inner-loop while vectorizing an outer
2315 loop - we don't need to extract a single scalar result at the end of the
2316 inner-loop. The final vector of partial results will be used in the
2317 vectorized outer-loop, or reduced to a scalar result at the end of the
2318 outer-loop. */
2319 if (nested_in_vect_loop)
2320 goto vect_finalize_reduction;
2322 /* 2.3 Create the reduction code, using one of the three schemes described
2323 above. */
2325 if (reduc_code < NUM_TREE_CODES)
2327 tree tmp;
2329 /*** Case 1: Create:
2330 v_out2 = reduc_expr <v_out1> */
2332 if (vect_print_dump_info (REPORT_DETAILS))
2333 fprintf (vect_dump, "Reduce using direct vector reduction.");
2335 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2336 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2337 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2338 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2339 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2340 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2342 extract_scalar_result = true;
2344 else
2346 enum tree_code shift_code = 0;
2347 bool have_whole_vector_shift = true;
2348 int bit_offset;
2349 int element_bitsize = tree_low_cst (bitsize, 1);
2350 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2351 tree vec_temp;
2353 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2354 shift_code = VEC_RSHIFT_EXPR;
2355 else
2356 have_whole_vector_shift = false;
2358 /* Regardless of whether we have a whole vector shift, if we're
2359 emulating the operation via tree-vect-generic, we don't want
2360 to use it. Only the first round of the reduction is likely
2361 to still be profitable via emulation. */
2362 /* ??? It might be better to emit a reduction tree code here, so that
2363 tree-vect-generic can expand the first round via bit tricks. */
2364 if (!VECTOR_MODE_P (mode))
2365 have_whole_vector_shift = false;
2366 else
2368 optab optab = optab_for_tree_code (code, vectype);
2369 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2370 have_whole_vector_shift = false;
2373 if (have_whole_vector_shift)
2375 /*** Case 2: Create:
2376 for (offset = VS/2; offset >= element_size; offset/=2)
2378 Create: va' = vec_shift <va, offset>
2379 Create: va = vop <va, va'>
2380 } */
2382 if (vect_print_dump_info (REPORT_DETAILS))
2383 fprintf (vect_dump, "Reduce using vector shifts");
2385 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2386 new_temp = PHI_RESULT (new_phi);
2388 for (bit_offset = vec_size_in_bits/2;
2389 bit_offset >= element_bitsize;
2390 bit_offset /= 2)
2392 tree bitpos = size_int (bit_offset);
2393 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2394 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2395 new_name = make_ssa_name (vec_dest, epilog_stmt);
2396 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2397 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2399 tmp = build2 (code, vectype, new_name, new_temp);
2400 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2401 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2402 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2403 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2406 extract_scalar_result = true;
2408 else
2410 tree rhs;
2412 /*** Case 3: Create:
2413 s = extract_field <v_out2, 0>
2414 for (offset = element_size;
2415 offset < vector_size;
2416 offset += element_size;)
2418 Create: s' = extract_field <v_out2, offset>
2419 Create: s = op <s, s'>
2420 } */
2422 if (vect_print_dump_info (REPORT_DETAILS))
2423 fprintf (vect_dump, "Reduce using scalar code. ");
2425 vec_temp = PHI_RESULT (new_phi);
2426 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2427 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2428 bitsize_zero_node);
2429 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2430 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2431 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2432 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2433 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2435 for (bit_offset = element_bitsize;
2436 bit_offset < vec_size_in_bits;
2437 bit_offset += element_bitsize)
2439 tree tmp;
2440 tree bitpos = bitsize_int (bit_offset);
2441 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2442 bitpos);
2444 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2445 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2446 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2447 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2448 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2450 tmp = build2 (code, scalar_type, new_name, new_temp);
2451 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2452 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2453 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2454 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2457 extract_scalar_result = false;
2461 /* 2.4 Extract the final scalar result. Create:
2462 s_out3 = extract_field <v_out2, bitpos> */
2464 if (extract_scalar_result)
2466 tree rhs;
2468 gcc_assert (!nested_in_vect_loop);
2469 if (vect_print_dump_info (REPORT_DETAILS))
2470 fprintf (vect_dump, "extract scalar result");
2472 if (BYTES_BIG_ENDIAN)
2473 bitpos = size_binop (MULT_EXPR,
2474 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2475 TYPE_SIZE (scalar_type));
2476 else
2477 bitpos = bitsize_zero_node;
2479 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2480 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2481 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2482 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2483 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2484 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2487 vect_finalize_reduction:
2489 /* 2.5 Adjust the final result by the initial value of the reduction
2490 variable. (When such adjustment is not needed, then
2491 'adjustment_def' is zero). For example, if code is PLUS we create:
2492 new_temp = loop_exit_def + adjustment_def */
2494 if (adjustment_def)
2496 if (nested_in_vect_loop)
2498 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2499 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2500 new_dest = vect_create_destination_var (scalar_dest, vectype);
2502 else
2504 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2505 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2506 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2508 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2509 new_temp = make_ssa_name (new_dest, epilog_stmt);
2510 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2511 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2515 /* 2.6 Handle the loop-exit phi */
2517 /* Replace uses of s_out0 with uses of s_out3:
2518 Find the loop-closed-use at the loop exit of the original scalar result.
2519 (The reduction result is expected to have two immediate uses - one at the
2520 latch block, and one at the loop exit). */
2521 phis = VEC_alloc (tree, heap, 10);
2522 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2524 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2526 exit_phi = USE_STMT (use_p);
2527 VEC_quick_push (tree, phis, exit_phi);
2530 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2531 gcc_assert (!VEC_empty (tree, phis));
2533 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2535 if (nested_in_vect_loop)
2537 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2539 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2540 is not used in the outer-loop (but only outside the outer-loop). */
2541 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2542 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2544 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2545 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2546 set_stmt_info (get_stmt_ann (epilog_stmt),
2547 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2548 continue;
2551 /* Replace the uses: */
2552 orig_name = PHI_RESULT (exit_phi);
2553 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2554 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2555 SET_USE (use_p, new_temp);
2557 VEC_free (tree, heap, phis);
2561 /* Function vectorizable_reduction.
2563 Check if STMT performs a reduction operation that can be vectorized.
2564 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2565 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2566 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2568 This function also handles reduction idioms (patterns) that have been
2569 recognized in advance during vect_pattern_recog. In this case, STMT may be
2570 of this form:
2571 X = pattern_expr (arg0, arg1, ..., X)
2572 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2573 sequence that had been detected and replaced by the pattern-stmt (STMT).
2575 In some cases of reduction patterns, the type of the reduction variable X is
2576 different than the type of the other arguments of STMT.
2577 In such cases, the vectype that is used when transforming STMT into a vector
2578 stmt is different than the vectype that is used to determine the
2579 vectorization factor, because it consists of a different number of elements
2580 than the actual number of elements that are being operated upon in parallel.
2582 For example, consider an accumulation of shorts into an int accumulator.
2583 On some targets it's possible to vectorize this pattern operating on 8
2584 shorts at a time (hence, the vectype for purposes of determining the
2585 vectorization factor should be V8HI); on the other hand, the vectype that
2586 is used to create the vector form is actually V4SI (the type of the result).
2588 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2589 indicates what is the actual level of parallelism (V8HI in the example), so
2590 that the right vectorization factor would be derived. This vectype
2591 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2592 be used to create the vectorized stmt. The right vectype for the vectorized
2593 stmt is obtained from the type of the result X:
2594 get_vectype_for_scalar_type (TREE_TYPE (X))
2596 This means that, contrary to "regular" reductions (or "regular" stmts in
2597 general), the following equation:
2598 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2599 does *NOT* necessarily hold for reduction patterns. */
2601 bool
2602 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2604 tree vec_dest;
2605 tree scalar_dest;
2606 tree op;
2607 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2608 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2609 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2610 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2611 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2612 tree operation;
2613 enum tree_code code, orig_code, epilog_reduc_code = 0;
2614 enum machine_mode vec_mode;
2615 int op_type;
2616 optab optab, reduc_optab;
2617 tree new_temp = NULL_TREE;
2618 tree def, def_stmt;
2619 enum vect_def_type dt;
2620 tree new_phi;
2621 tree scalar_type;
2622 bool is_simple_use;
2623 tree orig_stmt;
2624 stmt_vec_info orig_stmt_info;
2625 tree expr = NULL_TREE;
2626 int i;
2627 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2628 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2629 stmt_vec_info prev_stmt_info;
2630 tree reduc_def;
2631 tree new_stmt = NULL_TREE;
2632 int j;
2634 if (nested_in_vect_loop_p (loop, stmt))
2636 loop = loop->inner;
2637 /* FORNOW. This restriction should be relaxed. */
2638 if (ncopies > 1)
2640 if (vect_print_dump_info (REPORT_DETAILS))
2641 fprintf (vect_dump, "multiple types in nested loop.");
2642 return false;
2646 gcc_assert (ncopies >= 1);
2648 /* FORNOW: SLP not supported. */
2649 if (STMT_SLP_TYPE (stmt_info))
2650 return false;
2652 /* 1. Is vectorizable reduction? */
2654 /* Not supportable if the reduction variable is used in the loop. */
2655 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2656 return false;
2658 /* Reductions that are not used even in an enclosing outer-loop,
2659 are expected to be "live" (used out of the loop). */
2660 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2661 && !STMT_VINFO_LIVE_P (stmt_info))
2662 return false;
2664 /* Make sure it was already recognized as a reduction computation. */
2665 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2666 return false;
2668 /* 2. Has this been recognized as a reduction pattern?
2670 Check if STMT represents a pattern that has been recognized
2671 in earlier analysis stages. For stmts that represent a pattern,
2672 the STMT_VINFO_RELATED_STMT field records the last stmt in
2673 the original sequence that constitutes the pattern. */
2675 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2676 if (orig_stmt)
2678 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2679 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2680 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2681 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2684 /* 3. Check the operands of the operation. The first operands are defined
2685 inside the loop body. The last operand is the reduction variable,
2686 which is defined by the loop-header-phi. */
2688 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2690 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2691 code = TREE_CODE (operation);
2692 op_type = TREE_OPERAND_LENGTH (operation);
2693 if (op_type != binary_op && op_type != ternary_op)
2694 return false;
2695 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2696 scalar_type = TREE_TYPE (scalar_dest);
2698 /* All uses but the last are expected to be defined in the loop.
2699 The last use is the reduction variable. */
2700 for (i = 0; i < op_type-1; i++)
2702 op = TREE_OPERAND (operation, i);
2703 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2704 gcc_assert (is_simple_use);
2705 if (dt != vect_loop_def
2706 && dt != vect_invariant_def
2707 && dt != vect_constant_def
2708 && dt != vect_induction_def)
2709 return false;
2712 op = TREE_OPERAND (operation, i);
2713 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2714 gcc_assert (is_simple_use);
2715 gcc_assert (dt == vect_reduction_def);
2716 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2717 if (orig_stmt)
2718 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2719 else
2720 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2722 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2723 return false;
2725 /* 4. Supportable by target? */
2727 /* 4.1. check support for the operation in the loop */
2728 optab = optab_for_tree_code (code, vectype);
2729 if (!optab)
2731 if (vect_print_dump_info (REPORT_DETAILS))
2732 fprintf (vect_dump, "no optab.");
2733 return false;
2735 vec_mode = TYPE_MODE (vectype);
2736 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2738 if (vect_print_dump_info (REPORT_DETAILS))
2739 fprintf (vect_dump, "op not supported by target.");
2740 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2741 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2742 < vect_min_worthwhile_factor (code))
2743 return false;
2744 if (vect_print_dump_info (REPORT_DETAILS))
2745 fprintf (vect_dump, "proceeding using word mode.");
2748 /* Worthwhile without SIMD support? */
2749 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2750 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2751 < vect_min_worthwhile_factor (code))
2753 if (vect_print_dump_info (REPORT_DETAILS))
2754 fprintf (vect_dump, "not worthwhile without SIMD support.");
2755 return false;
2758 /* 4.2. Check support for the epilog operation.
2760 If STMT represents a reduction pattern, then the type of the
2761 reduction variable may be different than the type of the rest
2762 of the arguments. For example, consider the case of accumulation
2763 of shorts into an int accumulator; The original code:
2764 S1: int_a = (int) short_a;
2765 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2767 was replaced with:
2768 STMT: int_acc = widen_sum <short_a, int_acc>
2770 This means that:
2771 1. The tree-code that is used to create the vector operation in the
2772 epilog code (that reduces the partial results) is not the
2773 tree-code of STMT, but is rather the tree-code of the original
2774 stmt from the pattern that STMT is replacing. I.e, in the example
2775 above we want to use 'widen_sum' in the loop, but 'plus' in the
2776 epilog.
2777 2. The type (mode) we use to check available target support
2778 for the vector operation to be created in the *epilog*, is
2779 determined by the type of the reduction variable (in the example
2780 above we'd check this: plus_optab[vect_int_mode]).
2781 However the type (mode) we use to check available target support
2782 for the vector operation to be created *inside the loop*, is
2783 determined by the type of the other arguments to STMT (in the
2784 example we'd check this: widen_sum_optab[vect_short_mode]).
2786 This is contrary to "regular" reductions, in which the types of all
2787 the arguments are the same as the type of the reduction variable.
2788 For "regular" reductions we can therefore use the same vector type
2789 (and also the same tree-code) when generating the epilog code and
2790 when generating the code inside the loop. */
2792 if (orig_stmt)
2794 /* This is a reduction pattern: get the vectype from the type of the
2795 reduction variable, and get the tree-code from orig_stmt. */
2796 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2797 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2798 vec_mode = TYPE_MODE (vectype);
2800 else
2802 /* Regular reduction: use the same vectype and tree-code as used for
2803 the vector code inside the loop can be used for the epilog code. */
2804 orig_code = code;
2807 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2808 return false;
2809 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2810 if (!reduc_optab)
2812 if (vect_print_dump_info (REPORT_DETAILS))
2813 fprintf (vect_dump, "no optab for reduction.");
2814 epilog_reduc_code = NUM_TREE_CODES;
2816 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2818 if (vect_print_dump_info (REPORT_DETAILS))
2819 fprintf (vect_dump, "reduc op not supported by target.");
2820 epilog_reduc_code = NUM_TREE_CODES;
2823 if (!vec_stmt) /* transformation not required. */
2825 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2826 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
2827 return true;
2830 /** Transform. **/
2832 if (vect_print_dump_info (REPORT_DETAILS))
2833 fprintf (vect_dump, "transform reduction.");
2835 /* Create the destination vector */
2836 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2838 /* Create the reduction-phi that defines the reduction-operand. */
2839 new_phi = create_phi_node (vec_dest, loop->header);
2841 /* In case the vectorization factor (VF) is bigger than the number
2842 of elements that we can fit in a vectype (nunits), we have to generate
2843 more than one vector stmt - i.e - we need to "unroll" the
2844 vector stmt by a factor VF/nunits. For more details see documentation
2845 in vectorizable_operation. */
2847 prev_stmt_info = NULL;
2848 for (j = 0; j < ncopies; j++)
2850 /* Handle uses. */
2851 if (j == 0)
2853 op = TREE_OPERAND (operation, 0);
2854 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2855 if (op_type == ternary_op)
2857 op = TREE_OPERAND (operation, 1);
2858 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2861 /* Get the vector def for the reduction variable from the phi node */
2862 reduc_def = PHI_RESULT (new_phi);
2864 else
2866 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2867 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2868 if (op_type == ternary_op)
2869 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2871 /* Get the vector def for the reduction variable from the vectorized
2872 reduction operation generated in the previous iteration (j-1) */
2873 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2876 /* Arguments are ready. create the new vector stmt. */
2877 if (op_type == binary_op)
2878 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2879 else
2880 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2881 reduc_def);
2882 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2883 new_temp = make_ssa_name (vec_dest, new_stmt);
2884 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2885 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2887 if (j == 0)
2888 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2889 else
2890 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2891 prev_stmt_info = vinfo_for_stmt (new_stmt);
2894 /* Finalize the reduction-phi (set it's arguments) and create the
2895 epilog reduction code. */
2896 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2897 return true;
2900 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2901 a function declaration if the target has a vectorized version
2902 of the function, or NULL_TREE if the function cannot be vectorized. */
2904 tree
2905 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2907 tree fndecl = get_callee_fndecl (call);
2908 enum built_in_function code;
2910 /* We only handle functions that do not read or clobber memory -- i.e.
2911 const or novops ones. */
2912 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2913 return NULL_TREE;
2915 if (!fndecl
2916 || TREE_CODE (fndecl) != FUNCTION_DECL
2917 || !DECL_BUILT_IN (fndecl))
2918 return NULL_TREE;
2920 code = DECL_FUNCTION_CODE (fndecl);
2921 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2922 vectype_in);
2925 /* Function vectorizable_call.
2927 Check if STMT performs a function call that can be vectorized.
2928 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2929 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2930 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2932 bool
2933 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2935 tree vec_dest;
2936 tree scalar_dest;
2937 tree operation;
2938 tree op, type;
2939 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2940 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2941 tree vectype_out, vectype_in;
2942 int nunits_in;
2943 int nunits_out;
2944 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2945 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2946 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2947 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
2948 tree new_stmt;
2949 int ncopies, j, nargs;
2950 call_expr_arg_iterator iter;
2951 tree vargs;
2952 enum { NARROW, NONE, WIDEN } modifier;
2954 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2955 return false;
2957 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2958 return false;
2960 /* FORNOW: SLP not supported. */
2961 if (STMT_SLP_TYPE (stmt_info))
2962 return false;
2964 /* FORNOW: not yet supported. */
2965 if (STMT_VINFO_LIVE_P (stmt_info))
2967 if (vect_print_dump_info (REPORT_DETAILS))
2968 fprintf (vect_dump, "value used after loop.");
2969 return false;
2972 /* Is STMT a vectorizable call? */
2973 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2974 return false;
2976 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2977 return false;
2979 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2980 if (TREE_CODE (operation) != CALL_EXPR)
2981 return false;
2983 /* Process function arguments. */
2984 rhs_type = NULL_TREE;
2985 nargs = 0;
2986 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2988 /* Bail out if the function has more than two arguments, we
2989 do not have interesting builtin functions to vectorize with
2990 more than two arguments. */
2991 if (nargs >= 2)
2992 return false;
2994 /* We can only handle calls with arguments of the same type. */
2995 if (rhs_type
2996 && rhs_type != TREE_TYPE (op))
2998 if (vect_print_dump_info (REPORT_DETAILS))
2999 fprintf (vect_dump, "argument types differ.");
3000 return false;
3002 rhs_type = TREE_TYPE (op);
3004 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3006 if (vect_print_dump_info (REPORT_DETAILS))
3007 fprintf (vect_dump, "use not simple.");
3008 return false;
3011 ++nargs;
3014 /* No arguments is also not good. */
3015 if (nargs == 0)
3016 return false;
3018 vectype_in = get_vectype_for_scalar_type (rhs_type);
3019 if (!vectype_in)
3020 return false;
3021 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3023 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3024 vectype_out = get_vectype_for_scalar_type (lhs_type);
3025 if (!vectype_out)
3026 return false;
3027 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3029 /* FORNOW */
3030 if (nunits_in == nunits_out / 2)
3031 modifier = NARROW;
3032 else if (nunits_out == nunits_in)
3033 modifier = NONE;
3034 else if (nunits_out == nunits_in / 2)
3035 modifier = WIDEN;
3036 else
3037 return false;
3039 /* For now, we only vectorize functions if a target specific builtin
3040 is available. TODO -- in some cases, it might be profitable to
3041 insert the calls for pieces of the vector, in order to be able
3042 to vectorize other operations in the loop. */
3043 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3044 if (fndecl == NULL_TREE)
3046 if (vect_print_dump_info (REPORT_DETAILS))
3047 fprintf (vect_dump, "function is not vectorizable.");
3049 return false;
3052 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3054 if (modifier == NARROW)
3055 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3056 else
3057 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3059 /* Sanity check: make sure that at least one copy of the vectorized stmt
3060 needs to be generated. */
3061 gcc_assert (ncopies >= 1);
3063 /* FORNOW. This restriction should be relaxed. */
3064 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3066 if (vect_print_dump_info (REPORT_DETAILS))
3067 fprintf (vect_dump, "multiple types in nested loop.");
3068 return false;
3071 if (!vec_stmt) /* transformation not required. */
3073 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3074 if (vect_print_dump_info (REPORT_DETAILS))
3075 fprintf (vect_dump, "=== vectorizable_call ===");
3076 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3077 return true;
3080 /** Transform. **/
3082 if (vect_print_dump_info (REPORT_DETAILS))
3083 fprintf (vect_dump, "transform operation.");
3085 /* FORNOW. This restriction should be relaxed. */
3086 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3088 if (vect_print_dump_info (REPORT_DETAILS))
3089 fprintf (vect_dump, "multiple types in nested loop.");
3090 return false;
3093 /* Handle def. */
3094 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3095 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3097 prev_stmt_info = NULL;
3098 switch (modifier)
3100 case NONE:
3101 for (j = 0; j < ncopies; ++j)
3103 /* Build argument list for the vectorized call. */
3104 /* FIXME: Rewrite this so that it doesn't
3105 construct a temporary list. */
3106 vargs = NULL_TREE;
3107 nargs = 0;
3108 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3110 if (j == 0)
3111 vec_oprnd0
3112 = vect_get_vec_def_for_operand (op, stmt, NULL);
3113 else
3114 vec_oprnd0
3115 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3117 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3119 ++nargs;
3121 vargs = nreverse (vargs);
3123 rhs = build_function_call_expr (fndecl, vargs);
3124 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3125 new_temp = make_ssa_name (vec_dest, new_stmt);
3126 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3128 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3130 if (j == 0)
3131 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3132 else
3133 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3135 prev_stmt_info = vinfo_for_stmt (new_stmt);
3138 break;
3140 case NARROW:
3141 for (j = 0; j < ncopies; ++j)
3143 /* Build argument list for the vectorized call. */
3144 /* FIXME: Rewrite this so that it doesn't
3145 construct a temporary list. */
3146 vargs = NULL_TREE;
3147 nargs = 0;
3148 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3150 if (j == 0)
3152 vec_oprnd0
3153 = vect_get_vec_def_for_operand (op, stmt, NULL);
3154 vec_oprnd1
3155 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3157 else
3159 vec_oprnd0
3160 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3161 vec_oprnd1
3162 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3165 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3166 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3168 ++nargs;
3170 vargs = nreverse (vargs);
3172 rhs = build_function_call_expr (fndecl, vargs);
3173 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3174 new_temp = make_ssa_name (vec_dest, new_stmt);
3175 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3177 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3179 if (j == 0)
3180 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3181 else
3182 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3184 prev_stmt_info = vinfo_for_stmt (new_stmt);
3187 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3189 break;
3191 case WIDEN:
3192 /* No current target implements this case. */
3193 return false;
3196 /* The call in STMT might prevent it from being removed in dce.
3197 We however cannot remove it here, due to the way the ssa name
3198 it defines is mapped to the new definition. So just replace
3199 rhs of the statement with something harmless. */
3200 type = TREE_TYPE (scalar_dest);
3201 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3202 update_stmt (stmt);
3204 return true;
3208 /* Function vect_gen_widened_results_half
3210 Create a vector stmt whose code, type, number of arguments, and result
3211 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3212 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3213 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3214 needs to be created (DECL is a function-decl of a target-builtin).
3215 STMT is the original scalar stmt that we are vectorizing. */
3217 static tree
3218 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3219 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3220 tree vec_dest, block_stmt_iterator *bsi,
3221 tree stmt)
3223 tree expr;
3224 tree new_stmt;
3225 tree new_temp;
3226 tree sym;
3227 ssa_op_iter iter;
3229 /* Generate half of the widened result: */
3230 if (code == CALL_EXPR)
3232 /* Target specific support */
3233 if (op_type == binary_op)
3234 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3235 else
3236 expr = build_call_expr (decl, 1, vec_oprnd0);
3238 else
3240 /* Generic support */
3241 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3242 if (op_type == binary_op)
3243 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3244 else
3245 expr = build1 (code, vectype, vec_oprnd0);
3247 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3248 new_temp = make_ssa_name (vec_dest, new_stmt);
3249 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3250 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3252 if (code == CALL_EXPR)
3254 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3256 if (TREE_CODE (sym) == SSA_NAME)
3257 sym = SSA_NAME_VAR (sym);
3258 mark_sym_for_renaming (sym);
3262 return new_stmt;
3266 /* Check if STMT performs a conversion operation, that can be vectorized.
3267 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3268 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3269 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3271 bool
3272 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3273 tree *vec_stmt, slp_tree slp_node)
3275 tree vec_dest;
3276 tree scalar_dest;
3277 tree operation;
3278 tree op0;
3279 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3282 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3283 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3284 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3285 tree new_temp;
3286 tree def, def_stmt;
3287 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3288 tree new_stmt = NULL_TREE;
3289 stmt_vec_info prev_stmt_info;
3290 int nunits_in;
3291 int nunits_out;
3292 tree vectype_out, vectype_in;
3293 int ncopies, j;
3294 tree expr;
3295 tree rhs_type, lhs_type;
3296 tree builtin_decl;
3297 enum { NARROW, NONE, WIDEN } modifier;
3298 int i;
3299 VEC(tree,heap) *vec_oprnds0 = NULL;
3300 tree vop0;
3302 /* Is STMT a vectorizable conversion? */
3304 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3305 return false;
3307 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3308 return false;
3310 if (STMT_VINFO_LIVE_P (stmt_info))
3312 /* FORNOW: not yet supported. */
3313 if (vect_print_dump_info (REPORT_DETAILS))
3314 fprintf (vect_dump, "value used after loop.");
3315 return false;
3318 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3319 return false;
3321 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3322 return false;
3324 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3325 code = TREE_CODE (operation);
3326 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3327 return false;
3329 /* Check types of lhs and rhs. */
3330 op0 = TREE_OPERAND (operation, 0);
3331 rhs_type = TREE_TYPE (op0);
3332 vectype_in = get_vectype_for_scalar_type (rhs_type);
3333 if (!vectype_in)
3334 return false;
3335 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3337 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3338 lhs_type = TREE_TYPE (scalar_dest);
3339 vectype_out = get_vectype_for_scalar_type (lhs_type);
3340 if (!vectype_out)
3341 return false;
3342 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3344 /* FORNOW */
3345 if (nunits_in == nunits_out / 2)
3346 modifier = NARROW;
3347 else if (nunits_out == nunits_in)
3348 modifier = NONE;
3349 else if (nunits_out == nunits_in / 2)
3350 modifier = WIDEN;
3351 else
3352 return false;
3354 if (modifier == NONE)
3355 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3357 /* Bail out if the types are both integral or non-integral. */
3358 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3359 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3360 return false;
3362 if (modifier == NARROW)
3363 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3364 else
3365 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3367 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3368 this, so we can safely override NCOPIES with 1 here. */
3369 if (slp_node)
3370 ncopies = 1;
3372 /* Sanity check: make sure that at least one copy of the vectorized stmt
3373 needs to be generated. */
3374 gcc_assert (ncopies >= 1);
3376 /* FORNOW. This restriction should be relaxed. */
3377 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3379 if (vect_print_dump_info (REPORT_DETAILS))
3380 fprintf (vect_dump, "multiple types in nested loop.");
3381 return false;
3384 /* Check the operands of the operation. */
3385 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3387 if (vect_print_dump_info (REPORT_DETAILS))
3388 fprintf (vect_dump, "use not simple.");
3389 return false;
3392 /* Supportable by target? */
3393 if ((modifier == NONE
3394 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3395 || (modifier == WIDEN
3396 && !supportable_widening_operation (code, stmt, vectype_in,
3397 &decl1, &decl2,
3398 &code1, &code2))
3399 || (modifier == NARROW
3400 && !supportable_narrowing_operation (code, stmt, vectype_in,
3401 &code1)))
3403 if (vect_print_dump_info (REPORT_DETAILS))
3404 fprintf (vect_dump, "op not supported by target.");
3405 return false;
3408 if (modifier != NONE)
3410 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3411 /* FORNOW: SLP not supported. */
3412 if (STMT_SLP_TYPE (stmt_info))
3413 return false;
3416 if (!vec_stmt) /* transformation not required. */
3418 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3419 return true;
3422 /** Transform. **/
3423 if (vect_print_dump_info (REPORT_DETAILS))
3424 fprintf (vect_dump, "transform conversion.");
3426 /* Handle def. */
3427 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3429 if (modifier == NONE && !slp_node)
3430 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3432 prev_stmt_info = NULL;
3433 switch (modifier)
3435 case NONE:
3436 for (j = 0; j < ncopies; j++)
3438 tree sym;
3439 ssa_op_iter iter;
3441 if (j == 0)
3442 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3443 else
3444 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3446 builtin_decl =
3447 targetm.vectorize.builtin_conversion (code, vectype_in);
3448 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3450 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3452 /* Arguments are ready. create the new vector stmt. */
3453 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3454 new_temp = make_ssa_name (vec_dest, new_stmt);
3455 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3456 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3457 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3458 SSA_OP_ALL_VIRTUALS)
3460 if (TREE_CODE (sym) == SSA_NAME)
3461 sym = SSA_NAME_VAR (sym);
3462 mark_sym_for_renaming (sym);
3464 if (slp_node)
3465 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3468 if (j == 0)
3469 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3470 else
3471 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3472 prev_stmt_info = vinfo_for_stmt (new_stmt);
3474 break;
3476 case WIDEN:
3477 /* In case the vectorization factor (VF) is bigger than the number
3478 of elements that we can fit in a vectype (nunits), we have to
3479 generate more than one vector stmt - i.e - we need to "unroll"
3480 the vector stmt by a factor VF/nunits. */
3481 for (j = 0; j < ncopies; j++)
3483 if (j == 0)
3484 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3485 else
3486 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3488 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3490 /* Generate first half of the widened result: */
3491 new_stmt
3492 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3493 vec_oprnd0, vec_oprnd1,
3494 unary_op, vec_dest, bsi, stmt);
3495 if (j == 0)
3496 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3497 else
3498 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3499 prev_stmt_info = vinfo_for_stmt (new_stmt);
3501 /* Generate second half of the widened result: */
3502 new_stmt
3503 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3504 vec_oprnd0, vec_oprnd1,
3505 unary_op, vec_dest, bsi, stmt);
3506 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3507 prev_stmt_info = vinfo_for_stmt (new_stmt);
3509 break;
3511 case NARROW:
3512 /* In case the vectorization factor (VF) is bigger than the number
3513 of elements that we can fit in a vectype (nunits), we have to
3514 generate more than one vector stmt - i.e - we need to "unroll"
3515 the vector stmt by a factor VF/nunits. */
3516 for (j = 0; j < ncopies; j++)
3518 /* Handle uses. */
3519 if (j == 0)
3521 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3522 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3524 else
3526 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3527 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3530 /* Arguments are ready. Create the new vector stmt. */
3531 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3532 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3533 new_temp = make_ssa_name (vec_dest, new_stmt);
3534 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3535 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3537 if (j == 0)
3538 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3539 else
3540 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3542 prev_stmt_info = vinfo_for_stmt (new_stmt);
3545 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3548 return true;
3552 /* Function vectorizable_assignment.
3554 Check if STMT performs an assignment (copy) that can be vectorized.
3555 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3556 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3557 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3559 bool
3560 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3561 slp_tree slp_node)
3563 tree vec_dest;
3564 tree scalar_dest;
3565 tree op;
3566 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3567 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3568 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3569 tree new_temp;
3570 tree def, def_stmt;
3571 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3572 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3573 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3574 int i;
3575 VEC(tree,heap) *vec_oprnds = NULL;
3576 tree vop;
3578 gcc_assert (ncopies >= 1);
3579 if (ncopies > 1)
3580 return false; /* FORNOW */
3582 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3583 return false;
3585 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3586 return false;
3588 /* FORNOW: not yet supported. */
3589 if (STMT_VINFO_LIVE_P (stmt_info))
3591 if (vect_print_dump_info (REPORT_DETAILS))
3592 fprintf (vect_dump, "value used after loop.");
3593 return false;
3596 /* Is vectorizable assignment? */
3597 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3598 return false;
3600 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3601 if (TREE_CODE (scalar_dest) != SSA_NAME)
3602 return false;
3604 op = GIMPLE_STMT_OPERAND (stmt, 1);
3605 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3607 if (vect_print_dump_info (REPORT_DETAILS))
3608 fprintf (vect_dump, "use not simple.");
3609 return false;
3612 if (!vec_stmt) /* transformation not required. */
3614 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3615 if (vect_print_dump_info (REPORT_DETAILS))
3616 fprintf (vect_dump, "=== vectorizable_assignment ===");
3617 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3618 return true;
3621 /** Transform. **/
3622 if (vect_print_dump_info (REPORT_DETAILS))
3623 fprintf (vect_dump, "transform assignment.");
3625 /* Handle def. */
3626 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3628 /* Handle use. */
3629 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3631 /* Arguments are ready. create the new vector stmt. */
3632 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3634 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3635 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3636 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3637 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3638 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3640 if (slp_node)
3641 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3644 VEC_free (tree, heap, vec_oprnds);
3645 return true;
3649 /* Function vect_min_worthwhile_factor.
3651 For a loop where we could vectorize the operation indicated by CODE,
3652 return the minimum vectorization factor that makes it worthwhile
3653 to use generic vectors. */
3654 static int
3655 vect_min_worthwhile_factor (enum tree_code code)
3657 switch (code)
3659 case PLUS_EXPR:
3660 case MINUS_EXPR:
3661 case NEGATE_EXPR:
3662 return 4;
3664 case BIT_AND_EXPR:
3665 case BIT_IOR_EXPR:
3666 case BIT_XOR_EXPR:
3667 case BIT_NOT_EXPR:
3668 return 2;
3670 default:
3671 return INT_MAX;
3676 /* Function vectorizable_induction
3678 Check if PHI performs an induction computation that can be vectorized.
3679 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3680 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3681 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3683 bool
3684 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3685 tree *vec_stmt)
3687 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3688 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3689 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3690 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3691 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3692 tree vec_def;
3694 gcc_assert (ncopies >= 1);
3696 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3697 return false;
3699 /* FORNOW: SLP not supported. */
3700 if (STMT_SLP_TYPE (stmt_info))
3701 return false;
3703 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3705 if (STMT_VINFO_LIVE_P (stmt_info))
3707 /* FORNOW: not yet supported. */
3708 if (vect_print_dump_info (REPORT_DETAILS))
3709 fprintf (vect_dump, "value used after loop.");
3710 return false;
3713 if (TREE_CODE (phi) != PHI_NODE)
3714 return false;
3716 if (!vec_stmt) /* transformation not required. */
3718 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3719 if (vect_print_dump_info (REPORT_DETAILS))
3720 fprintf (vect_dump, "=== vectorizable_induction ===");
3721 vect_model_induction_cost (stmt_info, ncopies);
3722 return true;
3725 /** Transform. **/
3727 if (vect_print_dump_info (REPORT_DETAILS))
3728 fprintf (vect_dump, "transform induction phi.");
3730 vec_def = get_initial_def_for_induction (phi);
3731 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3732 return true;
3736 /* Function vectorizable_operation.
3738 Check if STMT performs a binary or unary operation that can be vectorized.
3739 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3740 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3741 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3743 bool
3744 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3745 slp_tree slp_node)
3747 tree vec_dest;
3748 tree scalar_dest;
3749 tree operation;
3750 tree op0, op1 = NULL;
3751 tree vec_oprnd1 = NULL_TREE;
3752 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3753 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3754 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3755 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3756 enum tree_code code;
3757 enum machine_mode vec_mode;
3758 tree new_temp;
3759 int op_type;
3760 optab optab;
3761 int icode;
3762 enum machine_mode optab_op2_mode;
3763 tree def, def_stmt;
3764 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3765 tree new_stmt = NULL_TREE;
3766 stmt_vec_info prev_stmt_info;
3767 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3768 int nunits_out;
3769 tree vectype_out;
3770 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3771 int j, i;
3772 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3773 tree vop0, vop1;
3775 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3776 this, so we can safely override NCOPIES with 1 here. */
3777 if (slp_node)
3778 ncopies = 1;
3779 gcc_assert (ncopies >= 1);
3780 /* FORNOW. This restriction should be relaxed. */
3781 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3783 if (vect_print_dump_info (REPORT_DETAILS))
3784 fprintf (vect_dump, "multiple types in nested loop.");
3785 return false;
3788 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3789 return false;
3791 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3792 return false;
3794 /* FORNOW: not yet supported. */
3795 if (STMT_VINFO_LIVE_P (stmt_info))
3797 if (vect_print_dump_info (REPORT_DETAILS))
3798 fprintf (vect_dump, "value used after loop.");
3799 return false;
3802 /* Is STMT a vectorizable binary/unary operation? */
3803 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3804 return false;
3806 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3807 return false;
3809 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3810 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3811 if (!vectype_out)
3812 return false;
3813 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3814 if (nunits_out != nunits_in)
3815 return false;
3817 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3818 code = TREE_CODE (operation);
3820 /* For pointer addition, we should use the normal plus for
3821 the vector addition. */
3822 if (code == POINTER_PLUS_EXPR)
3823 code = PLUS_EXPR;
3825 optab = optab_for_tree_code (code, vectype);
3827 /* Support only unary or binary operations. */
3828 op_type = TREE_OPERAND_LENGTH (operation);
3829 if (op_type != unary_op && op_type != binary_op)
3831 if (vect_print_dump_info (REPORT_DETAILS))
3832 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3833 return false;
3836 op0 = TREE_OPERAND (operation, 0);
3837 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3839 if (vect_print_dump_info (REPORT_DETAILS))
3840 fprintf (vect_dump, "use not simple.");
3841 return false;
3844 if (op_type == binary_op)
3846 op1 = TREE_OPERAND (operation, 1);
3847 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3849 if (vect_print_dump_info (REPORT_DETAILS))
3850 fprintf (vect_dump, "use not simple.");
3851 return false;
3855 /* Supportable by target? */
3856 if (!optab)
3858 if (vect_print_dump_info (REPORT_DETAILS))
3859 fprintf (vect_dump, "no optab.");
3860 return false;
3862 vec_mode = TYPE_MODE (vectype);
3863 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3864 if (icode == CODE_FOR_nothing)
3866 if (vect_print_dump_info (REPORT_DETAILS))
3867 fprintf (vect_dump, "op not supported by target.");
3868 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3869 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3870 < vect_min_worthwhile_factor (code))
3871 return false;
3872 if (vect_print_dump_info (REPORT_DETAILS))
3873 fprintf (vect_dump, "proceeding using word mode.");
3876 /* Worthwhile without SIMD support? */
3877 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3878 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3879 < vect_min_worthwhile_factor (code))
3881 if (vect_print_dump_info (REPORT_DETAILS))
3882 fprintf (vect_dump, "not worthwhile without SIMD support.");
3883 return false;
3886 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3888 /* FORNOW: not yet supported. */
3889 if (!VECTOR_MODE_P (vec_mode))
3890 return false;
3892 /* Invariant argument is needed for a vector shift
3893 by a scalar shift operand. */
3894 optab_op2_mode = insn_data[icode].operand[2].mode;
3895 if (! (VECTOR_MODE_P (optab_op2_mode)
3896 || dt[1] == vect_constant_def
3897 || dt[1] == vect_invariant_def))
3899 if (vect_print_dump_info (REPORT_DETAILS))
3900 fprintf (vect_dump, "operand mode requires invariant argument.");
3901 return false;
3905 if (!vec_stmt) /* transformation not required. */
3907 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3908 if (vect_print_dump_info (REPORT_DETAILS))
3909 fprintf (vect_dump, "=== vectorizable_operation ===");
3910 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3911 return true;
3914 /** Transform. **/
3916 if (vect_print_dump_info (REPORT_DETAILS))
3917 fprintf (vect_dump, "transform binary/unary operation.");
3919 /* Handle def. */
3920 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3922 if (!slp_node)
3923 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3924 if (op_type == binary_op)
3925 vec_oprnds1 = VEC_alloc (tree, heap, 1);
3927 /* In case the vectorization factor (VF) is bigger than the number
3928 of elements that we can fit in a vectype (nunits), we have to generate
3929 more than one vector stmt - i.e - we need to "unroll" the
3930 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3931 from one copy of the vector stmt to the next, in the field
3932 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3933 stages to find the correct vector defs to be used when vectorizing
3934 stmts that use the defs of the current stmt. The example below illustrates
3935 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3936 4 vectorized stmts):
3938 before vectorization:
3939 RELATED_STMT VEC_STMT
3940 S1: x = memref - -
3941 S2: z = x + 1 - -
3943 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3944 there):
3945 RELATED_STMT VEC_STMT
3946 VS1_0: vx0 = memref0 VS1_1 -
3947 VS1_1: vx1 = memref1 VS1_2 -
3948 VS1_2: vx2 = memref2 VS1_3 -
3949 VS1_3: vx3 = memref3 - -
3950 S1: x = load - VS1_0
3951 S2: z = x + 1 - -
3953 step2: vectorize stmt S2 (done here):
3954 To vectorize stmt S2 we first need to find the relevant vector
3955 def for the first operand 'x'. This is, as usual, obtained from
3956 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3957 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3958 relevant vector def 'vx0'. Having found 'vx0' we can generate
3959 the vector stmt VS2_0, and as usual, record it in the
3960 STMT_VINFO_VEC_STMT of stmt S2.
3961 When creating the second copy (VS2_1), we obtain the relevant vector
3962 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3963 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3964 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3965 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3966 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3967 chain of stmts and pointers:
3968 RELATED_STMT VEC_STMT
3969 VS1_0: vx0 = memref0 VS1_1 -
3970 VS1_1: vx1 = memref1 VS1_2 -
3971 VS1_2: vx2 = memref2 VS1_3 -
3972 VS1_3: vx3 = memref3 - -
3973 S1: x = load - VS1_0
3974 VS2_0: vz0 = vx0 + v1 VS2_1 -
3975 VS2_1: vz1 = vx1 + v1 VS2_2 -
3976 VS2_2: vz2 = vx2 + v1 VS2_3 -
3977 VS2_3: vz3 = vx3 + v1 - -
3978 S2: z = x + 1 - VS2_0 */
3980 prev_stmt_info = NULL;
3981 for (j = 0; j < ncopies; j++)
3983 /* Handle uses. */
3984 if (j == 0)
3986 if (op_type == binary_op
3987 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
3989 /* Vector shl and shr insn patterns can be defined with scalar
3990 operand 2 (shift operand). In this case, use constant or loop
3991 invariant op1 directly, without extending it to vector mode
3992 first. */
3993 optab_op2_mode = insn_data[icode].operand[2].mode;
3994 if (!VECTOR_MODE_P (optab_op2_mode))
3996 if (vect_print_dump_info (REPORT_DETAILS))
3997 fprintf (vect_dump, "operand 1 using scalar mode.");
3998 vec_oprnd1 = op1;
3999 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4003 /* vec_oprnd is available if operand 1 should be of a scalar-type
4004 (a special case for certain kind of vector shifts); otherwise,
4005 operand 1 should be of a vector type (the usual case). */
4006 if (op_type == binary_op && !vec_oprnd1)
4007 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4008 slp_node);
4009 else
4010 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4011 slp_node);
4013 else
4014 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4016 /* Arguments are ready. Create the new vector stmt. */
4017 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4019 if (op_type == binary_op)
4021 vop1 = VEC_index (tree, vec_oprnds1, i);
4022 new_stmt = build_gimple_modify_stmt (vec_dest,
4023 build2 (code, vectype, vop0, vop1));
4025 else
4026 new_stmt = build_gimple_modify_stmt (vec_dest,
4027 build1 (code, vectype, vop0));
4029 new_temp = make_ssa_name (vec_dest, new_stmt);
4030 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4031 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4032 if (slp_node)
4033 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4036 if (j == 0)
4037 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4038 else
4039 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4040 prev_stmt_info = vinfo_for_stmt (new_stmt);
4043 VEC_free (tree, heap, vec_oprnds0);
4044 if (vec_oprnds1)
4045 VEC_free (tree, heap, vec_oprnds1);
4047 return true;
4051 /* Function vectorizable_type_demotion
4053 Check if STMT performs a binary or unary operation that involves
4054 type demotion, and if it can be vectorized.
4055 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4056 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4057 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4059 bool
4060 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4061 tree *vec_stmt)
4063 tree vec_dest;
4064 tree scalar_dest;
4065 tree operation;
4066 tree op0;
4067 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4068 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4069 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4071 enum tree_code code, code1 = ERROR_MARK;
4072 tree new_temp;
4073 tree def, def_stmt;
4074 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4075 tree new_stmt;
4076 stmt_vec_info prev_stmt_info;
4077 int nunits_in;
4078 int nunits_out;
4079 tree vectype_out;
4080 int ncopies;
4081 int j;
4082 tree expr;
4083 tree vectype_in;
4085 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4086 return false;
4088 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4089 return false;
4091 /* FORNOW: not yet supported. */
4092 if (STMT_VINFO_LIVE_P (stmt_info))
4094 if (vect_print_dump_info (REPORT_DETAILS))
4095 fprintf (vect_dump, "value used after loop.");
4096 return false;
4099 /* Is STMT a vectorizable type-demotion operation? */
4100 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4101 return false;
4103 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4104 return false;
4106 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4107 code = TREE_CODE (operation);
4108 if (code != NOP_EXPR && code != CONVERT_EXPR)
4109 return false;
4111 op0 = TREE_OPERAND (operation, 0);
4112 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4113 if (!vectype_in)
4114 return false;
4115 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4117 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4118 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4119 if (!vectype_out)
4120 return false;
4121 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4122 if (nunits_in != nunits_out / 2) /* FORNOW */
4123 return false;
4125 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4126 gcc_assert (ncopies >= 1);
4127 /* FORNOW. This restriction should be relaxed. */
4128 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4130 if (vect_print_dump_info (REPORT_DETAILS))
4131 fprintf (vect_dump, "multiple types in nested loop.");
4132 return false;
4135 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4136 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4137 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4138 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4139 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4140 return false;
4142 /* Check the operands of the operation. */
4143 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4145 if (vect_print_dump_info (REPORT_DETAILS))
4146 fprintf (vect_dump, "use not simple.");
4147 return false;
4150 /* Supportable by target? */
4151 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4152 return false;
4154 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4156 if (!vec_stmt) /* transformation not required. */
4158 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4159 if (vect_print_dump_info (REPORT_DETAILS))
4160 fprintf (vect_dump, "=== vectorizable_demotion ===");
4161 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4162 return true;
4165 /** Transform. **/
4166 if (vect_print_dump_info (REPORT_DETAILS))
4167 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4168 ncopies);
4170 /* Handle def. */
4171 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4173 /* In case the vectorization factor (VF) is bigger than the number
4174 of elements that we can fit in a vectype (nunits), we have to generate
4175 more than one vector stmt - i.e - we need to "unroll" the
4176 vector stmt by a factor VF/nunits. */
4177 prev_stmt_info = NULL;
4178 for (j = 0; j < ncopies; j++)
4180 /* Handle uses. */
4181 if (j == 0)
4183 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4184 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4186 else
4188 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4189 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4192 /* Arguments are ready. Create the new vector stmt. */
4193 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4194 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4195 new_temp = make_ssa_name (vec_dest, new_stmt);
4196 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4197 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4199 if (j == 0)
4200 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4201 else
4202 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4204 prev_stmt_info = vinfo_for_stmt (new_stmt);
4207 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4208 return true;
4212 /* Function vectorizable_type_promotion
4214 Check if STMT performs a binary or unary operation that involves
4215 type promotion, and if it can be vectorized.
4216 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4217 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4218 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4220 bool
4221 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4222 tree *vec_stmt)
4224 tree vec_dest;
4225 tree scalar_dest;
4226 tree operation;
4227 tree op0, op1 = NULL;
4228 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4229 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4230 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4231 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4232 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4233 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4234 int op_type;
4235 tree def, def_stmt;
4236 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4237 tree new_stmt;
4238 stmt_vec_info prev_stmt_info;
4239 int nunits_in;
4240 int nunits_out;
4241 tree vectype_out;
4242 int ncopies;
4243 int j;
4244 tree vectype_in;
4246 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4247 return false;
4249 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4250 return false;
4252 /* FORNOW: not yet supported. */
4253 if (STMT_VINFO_LIVE_P (stmt_info))
4255 if (vect_print_dump_info (REPORT_DETAILS))
4256 fprintf (vect_dump, "value used after loop.");
4257 return false;
4260 /* Is STMT a vectorizable type-promotion operation? */
4261 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4262 return false;
4264 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4265 return false;
4267 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4268 code = TREE_CODE (operation);
4269 if (code != NOP_EXPR && code != CONVERT_EXPR
4270 && code != WIDEN_MULT_EXPR)
4271 return false;
4273 op0 = TREE_OPERAND (operation, 0);
4274 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4275 if (!vectype_in)
4276 return false;
4277 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4279 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4280 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4281 if (!vectype_out)
4282 return false;
4283 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4284 if (nunits_out != nunits_in / 2) /* FORNOW */
4285 return false;
4287 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4288 gcc_assert (ncopies >= 1);
4289 /* FORNOW. This restriction should be relaxed. */
4290 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4292 if (vect_print_dump_info (REPORT_DETAILS))
4293 fprintf (vect_dump, "multiple types in nested loop.");
4294 return false;
4297 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4298 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4299 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4300 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4301 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4302 return false;
4304 /* Check the operands of the operation. */
4305 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4307 if (vect_print_dump_info (REPORT_DETAILS))
4308 fprintf (vect_dump, "use not simple.");
4309 return false;
4312 op_type = TREE_CODE_LENGTH (code);
4313 if (op_type == binary_op)
4315 op1 = TREE_OPERAND (operation, 1);
4316 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4318 if (vect_print_dump_info (REPORT_DETAILS))
4319 fprintf (vect_dump, "use not simple.");
4320 return false;
4324 /* Supportable by target? */
4325 if (!supportable_widening_operation (code, stmt, vectype_in,
4326 &decl1, &decl2, &code1, &code2))
4327 return false;
4329 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4331 if (!vec_stmt) /* transformation not required. */
4333 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4334 if (vect_print_dump_info (REPORT_DETAILS))
4335 fprintf (vect_dump, "=== vectorizable_promotion ===");
4336 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4337 return true;
4340 /** Transform. **/
4342 if (vect_print_dump_info (REPORT_DETAILS))
4343 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4344 ncopies);
4346 /* Handle def. */
4347 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4349 /* In case the vectorization factor (VF) is bigger than the number
4350 of elements that we can fit in a vectype (nunits), we have to generate
4351 more than one vector stmt - i.e - we need to "unroll" the
4352 vector stmt by a factor VF/nunits. */
4354 prev_stmt_info = NULL;
4355 for (j = 0; j < ncopies; j++)
4357 /* Handle uses. */
4358 if (j == 0)
4360 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4361 if (op_type == binary_op)
4362 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4364 else
4366 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4367 if (op_type == binary_op)
4368 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4371 /* Arguments are ready. Create the new vector stmt. We are creating
4372 two vector defs because the widened result does not fit in one vector.
4373 The vectorized stmt can be expressed as a call to a taregt builtin,
4374 or a using a tree-code. */
4375 /* Generate first half of the widened result: */
4376 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4377 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4378 if (j == 0)
4379 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4380 else
4381 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4382 prev_stmt_info = vinfo_for_stmt (new_stmt);
4384 /* Generate second half of the widened result: */
4385 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4386 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4387 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4388 prev_stmt_info = vinfo_for_stmt (new_stmt);
4392 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4393 return true;
4397 /* Function vect_strided_store_supported.
4399 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4400 and FALSE otherwise. */
4402 static bool
4403 vect_strided_store_supported (tree vectype)
4405 optab interleave_high_optab, interleave_low_optab;
4406 int mode;
4408 mode = (int) TYPE_MODE (vectype);
4410 /* Check that the operation is supported. */
4411 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4412 vectype);
4413 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4414 vectype);
4415 if (!interleave_high_optab || !interleave_low_optab)
4417 if (vect_print_dump_info (REPORT_DETAILS))
4418 fprintf (vect_dump, "no optab for interleave.");
4419 return false;
4422 if (optab_handler (interleave_high_optab, mode)->insn_code
4423 == CODE_FOR_nothing
4424 || optab_handler (interleave_low_optab, mode)->insn_code
4425 == CODE_FOR_nothing)
4427 if (vect_print_dump_info (REPORT_DETAILS))
4428 fprintf (vect_dump, "interleave op not supported by target.");
4429 return false;
4432 return true;
4436 /* Function vect_permute_store_chain.
4438 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4439 a power of 2, generate interleave_high/low stmts to reorder the data
4440 correctly for the stores. Return the final references for stores in
4441 RESULT_CHAIN.
4443 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4444 The input is 4 vectors each containing 8 elements. We assign a number to each
4445 element, the input sequence is:
4447 1st vec: 0 1 2 3 4 5 6 7
4448 2nd vec: 8 9 10 11 12 13 14 15
4449 3rd vec: 16 17 18 19 20 21 22 23
4450 4th vec: 24 25 26 27 28 29 30 31
4452 The output sequence should be:
4454 1st vec: 0 8 16 24 1 9 17 25
4455 2nd vec: 2 10 18 26 3 11 19 27
4456 3rd vec: 4 12 20 28 5 13 21 30
4457 4th vec: 6 14 22 30 7 15 23 31
4459 i.e., we interleave the contents of the four vectors in their order.
4461 We use interleave_high/low instructions to create such output. The input of
4462 each interleave_high/low operation is two vectors:
4463 1st vec 2nd vec
4464 0 1 2 3 4 5 6 7
4465 the even elements of the result vector are obtained left-to-right from the
4466 high/low elements of the first vector. The odd elements of the result are
4467 obtained left-to-right from the high/low elements of the second vector.
4468 The output of interleave_high will be: 0 4 1 5
4469 and of interleave_low: 2 6 3 7
4472 The permutation is done in log LENGTH stages. In each stage interleave_high
4473 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4474 where the first argument is taken from the first half of DR_CHAIN and the
4475 second argument from it's second half.
4476 In our example,
4478 I1: interleave_high (1st vec, 3rd vec)
4479 I2: interleave_low (1st vec, 3rd vec)
4480 I3: interleave_high (2nd vec, 4th vec)
4481 I4: interleave_low (2nd vec, 4th vec)
4483 The output for the first stage is:
4485 I1: 0 16 1 17 2 18 3 19
4486 I2: 4 20 5 21 6 22 7 23
4487 I3: 8 24 9 25 10 26 11 27
4488 I4: 12 28 13 29 14 30 15 31
4490 The output of the second stage, i.e. the final result is:
4492 I1: 0 8 16 24 1 9 17 25
4493 I2: 2 10 18 26 3 11 19 27
4494 I3: 4 12 20 28 5 13 21 30
4495 I4: 6 14 22 30 7 15 23 31. */
4497 static bool
4498 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4499 unsigned int length,
4500 tree stmt,
4501 block_stmt_iterator *bsi,
4502 VEC(tree,heap) **result_chain)
4504 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4505 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4506 tree scalar_dest, tmp;
4507 int i;
4508 unsigned int j;
4509 VEC(tree,heap) *first, *second;
4511 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4512 first = VEC_alloc (tree, heap, length/2);
4513 second = VEC_alloc (tree, heap, length/2);
4515 /* Check that the operation is supported. */
4516 if (!vect_strided_store_supported (vectype))
4517 return false;
4519 *result_chain = VEC_copy (tree, heap, dr_chain);
4521 for (i = 0; i < exact_log2 (length); i++)
4523 for (j = 0; j < length/2; j++)
4525 vect1 = VEC_index (tree, dr_chain, j);
4526 vect2 = VEC_index (tree, dr_chain, j+length/2);
4528 /* Create interleaving stmt:
4529 in the case of big endian:
4530 high = interleave_high (vect1, vect2)
4531 and in the case of little endian:
4532 high = interleave_low (vect1, vect2). */
4533 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4534 DECL_GIMPLE_REG_P (perm_dest) = 1;
4535 add_referenced_var (perm_dest);
4536 if (BYTES_BIG_ENDIAN)
4537 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4538 else
4539 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4540 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4541 high = make_ssa_name (perm_dest, perm_stmt);
4542 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4543 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4544 VEC_replace (tree, *result_chain, 2*j, high);
4546 /* Create interleaving stmt:
4547 in the case of big endian:
4548 low = interleave_low (vect1, vect2)
4549 and in the case of little endian:
4550 low = interleave_high (vect1, vect2). */
4551 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4552 DECL_GIMPLE_REG_P (perm_dest) = 1;
4553 add_referenced_var (perm_dest);
4554 if (BYTES_BIG_ENDIAN)
4555 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4556 else
4557 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4558 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4559 low = make_ssa_name (perm_dest, perm_stmt);
4560 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4561 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4562 VEC_replace (tree, *result_chain, 2*j+1, low);
4564 dr_chain = VEC_copy (tree, heap, *result_chain);
4566 return true;
4570 /* Function vectorizable_store.
4572 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4573 can be vectorized.
4574 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4575 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4576 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4578 bool
4579 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4580 slp_tree slp_node)
4582 tree scalar_dest;
4583 tree data_ref;
4584 tree op;
4585 tree vec_oprnd = NULL_TREE;
4586 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4587 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4588 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4589 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4590 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4591 enum machine_mode vec_mode;
4592 tree dummy;
4593 enum dr_alignment_support alignment_support_scheme;
4594 tree def, def_stmt;
4595 enum vect_def_type dt;
4596 stmt_vec_info prev_stmt_info = NULL;
4597 tree dataref_ptr = NULL_TREE;
4598 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4599 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4600 int j;
4601 tree next_stmt, first_stmt;
4602 bool strided_store = false;
4603 unsigned int group_size, i;
4604 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4605 bool inv_p;
4606 VEC(tree,heap) *vec_oprnds = NULL;
4607 bool slp = (slp_node != NULL);
4608 stmt_vec_info first_stmt_vinfo;
4609 unsigned int vec_num;
4611 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4612 this, so we can safely override NCOPIES with 1 here. */
4613 if (slp)
4614 ncopies = 1;
4616 gcc_assert (ncopies >= 1);
4618 /* FORNOW. This restriction should be relaxed. */
4619 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4621 if (vect_print_dump_info (REPORT_DETAILS))
4622 fprintf (vect_dump, "multiple types in nested loop.");
4623 return false;
4626 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4627 return false;
4629 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4630 return false;
4632 if (STMT_VINFO_LIVE_P (stmt_info))
4634 if (vect_print_dump_info (REPORT_DETAILS))
4635 fprintf (vect_dump, "value used after loop.");
4636 return false;
4639 /* Is vectorizable store? */
4641 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4642 return false;
4644 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4645 if (TREE_CODE (scalar_dest) != ARRAY_REF
4646 && TREE_CODE (scalar_dest) != INDIRECT_REF
4647 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4648 return false;
4650 op = GIMPLE_STMT_OPERAND (stmt, 1);
4651 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4653 if (vect_print_dump_info (REPORT_DETAILS))
4654 fprintf (vect_dump, "use not simple.");
4655 return false;
4658 vec_mode = TYPE_MODE (vectype);
4659 /* FORNOW. In some cases can vectorize even if data-type not supported
4660 (e.g. - array initialization with 0). */
4661 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4662 return false;
4664 if (!STMT_VINFO_DATA_REF (stmt_info))
4665 return false;
4667 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4669 strided_store = true;
4670 if (!vect_strided_store_supported (vectype)
4671 && !PURE_SLP_STMT (stmt_info) && !slp)
4672 return false;
4675 if (!vec_stmt) /* transformation not required. */
4677 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4678 if (!PURE_SLP_STMT (stmt_info))
4679 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4680 return true;
4683 /** Transform. **/
4685 if (strided_store)
4687 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4688 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4689 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4691 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4693 /* FORNOW */
4694 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4696 /* We vectorize all the stmts of the interleaving group when we
4697 reach the last stmt in the group. */
4698 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4699 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4700 && !slp)
4702 *vec_stmt = NULL_TREE;
4703 return true;
4706 if (slp)
4707 strided_store = false;
4709 /* VEC_NUM is the number of vect stmts to be created for this group. */
4710 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4711 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4712 else
4713 vec_num = group_size;
4715 else
4717 first_stmt = stmt;
4718 first_dr = dr;
4719 group_size = vec_num = 1;
4720 first_stmt_vinfo = stmt_info;
4723 if (vect_print_dump_info (REPORT_DETAILS))
4724 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4726 dr_chain = VEC_alloc (tree, heap, group_size);
4727 oprnds = VEC_alloc (tree, heap, group_size);
4729 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4730 gcc_assert (alignment_support_scheme);
4731 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4733 /* In case the vectorization factor (VF) is bigger than the number
4734 of elements that we can fit in a vectype (nunits), we have to generate
4735 more than one vector stmt - i.e - we need to "unroll" the
4736 vector stmt by a factor VF/nunits. For more details see documentation in
4737 vect_get_vec_def_for_copy_stmt. */
4739 /* In case of interleaving (non-unit strided access):
4741 S1: &base + 2 = x2
4742 S2: &base = x0
4743 S3: &base + 1 = x1
4744 S4: &base + 3 = x3
4746 We create vectorized stores starting from base address (the access of the
4747 first stmt in the chain (S2 in the above example), when the last store stmt
4748 of the chain (S4) is reached:
4750 VS1: &base = vx2
4751 VS2: &base + vec_size*1 = vx0
4752 VS3: &base + vec_size*2 = vx1
4753 VS4: &base + vec_size*3 = vx3
4755 Then permutation statements are generated:
4757 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4758 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4761 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4762 (the order of the data-refs in the output of vect_permute_store_chain
4763 corresponds to the order of scalar stmts in the interleaving chain - see
4764 the documentation of vect_permute_store_chain()).
4766 In case of both multiple types and interleaving, above vector stores and
4767 permutation stmts are created for every copy. The result vector stmts are
4768 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4769 STMT_VINFO_RELATED_STMT for the next copies.
4772 prev_stmt_info = NULL;
4773 for (j = 0; j < ncopies; j++)
4775 tree new_stmt;
4776 tree ptr_incr;
4778 if (j == 0)
4780 if (slp)
4782 /* Get vectorized arguments for SLP_NODE. */
4783 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4785 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4787 else
4789 /* For interleaved stores we collect vectorized defs for all the
4790 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4791 used as an input to vect_permute_store_chain(), and OPRNDS as
4792 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4794 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4795 OPRNDS are of size 1. */
4796 next_stmt = first_stmt;
4797 for (i = 0; i < group_size; i++)
4799 /* Since gaps are not supported for interleaved stores,
4800 GROUP_SIZE is the exact number of stmts in the chain.
4801 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4802 there is no interleaving, GROUP_SIZE is 1, and only one
4803 iteration of the loop will be executed. */
4804 gcc_assert (next_stmt);
4805 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4807 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4808 NULL);
4809 VEC_quick_push(tree, dr_chain, vec_oprnd);
4810 VEC_quick_push(tree, oprnds, vec_oprnd);
4811 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4814 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4815 &dummy, &ptr_incr, false,
4816 TREE_TYPE (vec_oprnd), &inv_p);
4817 gcc_assert (!inv_p);
4819 else
4821 /* FORNOW SLP doesn't work for multiple types. */
4822 gcc_assert (!slp);
4824 /* For interleaved stores we created vectorized defs for all the
4825 defs stored in OPRNDS in the previous iteration (previous copy).
4826 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4827 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4828 next copy.
4829 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4830 OPRNDS are of size 1. */
4831 for (i = 0; i < group_size; i++)
4833 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
4834 VEC_index (tree, oprnds, i));
4835 VEC_replace(tree, dr_chain, i, vec_oprnd);
4836 VEC_replace(tree, oprnds, i, vec_oprnd);
4838 dataref_ptr =
4839 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4842 if (strided_store)
4844 result_chain = VEC_alloc (tree, heap, group_size);
4845 /* Permute. */
4846 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4847 &result_chain))
4848 return false;
4851 next_stmt = first_stmt;
4852 for (i = 0; i < vec_num; i++)
4854 if (i > 0)
4855 /* Bump the vector pointer. */
4856 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4857 NULL_TREE);
4859 if (slp)
4860 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4861 else if (strided_store)
4862 /* For strided stores vectorized defs are interleaved in
4863 vect_permute_store_chain(). */
4864 vec_oprnd = VEC_index (tree, result_chain, i);
4866 data_ref = build_fold_indirect_ref (dataref_ptr);
4867 /* Arguments are ready. Create the new vector stmt. */
4868 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4869 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4870 mark_symbols_for_renaming (new_stmt);
4872 if (j == 0)
4873 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4874 else
4875 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4877 prev_stmt_info = vinfo_for_stmt (new_stmt);
4878 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4879 if (!next_stmt)
4880 break;
4884 return true;
4888 /* Function vect_setup_realignment
4890 This function is called when vectorizing an unaligned load using
4891 the dr_explicit_realign[_optimized] scheme.
4892 This function generates the following code at the loop prolog:
4894 p = initial_addr;
4895 x msq_init = *(floor(p)); # prolog load
4896 realignment_token = call target_builtin;
4897 loop:
4898 x msq = phi (msq_init, ---)
4900 The stmts marked with x are generated only for the case of
4901 dr_explicit_realign_optimized.
4903 The code above sets up a new (vector) pointer, pointing to the first
4904 location accessed by STMT, and a "floor-aligned" load using that pointer.
4905 It also generates code to compute the "realignment-token" (if the relevant
4906 target hook was defined), and creates a phi-node at the loop-header bb
4907 whose arguments are the result of the prolog-load (created by this
4908 function) and the result of a load that takes place in the loop (to be
4909 created by the caller to this function).
4911 For the case of dr_explicit_realign_optimized:
4912 The caller to this function uses the phi-result (msq) to create the
4913 realignment code inside the loop, and sets up the missing phi argument,
4914 as follows:
4915 loop:
4916 msq = phi (msq_init, lsq)
4917 lsq = *(floor(p')); # load in loop
4918 result = realign_load (msq, lsq, realignment_token);
4920 For the case of dr_explicit_realign:
4921 loop:
4922 msq = *(floor(p)); # load in loop
4923 p' = p + (VS-1);
4924 lsq = *(floor(p')); # load in loop
4925 result = realign_load (msq, lsq, realignment_token);
4927 Input:
4928 STMT - (scalar) load stmt to be vectorized. This load accesses
4929 a memory location that may be unaligned.
4930 BSI - place where new code is to be inserted.
4931 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4932 is used.
4934 Output:
4935 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4936 target hook, if defined.
4937 Return value - the result of the loop-header phi node. */
4939 static tree
4940 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
4941 tree *realignment_token,
4942 enum dr_alignment_support alignment_support_scheme,
4943 tree init_addr,
4944 struct loop **at_loop)
4946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4947 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4948 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4949 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4950 edge pe;
4951 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4952 tree vec_dest;
4953 tree inc;
4954 tree ptr;
4955 tree data_ref;
4956 tree new_stmt;
4957 basic_block new_bb;
4958 tree msq_init = NULL_TREE;
4959 tree new_temp;
4960 tree phi_stmt;
4961 tree msq = NULL_TREE;
4962 tree stmts = NULL_TREE;
4963 bool inv_p;
4964 bool compute_in_loop = false;
4965 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4966 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
4967 struct loop *loop_for_initial_load;
4969 gcc_assert (alignment_support_scheme == dr_explicit_realign
4970 || alignment_support_scheme == dr_explicit_realign_optimized);
4972 /* We need to generate three things:
4973 1. the misalignment computation
4974 2. the extra vector load (for the optimized realignment scheme).
4975 3. the phi node for the two vectors from which the realignment is
4976 done (for the optimized realignment scheme).
4979 /* 1. Determine where to generate the misalignment computation.
4981 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4982 calculation will be generated by this function, outside the loop (in the
4983 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4984 caller, inside the loop.
4986 Background: If the misalignment remains fixed throughout the iterations of
4987 the loop, then both realignment schemes are applicable, and also the
4988 misalignment computation can be done outside LOOP. This is because we are
4989 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4990 are a multiple of VS (the Vector Size), and therefore the misalignment in
4991 different vectorized LOOP iterations is always the same.
4992 The problem arises only if the memory access is in an inner-loop nested
4993 inside LOOP, which is now being vectorized using outer-loop vectorization.
4994 This is the only case when the misalignment of the memory access may not
4995 remain fixed throughout the iterations of the inner-loop (as explained in
4996 detail in vect_supportable_dr_alignment). In this case, not only is the
4997 optimized realignment scheme not applicable, but also the misalignment
4998 computation (and generation of the realignment token that is passed to
4999 REALIGN_LOAD) have to be done inside the loop.
5001 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5002 or not, which in turn determines if the misalignment is computed inside
5003 the inner-loop, or outside LOOP. */
5005 if (init_addr != NULL_TREE)
5007 compute_in_loop = true;
5008 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5012 /* 2. Determine where to generate the extra vector load.
5014 For the optimized realignment scheme, instead of generating two vector
5015 loads in each iteration, we generate a single extra vector load in the
5016 preheader of the loop, and in each iteration reuse the result of the
5017 vector load from the previous iteration. In case the memory access is in
5018 an inner-loop nested inside LOOP, which is now being vectorized using
5019 outer-loop vectorization, we need to determine whether this initial vector
5020 load should be generated at the preheader of the inner-loop, or can be
5021 generated at the preheader of LOOP. If the memory access has no evolution
5022 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5023 to be generated inside LOOP (in the preheader of the inner-loop). */
5025 if (nested_in_vect_loop)
5027 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5028 bool invariant_in_outerloop =
5029 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5030 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5032 else
5033 loop_for_initial_load = loop;
5034 if (at_loop)
5035 *at_loop = loop_for_initial_load;
5037 /* 3. For the case of the optimized realignment, create the first vector
5038 load at the loop preheader. */
5040 if (alignment_support_scheme == dr_explicit_realign_optimized)
5042 /* Create msq_init = *(floor(p1)) in the loop preheader */
5044 gcc_assert (!compute_in_loop);
5045 pe = loop_preheader_edge (loop_for_initial_load);
5046 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5047 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5048 &init_addr, &inc, true, NULL_TREE, &inv_p);
5049 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5050 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5051 new_temp = make_ssa_name (vec_dest, new_stmt);
5052 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5053 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5054 gcc_assert (!new_bb);
5055 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5058 /* 4. Create realignment token using a target builtin, if available.
5059 It is done either inside the containing loop, or before LOOP (as
5060 determined above). */
5062 if (targetm.vectorize.builtin_mask_for_load)
5064 tree builtin_decl;
5066 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5067 if (compute_in_loop)
5068 gcc_assert (init_addr); /* already computed by the caller. */
5069 else
5071 /* Generate the INIT_ADDR computation outside LOOP. */
5072 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5073 NULL_TREE, loop);
5074 pe = loop_preheader_edge (loop);
5075 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5076 gcc_assert (!new_bb);
5079 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5080 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5081 vec_dest = vect_create_destination_var (scalar_dest,
5082 TREE_TYPE (new_stmt));
5083 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5084 new_temp = make_ssa_name (vec_dest, new_stmt);
5085 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5087 if (compute_in_loop)
5088 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5089 else
5091 /* Generate the misalignment computation outside LOOP. */
5092 pe = loop_preheader_edge (loop);
5093 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5094 gcc_assert (!new_bb);
5097 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5099 /* The result of the CALL_EXPR to this builtin is determined from
5100 the value of the parameter and no global variables are touched
5101 which makes the builtin a "const" function. Requiring the
5102 builtin to have the "const" attribute makes it unnecessary
5103 to call mark_call_clobbered. */
5104 gcc_assert (TREE_READONLY (builtin_decl));
5107 if (alignment_support_scheme == dr_explicit_realign)
5108 return msq;
5110 gcc_assert (!compute_in_loop);
5111 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5114 /* 5. Create msq = phi <msq_init, lsq> in loop */
5116 pe = loop_preheader_edge (containing_loop);
5117 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5118 msq = make_ssa_name (vec_dest, NULL_TREE);
5119 phi_stmt = create_phi_node (msq, containing_loop->header);
5120 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5121 add_phi_arg (phi_stmt, msq_init, pe);
5123 return msq;
5127 /* Function vect_strided_load_supported.
5129 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5130 and FALSE otherwise. */
5132 static bool
5133 vect_strided_load_supported (tree vectype)
5135 optab perm_even_optab, perm_odd_optab;
5136 int mode;
5138 mode = (int) TYPE_MODE (vectype);
5140 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5141 if (!perm_even_optab)
5143 if (vect_print_dump_info (REPORT_DETAILS))
5144 fprintf (vect_dump, "no optab for perm_even.");
5145 return false;
5148 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5150 if (vect_print_dump_info (REPORT_DETAILS))
5151 fprintf (vect_dump, "perm_even op not supported by target.");
5152 return false;
5155 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5156 if (!perm_odd_optab)
5158 if (vect_print_dump_info (REPORT_DETAILS))
5159 fprintf (vect_dump, "no optab for perm_odd.");
5160 return false;
5163 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5165 if (vect_print_dump_info (REPORT_DETAILS))
5166 fprintf (vect_dump, "perm_odd op not supported by target.");
5167 return false;
5169 return true;
5173 /* Function vect_permute_load_chain.
5175 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5176 a power of 2, generate extract_even/odd stmts to reorder the input data
5177 correctly. Return the final references for loads in RESULT_CHAIN.
5179 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5180 The input is 4 vectors each containing 8 elements. We assign a number to each
5181 element, the input sequence is:
5183 1st vec: 0 1 2 3 4 5 6 7
5184 2nd vec: 8 9 10 11 12 13 14 15
5185 3rd vec: 16 17 18 19 20 21 22 23
5186 4th vec: 24 25 26 27 28 29 30 31
5188 The output sequence should be:
5190 1st vec: 0 4 8 12 16 20 24 28
5191 2nd vec: 1 5 9 13 17 21 25 29
5192 3rd vec: 2 6 10 14 18 22 26 30
5193 4th vec: 3 7 11 15 19 23 27 31
5195 i.e., the first output vector should contain the first elements of each
5196 interleaving group, etc.
5198 We use extract_even/odd instructions to create such output. The input of each
5199 extract_even/odd operation is two vectors
5200 1st vec 2nd vec
5201 0 1 2 3 4 5 6 7
5203 and the output is the vector of extracted even/odd elements. The output of
5204 extract_even will be: 0 2 4 6
5205 and of extract_odd: 1 3 5 7
5208 The permutation is done in log LENGTH stages. In each stage extract_even and
5209 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5210 order. In our example,
5212 E1: extract_even (1st vec, 2nd vec)
5213 E2: extract_odd (1st vec, 2nd vec)
5214 E3: extract_even (3rd vec, 4th vec)
5215 E4: extract_odd (3rd vec, 4th vec)
5217 The output for the first stage will be:
5219 E1: 0 2 4 6 8 10 12 14
5220 E2: 1 3 5 7 9 11 13 15
5221 E3: 16 18 20 22 24 26 28 30
5222 E4: 17 19 21 23 25 27 29 31
5224 In order to proceed and create the correct sequence for the next stage (or
5225 for the correct output, if the second stage is the last one, as in our
5226 example), we first put the output of extract_even operation and then the
5227 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5228 The input for the second stage is:
5230 1st vec (E1): 0 2 4 6 8 10 12 14
5231 2nd vec (E3): 16 18 20 22 24 26 28 30
5232 3rd vec (E2): 1 3 5 7 9 11 13 15
5233 4th vec (E4): 17 19 21 23 25 27 29 31
5235 The output of the second stage:
5237 E1: 0 4 8 12 16 20 24 28
5238 E2: 2 6 10 14 18 22 26 30
5239 E3: 1 5 9 13 17 21 25 29
5240 E4: 3 7 11 15 19 23 27 31
5242 And RESULT_CHAIN after reordering:
5244 1st vec (E1): 0 4 8 12 16 20 24 28
5245 2nd vec (E3): 1 5 9 13 17 21 25 29
5246 3rd vec (E2): 2 6 10 14 18 22 26 30
5247 4th vec (E4): 3 7 11 15 19 23 27 31. */
5249 static bool
5250 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5251 unsigned int length,
5252 tree stmt,
5253 block_stmt_iterator *bsi,
5254 VEC(tree,heap) **result_chain)
5256 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5257 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5258 tree tmp;
5259 int i;
5260 unsigned int j;
5262 /* Check that the operation is supported. */
5263 if (!vect_strided_load_supported (vectype))
5264 return false;
5266 *result_chain = VEC_copy (tree, heap, dr_chain);
5267 for (i = 0; i < exact_log2 (length); i++)
5269 for (j = 0; j < length; j +=2)
5271 first_vect = VEC_index (tree, dr_chain, j);
5272 second_vect = VEC_index (tree, dr_chain, j+1);
5274 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5275 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5276 DECL_GIMPLE_REG_P (perm_dest) = 1;
5277 add_referenced_var (perm_dest);
5279 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5280 first_vect, second_vect);
5281 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5283 data_ref = make_ssa_name (perm_dest, perm_stmt);
5284 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5285 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5286 mark_symbols_for_renaming (perm_stmt);
5288 VEC_replace (tree, *result_chain, j/2, data_ref);
5290 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5291 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5292 DECL_GIMPLE_REG_P (perm_dest) = 1;
5293 add_referenced_var (perm_dest);
5295 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5296 first_vect, second_vect);
5297 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5298 data_ref = make_ssa_name (perm_dest, perm_stmt);
5299 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5300 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5301 mark_symbols_for_renaming (perm_stmt);
5303 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5305 dr_chain = VEC_copy (tree, heap, *result_chain);
5307 return true;
5311 /* Function vect_transform_strided_load.
5313 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5314 to perform their permutation and ascribe the result vectorized statements to
5315 the scalar statements.
5318 static bool
5319 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5320 block_stmt_iterator *bsi)
5322 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5323 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5324 tree next_stmt, new_stmt;
5325 VEC(tree,heap) *result_chain = NULL;
5326 unsigned int i, gap_count;
5327 tree tmp_data_ref;
5329 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5330 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5331 vectors, that are ready for vector computation. */
5332 result_chain = VEC_alloc (tree, heap, size);
5333 /* Permute. */
5334 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5335 return false;
5337 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5338 Since we scan the chain starting from it's first node, their order
5339 corresponds the order of data-refs in RESULT_CHAIN. */
5340 next_stmt = first_stmt;
5341 gap_count = 1;
5342 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5344 if (!next_stmt)
5345 break;
5347 /* Skip the gaps. Loads created for the gaps will be removed by dead
5348 code elimination pass later.
5349 DR_GROUP_GAP is the number of steps in elements from the previous
5350 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5351 correspond to the gaps.
5353 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5355 gap_count++;
5356 continue;
5359 while (next_stmt)
5361 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5362 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5363 copies, and we put the new vector statement in the first available
5364 RELATED_STMT. */
5365 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5366 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5367 else
5369 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5370 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5371 vinfo_for_stmt (prev_stmt));
5372 while (rel_stmt)
5374 prev_stmt = rel_stmt;
5375 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5377 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5379 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5380 gap_count = 1;
5381 /* If NEXT_STMT accesses the same DR as the previous statement,
5382 put the same TMP_DATA_REF as its vectorized statement; otherwise
5383 get the next data-ref from RESULT_CHAIN. */
5384 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5385 break;
5388 return true;
5392 /* vectorizable_load.
5394 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5395 can be vectorized.
5396 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5397 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5398 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5400 bool
5401 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5402 slp_tree slp_node)
5404 tree scalar_dest;
5405 tree vec_dest = NULL;
5406 tree data_ref = NULL;
5407 tree op;
5408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5409 stmt_vec_info prev_stmt_info;
5410 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5411 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5412 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5413 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5414 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5415 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5416 tree new_temp;
5417 int mode;
5418 tree new_stmt = NULL_TREE;
5419 tree dummy;
5420 enum dr_alignment_support alignment_support_scheme;
5421 tree dataref_ptr = NULL_TREE;
5422 tree ptr_incr;
5423 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5424 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5425 int i, j, group_size;
5426 tree msq = NULL_TREE, lsq;
5427 tree offset = NULL_TREE;
5428 tree realignment_token = NULL_TREE;
5429 tree phi = NULL_TREE;
5430 VEC(tree,heap) *dr_chain = NULL;
5431 bool strided_load = false;
5432 tree first_stmt;
5433 tree scalar_type;
5434 bool inv_p;
5435 bool compute_in_loop = false;
5436 struct loop *at_loop;
5437 int vec_num;
5438 bool slp = (slp_node != NULL);
5440 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5441 this, so we can safely override NCOPIES with 1 here. */
5442 if (slp)
5443 ncopies = 1;
5445 gcc_assert (ncopies >= 1);
5447 /* FORNOW. This restriction should be relaxed. */
5448 if (nested_in_vect_loop && ncopies > 1)
5450 if (vect_print_dump_info (REPORT_DETAILS))
5451 fprintf (vect_dump, "multiple types in nested loop.");
5452 return false;
5455 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5456 return false;
5458 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5459 return false;
5461 /* FORNOW: not yet supported. */
5462 if (STMT_VINFO_LIVE_P (stmt_info))
5464 if (vect_print_dump_info (REPORT_DETAILS))
5465 fprintf (vect_dump, "value used after loop.");
5466 return false;
5469 /* Is vectorizable load? */
5470 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5471 return false;
5473 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5474 if (TREE_CODE (scalar_dest) != SSA_NAME)
5475 return false;
5477 op = GIMPLE_STMT_OPERAND (stmt, 1);
5478 if (TREE_CODE (op) != ARRAY_REF
5479 && TREE_CODE (op) != INDIRECT_REF
5480 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5481 return false;
5483 if (!STMT_VINFO_DATA_REF (stmt_info))
5484 return false;
5486 scalar_type = TREE_TYPE (DR_REF (dr));
5487 mode = (int) TYPE_MODE (vectype);
5489 /* FORNOW. In some cases can vectorize even if data-type not supported
5490 (e.g. - data copies). */
5491 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5493 if (vect_print_dump_info (REPORT_DETAILS))
5494 fprintf (vect_dump, "Aligned load, but unsupported type.");
5495 return false;
5498 /* Check if the load is a part of an interleaving chain. */
5499 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5501 strided_load = true;
5502 /* FORNOW */
5503 gcc_assert (! nested_in_vect_loop);
5505 /* Check if interleaving is supported. */
5506 if (!vect_strided_load_supported (vectype)
5507 && !PURE_SLP_STMT (stmt_info) && !slp)
5508 return false;
5511 if (!vec_stmt) /* transformation not required. */
5513 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5514 vect_model_load_cost (stmt_info, ncopies, NULL);
5515 return true;
5518 if (vect_print_dump_info (REPORT_DETAILS))
5519 fprintf (vect_dump, "transform load.");
5521 /** Transform. **/
5523 if (strided_load)
5525 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5526 /* Check if the chain of loads is already vectorized. */
5527 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5529 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5530 return true;
5532 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5533 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5534 dr_chain = VEC_alloc (tree, heap, group_size);
5536 /* VEC_NUM is the number of vect stmts to be created for this group. */
5537 if (slp)
5539 strided_load = false;
5540 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5542 else
5543 vec_num = group_size;
5545 else
5547 first_stmt = stmt;
5548 first_dr = dr;
5549 group_size = vec_num = 1;
5552 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5553 gcc_assert (alignment_support_scheme);
5555 /* In case the vectorization factor (VF) is bigger than the number
5556 of elements that we can fit in a vectype (nunits), we have to generate
5557 more than one vector stmt - i.e - we need to "unroll" the
5558 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5559 from one copy of the vector stmt to the next, in the field
5560 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5561 stages to find the correct vector defs to be used when vectorizing
5562 stmts that use the defs of the current stmt. The example below illustrates
5563 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5564 4 vectorized stmts):
5566 before vectorization:
5567 RELATED_STMT VEC_STMT
5568 S1: x = memref - -
5569 S2: z = x + 1 - -
5571 step 1: vectorize stmt S1:
5572 We first create the vector stmt VS1_0, and, as usual, record a
5573 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5574 Next, we create the vector stmt VS1_1, and record a pointer to
5575 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5576 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5577 stmts and pointers:
5578 RELATED_STMT VEC_STMT
5579 VS1_0: vx0 = memref0 VS1_1 -
5580 VS1_1: vx1 = memref1 VS1_2 -
5581 VS1_2: vx2 = memref2 VS1_3 -
5582 VS1_3: vx3 = memref3 - -
5583 S1: x = load - VS1_0
5584 S2: z = x + 1 - -
5586 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5587 information we recorded in RELATED_STMT field is used to vectorize
5588 stmt S2. */
5590 /* In case of interleaving (non-unit strided access):
5592 S1: x2 = &base + 2
5593 S2: x0 = &base
5594 S3: x1 = &base + 1
5595 S4: x3 = &base + 3
5597 Vectorized loads are created in the order of memory accesses
5598 starting from the access of the first stmt of the chain:
5600 VS1: vx0 = &base
5601 VS2: vx1 = &base + vec_size*1
5602 VS3: vx3 = &base + vec_size*2
5603 VS4: vx4 = &base + vec_size*3
5605 Then permutation statements are generated:
5607 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5608 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5611 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5612 (the order of the data-refs in the output of vect_permute_load_chain
5613 corresponds to the order of scalar stmts in the interleaving chain - see
5614 the documentation of vect_permute_load_chain()).
5615 The generation of permutation stmts and recording them in
5616 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5618 In case of both multiple types and interleaving, the vector loads and
5619 permutation stmts above are created for every copy. The result vector stmts
5620 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5621 STMT_VINFO_RELATED_STMT for the next copies. */
5623 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5624 on a target that supports unaligned accesses (dr_unaligned_supported)
5625 we generate the following code:
5626 p = initial_addr;
5627 indx = 0;
5628 loop {
5629 p = p + indx * vectype_size;
5630 vec_dest = *(p);
5631 indx = indx + 1;
5634 Otherwise, the data reference is potentially unaligned on a target that
5635 does not support unaligned accesses (dr_explicit_realign_optimized) -
5636 then generate the following code, in which the data in each iteration is
5637 obtained by two vector loads, one from the previous iteration, and one
5638 from the current iteration:
5639 p1 = initial_addr;
5640 msq_init = *(floor(p1))
5641 p2 = initial_addr + VS - 1;
5642 realignment_token = call target_builtin;
5643 indx = 0;
5644 loop {
5645 p2 = p2 + indx * vectype_size
5646 lsq = *(floor(p2))
5647 vec_dest = realign_load (msq, lsq, realignment_token)
5648 indx = indx + 1;
5649 msq = lsq;
5650 } */
5652 /* If the misalignment remains the same throughout the execution of the
5653 loop, we can create the init_addr and permutation mask at the loop
5654 preheader. Otherwise, it needs to be created inside the loop.
5655 This can only occur when vectorizing memory accesses in the inner-loop
5656 nested within an outer-loop that is being vectorized. */
5658 if (nested_in_vect_loop_p (loop, stmt)
5659 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5661 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5662 compute_in_loop = true;
5665 if ((alignment_support_scheme == dr_explicit_realign_optimized
5666 || alignment_support_scheme == dr_explicit_realign)
5667 && !compute_in_loop)
5669 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5670 alignment_support_scheme, NULL_TREE,
5671 &at_loop);
5672 if (alignment_support_scheme == dr_explicit_realign_optimized)
5674 phi = SSA_NAME_DEF_STMT (msq);
5675 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5678 else
5679 at_loop = loop;
5681 prev_stmt_info = NULL;
5682 for (j = 0; j < ncopies; j++)
5684 /* 1. Create the vector pointer update chain. */
5685 if (j == 0)
5686 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5687 at_loop, offset,
5688 &dummy, &ptr_incr, false,
5689 NULL_TREE, &inv_p);
5690 else
5691 dataref_ptr =
5692 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5694 for (i = 0; i < vec_num; i++)
5696 if (i > 0)
5697 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5698 NULL_TREE);
5700 /* 2. Create the vector-load in the loop. */
5701 switch (alignment_support_scheme)
5703 case dr_aligned:
5704 gcc_assert (aligned_access_p (first_dr));
5705 data_ref = build_fold_indirect_ref (dataref_ptr);
5706 break;
5707 case dr_unaligned_supported:
5709 int mis = DR_MISALIGNMENT (first_dr);
5710 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5712 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5713 data_ref =
5714 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5715 break;
5717 case dr_explicit_realign:
5719 tree ptr, bump;
5720 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5722 if (compute_in_loop)
5723 msq = vect_setup_realignment (first_stmt, bsi,
5724 &realignment_token,
5725 dr_explicit_realign,
5726 dataref_ptr, NULL);
5728 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5729 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5730 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5731 new_temp = make_ssa_name (vec_dest, new_stmt);
5732 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5733 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5734 copy_virtual_operands (new_stmt, stmt);
5735 mark_symbols_for_renaming (new_stmt);
5736 msq = new_temp;
5738 bump = size_binop (MULT_EXPR, vs_minus_1,
5739 TYPE_SIZE_UNIT (scalar_type));
5740 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5741 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5742 break;
5744 case dr_explicit_realign_optimized:
5745 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5746 break;
5747 default:
5748 gcc_unreachable ();
5750 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5751 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5752 new_temp = make_ssa_name (vec_dest, new_stmt);
5753 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5754 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5755 mark_symbols_for_renaming (new_stmt);
5757 /* 3. Handle explicit realignment if necessary/supported. Create in
5758 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5759 if (alignment_support_scheme == dr_explicit_realign_optimized
5760 || alignment_support_scheme == dr_explicit_realign)
5762 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5763 if (!realignment_token)
5764 realignment_token = dataref_ptr;
5765 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5766 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5767 realignment_token);
5768 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5769 new_temp = make_ssa_name (vec_dest, new_stmt);
5770 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5771 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5773 if (alignment_support_scheme == dr_explicit_realign_optimized)
5775 if (i == vec_num - 1 && j == ncopies - 1)
5776 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5777 msq = lsq;
5781 /* 4. Handle invariant-load. */
5782 if (inv_p)
5784 gcc_assert (!strided_load);
5785 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5786 if (j == 0)
5788 int k;
5789 tree t = NULL_TREE;
5790 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5792 /* CHECKME: bitpos depends on endianess? */
5793 bitpos = bitsize_zero_node;
5794 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5795 bitsize, bitpos);
5796 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5797 TYPE_UNSIGNED (scalar_type);
5798 vec_dest =
5799 vect_create_destination_var (scalar_dest, NULL_TREE);
5800 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5801 new_temp = make_ssa_name (vec_dest, new_stmt);
5802 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5803 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5805 for (k = nunits - 1; k >= 0; --k)
5806 t = tree_cons (NULL_TREE, new_temp, t);
5807 /* FIXME: use build_constructor directly. */
5808 vec_inv = build_constructor_from_list (vectype, t);
5809 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5810 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5812 else
5813 gcc_unreachable (); /* FORNOW. */
5816 /* Collect vector loads and later create their permutation in
5817 vect_transform_strided_load (). */
5818 if (strided_load)
5819 VEC_quick_push (tree, dr_chain, new_temp);
5821 /* Store vector loads in the corresponding SLP_NODE. */
5822 if (slp)
5823 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5826 /* FORNOW: SLP with multiple types is unsupported. */
5827 if (slp)
5828 return true;
5830 if (strided_load)
5832 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5833 return false;
5834 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5835 dr_chain = VEC_alloc (tree, heap, group_size);
5837 else
5839 if (j == 0)
5840 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5841 else
5842 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5843 prev_stmt_info = vinfo_for_stmt (new_stmt);
5847 return true;
5851 /* Function vectorizable_live_operation.
5853 STMT computes a value that is used outside the loop. Check if
5854 it can be supported. */
5856 bool
5857 vectorizable_live_operation (tree stmt,
5858 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5859 tree *vec_stmt ATTRIBUTE_UNUSED)
5861 tree operation;
5862 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5863 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5864 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5865 int i;
5866 int op_type;
5867 tree op;
5868 tree def, def_stmt;
5869 enum vect_def_type dt;
5871 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5873 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5874 return false;
5876 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5877 return false;
5879 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5880 return false;
5882 /* FORNOW. CHECKME. */
5883 if (nested_in_vect_loop_p (loop, stmt))
5884 return false;
5886 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5887 op_type = TREE_OPERAND_LENGTH (operation);
5889 /* FORNOW: support only if all uses are invariant. This means
5890 that the scalar operations can remain in place, unvectorized.
5891 The original last scalar value that they compute will be used. */
5893 for (i = 0; i < op_type; i++)
5895 op = TREE_OPERAND (operation, i);
5896 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5898 if (vect_print_dump_info (REPORT_DETAILS))
5899 fprintf (vect_dump, "use not simple.");
5900 return false;
5903 if (dt != vect_invariant_def && dt != vect_constant_def)
5904 return false;
5907 /* No transformation is required for the cases we currently support. */
5908 return true;
5912 /* Function vect_is_simple_cond.
5914 Input:
5915 LOOP - the loop that is being vectorized.
5916 COND - Condition that is checked for simple use.
5918 Returns whether a COND can be vectorized. Checks whether
5919 condition operands are supportable using vec_is_simple_use. */
5921 static bool
5922 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
5924 tree lhs, rhs;
5925 tree def;
5926 enum vect_def_type dt;
5928 if (!COMPARISON_CLASS_P (cond))
5929 return false;
5931 lhs = TREE_OPERAND (cond, 0);
5932 rhs = TREE_OPERAND (cond, 1);
5934 if (TREE_CODE (lhs) == SSA_NAME)
5936 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
5937 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
5938 return false;
5940 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
5941 && TREE_CODE (lhs) != FIXED_CST)
5942 return false;
5944 if (TREE_CODE (rhs) == SSA_NAME)
5946 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
5947 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
5948 return false;
5950 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
5951 && TREE_CODE (rhs) != FIXED_CST)
5952 return false;
5954 return true;
5957 /* vectorizable_condition.
5959 Check if STMT is conditional modify expression that can be vectorized.
5960 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5961 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
5962 at BSI.
5964 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5966 bool
5967 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5969 tree scalar_dest = NULL_TREE;
5970 tree vec_dest = NULL_TREE;
5971 tree op = NULL_TREE;
5972 tree cond_expr, then_clause, else_clause;
5973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5974 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5975 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
5976 tree vec_compare, vec_cond_expr;
5977 tree new_temp;
5978 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5979 enum machine_mode vec_mode;
5980 tree def;
5981 enum vect_def_type dt;
5982 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5983 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5985 gcc_assert (ncopies >= 1);
5986 if (ncopies > 1)
5987 return false; /* FORNOW */
5989 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5990 return false;
5992 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5993 return false;
5995 /* FORNOW: SLP not supported. */
5996 if (STMT_SLP_TYPE (stmt_info))
5997 return false;
5999 /* FORNOW: not yet supported. */
6000 if (STMT_VINFO_LIVE_P (stmt_info))
6002 if (vect_print_dump_info (REPORT_DETAILS))
6003 fprintf (vect_dump, "value used after loop.");
6004 return false;
6007 /* Is vectorizable conditional operation? */
6008 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6009 return false;
6011 op = GIMPLE_STMT_OPERAND (stmt, 1);
6013 if (TREE_CODE (op) != COND_EXPR)
6014 return false;
6016 cond_expr = TREE_OPERAND (op, 0);
6017 then_clause = TREE_OPERAND (op, 1);
6018 else_clause = TREE_OPERAND (op, 2);
6020 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6021 return false;
6023 /* We do not handle two different vector types for the condition
6024 and the values. */
6025 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6026 return false;
6028 if (TREE_CODE (then_clause) == SSA_NAME)
6030 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6031 if (!vect_is_simple_use (then_clause, loop_vinfo,
6032 &then_def_stmt, &def, &dt))
6033 return false;
6035 else if (TREE_CODE (then_clause) != INTEGER_CST
6036 && TREE_CODE (then_clause) != REAL_CST
6037 && TREE_CODE (then_clause) != FIXED_CST)
6038 return false;
6040 if (TREE_CODE (else_clause) == SSA_NAME)
6042 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6043 if (!vect_is_simple_use (else_clause, loop_vinfo,
6044 &else_def_stmt, &def, &dt))
6045 return false;
6047 else if (TREE_CODE (else_clause) != INTEGER_CST
6048 && TREE_CODE (else_clause) != REAL_CST
6049 && TREE_CODE (else_clause) != FIXED_CST)
6050 return false;
6053 vec_mode = TYPE_MODE (vectype);
6055 if (!vec_stmt)
6057 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6058 return expand_vec_cond_expr_p (op, vec_mode);
6061 /* Transform */
6063 /* Handle def. */
6064 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6065 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6067 /* Handle cond expr. */
6068 vec_cond_lhs =
6069 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6070 vec_cond_rhs =
6071 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6072 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6073 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6075 /* Arguments are ready. create the new vector stmt. */
6076 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6077 vec_cond_lhs, vec_cond_rhs);
6078 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6079 vec_compare, vec_then_clause, vec_else_clause);
6081 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6082 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6083 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6084 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6086 return true;
6090 /* Function vect_transform_stmt.
6092 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6094 static bool
6095 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6096 slp_tree slp_node)
6098 bool is_store = false;
6099 tree vec_stmt = NULL_TREE;
6100 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6101 tree orig_stmt_in_pattern;
6102 bool done;
6104 switch (STMT_VINFO_TYPE (stmt_info))
6106 case type_demotion_vec_info_type:
6107 gcc_assert (!slp_node);
6108 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6109 gcc_assert (done);
6110 break;
6112 case type_promotion_vec_info_type:
6113 gcc_assert (!slp_node);
6114 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6115 gcc_assert (done);
6116 break;
6118 case type_conversion_vec_info_type:
6119 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6120 gcc_assert (done);
6121 break;
6123 case induc_vec_info_type:
6124 gcc_assert (!slp_node);
6125 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6126 gcc_assert (done);
6127 break;
6129 case op_vec_info_type:
6130 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6131 gcc_assert (done);
6132 break;
6134 case assignment_vec_info_type:
6135 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6136 gcc_assert (done);
6137 break;
6139 case load_vec_info_type:
6140 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6141 gcc_assert (done);
6142 break;
6144 case store_vec_info_type:
6145 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6146 gcc_assert (done);
6147 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6149 /* In case of interleaving, the whole chain is vectorized when the
6150 last store in the chain is reached. Store stmts before the last
6151 one are skipped, and there vec_stmt_info shouldn't be freed
6152 meanwhile. */
6153 *strided_store = true;
6154 if (STMT_VINFO_VEC_STMT (stmt_info))
6155 is_store = true;
6157 else
6158 is_store = true;
6159 break;
6161 case condition_vec_info_type:
6162 gcc_assert (!slp_node);
6163 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6164 gcc_assert (done);
6165 break;
6167 case call_vec_info_type:
6168 gcc_assert (!slp_node);
6169 done = vectorizable_call (stmt, bsi, &vec_stmt);
6170 break;
6172 case reduc_vec_info_type:
6173 gcc_assert (!slp_node);
6174 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6175 gcc_assert (done);
6176 break;
6178 default:
6179 if (!STMT_VINFO_LIVE_P (stmt_info))
6181 if (vect_print_dump_info (REPORT_DETAILS))
6182 fprintf (vect_dump, "stmt not supported.");
6183 gcc_unreachable ();
6187 if (STMT_VINFO_LIVE_P (stmt_info)
6188 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6190 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6191 gcc_assert (done);
6194 if (vec_stmt)
6196 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6197 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6198 if (orig_stmt_in_pattern)
6200 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6201 /* STMT was inserted by the vectorizer to replace a computation idiom.
6202 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6203 computed this idiom. We need to record a pointer to VEC_STMT in
6204 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6205 documentation of vect_pattern_recog. */
6206 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6208 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6209 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6214 return is_store;
6218 /* This function builds ni_name = number of iterations loop executes
6219 on the loop preheader. */
6221 static tree
6222 vect_build_loop_niters (loop_vec_info loop_vinfo)
6224 tree ni_name, stmt, var;
6225 edge pe;
6226 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6227 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6229 var = create_tmp_var (TREE_TYPE (ni), "niters");
6230 add_referenced_var (var);
6231 ni_name = force_gimple_operand (ni, &stmt, false, var);
6233 pe = loop_preheader_edge (loop);
6234 if (stmt)
6236 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6237 gcc_assert (!new_bb);
6240 return ni_name;
6244 /* This function generates the following statements:
6246 ni_name = number of iterations loop executes
6247 ratio = ni_name / vf
6248 ratio_mult_vf_name = ratio * vf
6250 and places them at the loop preheader edge. */
6252 static void
6253 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6254 tree *ni_name_ptr,
6255 tree *ratio_mult_vf_name_ptr,
6256 tree *ratio_name_ptr)
6259 edge pe;
6260 basic_block new_bb;
6261 tree stmt, ni_name;
6262 tree var;
6263 tree ratio_name;
6264 tree ratio_mult_vf_name;
6265 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6266 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6267 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6268 tree log_vf;
6270 pe = loop_preheader_edge (loop);
6272 /* Generate temporary variable that contains
6273 number of iterations loop executes. */
6275 ni_name = vect_build_loop_niters (loop_vinfo);
6276 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6278 /* Create: ratio = ni >> log2(vf) */
6280 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6281 if (!is_gimple_val (ratio_name))
6283 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6284 add_referenced_var (var);
6286 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6287 pe = loop_preheader_edge (loop);
6288 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6289 gcc_assert (!new_bb);
6292 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6294 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6295 ratio_name, log_vf);
6296 if (!is_gimple_val (ratio_mult_vf_name))
6298 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6299 add_referenced_var (var);
6301 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6302 true, var);
6303 pe = loop_preheader_edge (loop);
6304 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6305 gcc_assert (!new_bb);
6308 *ni_name_ptr = ni_name;
6309 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6310 *ratio_name_ptr = ratio_name;
6312 return;
6316 /* Function vect_update_ivs_after_vectorizer.
6318 "Advance" the induction variables of LOOP to the value they should take
6319 after the execution of LOOP. This is currently necessary because the
6320 vectorizer does not handle induction variables that are used after the
6321 loop. Such a situation occurs when the last iterations of LOOP are
6322 peeled, because:
6323 1. We introduced new uses after LOOP for IVs that were not originally used
6324 after LOOP: the IVs of LOOP are now used by an epilog loop.
6325 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6326 times, whereas the loop IVs should be bumped N times.
6328 Input:
6329 - LOOP - a loop that is going to be vectorized. The last few iterations
6330 of LOOP were peeled.
6331 - NITERS - the number of iterations that LOOP executes (before it is
6332 vectorized). i.e, the number of times the ivs should be bumped.
6333 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6334 coming out from LOOP on which there are uses of the LOOP ivs
6335 (this is the path from LOOP->exit to epilog_loop->preheader).
6337 The new definitions of the ivs are placed in LOOP->exit.
6338 The phi args associated with the edge UPDATE_E in the bb
6339 UPDATE_E->dest are updated accordingly.
6341 Assumption 1: Like the rest of the vectorizer, this function assumes
6342 a single loop exit that has a single predecessor.
6344 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6345 organized in the same order.
6347 Assumption 3: The access function of the ivs is simple enough (see
6348 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6350 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6351 coming out of LOOP on which the ivs of LOOP are used (this is the path
6352 that leads to the epilog loop; other paths skip the epilog loop). This
6353 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6354 needs to have its phis updated.
6357 static void
6358 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6359 edge update_e)
6361 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6362 basic_block exit_bb = single_exit (loop)->dest;
6363 tree phi, phi1;
6364 basic_block update_bb = update_e->dest;
6366 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6368 /* Make sure there exists a single-predecessor exit bb: */
6369 gcc_assert (single_pred_p (exit_bb));
6371 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6372 phi && phi1;
6373 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6375 tree access_fn = NULL;
6376 tree evolution_part;
6377 tree init_expr;
6378 tree step_expr;
6379 tree var, ni, ni_name;
6380 block_stmt_iterator last_bsi;
6382 if (vect_print_dump_info (REPORT_DETAILS))
6384 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6385 print_generic_expr (vect_dump, phi, TDF_SLIM);
6388 /* Skip virtual phi's. */
6389 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6391 if (vect_print_dump_info (REPORT_DETAILS))
6392 fprintf (vect_dump, "virtual phi. skip.");
6393 continue;
6396 /* Skip reduction phis. */
6397 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6399 if (vect_print_dump_info (REPORT_DETAILS))
6400 fprintf (vect_dump, "reduc phi. skip.");
6401 continue;
6404 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6405 gcc_assert (access_fn);
6406 evolution_part =
6407 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6408 gcc_assert (evolution_part != NULL_TREE);
6410 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6411 of degree >= 2 or exponential. */
6412 gcc_assert (!tree_is_chrec (evolution_part));
6414 step_expr = evolution_part;
6415 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6416 loop->num));
6418 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6419 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6420 init_expr,
6421 fold_convert (sizetype,
6422 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6423 niters, step_expr)));
6424 else
6425 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6426 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6427 fold_convert (TREE_TYPE (init_expr),
6428 niters),
6429 step_expr),
6430 init_expr);
6434 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6435 add_referenced_var (var);
6437 last_bsi = bsi_last (exit_bb);
6438 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6439 true, BSI_SAME_STMT);
6441 /* Fix phi expressions in the successor bb. */
6442 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6447 /* Function vect_do_peeling_for_loop_bound
6449 Peel the last iterations of the loop represented by LOOP_VINFO.
6450 The peeled iterations form a new epilog loop. Given that the loop now
6451 iterates NITERS times, the new epilog loop iterates
6452 NITERS % VECTORIZATION_FACTOR times.
6454 The original loop will later be made to iterate
6455 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6457 static void
6458 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6460 tree ni_name, ratio_mult_vf_name;
6461 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6462 struct loop *new_loop;
6463 edge update_e;
6464 basic_block preheader;
6465 int loop_num;
6466 unsigned int th;
6467 int min_scalar_loop_bound;
6468 int min_profitable_iters;
6470 if (vect_print_dump_info (REPORT_DETAILS))
6471 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6473 initialize_original_copy_tables ();
6475 /* Generate the following variables on the preheader of original loop:
6477 ni_name = number of iteration the original loop executes
6478 ratio = ni_name / vf
6479 ratio_mult_vf_name = ratio * vf */
6480 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6481 &ratio_mult_vf_name, ratio);
6483 loop_num = loop->num;
6485 /* Analyze cost to set threshhold for vectorized loop. */
6486 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6487 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6488 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6490 /* Use the cost model only if it is more conservative than user specified
6491 threshold. */
6493 th = (unsigned) min_scalar_loop_bound;
6494 if (min_profitable_iters
6495 && (!min_scalar_loop_bound
6496 || min_profitable_iters > min_scalar_loop_bound))
6497 th = (unsigned) min_profitable_iters;
6499 if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
6500 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6501 && vect_print_dump_info (REPORT_DETAILS))
6502 fprintf (vect_dump, "vectorization may not be profitable.");
6504 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6505 ratio_mult_vf_name, ni_name, false,
6506 th);
6507 gcc_assert (new_loop);
6508 gcc_assert (loop_num == loop->num);
6509 #ifdef ENABLE_CHECKING
6510 slpeel_verify_cfg_after_peeling (loop, new_loop);
6511 #endif
6513 /* A guard that controls whether the new_loop is to be executed or skipped
6514 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6515 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6516 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6517 is on the path where the LOOP IVs are used and need to be updated. */
6519 preheader = loop_preheader_edge (new_loop)->src;
6520 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6521 update_e = EDGE_PRED (preheader, 0);
6522 else
6523 update_e = EDGE_PRED (preheader, 1);
6525 /* Update IVs of original loop as if they were advanced
6526 by ratio_mult_vf_name steps. */
6527 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6529 /* After peeling we have to reset scalar evolution analyzer. */
6530 scev_reset ();
6532 free_original_copy_tables ();
6536 /* Function vect_gen_niters_for_prolog_loop
6538 Set the number of iterations for the loop represented by LOOP_VINFO
6539 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6540 and the misalignment of DR - the data reference recorded in
6541 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6542 this loop, the data reference DR will refer to an aligned location.
6544 The following computation is generated:
6546 If the misalignment of DR is known at compile time:
6547 addr_mis = int mis = DR_MISALIGNMENT (dr);
6548 Else, compute address misalignment in bytes:
6549 addr_mis = addr & (vectype_size - 1)
6551 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6553 (elem_size = element type size; an element is the scalar element
6554 whose type is the inner type of the vectype)
6556 For interleaving,
6558 prolog_niters = min ( LOOP_NITERS ,
6559 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6560 where group_size is the size of the interleaved group.
6562 The above formulas assume that VF == number of elements in the vector. This
6563 may not hold when there are multiple-types in the loop.
6564 In this case, for some data-references in the loop the VF does not represent
6565 the number of elements that fit in the vector. Therefore, instead of VF we
6566 use TYPE_VECTOR_SUBPARTS. */
6568 static tree
6569 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6571 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6572 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6573 tree var, stmt;
6574 tree iters, iters_name;
6575 edge pe;
6576 basic_block new_bb;
6577 tree dr_stmt = DR_STMT (dr);
6578 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6579 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6580 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6581 tree niters_type = TREE_TYPE (loop_niters);
6582 int group_size = 1;
6583 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6584 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6586 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6588 /* For interleaved access element size must be multiplied by the size of
6589 the interleaved group. */
6590 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6591 DR_GROUP_FIRST_DR (stmt_info)));
6592 element_size *= group_size;
6595 pe = loop_preheader_edge (loop);
6597 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6599 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6600 int elem_misalign = byte_misalign / element_size;
6602 if (vect_print_dump_info (REPORT_DETAILS))
6603 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6604 iters = build_int_cst (niters_type,
6605 (nelements - elem_misalign)&(nelements/group_size-1));
6607 else
6609 tree new_stmts = NULL_TREE;
6610 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6611 &new_stmts, NULL_TREE, loop);
6612 tree ptr_type = TREE_TYPE (start_addr);
6613 tree size = TYPE_SIZE (ptr_type);
6614 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6615 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6616 tree elem_size_log =
6617 build_int_cst (type, exact_log2 (vectype_align/nelements));
6618 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6619 tree nelements_tree = build_int_cst (type, nelements);
6620 tree byte_misalign;
6621 tree elem_misalign;
6623 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6624 gcc_assert (!new_bb);
6626 /* Create: byte_misalign = addr & (vectype_size - 1) */
6627 byte_misalign =
6628 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6630 /* Create: elem_misalign = byte_misalign / element_size */
6631 elem_misalign =
6632 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6634 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6635 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6636 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6637 iters = fold_convert (niters_type, iters);
6640 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6641 /* If the loop bound is known at compile time we already verified that it is
6642 greater than vf; since the misalignment ('iters') is at most vf, there's
6643 no need to generate the MIN_EXPR in this case. */
6644 if (TREE_CODE (loop_niters) != INTEGER_CST)
6645 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6647 if (vect_print_dump_info (REPORT_DETAILS))
6649 fprintf (vect_dump, "niters for prolog loop: ");
6650 print_generic_expr (vect_dump, iters, TDF_SLIM);
6653 var = create_tmp_var (niters_type, "prolog_loop_niters");
6654 add_referenced_var (var);
6655 iters_name = force_gimple_operand (iters, &stmt, false, var);
6657 /* Insert stmt on loop preheader edge. */
6658 if (stmt)
6660 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6661 gcc_assert (!new_bb);
6664 return iters_name;
6668 /* Function vect_update_init_of_dr
6670 NITERS iterations were peeled from LOOP. DR represents a data reference
6671 in LOOP. This function updates the information recorded in DR to
6672 account for the fact that the first NITERS iterations had already been
6673 executed. Specifically, it updates the OFFSET field of DR. */
6675 static void
6676 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6678 tree offset = DR_OFFSET (dr);
6680 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6681 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6682 DR_OFFSET (dr) = offset;
6686 /* Function vect_update_inits_of_drs
6688 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6689 This function updates the information recorded for the data references in
6690 the loop to account for the fact that the first NITERS iterations had
6691 already been executed. Specifically, it updates the initial_condition of
6692 the access_function of all the data_references in the loop. */
6694 static void
6695 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6697 unsigned int i;
6698 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6699 struct data_reference *dr;
6701 if (vect_print_dump_info (REPORT_DETAILS))
6702 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6704 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6705 vect_update_init_of_dr (dr, niters);
6709 /* Function vect_do_peeling_for_alignment
6711 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6712 'niters' is set to the misalignment of one of the data references in the
6713 loop, thereby forcing it to refer to an aligned location at the beginning
6714 of the execution of this loop. The data reference for which we are
6715 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6717 static void
6718 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6720 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6721 tree niters_of_prolog_loop, ni_name;
6722 tree n_iters;
6723 struct loop *new_loop;
6725 if (vect_print_dump_info (REPORT_DETAILS))
6726 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6728 initialize_original_copy_tables ();
6730 ni_name = vect_build_loop_niters (loop_vinfo);
6731 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6733 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6734 new_loop =
6735 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6736 niters_of_prolog_loop, ni_name, true, 0);
6737 gcc_assert (new_loop);
6738 #ifdef ENABLE_CHECKING
6739 slpeel_verify_cfg_after_peeling (new_loop, loop);
6740 #endif
6742 /* Update number of times loop executes. */
6743 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6744 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6745 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6747 /* Update the init conditions of the access functions of all data refs. */
6748 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6750 /* After peeling we have to reset scalar evolution analyzer. */
6751 scev_reset ();
6753 free_original_copy_tables ();
6757 /* Function vect_create_cond_for_align_checks.
6759 Create a conditional expression that represents the alignment checks for
6760 all of data references (array element references) whose alignment must be
6761 checked at runtime.
6763 Input:
6764 LOOP_VINFO - two fields of the loop information are used.
6765 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6766 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6768 Output:
6769 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6770 expression.
6771 The returned value is the conditional expression to be used in the if
6772 statement that controls which version of the loop gets executed at runtime.
6774 The algorithm makes two assumptions:
6775 1) The number of bytes "n" in a vector is a power of 2.
6776 2) An address "a" is aligned if a%n is zero and that this
6777 test can be done as a&(n-1) == 0. For example, for 16
6778 byte vectors the test is a&0xf == 0. */
6780 static tree
6781 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6782 tree *cond_expr_stmt_list)
6784 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6785 VEC(tree,heap) *may_misalign_stmts
6786 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6787 tree ref_stmt, tmp;
6788 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6789 tree mask_cst;
6790 unsigned int i;
6791 tree psize;
6792 tree int_ptrsize_type;
6793 char tmp_name[20];
6794 tree or_tmp_name = NULL_TREE;
6795 tree and_tmp, and_tmp_name, and_stmt;
6796 tree ptrsize_zero;
6798 /* Check that mask is one less than a power of 2, i.e., mask is
6799 all zeros followed by all ones. */
6800 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6802 /* CHECKME: what is the best integer or unsigned type to use to hold a
6803 cast from a pointer value? */
6804 psize = TYPE_SIZE (ptr_type_node);
6805 int_ptrsize_type
6806 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6808 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6809 of the first vector of the i'th data reference. */
6811 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6813 tree new_stmt_list = NULL_TREE;
6814 tree addr_base;
6815 tree addr_tmp, addr_tmp_name, addr_stmt;
6816 tree or_tmp, new_or_tmp_name, or_stmt;
6818 /* create: addr_tmp = (int)(address_of_first_vector) */
6819 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6820 &new_stmt_list, NULL_TREE, loop);
6822 if (new_stmt_list != NULL_TREE)
6823 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6825 sprintf (tmp_name, "%s%d", "addr2int", i);
6826 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6827 add_referenced_var (addr_tmp);
6828 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6829 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6830 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6831 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6832 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6834 /* The addresses are OR together. */
6836 if (or_tmp_name != NULL_TREE)
6838 /* create: or_tmp = or_tmp | addr_tmp */
6839 sprintf (tmp_name, "%s%d", "orptrs", i);
6840 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6841 add_referenced_var (or_tmp);
6842 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6843 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6844 or_tmp_name, addr_tmp_name);
6845 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6846 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6847 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6848 or_tmp_name = new_or_tmp_name;
6850 else
6851 or_tmp_name = addr_tmp_name;
6853 } /* end for i */
6855 mask_cst = build_int_cst (int_ptrsize_type, mask);
6857 /* create: and_tmp = or_tmp & mask */
6858 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6859 add_referenced_var (and_tmp);
6860 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6862 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6863 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6864 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6865 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6867 /* Make and_tmp the left operand of the conditional test against zero.
6868 if and_tmp has a nonzero bit then some address is unaligned. */
6869 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6870 return build2 (EQ_EXPR, boolean_type_node,
6871 and_tmp_name, ptrsize_zero);
6874 /* Function vect_vfa_segment_size.
6876 Create an expression that computes the size of segment
6877 that will be accessed for a data reference. The functions takes into
6878 account that realignment loads may access one more vector.
6880 Input:
6881 DR: The data reference.
6882 VECT_FACTOR: vectorization factor.
6884 Return an expression whose value is the size of segment which will be
6885 accessed by DR. */
6887 static tree
6888 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
6890 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
6891 DR_STEP (dr), vect_factor);
6893 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
6895 tree vector_size = TYPE_SIZE_UNIT
6896 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
6898 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
6899 segment_length, vector_size);
6901 return fold_convert (sizetype, segment_length);
6904 /* Function vect_create_cond_for_alias_checks.
6906 Create a conditional expression that represents the run-time checks for
6907 overlapping of address ranges represented by a list of data references
6908 relations passed as input.
6910 Input:
6911 COND_EXPR - input conditional expression. New conditions will be chained
6912 with logical and operation.
6913 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
6914 to be checked.
6916 Output:
6917 COND_EXPR - conditional expression.
6918 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6919 expression.
6922 The returned value is the conditional expression to be used in the if
6923 statement that controls which version of the loop gets executed at runtime.
6926 static void
6927 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
6928 tree * cond_expr,
6929 tree * cond_expr_stmt_list)
6931 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6932 VEC (ddr_p, heap) * may_alias_ddrs =
6933 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
6934 tree vect_factor =
6935 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
6937 ddr_p ddr;
6938 unsigned int i;
6939 tree part_cond_expr;
6941 /* Create expression
6942 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
6943 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
6947 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
6948 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
6950 if (VEC_empty (ddr_p, may_alias_ddrs))
6951 return;
6953 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
6955 struct data_reference *dr_a, *dr_b;
6956 tree dr_group_first_a, dr_group_first_b;
6957 tree addr_base_a, addr_base_b;
6958 tree segment_length_a, segment_length_b;
6959 tree stmt_a, stmt_b;
6961 dr_a = DDR_A (ddr);
6962 stmt_a = DR_STMT (DDR_A (ddr));
6963 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
6964 if (dr_group_first_a)
6966 stmt_a = dr_group_first_a;
6967 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
6970 dr_b = DDR_B (ddr);
6971 stmt_b = DR_STMT (DDR_B (ddr));
6972 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
6973 if (dr_group_first_b)
6975 stmt_b = dr_group_first_b;
6976 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
6979 addr_base_a =
6980 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
6981 NULL_TREE, loop);
6982 addr_base_b =
6983 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
6984 NULL_TREE, loop);
6986 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
6987 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
6989 if (vect_print_dump_info (REPORT_DR_DETAILS))
6991 fprintf (vect_dump,
6992 "create runtime check for data references ");
6993 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
6994 fprintf (vect_dump, " and ");
6995 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
6999 part_cond_expr =
7000 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7001 fold_build2 (LT_EXPR, boolean_type_node,
7002 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7003 addr_base_a,
7004 segment_length_a),
7005 addr_base_b),
7006 fold_build2 (LT_EXPR, boolean_type_node,
7007 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7008 addr_base_b,
7009 segment_length_b),
7010 addr_base_a));
7012 if (*cond_expr)
7013 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7014 *cond_expr, part_cond_expr);
7015 else
7016 *cond_expr = part_cond_expr;
7018 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7019 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7020 VEC_length (ddr_p, may_alias_ddrs));
7024 /* Function vect_loop_versioning.
7026 If the loop has data references that may or may not be aligned or/and
7027 has data reference relations whose independence was not proven then
7028 two versions of the loop need to be generated, one which is vectorized
7029 and one which isn't. A test is then generated to control which of the
7030 loops is executed. The test checks for the alignment of all of the
7031 data references that may or may not be aligned. An additional
7032 sequence of runtime tests is generated for each pairs of DDRs whose
7033 independence was not proven. The vectorized version of loop is
7034 executed only if both alias and alignment tests are passed. */
7036 static void
7037 vect_loop_versioning (loop_vec_info loop_vinfo)
7039 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7040 struct loop *nloop;
7041 tree cond_expr = NULL_TREE;
7042 tree cond_expr_stmt_list = NULL_TREE;
7043 basic_block condition_bb;
7044 block_stmt_iterator cond_exp_bsi;
7045 basic_block merge_bb;
7046 basic_block new_exit_bb;
7047 edge new_exit_e, e;
7048 tree orig_phi, new_phi, arg;
7049 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7050 tree gimplify_stmt_list;
7052 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7053 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7054 return;
7056 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7057 cond_expr =
7058 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
7060 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7061 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, &cond_expr_stmt_list);
7063 cond_expr =
7064 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7065 cond_expr =
7066 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7067 NULL_TREE);
7068 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7070 initialize_original_copy_tables ();
7071 nloop = loop_version (loop, cond_expr, &condition_bb,
7072 prob, prob, REG_BR_PROB_BASE - prob, true);
7073 free_original_copy_tables();
7075 /* Loop versioning violates an assumption we try to maintain during
7076 vectorization - that the loop exit block has a single predecessor.
7077 After versioning, the exit block of both loop versions is the same
7078 basic block (i.e. it has two predecessors). Just in order to simplify
7079 following transformations in the vectorizer, we fix this situation
7080 here by adding a new (empty) block on the exit-edge of the loop,
7081 with the proper loop-exit phis to maintain loop-closed-form. */
7083 merge_bb = single_exit (loop)->dest;
7084 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7085 new_exit_bb = split_edge (single_exit (loop));
7086 new_exit_e = single_exit (loop);
7087 e = EDGE_SUCC (new_exit_bb, 0);
7089 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7090 orig_phi = PHI_CHAIN (orig_phi))
7092 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7093 new_exit_bb);
7094 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7095 add_phi_arg (new_phi, arg, new_exit_e);
7096 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7099 /* End loop-exit-fixes after versioning. */
7101 update_ssa (TODO_update_ssa);
7102 if (cond_expr_stmt_list)
7104 cond_exp_bsi = bsi_last (condition_bb);
7105 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7109 /* Remove a group of stores (for SLP or interleaving), free their
7110 stmt_vec_info. */
7112 static void
7113 vect_remove_stores (tree first_stmt)
7115 stmt_ann_t ann;
7116 tree next = first_stmt;
7117 tree tmp;
7118 stmt_vec_info next_stmt_info;
7119 block_stmt_iterator next_si;
7121 while (next)
7123 /* Free the attached stmt_vec_info and remove the stmt. */
7124 next_si = bsi_for_stmt (next);
7125 bsi_remove (&next_si, true);
7126 next_stmt_info = vinfo_for_stmt (next);
7127 ann = stmt_ann (next);
7128 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7129 free (next_stmt_info);
7130 set_stmt_info (ann, NULL);
7131 next = tmp;
7136 /* Vectorize SLP instance tree in postorder. */
7138 static bool
7139 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7141 tree stmt;
7142 bool strided_store, is_store;
7143 block_stmt_iterator si;
7144 stmt_vec_info stmt_info;
7146 if (!node)
7147 return false;
7149 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7150 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7152 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7153 stmt_info = vinfo_for_stmt (stmt);
7154 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7155 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7157 if (vect_print_dump_info (REPORT_DETAILS))
7159 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7160 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7163 si = bsi_for_stmt (stmt);
7164 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7165 if (is_store)
7167 if (DR_GROUP_FIRST_DR (stmt_info))
7168 /* If IS_STORE is TRUE, the vectorization of the
7169 interleaving chain was completed - free all the stores in
7170 the chain. */
7171 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7172 else
7173 /* FORNOW: SLP originates only from strided stores. */
7174 gcc_unreachable ();
7176 return true;
7179 /* FORNOW: SLP originates only from strided stores. */
7180 return false;
7184 static bool
7185 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7187 VEC (slp_instance, heap) *slp_instances =
7188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7189 slp_instance instance;
7190 unsigned int vec_stmts_size;
7191 unsigned int group_size, i;
7192 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7193 bool is_store = false;
7195 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7197 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7198 /* For each SLP instance calculate number of vector stmts to be created
7199 for the scalar stmts in each node of the SLP tree. Number of vector
7200 elements in one vector iteration is the number of scalar elements in
7201 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7202 size. */
7203 vec_stmts_size = vectorization_factor * group_size / nunits;
7205 /* Schedule the tree of INSTANCE. */
7206 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7207 vec_stmts_size);
7209 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7210 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7211 fprintf (vect_dump, "vectorizing stmts using SLP.");
7214 return is_store;
7217 /* Function vect_transform_loop.
7219 The analysis phase has determined that the loop is vectorizable.
7220 Vectorize the loop - created vectorized stmts to replace the scalar
7221 stmts in the loop, and update the loop exit condition. */
7223 void
7224 vect_transform_loop (loop_vec_info loop_vinfo)
7226 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7227 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7228 int nbbs = loop->num_nodes;
7229 block_stmt_iterator si, next_si;
7230 int i;
7231 tree ratio = NULL;
7232 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7233 bool strided_store;
7234 bool slp_scheduled = false;
7235 unsigned int nunits;
7237 if (vect_print_dump_info (REPORT_DETAILS))
7238 fprintf (vect_dump, "=== vec_transform_loop ===");
7239 vect_loop_versioning (loop_vinfo);
7241 /* CHECKME: we wouldn't need this if we called update_ssa once
7242 for all loops. */
7243 bitmap_zero (vect_memsyms_to_rename);
7245 /* Peel the loop if there are data refs with unknown alignment.
7246 Only one data ref with unknown store is allowed. */
7248 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7249 vect_do_peeling_for_alignment (loop_vinfo);
7251 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7252 compile time constant), or it is a constant that doesn't divide by the
7253 vectorization factor, then an epilog loop needs to be created.
7254 We therefore duplicate the loop: the original loop will be vectorized,
7255 and will compute the first (n/VF) iterations. The second copy of the loop
7256 will remain scalar and will compute the remaining (n%VF) iterations.
7257 (VF is the vectorization factor). */
7259 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7260 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7261 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7262 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7263 else
7264 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7265 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7267 /* 1) Make sure the loop header has exactly two entries
7268 2) Make sure we have a preheader basic block. */
7270 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7272 split_edge (loop_preheader_edge (loop));
7274 /* FORNOW: the vectorizer supports only loops which body consist
7275 of one basic block (header + empty latch). When the vectorizer will
7276 support more involved loop forms, the order by which the BBs are
7277 traversed need to be reconsidered. */
7279 for (i = 0; i < nbbs; i++)
7281 basic_block bb = bbs[i];
7282 stmt_vec_info stmt_info;
7283 tree phi;
7285 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7287 if (vect_print_dump_info (REPORT_DETAILS))
7289 fprintf (vect_dump, "------>vectorizing phi: ");
7290 print_generic_expr (vect_dump, phi, TDF_SLIM);
7292 stmt_info = vinfo_for_stmt (phi);
7293 if (!stmt_info)
7294 continue;
7296 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7297 && !STMT_VINFO_LIVE_P (stmt_info))
7298 continue;
7300 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7301 != (unsigned HOST_WIDE_INT) vectorization_factor)
7302 && vect_print_dump_info (REPORT_DETAILS))
7303 fprintf (vect_dump, "multiple-types.");
7305 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7307 if (vect_print_dump_info (REPORT_DETAILS))
7308 fprintf (vect_dump, "transform phi.");
7309 vect_transform_stmt (phi, NULL, NULL, NULL);
7313 for (si = bsi_start (bb); !bsi_end_p (si);)
7315 tree stmt = bsi_stmt (si);
7316 bool is_store;
7318 if (vect_print_dump_info (REPORT_DETAILS))
7320 fprintf (vect_dump, "------>vectorizing statement: ");
7321 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7324 stmt_info = vinfo_for_stmt (stmt);
7326 /* vector stmts created in the outer-loop during vectorization of
7327 stmts in an inner-loop may not have a stmt_info, and do not
7328 need to be vectorized. */
7329 if (!stmt_info)
7331 bsi_next (&si);
7332 continue;
7335 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7336 && !STMT_VINFO_LIVE_P (stmt_info))
7338 bsi_next (&si);
7339 continue;
7342 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7343 nunits =
7344 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7345 if (!STMT_SLP_TYPE (stmt_info)
7346 && nunits != (unsigned int) vectorization_factor
7347 && vect_print_dump_info (REPORT_DETAILS))
7348 /* For SLP VF is set according to unrolling factor, and not to
7349 vector size, hence for SLP this print is not valid. */
7350 fprintf (vect_dump, "multiple-types.");
7352 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7353 reached. */
7354 if (STMT_SLP_TYPE (stmt_info))
7356 if (!slp_scheduled)
7358 slp_scheduled = true;
7360 if (vect_print_dump_info (REPORT_DETAILS))
7361 fprintf (vect_dump, "=== scheduling SLP instances ===");
7363 is_store = vect_schedule_slp (loop_vinfo, nunits);
7365 /* IS_STORE is true if STMT is a store. Stores cannot be of
7366 hybrid SLP type. They are removed in
7367 vect_schedule_slp_instance and their vinfo is destroyed. */
7368 if (is_store)
7370 bsi_next (&si);
7371 continue;
7375 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7376 if (PURE_SLP_STMT (stmt_info))
7378 bsi_next (&si);
7379 continue;
7383 /* -------- vectorize statement ------------ */
7384 if (vect_print_dump_info (REPORT_DETAILS))
7385 fprintf (vect_dump, "transform statement.");
7387 strided_store = false;
7388 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7389 if (is_store)
7391 stmt_ann_t ann;
7392 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7394 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7395 interleaving chain was completed - free all the stores in
7396 the chain. */
7397 tree next = DR_GROUP_FIRST_DR (stmt_info);
7398 tree tmp;
7399 stmt_vec_info next_stmt_info;
7401 while (next)
7403 next_si = bsi_for_stmt (next);
7404 next_stmt_info = vinfo_for_stmt (next);
7405 /* Free the attached stmt_vec_info and remove the stmt. */
7406 ann = stmt_ann (next);
7407 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7408 free (next_stmt_info);
7409 set_stmt_info (ann, NULL);
7410 bsi_remove (&next_si, true);
7411 next = tmp;
7413 bsi_remove (&si, true);
7414 continue;
7416 else
7418 /* Free the attached stmt_vec_info and remove the stmt. */
7419 ann = stmt_ann (stmt);
7420 free (stmt_info);
7421 set_stmt_info (ann, NULL);
7422 bsi_remove (&si, true);
7423 continue;
7426 bsi_next (&si);
7427 } /* stmts in BB */
7428 } /* BBs in loop */
7430 slpeel_make_loop_iterate_ntimes (loop, ratio);
7432 mark_set_for_renaming (vect_memsyms_to_rename);
7434 /* The memory tags and pointers in vectorized statements need to
7435 have their SSA forms updated. FIXME, why can't this be delayed
7436 until all the loops have been transformed? */
7437 update_ssa (TODO_update_ssa);
7439 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7440 fprintf (vect_dump, "LOOP VECTORIZED.");
7441 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7442 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");