EnumSet*.class: Regenerate
[official-gcc.git] / gcc / tree-vect-transform.c
blob00cc8c342185c40022b15b43d11945f606d4ed9c
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
75 static int
76 cost_for_stmt (tree stmt)
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
80 switch (STMT_VINFO_TYPE (stmt_info))
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
103 /* Function vect_estimate_min_profitable_iters
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
124 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
125 int nbbs = loop->num_nodes;
126 int byte_misalign;
127 int innerloop_iters = 0, factor;
129 /* Cost model disabled. */
130 if (!flag_vect_cost_model)
132 if (vect_print_dump_info (REPORT_DETAILS))
133 fprintf (vect_dump, "cost model disabled.");
134 return 0;
137 /* Requires loop versioning tests to handle misalignment.
138 FIXME: Make cost depend on number of stmts in may_misalign list. */
140 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
142 vec_outside_cost += TARG_COND_BRANCH_COST;
143 if (vect_print_dump_info (REPORT_DETAILS))
144 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
145 "versioning.\n");
148 /* Count statements in scalar loop. Using this as scalar cost for a single
149 iteration for now.
151 TODO: Add outer loop support.
153 TODO: Consider assigning different costs to different scalar
154 statements. */
156 /* FORNOW. */
157 if (loop->inner)
158 innerloop_iters = 50; /* FIXME */
160 for (i = 0; i < nbbs; i++)
162 block_stmt_iterator si;
163 basic_block bb = bbs[i];
165 if (bb->loop_father == loop->inner)
166 factor = innerloop_iters;
167 else
168 factor = 1;
170 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
172 tree stmt = bsi_stmt (si);
173 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
174 if (!STMT_VINFO_RELEVANT_P (stmt_info)
175 && !STMT_VINFO_LIVE_P (stmt_info))
176 continue;
177 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
178 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
179 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
180 some of the "outside" costs are generated inside the outer-loop. */
181 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
185 /* Add additional cost for the peeled instructions in prologue and epilogue
186 loop.
188 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
189 at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
191 TODO: Build an expression that represents peel_iters for prologue and
192 epilogue to be used in a run-time test. */
194 byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
196 if (byte_misalign < 0)
198 peel_iters_prologue = (vf - 1)/2;
199 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "cost model: "
201 "prologue peel iters set to (vf-1)/2.");
203 /* If peeling for alignment is unknown, loop bound of main loop becomes
204 unknown. */
205 peel_iters_epilogue = (vf - 1)/2;
206 if (vect_print_dump_info (REPORT_DETAILS))
207 fprintf (vect_dump, "cost model: "
208 "epilogue peel iters set to (vf-1)/2 because "
209 "peeling for alignment is unknown .");
211 else
213 if (byte_misalign)
215 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
216 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
217 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
218 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
220 peel_iters_prologue = nelements - (byte_misalign / element_size);
222 else
223 peel_iters_prologue = 0;
225 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
227 peel_iters_epilogue = (vf - 1)/2;
228 if (vect_print_dump_info (REPORT_DETAILS))
229 fprintf (vect_dump, "cost model: "
230 "epilogue peel iters set to (vf-1)/2 because "
231 "loop iterations are unknown .");
233 else
235 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
236 peel_iters_prologue = niters < peel_iters_prologue ?
237 niters : peel_iters_prologue;
238 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
242 /* Requires a prologue loop when peeling to handle misalignment. Add cost of
243 two guards, one for the peeled loop and one for the vector loop. */
245 if (peel_iters_prologue)
247 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
248 if (vect_print_dump_info (REPORT_DETAILS))
249 fprintf (vect_dump, "cost model: Adding cost of checks for "
250 "prologue.\n");
253 /* Requires an epilogue loop to finish up remaining iterations after vector
254 loop. Add cost of two guards, one for the peeled loop and one for the
255 vector loop. */
257 if (peel_iters_epilogue
258 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
259 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
261 vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "cost model : Adding cost of checks for "
264 "epilogue.\n");
267 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
268 + (peel_iters_epilogue * scalar_single_iter_cost);
270 /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
271 information we provide for the target is whether testing against the
272 threshold involves a runtime test. */
273 if (targetm.vectorize.builtin_vectorization_cost)
275 bool runtime_test = false;
277 /* If the number of iterations is unknown, or the
278 peeling-for-misalignment amount is unknown, we eill have to generate
279 a runtime test to test the loop count against the threshold. */
280 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
281 || (byte_misalign < 0))
282 runtime_test = true;
283 vec_outside_cost +=
284 targetm.vectorize.builtin_vectorization_cost (runtime_test);
285 if (vect_print_dump_info (REPORT_DETAILS))
286 fprintf (vect_dump, "cost model : Adding target out-of-loop cost = %d",
287 targetm.vectorize.builtin_vectorization_cost (runtime_test));
290 /* Calculate number of iterations required to make the vector version
291 profitable, relative to the loop bodies only. The following condition
292 must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
293 SIC = scalar iteration cost, VIC = vector iteration cost,
294 VOC = vector outside cost and VF = vectorization factor. */
296 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
298 if (vec_outside_cost == 0)
299 min_profitable_iters = 1;
300 else
302 min_profitable_iters = (vec_outside_cost * vf)
303 / ((scalar_single_iter_cost * vf)
304 - vec_inside_cost);
306 if ((scalar_single_iter_cost * vf * min_profitable_iters)
307 <= ((vec_inside_cost * min_profitable_iters)
308 + (vec_outside_cost * vf)))
309 min_profitable_iters++;
312 /* vector version will never be profitable. */
313 else
315 if (vect_print_dump_info (REPORT_DETAILS))
316 fprintf (vect_dump, "cost model: vector iteration cost = %d "
317 "is divisible by scalar iteration cost = %d by a factor "
318 "greater than or equal to the vectorization factor = %d .",
319 vec_inside_cost, scalar_single_iter_cost, vf);
320 return -1;
323 if (vect_print_dump_info (REPORT_DETAILS))
325 fprintf (vect_dump, "Cost model analysis: \n");
326 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
327 vec_inside_cost);
328 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
329 vec_outside_cost);
330 fprintf (vect_dump, " Scalar cost: %d\n", scalar_single_iter_cost);
331 fprintf (vect_dump, " prologue iterations: %d\n",
332 peel_iters_prologue);
333 fprintf (vect_dump, " epilogue iterations: %d\n",
334 peel_iters_epilogue);
335 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
336 min_profitable_iters);
337 fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
338 min_profitable_iters < vf ? vf : min_profitable_iters);
341 min_profitable_iters =
342 min_profitable_iters < vf ? vf : min_profitable_iters;
344 /* Because the condition we create is:
345 if (niters <= min_profitable_iters)
346 then skip the vectorized loop. */
347 min_profitable_iters--;
348 return min_profitable_iters;
352 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
353 functions. Design better to avoid maintenance issues. */
355 /* Function vect_model_reduction_cost.
357 Models cost for a reduction operation, including the vector ops
358 generated within the strip-mine loop, the initial definition before
359 the loop, and the epilogue code that must be generated. */
361 static void
362 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
363 int ncopies)
365 int outer_cost = 0;
366 enum tree_code code;
367 optab optab;
368 tree vectype;
369 tree orig_stmt;
370 tree reduction_op;
371 enum machine_mode mode;
372 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
373 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
374 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
375 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
377 /* Cost of reduction op inside loop. */
378 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
380 reduction_op = TREE_OPERAND (operation, op_type-1);
381 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
382 mode = TYPE_MODE (vectype);
383 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
385 if (!orig_stmt)
386 orig_stmt = STMT_VINFO_STMT (stmt_info);
388 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
390 /* Add in cost for initial definition. */
391 outer_cost += TARG_SCALAR_TO_VEC_COST;
393 /* Determine cost of epilogue code.
395 We have a reduction operator that will reduce the vector in one statement.
396 Also requires scalar extract. */
398 if (!nested_in_vect_loop_p (loop, orig_stmt))
400 if (reduc_code < NUM_TREE_CODES)
401 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
402 else
404 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
405 tree bitsize =
406 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
407 int element_bitsize = tree_low_cst (bitsize, 1);
408 int nelements = vec_size_in_bits / element_bitsize;
410 optab = optab_for_tree_code (code, vectype);
412 /* We have a whole vector shift available. */
413 if (VECTOR_MODE_P (mode)
414 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
415 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
416 /* Final reduction via vector shifts and the reduction operator. Also
417 requires scalar extract. */
418 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
419 + TARG_VEC_TO_SCALAR_COST);
420 else
421 /* Use extracts and reduction op for final reduction. For N elements,
422 we have N extracts and N-1 reduction ops. */
423 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
427 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
431 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
432 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
436 /* Function vect_model_induction_cost.
438 Models cost for induction operations. */
440 static void
441 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
443 /* loop cost for vec_loop. */
444 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
445 /* prologue cost for vec_init and vec_step. */
446 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
448 if (vect_print_dump_info (REPORT_DETAILS))
449 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
450 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
451 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
455 /* Function vect_model_simple_cost.
457 Models cost for simple operations, i.e. those that only emit ncopies of a
458 single op. Right now, this does not account for multiple insns that could
459 be generated for the single vector op. We will handle that shortly. */
461 static void
462 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type *dt)
464 int i;
466 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
468 /* FORNOW: Assuming maximum 2 args per stmts. */
469 for (i=0; i<2; i++)
471 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
472 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) += TARG_SCALAR_TO_VEC_COST;
475 if (vect_print_dump_info (REPORT_DETAILS))
476 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
477 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
478 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
482 /* Function vect_cost_strided_group_size
484 For strided load or store, return the group_size only if it is the first
485 load or store of a group, else return 1. This ensures that group size is
486 only returned once per group. */
488 static int
489 vect_cost_strided_group_size (stmt_vec_info stmt_info)
491 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
493 if (first_stmt == STMT_VINFO_STMT (stmt_info))
494 return DR_GROUP_SIZE (stmt_info);
496 return 1;
500 /* Function vect_model_store_cost
502 Models cost for stores. In the case of strided accesses, one access
503 has the overhead of the strided access attributed to it. */
505 static void
506 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type dt)
508 int cost = 0;
509 int group_size;
511 if (dt == vect_constant_def || dt == vect_invariant_def)
512 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = TARG_SCALAR_TO_VEC_COST;
514 /* Strided access? */
515 if (DR_GROUP_FIRST_DR (stmt_info))
516 group_size = vect_cost_strided_group_size (stmt_info);
517 /* Not a strided access. */
518 else
519 group_size = 1;
521 /* Is this an access in a group of stores, which provide strided access?
522 If so, add in the cost of the permutes. */
523 if (group_size > 1)
525 /* Uses a high and low interleave operation for each needed permute. */
526 cost = ncopies * exact_log2(group_size) * group_size
527 * TARG_VEC_STMT_COST;
529 if (vect_print_dump_info (REPORT_DETAILS))
530 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
531 group_size);
535 /* Costs of the stores. */
536 cost += ncopies * TARG_VEC_STORE_COST;
538 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = cost;
540 if (vect_print_dump_info (REPORT_DETAILS))
541 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
542 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
543 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
547 /* Function vect_model_load_cost
549 Models cost for loads. In the case of strided accesses, the last access
550 has the overhead of the strided access attributed to it. Since unaligned
551 accesses are supported for loads, we also account for the costs of the
552 access scheme chosen. */
554 static void
555 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
558 int inner_cost = 0;
559 int group_size;
560 int alignment_support_cheme;
561 tree first_stmt;
562 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
564 /* Strided accesses? */
565 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
566 if (first_stmt)
568 group_size = vect_cost_strided_group_size (stmt_info);
569 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
571 /* Not a strided access. */
572 else
574 group_size = 1;
575 first_dr = dr;
578 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
580 /* Is this an access in a group of loads providing strided access?
581 If so, add in the cost of the permutes. */
582 if (group_size > 1)
584 /* Uses an even and odd extract operations for each needed permute. */
585 inner_cost = ncopies * exact_log2(group_size) * group_size
586 * TARG_VEC_STMT_COST;
588 if (vect_print_dump_info (REPORT_DETAILS))
589 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
590 group_size);
594 /* The loads themselves. */
595 switch (alignment_support_cheme)
597 case dr_aligned:
599 inner_cost += ncopies * TARG_VEC_LOAD_COST;
601 if (vect_print_dump_info (REPORT_DETAILS))
602 fprintf (vect_dump, "vect_model_load_cost: aligned.");
604 break;
606 case dr_unaligned_supported:
608 /* Here, we assign an additional cost for the unaligned load. */
609 inner_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
611 if (vect_print_dump_info (REPORT_DETAILS))
612 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
613 "hardware.");
615 break;
617 case dr_explicit_realign:
619 inner_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
621 /* FIXME: If the misalignment remains fixed across the iterations of
622 the containing loop, the following cost should be added to the
623 outside costs. */
624 if (targetm.vectorize.builtin_mask_for_load)
625 inner_cost += TARG_VEC_STMT_COST;
627 break;
629 case dr_explicit_realign_optimized:
631 int outer_cost = 0;
633 if (vect_print_dump_info (REPORT_DETAILS))
634 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
635 "pipelined.");
637 /* Unaligned software pipeline has a load of an address, an initial
638 load, and possibly a mask operation to "prime" the loop. However,
639 if this is an access in a group of loads, which provide strided
640 access, then the above cost should only be considered for one
641 access in the group. Inside the loop, there is a load op
642 and a realignment op. */
644 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1)
646 outer_cost = 2*TARG_VEC_STMT_COST;
647 if (targetm.vectorize.builtin_mask_for_load)
648 outer_cost += TARG_VEC_STMT_COST;
651 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
653 inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
655 break;
658 default:
659 gcc_unreachable ();
662 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = inner_cost;
664 if (vect_print_dump_info (REPORT_DETAILS))
665 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
666 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
667 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
672 /* Function vect_get_new_vect_var.
674 Returns a name for a new variable. The current naming scheme appends the
675 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
676 the name of vectorizer generated variables, and appends that to NAME if
677 provided. */
679 static tree
680 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
682 const char *prefix;
683 tree new_vect_var;
685 switch (var_kind)
687 case vect_simple_var:
688 prefix = "vect_";
689 break;
690 case vect_scalar_var:
691 prefix = "stmp_";
692 break;
693 case vect_pointer_var:
694 prefix = "vect_p";
695 break;
696 default:
697 gcc_unreachable ();
700 if (name)
702 char* tmp = concat (prefix, name, NULL);
703 new_vect_var = create_tmp_var (type, tmp);
704 free (tmp);
706 else
707 new_vect_var = create_tmp_var (type, prefix);
709 /* Mark vector typed variable as a gimple register variable. */
710 if (TREE_CODE (type) == VECTOR_TYPE)
711 DECL_GIMPLE_REG_P (new_vect_var) = true;
713 return new_vect_var;
717 /* Function vect_create_addr_base_for_vector_ref.
719 Create an expression that computes the address of the first memory location
720 that will be accessed for a data reference.
722 Input:
723 STMT: The statement containing the data reference.
724 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
725 OFFSET: Optional. If supplied, it is be added to the initial address.
726 LOOP: Specify relative to which loop-nest should the address be computed.
727 For example, when the dataref is in an inner-loop nested in an
728 outer-loop that is now being vectorized, LOOP can be either the
729 outer-loop, or the inner-loop. The first memory location accessed
730 by the following dataref ('in' points to short):
732 for (i=0; i<N; i++)
733 for (j=0; j<M; j++)
734 s += in[i+j]
736 is as follows:
737 if LOOP=i_loop: &in (relative to i_loop)
738 if LOOP=j_loop: &in+i*2B (relative to j_loop)
740 Output:
741 1. Return an SSA_NAME whose value is the address of the memory location of
742 the first vector of the data reference.
743 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
744 these statement(s) which define the returned SSA_NAME.
746 FORNOW: We are only handling array accesses with step 1. */
748 static tree
749 vect_create_addr_base_for_vector_ref (tree stmt,
750 tree *new_stmt_list,
751 tree offset,
752 struct loop *loop)
754 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
755 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
756 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
757 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
758 tree base_name;
759 tree data_ref_base_var;
760 tree new_base_stmt;
761 tree vec_stmt;
762 tree addr_base, addr_expr;
763 tree dest, new_stmt;
764 tree base_offset = unshare_expr (DR_OFFSET (dr));
765 tree init = unshare_expr (DR_INIT (dr));
766 tree vect_ptr_type, addr_expr2;
767 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
769 gcc_assert (loop);
770 if (loop != containing_loop)
772 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
773 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
775 gcc_assert (nested_in_vect_loop_p (loop, stmt));
777 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
778 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
779 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
782 /* Create data_ref_base */
783 base_name = build_fold_indirect_ref (data_ref_base);
784 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
785 add_referenced_var (data_ref_base_var);
786 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
787 true, data_ref_base_var);
788 append_to_statement_list_force(new_base_stmt, new_stmt_list);
790 /* Create base_offset */
791 base_offset = size_binop (PLUS_EXPR, base_offset, init);
792 base_offset = fold_convert (sizetype, base_offset);
793 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
794 add_referenced_var (dest);
795 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
796 append_to_statement_list_force (new_stmt, new_stmt_list);
798 if (offset)
800 tree tmp = create_tmp_var (sizetype, "offset");
802 add_referenced_var (tmp);
803 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
804 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
805 base_offset, offset);
806 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
807 append_to_statement_list_force (new_stmt, new_stmt_list);
810 /* base + base_offset */
811 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
812 data_ref_base, base_offset);
814 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
816 /* addr_expr = addr_base */
817 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
818 get_name (base_name));
819 add_referenced_var (addr_expr);
820 vec_stmt = fold_convert (vect_ptr_type, addr_base);
821 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
822 get_name (base_name));
823 add_referenced_var (addr_expr2);
824 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
825 append_to_statement_list_force (new_stmt, new_stmt_list);
827 if (vect_print_dump_info (REPORT_DETAILS))
829 fprintf (vect_dump, "created ");
830 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
832 return vec_stmt;
836 /* Function vect_create_data_ref_ptr.
838 Create a new pointer to vector type (vp), that points to the first location
839 accessed in the loop by STMT, along with the def-use update chain to
840 appropriately advance the pointer through the loop iterations. Also set
841 aliasing information for the pointer. This vector pointer is used by the
842 callers to this function to create a memory reference expression for vector
843 load/store access.
845 Input:
846 1. STMT: a stmt that references memory. Expected to be of the form
847 GIMPLE_MODIFY_STMT <name, data-ref> or
848 GIMPLE_MODIFY_STMT <data-ref, name>.
849 2. AT_LOOP: the loop where the vector memref is to be created.
850 3. OFFSET (optional): an offset to be added to the initial address accessed
851 by the data-ref in STMT.
852 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
853 pointing to the initial address.
854 5. TYPE: if not NULL indicates the required type of the data-ref
856 Output:
857 1. Declare a new ptr to vector_type, and have it point to the base of the
858 data reference (initial addressed accessed by the data reference).
859 For example, for vector of type V8HI, the following code is generated:
861 v8hi *vp;
862 vp = (v8hi *)initial_address;
864 if OFFSET is not supplied:
865 initial_address = &a[init];
866 if OFFSET is supplied:
867 initial_address = &a[init + OFFSET];
869 Return the initial_address in INITIAL_ADDRESS.
871 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
872 update the pointer in each iteration of the loop.
874 Return the increment stmt that updates the pointer in PTR_INCR.
876 3. Set INV_P to true if the access pattern of the data reference in the
877 vectorized loop is invariant. Set it to false otherwise.
879 4. Return the pointer. */
881 static tree
882 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
883 tree offset, tree *initial_address, tree *ptr_incr,
884 bool only_init, tree type, bool *inv_p)
886 tree base_name;
887 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
888 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
889 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
890 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
891 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
892 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
893 tree vect_ptr_type;
894 tree vect_ptr;
895 tree tag;
896 tree new_temp;
897 tree vec_stmt;
898 tree new_stmt_list = NULL_TREE;
899 edge pe;
900 basic_block new_bb;
901 tree vect_ptr_init;
902 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
903 tree vptr;
904 block_stmt_iterator incr_bsi;
905 bool insert_after;
906 tree indx_before_incr, indx_after_incr;
907 tree incr;
908 tree step;
910 /* Check the step (evolution) of the load in LOOP, and record
911 whether it's invariant. */
912 if (nested_in_vect_loop)
913 step = STMT_VINFO_DR_STEP (stmt_info);
914 else
915 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
917 if (tree_int_cst_compare (step, size_zero_node) == 0)
918 *inv_p = true;
919 else
920 *inv_p = false;
922 /* Create an expression for the first address accessed by this load
923 in LOOP. */
924 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
926 if (vect_print_dump_info (REPORT_DETAILS))
928 tree data_ref_base = base_name;
929 fprintf (vect_dump, "create vector-pointer variable to type: ");
930 print_generic_expr (vect_dump, vectype, TDF_SLIM);
931 if (TREE_CODE (data_ref_base) == VAR_DECL)
932 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
933 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
934 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
935 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
936 fprintf (vect_dump, " vectorizing a record based array ref: ");
937 else if (TREE_CODE (data_ref_base) == SSA_NAME)
938 fprintf (vect_dump, " vectorizing a pointer ref: ");
939 print_generic_expr (vect_dump, base_name, TDF_SLIM);
942 /** (1) Create the new vector-pointer variable: **/
943 if (type)
944 vect_ptr_type = build_pointer_type (type);
945 else
946 vect_ptr_type = build_pointer_type (vectype);
947 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
948 get_name (base_name));
949 add_referenced_var (vect_ptr);
951 /** (2) Add aliasing information to the new vector-pointer:
952 (The points-to info (DR_PTR_INFO) may be defined later.) **/
954 tag = DR_SYMBOL_TAG (dr);
955 gcc_assert (tag);
957 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
958 tag must be created with tag added to its may alias list. */
959 if (!MTAG_P (tag))
960 new_type_alias (vect_ptr, tag, DR_REF (dr));
961 else
962 set_symbol_mem_tag (vect_ptr, tag);
964 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
966 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
967 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
968 def-use update cycles for the pointer: One relative to the outer-loop
969 (LOOP), which is what steps (3) and (4) below do. The other is relative
970 to the inner-loop (which is the inner-most loop containing the dataref),
971 and this is done be step (5) below.
973 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
974 inner-most loop, and so steps (3),(4) work the same, and step (5) is
975 redundant. Steps (3),(4) create the following:
977 vp0 = &base_addr;
978 LOOP: vp1 = phi(vp0,vp2)
979 ...
981 vp2 = vp1 + step
982 goto LOOP
984 If there is an inner-loop nested in loop, then step (5) will also be
985 applied, and an additional update in the inner-loop will be created:
987 vp0 = &base_addr;
988 LOOP: vp1 = phi(vp0,vp2)
990 inner: vp3 = phi(vp1,vp4)
991 vp4 = vp3 + inner_step
992 if () goto inner
994 vp2 = vp1 + step
995 if () goto LOOP */
997 /** (3) Calculate the initial address the vector-pointer, and set
998 the vector-pointer to point to it before the loop: **/
1000 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1002 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1003 offset, loop);
1004 pe = loop_preheader_edge (loop);
1005 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1006 gcc_assert (!new_bb);
1007 *initial_address = new_temp;
1009 /* Create: p = (vectype *) initial_base */
1010 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1011 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1012 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1013 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1014 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1015 gcc_assert (!new_bb);
1018 /** (4) Handle the updating of the vector-pointer inside the loop.
1019 This is needed when ONLY_INIT is false, and also when AT_LOOP
1020 is the inner-loop nested in LOOP (during outer-loop vectorization).
1023 if (only_init && at_loop == loop) /* No update in loop is required. */
1025 /* Copy the points-to information if it exists. */
1026 if (DR_PTR_INFO (dr))
1027 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1028 vptr = vect_ptr_init;
1030 else
1032 /* The step of the vector pointer is the Vector Size. */
1033 tree step = TYPE_SIZE_UNIT (vectype);
1034 /* One exception to the above is when the scalar step of the load in
1035 LOOP is zero. In this case the step here is also zero. */
1036 if (*inv_p)
1037 step = size_zero_node;
1039 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1041 create_iv (vect_ptr_init,
1042 fold_convert (vect_ptr_type, step),
1043 NULL_TREE, loop, &incr_bsi, insert_after,
1044 &indx_before_incr, &indx_after_incr);
1045 incr = bsi_stmt (incr_bsi);
1046 set_stmt_info (stmt_ann (incr),
1047 new_stmt_vec_info (incr, loop_vinfo));
1049 /* Copy the points-to information if it exists. */
1050 if (DR_PTR_INFO (dr))
1052 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1053 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1055 merge_alias_info (vect_ptr_init, indx_before_incr);
1056 merge_alias_info (vect_ptr_init, indx_after_incr);
1057 if (ptr_incr)
1058 *ptr_incr = incr;
1060 vptr = indx_before_incr;
1063 if (!nested_in_vect_loop || only_init)
1064 return vptr;
1067 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1068 nested in LOOP, if exists: **/
1070 gcc_assert (nested_in_vect_loop);
1071 if (!only_init)
1073 standard_iv_increment_position (containing_loop, &incr_bsi,
1074 &insert_after);
1075 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1076 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1077 &indx_after_incr);
1078 incr = bsi_stmt (incr_bsi);
1079 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1081 /* Copy the points-to information if it exists. */
1082 if (DR_PTR_INFO (dr))
1084 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1085 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1087 merge_alias_info (vect_ptr_init, indx_before_incr);
1088 merge_alias_info (vect_ptr_init, indx_after_incr);
1089 if (ptr_incr)
1090 *ptr_incr = incr;
1092 return indx_before_incr;
1094 else
1095 gcc_unreachable ();
1099 /* Function bump_vector_ptr
1101 Increment a pointer (to a vector type) by vector-size. If requested,
1102 i.e. if PTR-INCR is given, then also connect the new increment stmt
1103 to the existing def-use update-chain of the pointer, by modifying
1104 the PTR_INCR as illustrated below:
1106 The pointer def-use update-chain before this function:
1107 DATAREF_PTR = phi (p_0, p_2)
1108 ....
1109 PTR_INCR: p_2 = DATAREF_PTR + step
1111 The pointer def-use update-chain after this function:
1112 DATAREF_PTR = phi (p_0, p_2)
1113 ....
1114 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1115 ....
1116 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1118 Input:
1119 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1120 in the loop.
1121 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1122 the loop. The increment amount across iterations is expected
1123 to be vector_size.
1124 BSI - location where the new update stmt is to be placed.
1125 STMT - the original scalar memory-access stmt that is being vectorized.
1126 BUMP - optional. The offset by which to bump the pointer. If not given,
1127 the offset is assumed to be vector_size.
1129 Output: Return NEW_DATAREF_PTR as illustrated above.
1133 static tree
1134 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1135 tree stmt, tree bump)
1137 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1138 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1139 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1140 tree vptr_type = TREE_TYPE (dataref_ptr);
1141 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1142 tree update = TYPE_SIZE_UNIT (vectype);
1143 tree incr_stmt;
1144 ssa_op_iter iter;
1145 use_operand_p use_p;
1146 tree new_dataref_ptr;
1148 if (bump)
1149 update = bump;
1151 incr_stmt = build_gimple_modify_stmt (ptr_var,
1152 build2 (POINTER_PLUS_EXPR, vptr_type,
1153 dataref_ptr, update));
1154 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1155 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1156 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1158 /* Copy the points-to information if it exists. */
1159 if (DR_PTR_INFO (dr))
1160 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1161 merge_alias_info (new_dataref_ptr, dataref_ptr);
1163 if (!ptr_incr)
1164 return new_dataref_ptr;
1166 /* Update the vector-pointer's cross-iteration increment. */
1167 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1169 tree use = USE_FROM_PTR (use_p);
1171 if (use == dataref_ptr)
1172 SET_USE (use_p, new_dataref_ptr);
1173 else
1174 gcc_assert (tree_int_cst_compare (use, update) == 0);
1177 return new_dataref_ptr;
1181 /* Function vect_create_destination_var.
1183 Create a new temporary of type VECTYPE. */
1185 static tree
1186 vect_create_destination_var (tree scalar_dest, tree vectype)
1188 tree vec_dest;
1189 const char *new_name;
1190 tree type;
1191 enum vect_var_kind kind;
1193 kind = vectype ? vect_simple_var : vect_scalar_var;
1194 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1196 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1198 new_name = get_name (scalar_dest);
1199 if (!new_name)
1200 new_name = "var_";
1201 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1202 add_referenced_var (vec_dest);
1204 return vec_dest;
1208 /* Function vect_init_vector.
1210 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1211 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1212 is not NULL. Otherwise, place the initialization at the loop preheader.
1213 Return the DEF of INIT_STMT.
1214 It will be used in the vectorization of STMT. */
1216 static tree
1217 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1218 block_stmt_iterator *bsi)
1220 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1221 tree new_var;
1222 tree init_stmt;
1223 tree vec_oprnd;
1224 edge pe;
1225 tree new_temp;
1226 basic_block new_bb;
1228 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1229 add_referenced_var (new_var);
1230 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1231 new_temp = make_ssa_name (new_var, init_stmt);
1232 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1234 if (bsi)
1235 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1236 else
1238 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1241 if (nested_in_vect_loop_p (loop, stmt))
1242 loop = loop->inner;
1243 pe = loop_preheader_edge (loop);
1244 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1245 gcc_assert (!new_bb);
1248 if (vect_print_dump_info (REPORT_DETAILS))
1250 fprintf (vect_dump, "created new init_stmt: ");
1251 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1254 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1255 return vec_oprnd;
1259 /* Function get_initial_def_for_induction
1261 Input:
1262 STMT - a stmt that performs an induction operation in the loop.
1263 IV_PHI - the initial value of the induction variable
1265 Output:
1266 Return a vector variable, initialized with the first VF values of
1267 the induction variable. E.g., for an iv with IV_PHI='X' and
1268 evolution S, for a vector of 4 units, we want to return:
1269 [X, X + S, X + 2*S, X + 3*S]. */
1271 static tree
1272 get_initial_def_for_induction (tree iv_phi)
1274 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1276 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1277 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1278 tree vectype = get_vectype_for_scalar_type (scalar_type);
1279 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1280 edge pe = loop_preheader_edge (loop);
1281 struct loop *iv_loop;
1282 basic_block new_bb;
1283 tree vec, vec_init, vec_step, t;
1284 tree access_fn;
1285 tree new_var;
1286 tree new_name;
1287 tree init_stmt;
1288 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1289 tree init_expr, step_expr;
1290 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1291 int i;
1292 bool ok;
1293 int ncopies = vf / nunits;
1294 tree expr;
1295 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1296 bool nested_in_vect_loop = false;
1297 tree stmts;
1298 imm_use_iterator imm_iter;
1299 use_operand_p use_p;
1300 tree exit_phi;
1301 edge latch_e;
1302 tree loop_arg;
1303 block_stmt_iterator si;
1304 basic_block bb = bb_for_stmt (iv_phi);
1306 gcc_assert (phi_info);
1307 gcc_assert (ncopies >= 1);
1309 /* Find the first insertion point in the BB. */
1310 si = bsi_after_labels (bb);
1312 if (INTEGRAL_TYPE_P (scalar_type))
1313 step_expr = build_int_cst (scalar_type, 0);
1314 else
1315 step_expr = build_real (scalar_type, dconst0);
1317 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1318 if (nested_in_vect_loop_p (loop, iv_phi))
1320 nested_in_vect_loop = true;
1321 iv_loop = loop->inner;
1323 else
1324 iv_loop = loop;
1325 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1327 latch_e = loop_latch_edge (iv_loop);
1328 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1330 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1331 gcc_assert (access_fn);
1332 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1333 &init_expr, &step_expr);
1334 gcc_assert (ok);
1335 pe = loop_preheader_edge (iv_loop);
1337 /* Create the vector that holds the initial_value of the induction. */
1338 if (nested_in_vect_loop)
1340 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1341 been created during vectorization of previous stmts; We obtain it from
1342 the STMT_VINFO_VEC_STMT of the defining stmt. */
1343 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1344 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1346 else
1348 /* iv_loop is the loop to be vectorized. Create:
1349 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1350 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1351 add_referenced_var (new_var);
1353 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1354 if (stmts)
1356 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1357 gcc_assert (!new_bb);
1360 t = NULL_TREE;
1361 t = tree_cons (NULL_TREE, init_expr, t);
1362 for (i = 1; i < nunits; i++)
1364 tree tmp;
1366 /* Create: new_name_i = new_name + step_expr */
1367 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1368 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1369 new_name = make_ssa_name (new_var, init_stmt);
1370 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1372 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1373 gcc_assert (!new_bb);
1375 if (vect_print_dump_info (REPORT_DETAILS))
1377 fprintf (vect_dump, "created new init_stmt: ");
1378 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1380 t = tree_cons (NULL_TREE, new_name, t);
1382 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1383 vec = build_constructor_from_list (vectype, nreverse (t));
1384 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1388 /* Create the vector that holds the step of the induction. */
1389 if (nested_in_vect_loop)
1390 /* iv_loop is nested in the loop to be vectorized. Generate:
1391 vec_step = [S, S, S, S] */
1392 new_name = step_expr;
1393 else
1395 /* iv_loop is the loop to be vectorized. Generate:
1396 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1397 expr = build_int_cst (scalar_type, vf);
1398 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1401 t = NULL_TREE;
1402 for (i = 0; i < nunits; i++)
1403 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1404 vec = build_constructor_from_list (vectype, t);
1405 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1408 /* Create the following def-use cycle:
1409 loop prolog:
1410 vec_init = ...
1411 vec_step = ...
1412 loop:
1413 vec_iv = PHI <vec_init, vec_loop>
1415 STMT
1417 vec_loop = vec_iv + vec_step; */
1419 /* Create the induction-phi that defines the induction-operand. */
1420 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1421 add_referenced_var (vec_dest);
1422 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1423 set_stmt_info (get_stmt_ann (induction_phi),
1424 new_stmt_vec_info (induction_phi, loop_vinfo));
1425 induc_def = PHI_RESULT (induction_phi);
1427 /* Create the iv update inside the loop */
1428 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1429 build2 (PLUS_EXPR, vectype,
1430 induc_def, vec_step));
1431 vec_def = make_ssa_name (vec_dest, new_stmt);
1432 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1433 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1434 set_stmt_info (get_stmt_ann (new_stmt),
1435 new_stmt_vec_info (new_stmt, loop_vinfo));
1437 /* Set the arguments of the phi node: */
1438 add_phi_arg (induction_phi, vec_init, pe);
1439 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1442 /* In case that vectorization factor (VF) is bigger than the number
1443 of elements that we can fit in a vectype (nunits), we have to generate
1444 more than one vector stmt - i.e - we need to "unroll" the
1445 vector stmt by a factor VF/nunits. For more details see documentation
1446 in vectorizable_operation. */
1448 if (ncopies > 1)
1450 stmt_vec_info prev_stmt_vinfo;
1451 /* FORNOW. This restriction should be relaxed. */
1452 gcc_assert (!nested_in_vect_loop);
1454 /* Create the vector that holds the step of the induction. */
1455 expr = build_int_cst (scalar_type, nunits);
1456 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1457 t = NULL_TREE;
1458 for (i = 0; i < nunits; i++)
1459 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1460 vec = build_constructor_from_list (vectype, t);
1461 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1463 vec_def = induc_def;
1464 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1465 for (i = 1; i < ncopies; i++)
1467 tree tmp;
1469 /* vec_i = vec_prev + vec_step */
1470 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1471 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1472 vec_def = make_ssa_name (vec_dest, new_stmt);
1473 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1474 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1475 set_stmt_info (get_stmt_ann (new_stmt),
1476 new_stmt_vec_info (new_stmt, loop_vinfo));
1477 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1478 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1482 if (nested_in_vect_loop)
1484 /* Find the loop-closed exit-phi of the induction, and record
1485 the final vector of induction results: */
1486 exit_phi = NULL;
1487 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1489 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1491 exit_phi = USE_STMT (use_p);
1492 break;
1495 if (exit_phi)
1497 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1498 /* FORNOW. Currently not supporting the case that an inner-loop induction
1499 is not used in the outer-loop (i.e. only outside the outer-loop). */
1500 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1501 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1503 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1504 if (vect_print_dump_info (REPORT_DETAILS))
1506 fprintf (vect_dump, "vector of inductions after inner-loop:");
1507 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1513 if (vect_print_dump_info (REPORT_DETAILS))
1515 fprintf (vect_dump, "transform induction: created def-use cycle:");
1516 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1517 fprintf (vect_dump, "\n");
1518 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1521 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1522 return induc_def;
1526 /* Function vect_get_vec_def_for_operand.
1528 OP is an operand in STMT. This function returns a (vector) def that will be
1529 used in the vectorized stmt for STMT.
1531 In the case that OP is an SSA_NAME which is defined in the loop, then
1532 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1534 In case OP is an invariant or constant, a new stmt that creates a vector def
1535 needs to be introduced. */
1537 static tree
1538 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1540 tree vec_oprnd;
1541 tree vec_stmt;
1542 tree def_stmt;
1543 stmt_vec_info def_stmt_info = NULL;
1544 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1545 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1546 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1547 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1548 tree vec_inv;
1549 tree vec_cst;
1550 tree t = NULL_TREE;
1551 tree def;
1552 int i;
1553 enum vect_def_type dt;
1554 bool is_simple_use;
1555 tree vector_type;
1557 if (vect_print_dump_info (REPORT_DETAILS))
1559 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1560 print_generic_expr (vect_dump, op, TDF_SLIM);
1563 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1564 gcc_assert (is_simple_use);
1565 if (vect_print_dump_info (REPORT_DETAILS))
1567 if (def)
1569 fprintf (vect_dump, "def = ");
1570 print_generic_expr (vect_dump, def, TDF_SLIM);
1572 if (def_stmt)
1574 fprintf (vect_dump, " def_stmt = ");
1575 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1579 switch (dt)
1581 /* Case 1: operand is a constant. */
1582 case vect_constant_def:
1584 if (scalar_def)
1585 *scalar_def = op;
1587 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1588 if (vect_print_dump_info (REPORT_DETAILS))
1589 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1591 for (i = nunits - 1; i >= 0; --i)
1593 t = tree_cons (NULL_TREE, op, t);
1595 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1596 vec_cst = build_vector (vector_type, t);
1598 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1601 /* Case 2: operand is defined outside the loop - loop invariant. */
1602 case vect_invariant_def:
1604 if (scalar_def)
1605 *scalar_def = def;
1607 /* Create 'vec_inv = {inv,inv,..,inv}' */
1608 if (vect_print_dump_info (REPORT_DETAILS))
1609 fprintf (vect_dump, "Create vector_inv.");
1611 for (i = nunits - 1; i >= 0; --i)
1613 t = tree_cons (NULL_TREE, def, t);
1616 /* FIXME: use build_constructor directly. */
1617 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1618 vec_inv = build_constructor_from_list (vector_type, t);
1619 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1622 /* Case 3: operand is defined inside the loop. */
1623 case vect_loop_def:
1625 if (scalar_def)
1626 *scalar_def = def_stmt;
1628 /* Get the def from the vectorized stmt. */
1629 def_stmt_info = vinfo_for_stmt (def_stmt);
1630 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1631 gcc_assert (vec_stmt);
1632 if (TREE_CODE (vec_stmt) == PHI_NODE)
1633 vec_oprnd = PHI_RESULT (vec_stmt);
1634 else
1635 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1636 return vec_oprnd;
1639 /* Case 4: operand is defined by a loop header phi - reduction */
1640 case vect_reduction_def:
1642 struct loop *loop;
1644 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1645 loop = (bb_for_stmt (def_stmt))->loop_father;
1647 /* Get the def before the loop */
1648 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1649 return get_initial_def_for_reduction (stmt, op, scalar_def);
1652 /* Case 5: operand is defined by loop-header phi - induction. */
1653 case vect_induction_def:
1655 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1657 /* Get the def from the vectorized stmt. */
1658 def_stmt_info = vinfo_for_stmt (def_stmt);
1659 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1660 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1661 vec_oprnd = PHI_RESULT (vec_stmt);
1662 return vec_oprnd;
1665 default:
1666 gcc_unreachable ();
1671 /* Function vect_get_vec_def_for_stmt_copy
1673 Return a vector-def for an operand. This function is used when the
1674 vectorized stmt to be created (by the caller to this function) is a "copy"
1675 created in case the vectorized result cannot fit in one vector, and several
1676 copies of the vector-stmt are required. In this case the vector-def is
1677 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1678 of the stmt that defines VEC_OPRND.
1679 DT is the type of the vector def VEC_OPRND.
1681 Context:
1682 In case the vectorization factor (VF) is bigger than the number
1683 of elements that can fit in a vectype (nunits), we have to generate
1684 more than one vector stmt to vectorize the scalar stmt. This situation
1685 arises when there are multiple data-types operated upon in the loop; the
1686 smallest data-type determines the VF, and as a result, when vectorizing
1687 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1688 vector stmt (each computing a vector of 'nunits' results, and together
1689 computing 'VF' results in each iteration). This function is called when
1690 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1691 which VF=16 and nunits=4, so the number of copies required is 4):
1693 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1695 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1696 VS1.1: vx.1 = memref1 VS1.2
1697 VS1.2: vx.2 = memref2 VS1.3
1698 VS1.3: vx.3 = memref3
1700 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1701 VSnew.1: vz1 = vx.1 + ... VSnew.2
1702 VSnew.2: vz2 = vx.2 + ... VSnew.3
1703 VSnew.3: vz3 = vx.3 + ...
1705 The vectorization of S1 is explained in vectorizable_load.
1706 The vectorization of S2:
1707 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1708 the function 'vect_get_vec_def_for_operand' is called to
1709 get the relevant vector-def for each operand of S2. For operand x it
1710 returns the vector-def 'vx.0'.
1712 To create the remaining copies of the vector-stmt (VSnew.j), this
1713 function is called to get the relevant vector-def for each operand. It is
1714 obtained from the respective VS1.j stmt, which is recorded in the
1715 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
1717 For example, to obtain the vector-def 'vx.1' in order to create the
1718 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
1719 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
1720 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
1721 and return its def ('vx.1').
1722 Overall, to create the above sequence this function will be called 3 times:
1723 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
1724 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
1725 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
1727 static tree
1728 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
1730 tree vec_stmt_for_operand;
1731 stmt_vec_info def_stmt_info;
1733 /* Do nothing; can reuse same def. */
1734 if (dt == vect_invariant_def || dt == vect_constant_def )
1735 return vec_oprnd;
1737 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
1738 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
1739 gcc_assert (def_stmt_info);
1740 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
1741 gcc_assert (vec_stmt_for_operand);
1742 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
1743 return vec_oprnd;
1747 /* Function vect_finish_stmt_generation.
1749 Insert a new stmt. */
1751 static void
1752 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
1753 block_stmt_iterator *bsi)
1755 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1756 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1758 gcc_assert (stmt == bsi_stmt (*bsi));
1759 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
1761 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
1763 set_stmt_info (get_stmt_ann (vec_stmt),
1764 new_stmt_vec_info (vec_stmt, loop_vinfo));
1766 if (vect_print_dump_info (REPORT_DETAILS))
1768 fprintf (vect_dump, "add new stmt: ");
1769 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
1772 /* Make sure bsi points to the stmt that is being vectorized. */
1773 gcc_assert (stmt == bsi_stmt (*bsi));
1775 #ifdef USE_MAPPED_LOCATION
1776 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
1777 #else
1778 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
1779 #endif
1783 /* Function get_initial_def_for_reduction
1785 Input:
1786 STMT - a stmt that performs a reduction operation in the loop.
1787 INIT_VAL - the initial value of the reduction variable
1789 Output:
1790 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
1791 of the reduction (used for adjusting the epilog - see below).
1792 Return a vector variable, initialized according to the operation that STMT
1793 performs. This vector will be used as the initial value of the
1794 vector of partial results.
1796 Option1 (adjust in epilog): Initialize the vector as follows:
1797 add: [0,0,...,0,0]
1798 mult: [1,1,...,1,1]
1799 min/max: [init_val,init_val,..,init_val,init_val]
1800 bit and/or: [init_val,init_val,..,init_val,init_val]
1801 and when necessary (e.g. add/mult case) let the caller know
1802 that it needs to adjust the result by init_val.
1804 Option2: Initialize the vector as follows:
1805 add: [0,0,...,0,init_val]
1806 mult: [1,1,...,1,init_val]
1807 min/max: [init_val,init_val,...,init_val]
1808 bit and/or: [init_val,init_val,...,init_val]
1809 and no adjustments are needed.
1811 For example, for the following code:
1813 s = init_val;
1814 for (i=0;i<n;i++)
1815 s = s + a[i];
1817 STMT is 's = s + a[i]', and the reduction variable is 's'.
1818 For a vector of 4 units, we want to return either [0,0,0,init_val],
1819 or [0,0,0,0] and let the caller know that it needs to adjust
1820 the result at the end by 'init_val'.
1822 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
1823 initialization vector is simpler (same element in all entries).
1824 A cost model should help decide between these two schemes. */
1826 static tree
1827 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
1829 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1830 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1832 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1833 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1834 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1835 tree type = TREE_TYPE (init_val);
1836 tree vecdef;
1837 tree def_for_init;
1838 tree init_def;
1839 tree t = NULL_TREE;
1840 int i;
1841 tree vector_type;
1842 bool nested_in_vect_loop = false;
1844 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
1845 if (nested_in_vect_loop_p (loop, stmt))
1846 nested_in_vect_loop = true;
1847 else
1848 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
1850 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
1852 switch (code)
1854 case WIDEN_SUM_EXPR:
1855 case DOT_PROD_EXPR:
1856 case PLUS_EXPR:
1857 if (nested_in_vect_loop)
1858 *adjustment_def = vecdef;
1859 else
1860 *adjustment_def = init_val;
1861 /* Create a vector of zeros for init_def. */
1862 if (INTEGRAL_TYPE_P (type))
1863 def_for_init = build_int_cst (type, 0);
1864 else
1865 def_for_init = build_real (type, dconst0);
1866 for (i = nunits - 1; i >= 0; --i)
1867 t = tree_cons (NULL_TREE, def_for_init, t);
1868 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
1869 init_def = build_vector (vector_type, t);
1870 break;
1872 case MIN_EXPR:
1873 case MAX_EXPR:
1874 *adjustment_def = NULL_TREE;
1875 init_def = vecdef;
1876 break;
1878 default:
1879 gcc_unreachable ();
1882 return init_def;
1886 /* Function vect_create_epilog_for_reduction
1888 Create code at the loop-epilog to finalize the result of a reduction
1889 computation.
1891 VECT_DEF is a vector of partial results.
1892 REDUC_CODE is the tree-code for the epilog reduction.
1893 STMT is the scalar reduction stmt that is being vectorized.
1894 REDUCTION_PHI is the phi-node that carries the reduction computation.
1896 This function:
1897 1. Creates the reduction def-use cycle: sets the arguments for
1898 REDUCTION_PHI:
1899 The loop-entry argument is the vectorized initial-value of the reduction.
1900 The loop-latch argument is VECT_DEF - the vector of partial sums.
1901 2. "Reduces" the vector of partial results VECT_DEF into a single result,
1902 by applying the operation specified by REDUC_CODE if available, or by
1903 other means (whole-vector shifts or a scalar loop).
1904 The function also creates a new phi node at the loop exit to preserve
1905 loop-closed form, as illustrated below.
1907 The flow at the entry to this function:
1909 loop:
1910 vec_def = phi <null, null> # REDUCTION_PHI
1911 VECT_DEF = vector_stmt # vectorized form of STMT
1912 s_loop = scalar_stmt # (scalar) STMT
1913 loop_exit:
1914 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1915 use <s_out0>
1916 use <s_out0>
1918 The above is transformed by this function into:
1920 loop:
1921 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
1922 VECT_DEF = vector_stmt # vectorized form of STMT
1923 s_loop = scalar_stmt # (scalar) STMT
1924 loop_exit:
1925 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
1926 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1927 v_out2 = reduce <v_out1>
1928 s_out3 = extract_field <v_out2, 0>
1929 s_out4 = adjust_result <s_out3>
1930 use <s_out4>
1931 use <s_out4>
1934 static void
1935 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
1936 enum tree_code reduc_code, tree reduction_phi)
1938 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1939 tree vectype;
1940 enum machine_mode mode;
1941 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1942 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943 basic_block exit_bb;
1944 tree scalar_dest;
1945 tree scalar_type;
1946 tree new_phi;
1947 block_stmt_iterator exit_bsi;
1948 tree vec_dest;
1949 tree new_temp = NULL_TREE;
1950 tree new_name;
1951 tree epilog_stmt = NULL_TREE;
1952 tree new_scalar_dest, exit_phi, new_dest;
1953 tree bitsize, bitpos, bytesize;
1954 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
1955 tree adjustment_def;
1956 tree vec_initial_def;
1957 tree orig_name;
1958 imm_use_iterator imm_iter;
1959 use_operand_p use_p;
1960 bool extract_scalar_result = false;
1961 tree reduction_op, expr;
1962 tree orig_stmt;
1963 tree use_stmt;
1964 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
1965 bool nested_in_vect_loop = false;
1966 int op_type;
1968 if (nested_in_vect_loop_p (loop, stmt))
1970 loop = loop->inner;
1971 nested_in_vect_loop = true;
1974 op_type = TREE_OPERAND_LENGTH (operation);
1975 reduction_op = TREE_OPERAND (operation, op_type-1);
1976 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
1977 mode = TYPE_MODE (vectype);
1979 /*** 1. Create the reduction def-use cycle ***/
1981 /* 1.1 set the loop-entry arg of the reduction-phi: */
1982 /* For the case of reduction, vect_get_vec_def_for_operand returns
1983 the scalar def before the loop, that defines the initial value
1984 of the reduction variable. */
1985 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
1986 &adjustment_def);
1987 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
1989 /* 1.2 set the loop-latch arg for the reduction-phi: */
1990 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
1992 if (vect_print_dump_info (REPORT_DETAILS))
1994 fprintf (vect_dump, "transform reduction: created def-use cycle:");
1995 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
1996 fprintf (vect_dump, "\n");
1997 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2001 /*** 2. Create epilog code
2002 The reduction epilog code operates across the elements of the vector
2003 of partial results computed by the vectorized loop.
2004 The reduction epilog code consists of:
2005 step 1: compute the scalar result in a vector (v_out2)
2006 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2007 step 3: adjust the scalar result (s_out3) if needed.
2009 Step 1 can be accomplished using one the following three schemes:
2010 (scheme 1) using reduc_code, if available.
2011 (scheme 2) using whole-vector shifts, if available.
2012 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2013 combined.
2015 The overall epilog code looks like this:
2017 s_out0 = phi <s_loop> # original EXIT_PHI
2018 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2019 v_out2 = reduce <v_out1> # step 1
2020 s_out3 = extract_field <v_out2, 0> # step 2
2021 s_out4 = adjust_result <s_out3> # step 3
2023 (step 3 is optional, and step2 1 and 2 may be combined).
2024 Lastly, the uses of s_out0 are replaced by s_out4.
2026 ***/
2028 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2029 v_out1 = phi <v_loop> */
2031 exit_bb = single_exit (loop)->dest;
2032 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2033 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2034 exit_bsi = bsi_after_labels (exit_bb);
2036 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2037 (i.e. when reduc_code is not available) and in the final adjustment
2038 code (if needed). Also get the original scalar reduction variable as
2039 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2040 represents a reduction pattern), the tree-code and scalar-def are
2041 taken from the original stmt that the pattern-stmt (STMT) replaces.
2042 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2043 are taken from STMT. */
2045 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2046 if (!orig_stmt)
2048 /* Regular reduction */
2049 orig_stmt = stmt;
2051 else
2053 /* Reduction pattern */
2054 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2055 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2056 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2058 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2059 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2060 scalar_type = TREE_TYPE (scalar_dest);
2061 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2062 bitsize = TYPE_SIZE (scalar_type);
2063 bytesize = TYPE_SIZE_UNIT (scalar_type);
2066 /* In case this is a reduction in an inner-loop while vectorizing an outer
2067 loop - we don't need to extract a single scalar result at the end of the
2068 inner-loop. The final vector of partial results will be used in the
2069 vectorized outer-loop, or reduced to a scalar result at the end of the
2070 outer-loop. */
2071 if (nested_in_vect_loop)
2072 goto vect_finalize_reduction;
2074 /* 2.3 Create the reduction code, using one of the three schemes described
2075 above. */
2077 if (reduc_code < NUM_TREE_CODES)
2079 tree tmp;
2081 /*** Case 1: Create:
2082 v_out2 = reduc_expr <v_out1> */
2084 if (vect_print_dump_info (REPORT_DETAILS))
2085 fprintf (vect_dump, "Reduce using direct vector reduction.");
2087 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2088 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2089 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2090 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2091 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2092 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2094 extract_scalar_result = true;
2096 else
2098 enum tree_code shift_code = 0;
2099 bool have_whole_vector_shift = true;
2100 int bit_offset;
2101 int element_bitsize = tree_low_cst (bitsize, 1);
2102 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2103 tree vec_temp;
2105 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2106 shift_code = VEC_RSHIFT_EXPR;
2107 else
2108 have_whole_vector_shift = false;
2110 /* Regardless of whether we have a whole vector shift, if we're
2111 emulating the operation via tree-vect-generic, we don't want
2112 to use it. Only the first round of the reduction is likely
2113 to still be profitable via emulation. */
2114 /* ??? It might be better to emit a reduction tree code here, so that
2115 tree-vect-generic can expand the first round via bit tricks. */
2116 if (!VECTOR_MODE_P (mode))
2117 have_whole_vector_shift = false;
2118 else
2120 optab optab = optab_for_tree_code (code, vectype);
2121 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2122 have_whole_vector_shift = false;
2125 if (have_whole_vector_shift)
2127 /*** Case 2: Create:
2128 for (offset = VS/2; offset >= element_size; offset/=2)
2130 Create: va' = vec_shift <va, offset>
2131 Create: va = vop <va, va'>
2132 } */
2134 if (vect_print_dump_info (REPORT_DETAILS))
2135 fprintf (vect_dump, "Reduce using vector shifts");
2137 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2138 new_temp = PHI_RESULT (new_phi);
2140 for (bit_offset = vec_size_in_bits/2;
2141 bit_offset >= element_bitsize;
2142 bit_offset /= 2)
2144 tree bitpos = size_int (bit_offset);
2145 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2146 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2147 new_name = make_ssa_name (vec_dest, epilog_stmt);
2148 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2149 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2151 tmp = build2 (code, vectype, new_name, new_temp);
2152 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2153 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2154 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2155 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2158 extract_scalar_result = true;
2160 else
2162 tree rhs;
2164 /*** Case 3: Create:
2165 s = extract_field <v_out2, 0>
2166 for (offset = element_size;
2167 offset < vector_size;
2168 offset += element_size;)
2170 Create: s' = extract_field <v_out2, offset>
2171 Create: s = op <s, s'>
2172 } */
2174 if (vect_print_dump_info (REPORT_DETAILS))
2175 fprintf (vect_dump, "Reduce using scalar code. ");
2177 vec_temp = PHI_RESULT (new_phi);
2178 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2179 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2180 bitsize_zero_node);
2181 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2182 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2183 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2184 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2185 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2187 for (bit_offset = element_bitsize;
2188 bit_offset < vec_size_in_bits;
2189 bit_offset += element_bitsize)
2191 tree tmp;
2192 tree bitpos = bitsize_int (bit_offset);
2193 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2194 bitpos);
2196 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2197 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2198 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2199 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2200 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2202 tmp = build2 (code, scalar_type, new_name, new_temp);
2203 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2204 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2205 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2206 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2209 extract_scalar_result = false;
2213 /* 2.4 Extract the final scalar result. Create:
2214 s_out3 = extract_field <v_out2, bitpos> */
2216 if (extract_scalar_result)
2218 tree rhs;
2220 gcc_assert (!nested_in_vect_loop);
2221 if (vect_print_dump_info (REPORT_DETAILS))
2222 fprintf (vect_dump, "extract scalar result");
2224 if (BYTES_BIG_ENDIAN)
2225 bitpos = size_binop (MULT_EXPR,
2226 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2227 TYPE_SIZE (scalar_type));
2228 else
2229 bitpos = bitsize_zero_node;
2231 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2232 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2233 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2234 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2235 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2236 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2239 vect_finalize_reduction:
2241 /* 2.5 Adjust the final result by the initial value of the reduction
2242 variable. (When such adjustment is not needed, then
2243 'adjustment_def' is zero). For example, if code is PLUS we create:
2244 new_temp = loop_exit_def + adjustment_def */
2246 if (adjustment_def)
2248 if (nested_in_vect_loop)
2250 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2251 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2252 new_dest = vect_create_destination_var (scalar_dest, vectype);
2254 else
2256 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2257 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2258 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2260 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2261 new_temp = make_ssa_name (new_dest, epilog_stmt);
2262 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2263 #if 0
2264 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
2265 #else
2266 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2267 #endif
2271 /* 2.6 Handle the loop-exit phi */
2273 /* Replace uses of s_out0 with uses of s_out3:
2274 Find the loop-closed-use at the loop exit of the original scalar result.
2275 (The reduction result is expected to have two immediate uses - one at the
2276 latch block, and one at the loop exit). */
2277 exit_phi = NULL;
2278 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2280 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2282 exit_phi = USE_STMT (use_p);
2283 break;
2286 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2287 gcc_assert (exit_phi);
2289 if (nested_in_vect_loop)
2291 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2293 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2294 is not used in the outer-loop (but only outside the outer-loop). */
2295 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2296 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2298 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2299 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2300 set_stmt_info (get_stmt_ann (epilog_stmt),
2301 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2303 if (vect_print_dump_info (REPORT_DETAILS))
2305 fprintf (vect_dump, "vector of partial results after inner-loop:");
2306 print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
2308 return;
2311 /* Replace the uses: */
2312 orig_name = PHI_RESULT (exit_phi);
2313 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2314 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2315 SET_USE (use_p, new_temp);
2319 /* Function vectorizable_reduction.
2321 Check if STMT performs a reduction operation that can be vectorized.
2322 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2323 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2324 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2326 This function also handles reduction idioms (patterns) that have been
2327 recognized in advance during vect_pattern_recog. In this case, STMT may be
2328 of this form:
2329 X = pattern_expr (arg0, arg1, ..., X)
2330 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2331 sequence that had been detected and replaced by the pattern-stmt (STMT).
2333 In some cases of reduction patterns, the type of the reduction variable X is
2334 different than the type of the other arguments of STMT.
2335 In such cases, the vectype that is used when transforming STMT into a vector
2336 stmt is different than the vectype that is used to determine the
2337 vectorization factor, because it consists of a different number of elements
2338 than the actual number of elements that are being operated upon in parallel.
2340 For example, consider an accumulation of shorts into an int accumulator.
2341 On some targets it's possible to vectorize this pattern operating on 8
2342 shorts at a time (hence, the vectype for purposes of determining the
2343 vectorization factor should be V8HI); on the other hand, the vectype that
2344 is used to create the vector form is actually V4SI (the type of the result).
2346 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2347 indicates what is the actual level of parallelism (V8HI in the example), so
2348 that the right vectorization factor would be derived. This vectype
2349 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2350 be used to create the vectorized stmt. The right vectype for the vectorized
2351 stmt is obtained from the type of the result X:
2352 get_vectype_for_scalar_type (TREE_TYPE (X))
2354 This means that, contrary to "regular" reductions (or "regular" stmts in
2355 general), the following equation:
2356 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2357 does *NOT* necessarily hold for reduction patterns. */
2359 bool
2360 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2362 tree vec_dest;
2363 tree scalar_dest;
2364 tree op;
2365 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2366 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2367 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2368 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2369 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2370 tree operation;
2371 enum tree_code code, orig_code, epilog_reduc_code = 0;
2372 enum machine_mode vec_mode;
2373 int op_type;
2374 optab optab, reduc_optab;
2375 tree new_temp = NULL_TREE;
2376 tree def, def_stmt;
2377 enum vect_def_type dt;
2378 tree new_phi;
2379 tree scalar_type;
2380 bool is_simple_use;
2381 tree orig_stmt;
2382 stmt_vec_info orig_stmt_info;
2383 tree expr = NULL_TREE;
2384 int i;
2385 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2386 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2387 stmt_vec_info prev_stmt_info;
2388 tree reduc_def;
2389 tree new_stmt = NULL_TREE;
2390 int j;
2392 if (nested_in_vect_loop_p (loop, stmt))
2394 loop = loop->inner;
2395 /* FORNOW. This restriction should be relaxed. */
2396 if (ncopies > 1)
2398 if (vect_print_dump_info (REPORT_DETAILS))
2399 fprintf (vect_dump, "multiple types in nested loop.");
2400 return false;
2404 gcc_assert (ncopies >= 1);
2406 /* 1. Is vectorizable reduction? */
2408 /* Not supportable if the reduction variable is used in the loop. */
2409 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2410 return false;
2412 /* Reductions that are not used even in an enclosing outer-loop,
2413 are expected to be "live" (used out of the loop). */
2414 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2415 && !STMT_VINFO_LIVE_P (stmt_info))
2416 return false;
2418 /* Make sure it was already recognized as a reduction computation. */
2419 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2420 return false;
2422 /* 2. Has this been recognized as a reduction pattern?
2424 Check if STMT represents a pattern that has been recognized
2425 in earlier analysis stages. For stmts that represent a pattern,
2426 the STMT_VINFO_RELATED_STMT field records the last stmt in
2427 the original sequence that constitutes the pattern. */
2429 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2430 if (orig_stmt)
2432 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2433 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2434 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2435 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2438 /* 3. Check the operands of the operation. The first operands are defined
2439 inside the loop body. The last operand is the reduction variable,
2440 which is defined by the loop-header-phi. */
2442 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2444 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2445 code = TREE_CODE (operation);
2446 op_type = TREE_OPERAND_LENGTH (operation);
2447 if (op_type != binary_op && op_type != ternary_op)
2448 return false;
2449 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2450 scalar_type = TREE_TYPE (scalar_dest);
2452 /* All uses but the last are expected to be defined in the loop.
2453 The last use is the reduction variable. */
2454 for (i = 0; i < op_type-1; i++)
2456 op = TREE_OPERAND (operation, i);
2457 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2458 gcc_assert (is_simple_use);
2459 if (dt != vect_loop_def
2460 && dt != vect_invariant_def
2461 && dt != vect_constant_def
2462 && dt != vect_induction_def)
2463 return false;
2466 op = TREE_OPERAND (operation, i);
2467 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2468 gcc_assert (is_simple_use);
2469 gcc_assert (dt == vect_reduction_def);
2470 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2471 if (orig_stmt)
2472 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2473 else
2474 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2476 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2477 return false;
2479 /* 4. Supportable by target? */
2481 /* 4.1. check support for the operation in the loop */
2482 optab = optab_for_tree_code (code, vectype);
2483 if (!optab)
2485 if (vect_print_dump_info (REPORT_DETAILS))
2486 fprintf (vect_dump, "no optab.");
2487 return false;
2489 vec_mode = TYPE_MODE (vectype);
2490 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2492 if (vect_print_dump_info (REPORT_DETAILS))
2493 fprintf (vect_dump, "op not supported by target.");
2494 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2495 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2496 < vect_min_worthwhile_factor (code))
2497 return false;
2498 if (vect_print_dump_info (REPORT_DETAILS))
2499 fprintf (vect_dump, "proceeding using word mode.");
2502 /* Worthwhile without SIMD support? */
2503 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2504 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2505 < vect_min_worthwhile_factor (code))
2507 if (vect_print_dump_info (REPORT_DETAILS))
2508 fprintf (vect_dump, "not worthwhile without SIMD support.");
2509 return false;
2512 /* 4.2. Check support for the epilog operation.
2514 If STMT represents a reduction pattern, then the type of the
2515 reduction variable may be different than the type of the rest
2516 of the arguments. For example, consider the case of accumulation
2517 of shorts into an int accumulator; The original code:
2518 S1: int_a = (int) short_a;
2519 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2521 was replaced with:
2522 STMT: int_acc = widen_sum <short_a, int_acc>
2524 This means that:
2525 1. The tree-code that is used to create the vector operation in the
2526 epilog code (that reduces the partial results) is not the
2527 tree-code of STMT, but is rather the tree-code of the original
2528 stmt from the pattern that STMT is replacing. I.e, in the example
2529 above we want to use 'widen_sum' in the loop, but 'plus' in the
2530 epilog.
2531 2. The type (mode) we use to check available target support
2532 for the vector operation to be created in the *epilog*, is
2533 determined by the type of the reduction variable (in the example
2534 above we'd check this: plus_optab[vect_int_mode]).
2535 However the type (mode) we use to check available target support
2536 for the vector operation to be created *inside the loop*, is
2537 determined by the type of the other arguments to STMT (in the
2538 example we'd check this: widen_sum_optab[vect_short_mode]).
2540 This is contrary to "regular" reductions, in which the types of all
2541 the arguments are the same as the type of the reduction variable.
2542 For "regular" reductions we can therefore use the same vector type
2543 (and also the same tree-code) when generating the epilog code and
2544 when generating the code inside the loop. */
2546 if (orig_stmt)
2548 /* This is a reduction pattern: get the vectype from the type of the
2549 reduction variable, and get the tree-code from orig_stmt. */
2550 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2551 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2552 vec_mode = TYPE_MODE (vectype);
2554 else
2556 /* Regular reduction: use the same vectype and tree-code as used for
2557 the vector code inside the loop can be used for the epilog code. */
2558 orig_code = code;
2561 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2562 return false;
2563 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2564 if (!reduc_optab)
2566 if (vect_print_dump_info (REPORT_DETAILS))
2567 fprintf (vect_dump, "no optab for reduction.");
2568 epilog_reduc_code = NUM_TREE_CODES;
2570 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2572 if (vect_print_dump_info (REPORT_DETAILS))
2573 fprintf (vect_dump, "reduc op not supported by target.");
2574 epilog_reduc_code = NUM_TREE_CODES;
2577 if (!vec_stmt) /* transformation not required. */
2579 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2580 vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
2581 return true;
2584 /** Transform. **/
2586 if (vect_print_dump_info (REPORT_DETAILS))
2587 fprintf (vect_dump, "transform reduction.");
2589 /* Create the destination vector */
2590 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2592 /* Create the reduction-phi that defines the reduction-operand. */
2593 new_phi = create_phi_node (vec_dest, loop->header);
2595 /* In case the vectorization factor (VF) is bigger than the number
2596 of elements that we can fit in a vectype (nunits), we have to generate
2597 more than one vector stmt - i.e - we need to "unroll" the
2598 vector stmt by a factor VF/nunits. For more details see documentation
2599 in vectorizable_operation. */
2601 prev_stmt_info = NULL;
2602 for (j = 0; j < ncopies; j++)
2604 /* Handle uses. */
2605 if (j == 0)
2607 op = TREE_OPERAND (operation, 0);
2608 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2609 if (op_type == ternary_op)
2611 op = TREE_OPERAND (operation, 1);
2612 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2615 /* Get the vector def for the reduction variable from the phi node */
2616 reduc_def = PHI_RESULT (new_phi);
2618 else
2620 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2621 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2622 if (op_type == ternary_op)
2623 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2625 /* Get the vector def for the reduction variable from the vectorized
2626 reduction operation generated in the previous iteration (j-1) */
2627 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2630 /* Arguments are ready. create the new vector stmt. */
2631 if (op_type == binary_op)
2632 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2633 else
2634 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2635 reduc_def);
2636 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2637 new_temp = make_ssa_name (vec_dest, new_stmt);
2638 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2639 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2641 if (j == 0)
2642 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2643 else
2644 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2645 prev_stmt_info = vinfo_for_stmt (new_stmt);
2648 /* Finalize the reduction-phi (set it's arguments) and create the
2649 epilog reduction code. */
2650 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
2651 return true;
2654 /* Checks if CALL can be vectorized in type VECTYPE. Returns
2655 a function declaration if the target has a vectorized version
2656 of the function, or NULL_TREE if the function cannot be vectorized. */
2658 tree
2659 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
2661 tree fndecl = get_callee_fndecl (call);
2662 enum built_in_function code;
2664 /* We only handle functions that do not read or clobber memory -- i.e.
2665 const or novops ones. */
2666 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
2667 return NULL_TREE;
2669 if (!fndecl
2670 || TREE_CODE (fndecl) != FUNCTION_DECL
2671 || !DECL_BUILT_IN (fndecl))
2672 return NULL_TREE;
2674 code = DECL_FUNCTION_CODE (fndecl);
2675 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
2676 vectype_in);
2679 /* Function vectorizable_call.
2681 Check if STMT performs a function call that can be vectorized.
2682 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2683 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2684 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2686 bool
2687 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2689 tree vec_dest;
2690 tree scalar_dest;
2691 tree operation;
2692 tree op, type;
2693 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
2694 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
2695 tree vectype_out, vectype_in;
2696 int nunits_in;
2697 int nunits_out;
2698 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2699 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2700 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
2701 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
2702 tree new_stmt;
2703 int ncopies, j, nargs;
2704 call_expr_arg_iterator iter;
2705 tree vargs;
2706 enum { NARROW, NONE, WIDEN } modifier;
2708 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2709 return false;
2711 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
2712 return false;
2714 /* FORNOW: not yet supported. */
2715 if (STMT_VINFO_LIVE_P (stmt_info))
2717 if (vect_print_dump_info (REPORT_DETAILS))
2718 fprintf (vect_dump, "value used after loop.");
2719 return false;
2722 /* Is STMT a vectorizable call? */
2723 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2724 return false;
2726 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2727 return false;
2729 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2730 if (TREE_CODE (operation) != CALL_EXPR)
2731 return false;
2733 /* Process function arguments. */
2734 rhs_type = NULL_TREE;
2735 nargs = 0;
2736 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2738 /* Bail out if the function has more than two arguments, we
2739 do not have interesting builtin functions to vectorize with
2740 more than two arguments. */
2741 if (nargs >= 2)
2742 return false;
2744 /* We can only handle calls with arguments of the same type. */
2745 if (rhs_type
2746 && rhs_type != TREE_TYPE (op))
2748 if (vect_print_dump_info (REPORT_DETAILS))
2749 fprintf (vect_dump, "argument types differ.");
2750 return false;
2752 rhs_type = TREE_TYPE (op);
2754 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
2756 if (vect_print_dump_info (REPORT_DETAILS))
2757 fprintf (vect_dump, "use not simple.");
2758 return false;
2761 ++nargs;
2764 /* No arguments is also not good. */
2765 if (nargs == 0)
2766 return false;
2768 vectype_in = get_vectype_for_scalar_type (rhs_type);
2769 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2771 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
2772 vectype_out = get_vectype_for_scalar_type (lhs_type);
2773 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2775 /* FORNOW */
2776 if (nunits_in == nunits_out / 2)
2777 modifier = NARROW;
2778 else if (nunits_out == nunits_in)
2779 modifier = NONE;
2780 else if (nunits_out == nunits_in / 2)
2781 modifier = WIDEN;
2782 else
2783 return false;
2785 /* For now, we only vectorize functions if a target specific builtin
2786 is available. TODO -- in some cases, it might be profitable to
2787 insert the calls for pieces of the vector, in order to be able
2788 to vectorize other operations in the loop. */
2789 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
2790 if (fndecl == NULL_TREE)
2792 if (vect_print_dump_info (REPORT_DETAILS))
2793 fprintf (vect_dump, "function is not vectorizable.");
2795 return false;
2798 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
2800 if (modifier == NARROW)
2801 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2802 else
2803 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2805 /* Sanity check: make sure that at least one copy of the vectorized stmt
2806 needs to be generated. */
2807 gcc_assert (ncopies >= 1);
2809 /* FORNOW. This restriction should be relaxed. */
2810 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
2812 if (vect_print_dump_info (REPORT_DETAILS))
2813 fprintf (vect_dump, "multiple types in nested loop.");
2814 return false;
2817 if (!vec_stmt) /* transformation not required. */
2819 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
2820 if (vect_print_dump_info (REPORT_DETAILS))
2821 fprintf (vect_dump, "=== vectorizable_call ===");
2822 vect_model_simple_cost (stmt_info, ncopies, dt);
2823 return true;
2826 /** Transform. **/
2828 if (vect_print_dump_info (REPORT_DETAILS))
2829 fprintf (vect_dump, "transform operation.");
2831 /* FORNOW. This restriction should be relaxed. */
2832 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
2834 if (vect_print_dump_info (REPORT_DETAILS))
2835 fprintf (vect_dump, "multiple types in nested loop.");
2836 return false;
2839 /* Handle def. */
2840 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2841 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2843 prev_stmt_info = NULL;
2844 switch (modifier)
2846 case NONE:
2847 for (j = 0; j < ncopies; ++j)
2849 /* Build argument list for the vectorized call. */
2850 /* FIXME: Rewrite this so that it doesn't
2851 construct a temporary list. */
2852 vargs = NULL_TREE;
2853 nargs = 0;
2854 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2856 if (j == 0)
2857 vec_oprnd0
2858 = vect_get_vec_def_for_operand (op, stmt, NULL);
2859 else
2860 vec_oprnd0
2861 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2863 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
2865 ++nargs;
2867 vargs = nreverse (vargs);
2869 rhs = build_function_call_expr (fndecl, vargs);
2870 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
2871 new_temp = make_ssa_name (vec_dest, new_stmt);
2872 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2874 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2876 if (j == 0)
2877 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2878 else
2879 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2881 prev_stmt_info = vinfo_for_stmt (new_stmt);
2884 break;
2886 case NARROW:
2887 for (j = 0; j < ncopies; ++j)
2889 /* Build argument list for the vectorized call. */
2890 /* FIXME: Rewrite this so that it doesn't
2891 construct a temporary list. */
2892 vargs = NULL_TREE;
2893 nargs = 0;
2894 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
2896 if (j == 0)
2898 vec_oprnd0
2899 = vect_get_vec_def_for_operand (op, stmt, NULL);
2900 vec_oprnd1
2901 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2903 else
2905 vec_oprnd0
2906 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
2907 vec_oprnd1
2908 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
2911 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
2912 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
2914 ++nargs;
2916 vargs = nreverse (vargs);
2918 rhs = build_function_call_expr (fndecl, vargs);
2919 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
2920 new_temp = make_ssa_name (vec_dest, new_stmt);
2921 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2923 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2925 if (j == 0)
2926 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2927 else
2928 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2930 prev_stmt_info = vinfo_for_stmt (new_stmt);
2933 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2935 break;
2937 case WIDEN:
2938 /* No current target implements this case. */
2939 return false;
2942 /* The call in STMT might prevent it from being removed in dce.
2943 We however cannot remove it here, due to the way the ssa name
2944 it defines is mapped to the new definition. So just replace
2945 rhs of the statement with something harmless. */
2946 type = TREE_TYPE (scalar_dest);
2947 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
2948 update_stmt (stmt);
2950 return true;
2954 /* Function vect_gen_widened_results_half
2956 Create a vector stmt whose code, type, number of arguments, and result
2957 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2958 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2959 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2960 needs to be created (DECL is a function-decl of a target-builtin).
2961 STMT is the original scalar stmt that we are vectorizing. */
2963 static tree
2964 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2965 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2966 tree vec_dest, block_stmt_iterator *bsi,
2967 tree stmt)
2969 tree expr;
2970 tree new_stmt;
2971 tree new_temp;
2972 tree sym;
2973 ssa_op_iter iter;
2975 /* Generate half of the widened result: */
2976 if (code == CALL_EXPR)
2978 /* Target specific support */
2979 if (op_type == binary_op)
2980 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
2981 else
2982 expr = build_call_expr (decl, 1, vec_oprnd0);
2984 else
2986 /* Generic support */
2987 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2988 if (op_type == binary_op)
2989 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2990 else
2991 expr = build1 (code, vectype, vec_oprnd0);
2993 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2994 new_temp = make_ssa_name (vec_dest, new_stmt);
2995 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2996 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2998 if (code == CALL_EXPR)
3000 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3002 if (TREE_CODE (sym) == SSA_NAME)
3003 sym = SSA_NAME_VAR (sym);
3004 mark_sym_for_renaming (sym);
3008 return new_stmt;
3012 /* Function vectorizable_conversion.
3014 Check if STMT performs a conversion operation, that can be vectorized.
3015 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3016 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3017 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3019 bool
3020 vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
3021 tree * vec_stmt)
3023 tree vec_dest;
3024 tree scalar_dest;
3025 tree operation;
3026 tree op0;
3027 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3028 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3029 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3030 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3031 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3032 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3033 tree new_temp;
3034 tree def, def_stmt;
3035 enum vect_def_type dt0;
3036 tree new_stmt;
3037 stmt_vec_info prev_stmt_info;
3038 int nunits_in;
3039 int nunits_out;
3040 tree vectype_out, vectype_in;
3041 int ncopies, j;
3042 tree expr;
3043 tree rhs_type, lhs_type;
3044 tree builtin_decl;
3045 enum { NARROW, NONE, WIDEN } modifier;
3047 /* Is STMT a vectorizable conversion? */
3049 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3050 return false;
3052 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3053 return false;
3055 if (STMT_VINFO_LIVE_P (stmt_info))
3057 /* FORNOW: not yet supported. */
3058 if (vect_print_dump_info (REPORT_DETAILS))
3059 fprintf (vect_dump, "value used after loop.");
3060 return false;
3063 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3064 return false;
3066 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3067 return false;
3069 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3070 code = TREE_CODE (operation);
3071 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3072 return false;
3074 /* Check types of lhs and rhs */
3075 op0 = TREE_OPERAND (operation, 0);
3076 rhs_type = TREE_TYPE (op0);
3077 vectype_in = get_vectype_for_scalar_type (rhs_type);
3078 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3080 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3081 lhs_type = TREE_TYPE (scalar_dest);
3082 vectype_out = get_vectype_for_scalar_type (lhs_type);
3083 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3085 /* FORNOW */
3086 if (nunits_in == nunits_out / 2)
3087 modifier = NARROW;
3088 else if (nunits_out == nunits_in)
3089 modifier = NONE;
3090 else if (nunits_out == nunits_in / 2)
3091 modifier = WIDEN;
3092 else
3093 return false;
3095 if (modifier == NONE)
3096 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3098 /* Bail out if the types are both integral or non-integral */
3099 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3100 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3101 return false;
3103 if (modifier == NARROW)
3104 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3105 else
3106 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3108 /* Sanity check: make sure that at least one copy of the vectorized stmt
3109 needs to be generated. */
3110 gcc_assert (ncopies >= 1);
3112 /* FORNOW. This restriction should be relaxed. */
3113 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3115 if (vect_print_dump_info (REPORT_DETAILS))
3116 fprintf (vect_dump, "multiple types in nested loop.");
3117 return false;
3120 /* Check the operands of the operation. */
3121 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
3123 if (vect_print_dump_info (REPORT_DETAILS))
3124 fprintf (vect_dump, "use not simple.");
3125 return false;
3128 /* Supportable by target? */
3129 if ((modifier == NONE
3130 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3131 || (modifier == WIDEN
3132 && !supportable_widening_operation (code, stmt, vectype_in,
3133 &decl1, &decl2,
3134 &code1, &code2))
3135 || (modifier == NARROW
3136 && !supportable_narrowing_operation (code, stmt, vectype_in,
3137 &code1)))
3139 if (vect_print_dump_info (REPORT_DETAILS))
3140 fprintf (vect_dump, "op not supported by target.");
3141 return false;
3144 if (modifier != NONE)
3145 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3147 if (!vec_stmt) /* transformation not required. */
3149 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3150 return true;
3153 /** Transform. **/
3154 if (vect_print_dump_info (REPORT_DETAILS))
3155 fprintf (vect_dump, "transform conversion.");
3157 /* Handle def. */
3158 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3160 prev_stmt_info = NULL;
3161 switch (modifier)
3163 case NONE:
3164 for (j = 0; j < ncopies; j++)
3166 tree sym;
3167 ssa_op_iter iter;
3169 if (j == 0)
3170 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3171 else
3172 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3174 builtin_decl =
3175 targetm.vectorize.builtin_conversion (code, vectype_in);
3176 new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
3178 /* Arguments are ready. create the new vector stmt. */
3179 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3180 new_temp = make_ssa_name (vec_dest, new_stmt);
3181 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3182 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3183 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3185 if (TREE_CODE (sym) == SSA_NAME)
3186 sym = SSA_NAME_VAR (sym);
3187 mark_sym_for_renaming (sym);
3190 if (j == 0)
3191 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3192 else
3193 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3194 prev_stmt_info = vinfo_for_stmt (new_stmt);
3196 break;
3198 case WIDEN:
3199 /* In case the vectorization factor (VF) is bigger than the number
3200 of elements that we can fit in a vectype (nunits), we have to
3201 generate more than one vector stmt - i.e - we need to "unroll"
3202 the vector stmt by a factor VF/nunits. */
3203 for (j = 0; j < ncopies; j++)
3205 if (j == 0)
3206 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3207 else
3208 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3210 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3212 /* Generate first half of the widened result: */
3213 new_stmt
3214 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3215 vec_oprnd0, vec_oprnd1,
3216 unary_op, vec_dest, bsi, stmt);
3217 if (j == 0)
3218 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3219 else
3220 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3221 prev_stmt_info = vinfo_for_stmt (new_stmt);
3223 /* Generate second half of the widened result: */
3224 new_stmt
3225 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3226 vec_oprnd0, vec_oprnd1,
3227 unary_op, vec_dest, bsi, stmt);
3228 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3229 prev_stmt_info = vinfo_for_stmt (new_stmt);
3231 break;
3233 case NARROW:
3234 /* In case the vectorization factor (VF) is bigger than the number
3235 of elements that we can fit in a vectype (nunits), we have to
3236 generate more than one vector stmt - i.e - we need to "unroll"
3237 the vector stmt by a factor VF/nunits. */
3238 for (j = 0; j < ncopies; j++)
3240 /* Handle uses. */
3241 if (j == 0)
3243 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3244 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3246 else
3248 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
3249 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
3252 /* Arguments are ready. Create the new vector stmt. */
3253 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3254 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3255 new_temp = make_ssa_name (vec_dest, new_stmt);
3256 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3257 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3259 if (j == 0)
3260 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3261 else
3262 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3264 prev_stmt_info = vinfo_for_stmt (new_stmt);
3267 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3269 return true;
3273 /* Function vectorizable_assignment.
3275 Check if STMT performs an assignment (copy) that can be vectorized.
3276 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3277 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3278 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3280 bool
3281 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3283 tree vec_dest;
3284 tree scalar_dest;
3285 tree op;
3286 tree vec_oprnd;
3287 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3288 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3289 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3290 tree new_temp;
3291 tree def, def_stmt;
3292 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3293 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3294 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3296 gcc_assert (ncopies >= 1);
3297 if (ncopies > 1)
3298 return false; /* FORNOW */
3300 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3301 return false;
3303 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3304 return false;
3306 /* FORNOW: not yet supported. */
3307 if (STMT_VINFO_LIVE_P (stmt_info))
3309 if (vect_print_dump_info (REPORT_DETAILS))
3310 fprintf (vect_dump, "value used after loop.");
3311 return false;
3314 /* Is vectorizable assignment? */
3315 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3316 return false;
3318 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3319 if (TREE_CODE (scalar_dest) != SSA_NAME)
3320 return false;
3322 op = GIMPLE_STMT_OPERAND (stmt, 1);
3323 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3325 if (vect_print_dump_info (REPORT_DETAILS))
3326 fprintf (vect_dump, "use not simple.");
3327 return false;
3330 if (!vec_stmt) /* transformation not required. */
3332 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3333 if (vect_print_dump_info (REPORT_DETAILS))
3334 fprintf (vect_dump, "=== vectorizable_assignment ===");
3335 vect_model_simple_cost (stmt_info, ncopies, dt);
3336 return true;
3339 /** Transform. **/
3340 if (vect_print_dump_info (REPORT_DETAILS))
3341 fprintf (vect_dump, "transform assignment.");
3343 /* Handle def. */
3344 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3346 /* Handle use. */
3347 op = GIMPLE_STMT_OPERAND (stmt, 1);
3348 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
3350 /* Arguments are ready. create the new vector stmt. */
3351 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
3352 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3353 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3354 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3356 return true;
3360 /* Function vect_min_worthwhile_factor.
3362 For a loop where we could vectorize the operation indicated by CODE,
3363 return the minimum vectorization factor that makes it worthwhile
3364 to use generic vectors. */
3365 static int
3366 vect_min_worthwhile_factor (enum tree_code code)
3368 switch (code)
3370 case PLUS_EXPR:
3371 case MINUS_EXPR:
3372 case NEGATE_EXPR:
3373 return 4;
3375 case BIT_AND_EXPR:
3376 case BIT_IOR_EXPR:
3377 case BIT_XOR_EXPR:
3378 case BIT_NOT_EXPR:
3379 return 2;
3381 default:
3382 return INT_MAX;
3387 /* Function vectorizable_induction
3389 Check if PHI performs an induction computation that can be vectorized.
3390 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3391 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3392 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3394 bool
3395 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3396 tree *vec_stmt)
3398 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3399 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3400 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3401 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3402 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3403 tree vec_def;
3405 gcc_assert (ncopies >= 1);
3407 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3408 return false;
3410 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3412 if (STMT_VINFO_LIVE_P (stmt_info))
3414 /* FORNOW: not yet supported. */
3415 if (vect_print_dump_info (REPORT_DETAILS))
3416 fprintf (vect_dump, "value used after loop.");
3417 return false;
3420 if (TREE_CODE (phi) != PHI_NODE)
3421 return false;
3423 if (!vec_stmt) /* transformation not required. */
3425 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3426 if (vect_print_dump_info (REPORT_DETAILS))
3427 fprintf (vect_dump, "=== vectorizable_induction ===");
3428 vect_model_induction_cost (stmt_info, ncopies);
3429 return true;
3432 /** Transform. **/
3434 if (vect_print_dump_info (REPORT_DETAILS))
3435 fprintf (vect_dump, "transform induction phi.");
3437 vec_def = get_initial_def_for_induction (phi);
3438 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3439 return true;
3443 /* Function vectorizable_operation.
3445 Check if STMT performs a binary or unary operation that can be vectorized.
3446 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3447 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3448 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3450 bool
3451 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3453 tree vec_dest;
3454 tree scalar_dest;
3455 tree operation;
3456 tree op0, op1 = NULL;
3457 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3458 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3459 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3461 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3462 enum tree_code code;
3463 enum machine_mode vec_mode;
3464 tree new_temp;
3465 int op_type;
3466 optab optab;
3467 int icode;
3468 enum machine_mode optab_op2_mode;
3469 tree def, def_stmt;
3470 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3471 tree new_stmt;
3472 stmt_vec_info prev_stmt_info;
3473 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3474 int nunits_out;
3475 tree vectype_out;
3476 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3477 int j;
3479 gcc_assert (ncopies >= 1);
3480 /* FORNOW. This restriction should be relaxed. */
3481 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3483 if (vect_print_dump_info (REPORT_DETAILS))
3484 fprintf (vect_dump, "multiple types in nested loop.");
3485 return false;
3488 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3489 return false;
3491 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3492 return false;
3494 /* FORNOW: not yet supported. */
3495 if (STMT_VINFO_LIVE_P (stmt_info))
3497 if (vect_print_dump_info (REPORT_DETAILS))
3498 fprintf (vect_dump, "value used after loop.");
3499 return false;
3502 /* Is STMT a vectorizable binary/unary operation? */
3503 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3504 return false;
3506 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3507 return false;
3509 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3510 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3511 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3512 if (nunits_out != nunits_in)
3513 return false;
3515 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3516 code = TREE_CODE (operation);
3518 /* For pointer addition, we should use the normal plus for
3519 the vector addition. */
3520 if (code == POINTER_PLUS_EXPR)
3521 code = PLUS_EXPR;
3523 optab = optab_for_tree_code (code, vectype);
3525 /* Support only unary or binary operations. */
3526 op_type = TREE_OPERAND_LENGTH (operation);
3527 if (op_type != unary_op && op_type != binary_op)
3529 if (vect_print_dump_info (REPORT_DETAILS))
3530 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3531 return false;
3534 op0 = TREE_OPERAND (operation, 0);
3535 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3537 if (vect_print_dump_info (REPORT_DETAILS))
3538 fprintf (vect_dump, "use not simple.");
3539 return false;
3542 if (op_type == binary_op)
3544 op1 = TREE_OPERAND (operation, 1);
3545 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3547 if (vect_print_dump_info (REPORT_DETAILS))
3548 fprintf (vect_dump, "use not simple.");
3549 return false;
3553 /* Supportable by target? */
3554 if (!optab)
3556 if (vect_print_dump_info (REPORT_DETAILS))
3557 fprintf (vect_dump, "no optab.");
3558 return false;
3560 vec_mode = TYPE_MODE (vectype);
3561 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3562 if (icode == CODE_FOR_nothing)
3564 if (vect_print_dump_info (REPORT_DETAILS))
3565 fprintf (vect_dump, "op not supported by target.");
3566 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3567 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3568 < vect_min_worthwhile_factor (code))
3569 return false;
3570 if (vect_print_dump_info (REPORT_DETAILS))
3571 fprintf (vect_dump, "proceeding using word mode.");
3574 /* Worthwhile without SIMD support? */
3575 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3576 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3577 < vect_min_worthwhile_factor (code))
3579 if (vect_print_dump_info (REPORT_DETAILS))
3580 fprintf (vect_dump, "not worthwhile without SIMD support.");
3581 return false;
3584 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3586 /* FORNOW: not yet supported. */
3587 if (!VECTOR_MODE_P (vec_mode))
3588 return false;
3590 /* Invariant argument is needed for a vector shift
3591 by a scalar shift operand. */
3592 optab_op2_mode = insn_data[icode].operand[2].mode;
3593 if (! (VECTOR_MODE_P (optab_op2_mode)
3594 || dt[1] == vect_constant_def
3595 || dt[1] == vect_invariant_def))
3597 if (vect_print_dump_info (REPORT_DETAILS))
3598 fprintf (vect_dump, "operand mode requires invariant argument.");
3599 return false;
3603 if (!vec_stmt) /* transformation not required. */
3605 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3606 if (vect_print_dump_info (REPORT_DETAILS))
3607 fprintf (vect_dump, "=== vectorizable_operation ===");
3608 vect_model_simple_cost (stmt_info, ncopies, dt);
3609 return true;
3612 /** Transform. **/
3614 if (vect_print_dump_info (REPORT_DETAILS))
3615 fprintf (vect_dump, "transform binary/unary operation.");
3617 /* Handle def. */
3618 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3620 /* In case the vectorization factor (VF) is bigger than the number
3621 of elements that we can fit in a vectype (nunits), we have to generate
3622 more than one vector stmt - i.e - we need to "unroll" the
3623 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3624 from one copy of the vector stmt to the next, in the field
3625 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3626 stages to find the correct vector defs to be used when vectorizing
3627 stmts that use the defs of the current stmt. The example below illustrates
3628 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3629 4 vectorized stmts):
3631 before vectorization:
3632 RELATED_STMT VEC_STMT
3633 S1: x = memref - -
3634 S2: z = x + 1 - -
3636 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
3637 there):
3638 RELATED_STMT VEC_STMT
3639 VS1_0: vx0 = memref0 VS1_1 -
3640 VS1_1: vx1 = memref1 VS1_2 -
3641 VS1_2: vx2 = memref2 VS1_3 -
3642 VS1_3: vx3 = memref3 - -
3643 S1: x = load - VS1_0
3644 S2: z = x + 1 - -
3646 step2: vectorize stmt S2 (done here):
3647 To vectorize stmt S2 we first need to find the relevant vector
3648 def for the first operand 'x'. This is, as usual, obtained from
3649 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
3650 that defines 'x' (S1). This way we find the stmt VS1_0, and the
3651 relevant vector def 'vx0'. Having found 'vx0' we can generate
3652 the vector stmt VS2_0, and as usual, record it in the
3653 STMT_VINFO_VEC_STMT of stmt S2.
3654 When creating the second copy (VS2_1), we obtain the relevant vector
3655 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
3656 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
3657 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
3658 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
3659 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
3660 chain of stmts and pointers:
3661 RELATED_STMT VEC_STMT
3662 VS1_0: vx0 = memref0 VS1_1 -
3663 VS1_1: vx1 = memref1 VS1_2 -
3664 VS1_2: vx2 = memref2 VS1_3 -
3665 VS1_3: vx3 = memref3 - -
3666 S1: x = load - VS1_0
3667 VS2_0: vz0 = vx0 + v1 VS2_1 -
3668 VS2_1: vz1 = vx1 + v1 VS2_2 -
3669 VS2_2: vz2 = vx2 + v1 VS2_3 -
3670 VS2_3: vz3 = vx3 + v1 - -
3671 S2: z = x + 1 - VS2_0 */
3673 prev_stmt_info = NULL;
3674 for (j = 0; j < ncopies; j++)
3676 /* Handle uses. */
3677 if (j == 0)
3679 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3680 if (op_type == binary_op)
3682 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3684 /* Vector shl and shr insn patterns can be defined with
3685 scalar operand 2 (shift operand). In this case, use
3686 constant or loop invariant op1 directly, without
3687 extending it to vector mode first. */
3688 optab_op2_mode = insn_data[icode].operand[2].mode;
3689 if (!VECTOR_MODE_P (optab_op2_mode))
3691 if (vect_print_dump_info (REPORT_DETAILS))
3692 fprintf (vect_dump, "operand 1 using scalar mode.");
3693 vec_oprnd1 = op1;
3696 if (!vec_oprnd1)
3697 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
3700 else
3702 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3703 if (op_type == binary_op)
3704 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
3707 /* Arguments are ready. create the new vector stmt. */
3709 if (op_type == binary_op)
3710 new_stmt = build_gimple_modify_stmt (vec_dest,
3711 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
3712 else
3713 new_stmt = build_gimple_modify_stmt (vec_dest,
3714 build1 (code, vectype, vec_oprnd0));
3715 new_temp = make_ssa_name (vec_dest, new_stmt);
3716 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3717 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3719 if (j == 0)
3720 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3721 else
3722 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3723 prev_stmt_info = vinfo_for_stmt (new_stmt);
3726 return true;
3730 /* Function vectorizable_type_demotion
3732 Check if STMT performs a binary or unary operation that involves
3733 type demotion, and if it can be vectorized.
3734 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3735 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3736 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3738 bool
3739 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
3740 tree *vec_stmt)
3742 tree vec_dest;
3743 tree scalar_dest;
3744 tree operation;
3745 tree op0;
3746 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3747 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3748 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3749 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3750 enum tree_code code, code1 = ERROR_MARK;
3751 tree new_temp;
3752 tree def, def_stmt;
3753 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3754 tree new_stmt;
3755 stmt_vec_info prev_stmt_info;
3756 int nunits_in;
3757 int nunits_out;
3758 tree vectype_out;
3759 int ncopies;
3760 int j;
3761 tree expr;
3762 tree vectype_in;
3764 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3765 return false;
3767 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3768 return false;
3770 /* FORNOW: not yet supported. */
3771 if (STMT_VINFO_LIVE_P (stmt_info))
3773 if (vect_print_dump_info (REPORT_DETAILS))
3774 fprintf (vect_dump, "value used after loop.");
3775 return false;
3778 /* Is STMT a vectorizable type-demotion operation? */
3779 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3780 return false;
3782 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3783 return false;
3785 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3786 code = TREE_CODE (operation);
3787 if (code != NOP_EXPR && code != CONVERT_EXPR)
3788 return false;
3790 op0 = TREE_OPERAND (operation, 0);
3791 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3792 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3794 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3795 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3796 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3797 if (nunits_in != nunits_out / 2) /* FORNOW */
3798 return false;
3800 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3801 gcc_assert (ncopies >= 1);
3802 /* FORNOW. This restriction should be relaxed. */
3803 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3805 if (vect_print_dump_info (REPORT_DETAILS))
3806 fprintf (vect_dump, "multiple types in nested loop.");
3807 return false;
3810 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3811 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3812 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3813 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3814 && (code == NOP_EXPR || code == CONVERT_EXPR))))
3815 return false;
3817 /* Check the operands of the operation. */
3818 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3820 if (vect_print_dump_info (REPORT_DETAILS))
3821 fprintf (vect_dump, "use not simple.");
3822 return false;
3825 /* Supportable by target? */
3826 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
3827 return false;
3829 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3831 if (!vec_stmt) /* transformation not required. */
3833 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
3834 if (vect_print_dump_info (REPORT_DETAILS))
3835 fprintf (vect_dump, "=== vectorizable_demotion ===");
3836 vect_model_simple_cost (stmt_info, ncopies, dt);
3837 return true;
3840 /** Transform. **/
3841 if (vect_print_dump_info (REPORT_DETAILS))
3842 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
3843 ncopies);
3845 /* Handle def. */
3846 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3848 /* In case the vectorization factor (VF) is bigger than the number
3849 of elements that we can fit in a vectype (nunits), we have to generate
3850 more than one vector stmt - i.e - we need to "unroll" the
3851 vector stmt by a factor VF/nunits. */
3852 prev_stmt_info = NULL;
3853 for (j = 0; j < ncopies; j++)
3855 /* Handle uses. */
3856 if (j == 0)
3858 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3859 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3861 else
3863 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3864 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3867 /* Arguments are ready. Create the new vector stmt. */
3868 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3869 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3870 new_temp = make_ssa_name (vec_dest, new_stmt);
3871 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3872 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3874 if (j == 0)
3875 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3876 else
3877 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3879 prev_stmt_info = vinfo_for_stmt (new_stmt);
3882 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3883 return true;
3887 /* Function vectorizable_type_promotion
3889 Check if STMT performs a binary or unary operation that involves
3890 type promotion, and if it can be vectorized.
3891 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3892 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3893 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3895 bool
3896 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
3897 tree *vec_stmt)
3899 tree vec_dest;
3900 tree scalar_dest;
3901 tree operation;
3902 tree op0, op1 = NULL;
3903 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
3904 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3905 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3906 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3907 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3908 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3909 int op_type;
3910 tree def, def_stmt;
3911 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3912 tree new_stmt;
3913 stmt_vec_info prev_stmt_info;
3914 int nunits_in;
3915 int nunits_out;
3916 tree vectype_out;
3917 int ncopies;
3918 int j;
3919 tree vectype_in;
3921 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3922 return false;
3924 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3925 return false;
3927 /* FORNOW: not yet supported. */
3928 if (STMT_VINFO_LIVE_P (stmt_info))
3930 if (vect_print_dump_info (REPORT_DETAILS))
3931 fprintf (vect_dump, "value used after loop.");
3932 return false;
3935 /* Is STMT a vectorizable type-promotion operation? */
3936 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3937 return false;
3939 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3940 return false;
3942 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3943 code = TREE_CODE (operation);
3944 if (code != NOP_EXPR && code != CONVERT_EXPR
3945 && code != WIDEN_MULT_EXPR)
3946 return false;
3948 op0 = TREE_OPERAND (operation, 0);
3949 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
3950 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3952 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3953 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3954 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3955 if (nunits_out != nunits_in / 2) /* FORNOW */
3956 return false;
3958 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3959 gcc_assert (ncopies >= 1);
3960 /* FORNOW. This restriction should be relaxed. */
3961 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3963 if (vect_print_dump_info (REPORT_DETAILS))
3964 fprintf (vect_dump, "multiple types in nested loop.");
3965 return false;
3968 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
3969 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3970 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
3971 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
3972 && (code == CONVERT_EXPR || code == NOP_EXPR))))
3973 return false;
3975 /* Check the operands of the operation. */
3976 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3978 if (vect_print_dump_info (REPORT_DETAILS))
3979 fprintf (vect_dump, "use not simple.");
3980 return false;
3983 op_type = TREE_CODE_LENGTH (code);
3984 if (op_type == binary_op)
3986 op1 = TREE_OPERAND (operation, 1);
3987 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3989 if (vect_print_dump_info (REPORT_DETAILS))
3990 fprintf (vect_dump, "use not simple.");
3991 return false;
3995 /* Supportable by target? */
3996 if (!supportable_widening_operation (code, stmt, vectype_in,
3997 &decl1, &decl2, &code1, &code2))
3998 return false;
4000 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4002 if (!vec_stmt) /* transformation not required. */
4004 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4005 if (vect_print_dump_info (REPORT_DETAILS))
4006 fprintf (vect_dump, "=== vectorizable_promotion ===");
4007 vect_model_simple_cost (stmt_info, 2*ncopies, dt);
4008 return true;
4011 /** Transform. **/
4013 if (vect_print_dump_info (REPORT_DETAILS))
4014 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4015 ncopies);
4017 /* Handle def. */
4018 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4020 /* In case the vectorization factor (VF) is bigger than the number
4021 of elements that we can fit in a vectype (nunits), we have to generate
4022 more than one vector stmt - i.e - we need to "unroll" the
4023 vector stmt by a factor VF/nunits. */
4025 prev_stmt_info = NULL;
4026 for (j = 0; j < ncopies; j++)
4028 /* Handle uses. */
4029 if (j == 0)
4031 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4032 if (op_type == binary_op)
4033 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4035 else
4037 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4038 if (op_type == binary_op)
4039 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4042 /* Arguments are ready. Create the new vector stmt. We are creating
4043 two vector defs because the widened result does not fit in one vector.
4044 The vectorized stmt can be expressed as a call to a taregt builtin,
4045 or a using a tree-code. */
4046 /* Generate first half of the widened result: */
4047 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4048 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4049 if (j == 0)
4050 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4051 else
4052 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4053 prev_stmt_info = vinfo_for_stmt (new_stmt);
4055 /* Generate second half of the widened result: */
4056 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4057 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4058 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4059 prev_stmt_info = vinfo_for_stmt (new_stmt);
4063 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4064 return true;
4068 /* Function vect_strided_store_supported.
4070 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4071 and FALSE otherwise. */
4073 static bool
4074 vect_strided_store_supported (tree vectype)
4076 optab interleave_high_optab, interleave_low_optab;
4077 int mode;
4079 mode = (int) TYPE_MODE (vectype);
4081 /* Check that the operation is supported. */
4082 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4083 vectype);
4084 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4085 vectype);
4086 if (!interleave_high_optab || !interleave_low_optab)
4088 if (vect_print_dump_info (REPORT_DETAILS))
4089 fprintf (vect_dump, "no optab for interleave.");
4090 return false;
4093 if (optab_handler (interleave_high_optab, mode)->insn_code
4094 == CODE_FOR_nothing
4095 || optab_handler (interleave_low_optab, mode)->insn_code
4096 == CODE_FOR_nothing)
4098 if (vect_print_dump_info (REPORT_DETAILS))
4099 fprintf (vect_dump, "interleave op not supported by target.");
4100 return false;
4102 return true;
4106 /* Function vect_permute_store_chain.
4108 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4109 a power of 2, generate interleave_high/low stmts to reorder the data
4110 correctly for the stores. Return the final references for stores in
4111 RESULT_CHAIN.
4113 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4114 The input is 4 vectors each containing 8 elements. We assign a number to each
4115 element, the input sequence is:
4117 1st vec: 0 1 2 3 4 5 6 7
4118 2nd vec: 8 9 10 11 12 13 14 15
4119 3rd vec: 16 17 18 19 20 21 22 23
4120 4th vec: 24 25 26 27 28 29 30 31
4122 The output sequence should be:
4124 1st vec: 0 8 16 24 1 9 17 25
4125 2nd vec: 2 10 18 26 3 11 19 27
4126 3rd vec: 4 12 20 28 5 13 21 30
4127 4th vec: 6 14 22 30 7 15 23 31
4129 i.e., we interleave the contents of the four vectors in their order.
4131 We use interleave_high/low instructions to create such output. The input of
4132 each interleave_high/low operation is two vectors:
4133 1st vec 2nd vec
4134 0 1 2 3 4 5 6 7
4135 the even elements of the result vector are obtained left-to-right from the
4136 high/low elements of the first vector. The odd elements of the result are
4137 obtained left-to-right from the high/low elements of the second vector.
4138 The output of interleave_high will be: 0 4 1 5
4139 and of interleave_low: 2 6 3 7
4142 The permutation is done in log LENGTH stages. In each stage interleave_high
4143 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4144 where the first argument is taken from the first half of DR_CHAIN and the
4145 second argument from it's second half.
4146 In our example,
4148 I1: interleave_high (1st vec, 3rd vec)
4149 I2: interleave_low (1st vec, 3rd vec)
4150 I3: interleave_high (2nd vec, 4th vec)
4151 I4: interleave_low (2nd vec, 4th vec)
4153 The output for the first stage is:
4155 I1: 0 16 1 17 2 18 3 19
4156 I2: 4 20 5 21 6 22 7 23
4157 I3: 8 24 9 25 10 26 11 27
4158 I4: 12 28 13 29 14 30 15 31
4160 The output of the second stage, i.e. the final result is:
4162 I1: 0 8 16 24 1 9 17 25
4163 I2: 2 10 18 26 3 11 19 27
4164 I3: 4 12 20 28 5 13 21 30
4165 I4: 6 14 22 30 7 15 23 31. */
4167 static bool
4168 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4169 unsigned int length,
4170 tree stmt,
4171 block_stmt_iterator *bsi,
4172 VEC(tree,heap) **result_chain)
4174 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4175 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4176 tree scalar_dest, tmp;
4177 int i;
4178 unsigned int j;
4179 VEC(tree,heap) *first, *second;
4181 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4182 first = VEC_alloc (tree, heap, length/2);
4183 second = VEC_alloc (tree, heap, length/2);
4185 /* Check that the operation is supported. */
4186 if (!vect_strided_store_supported (vectype))
4187 return false;
4189 *result_chain = VEC_copy (tree, heap, dr_chain);
4191 for (i = 0; i < exact_log2 (length); i++)
4193 for (j = 0; j < length/2; j++)
4195 vect1 = VEC_index (tree, dr_chain, j);
4196 vect2 = VEC_index (tree, dr_chain, j+length/2);
4198 /* Create interleaving stmt:
4199 in the case of big endian:
4200 high = interleave_high (vect1, vect2)
4201 and in the case of little endian:
4202 high = interleave_low (vect1, vect2). */
4203 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4204 DECL_GIMPLE_REG_P (perm_dest) = 1;
4205 add_referenced_var (perm_dest);
4206 if (BYTES_BIG_ENDIAN)
4207 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4208 else
4209 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4210 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4211 high = make_ssa_name (perm_dest, perm_stmt);
4212 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4213 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4214 VEC_replace (tree, *result_chain, 2*j, high);
4216 /* Create interleaving stmt:
4217 in the case of big endian:
4218 low = interleave_low (vect1, vect2)
4219 and in the case of little endian:
4220 low = interleave_high (vect1, vect2). */
4221 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4222 DECL_GIMPLE_REG_P (perm_dest) = 1;
4223 add_referenced_var (perm_dest);
4224 if (BYTES_BIG_ENDIAN)
4225 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4226 else
4227 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4228 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4229 low = make_ssa_name (perm_dest, perm_stmt);
4230 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4231 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4232 VEC_replace (tree, *result_chain, 2*j+1, low);
4234 dr_chain = VEC_copy (tree, heap, *result_chain);
4236 return true;
4240 /* Function vectorizable_store.
4242 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4243 can be vectorized.
4244 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4245 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4246 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4248 bool
4249 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
4251 tree scalar_dest;
4252 tree data_ref;
4253 tree op;
4254 tree vec_oprnd = NULL_TREE;
4255 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4256 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4257 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4260 enum machine_mode vec_mode;
4261 tree dummy;
4262 enum dr_alignment_support alignment_support_scheme;
4263 tree def, def_stmt;
4264 enum vect_def_type dt;
4265 stmt_vec_info prev_stmt_info = NULL;
4266 tree dataref_ptr = NULL_TREE;
4267 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4268 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4269 int j;
4270 tree next_stmt, first_stmt;
4271 bool strided_store = false;
4272 unsigned int group_size, i;
4273 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4274 bool inv_p;
4276 gcc_assert (ncopies >= 1);
4278 /* FORNOW. This restriction should be relaxed. */
4279 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4281 if (vect_print_dump_info (REPORT_DETAILS))
4282 fprintf (vect_dump, "multiple types in nested loop.");
4283 return false;
4286 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4287 return false;
4289 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4290 return false;
4292 if (STMT_VINFO_LIVE_P (stmt_info))
4294 if (vect_print_dump_info (REPORT_DETAILS))
4295 fprintf (vect_dump, "value used after loop.");
4296 return false;
4299 /* Is vectorizable store? */
4301 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4302 return false;
4304 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4305 if (TREE_CODE (scalar_dest) != ARRAY_REF
4306 && TREE_CODE (scalar_dest) != INDIRECT_REF
4307 && !DR_GROUP_FIRST_DR (stmt_info))
4308 return false;
4310 op = GIMPLE_STMT_OPERAND (stmt, 1);
4311 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4313 if (vect_print_dump_info (REPORT_DETAILS))
4314 fprintf (vect_dump, "use not simple.");
4315 return false;
4318 vec_mode = TYPE_MODE (vectype);
4319 /* FORNOW. In some cases can vectorize even if data-type not supported
4320 (e.g. - array initialization with 0). */
4321 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4322 return false;
4324 if (!STMT_VINFO_DATA_REF (stmt_info))
4325 return false;
4327 if (DR_GROUP_FIRST_DR (stmt_info))
4329 strided_store = true;
4330 if (!vect_strided_store_supported (vectype))
4331 return false;
4334 if (!vec_stmt) /* transformation not required. */
4336 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4337 vect_model_store_cost (stmt_info, ncopies, dt);
4338 return true;
4341 /** Transform. **/
4343 if (strided_store)
4345 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4346 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4347 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4349 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4351 /* FORNOW */
4352 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4354 /* We vectorize all the stmts of the interleaving group when we
4355 reach the last stmt in the group. */
4356 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4357 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
4359 *vec_stmt = NULL_TREE;
4360 return true;
4363 else
4365 first_stmt = stmt;
4366 first_dr = dr;
4367 group_size = 1;
4370 if (vect_print_dump_info (REPORT_DETAILS))
4371 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4373 dr_chain = VEC_alloc (tree, heap, group_size);
4374 oprnds = VEC_alloc (tree, heap, group_size);
4376 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4377 gcc_assert (alignment_support_scheme);
4378 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4380 /* In case the vectorization factor (VF) is bigger than the number
4381 of elements that we can fit in a vectype (nunits), we have to generate
4382 more than one vector stmt - i.e - we need to "unroll" the
4383 vector stmt by a factor VF/nunits. For more details see documentation in
4384 vect_get_vec_def_for_copy_stmt. */
4386 /* In case of interleaving (non-unit strided access):
4388 S1: &base + 2 = x2
4389 S2: &base = x0
4390 S3: &base + 1 = x1
4391 S4: &base + 3 = x3
4393 We create vectorized stores starting from base address (the access of the
4394 first stmt in the chain (S2 in the above example), when the last store stmt
4395 of the chain (S4) is reached:
4397 VS1: &base = vx2
4398 VS2: &base + vec_size*1 = vx0
4399 VS3: &base + vec_size*2 = vx1
4400 VS4: &base + vec_size*3 = vx3
4402 Then permutation statements are generated:
4404 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4405 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4408 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4409 (the order of the data-refs in the output of vect_permute_store_chain
4410 corresponds to the order of scalar stmts in the interleaving chain - see
4411 the documentation of vect_permute_store_chain()).
4413 In case of both multiple types and interleaving, above vector stores and
4414 permutation stmts are created for every copy. The result vector stmts are
4415 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4416 STMT_VINFO_RELATED_STMT for the next copies.
4419 prev_stmt_info = NULL;
4420 for (j = 0; j < ncopies; j++)
4422 tree new_stmt;
4423 tree ptr_incr;
4425 if (j == 0)
4427 /* For interleaved stores we collect vectorized defs for all the
4428 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
4429 as an input to vect_permute_store_chain(), and OPRNDS as an input
4430 to vect_get_vec_def_for_stmt_copy() for the next copy.
4431 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4432 OPRNDS are of size 1. */
4433 next_stmt = first_stmt;
4434 for (i = 0; i < group_size; i++)
4436 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
4437 is the exact number of stmts in the chain. Therefore, NEXT_STMT
4438 can't be NULL_TREE. In case that there is no interleaving,
4439 GROUP_SIZE is 1, and only one iteration of the loop will be
4440 executed. */
4441 gcc_assert (next_stmt);
4442 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4443 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
4444 VEC_quick_push(tree, dr_chain, vec_oprnd);
4445 VEC_quick_push(tree, oprnds, vec_oprnd);
4446 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4448 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4449 &dummy, &ptr_incr, false,
4450 TREE_TYPE (vec_oprnd), &inv_p);
4451 gcc_assert (!inv_p);
4453 else
4455 /* For interleaved stores we created vectorized defs for all the
4456 defs stored in OPRNDS in the previous iteration (previous copy).
4457 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4458 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4459 next copy.
4460 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4461 OPRNDS are of size 1. */
4462 for (i = 0; i < group_size; i++)
4464 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
4465 VEC_index (tree, oprnds, i));
4466 VEC_replace(tree, dr_chain, i, vec_oprnd);
4467 VEC_replace(tree, oprnds, i, vec_oprnd);
4469 dataref_ptr =
4470 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4473 if (strided_store)
4475 result_chain = VEC_alloc (tree, heap, group_size);
4476 /* Permute. */
4477 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4478 &result_chain))
4479 return false;
4482 next_stmt = first_stmt;
4483 for (i = 0; i < group_size; i++)
4485 /* For strided stores vectorized defs are interleaved in
4486 vect_permute_store_chain(). */
4487 if (strided_store)
4488 vec_oprnd = VEC_index(tree, result_chain, i);
4490 data_ref = build_fold_indirect_ref (dataref_ptr);
4491 /* Arguments are ready. Create the new vector stmt. */
4492 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4493 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4494 mark_symbols_for_renaming (new_stmt);
4496 if (j == 0)
4497 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4498 else
4499 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4501 prev_stmt_info = vinfo_for_stmt (new_stmt);
4502 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4503 if (!next_stmt)
4504 break;
4505 /* Bump the vector pointer. */
4506 dataref_ptr =
4507 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4511 return true;
4515 /* Function vect_setup_realignment
4517 This function is called when vectorizing an unaligned load using
4518 the dr_explicit_realign[_optimized] scheme.
4519 This function generates the following code at the loop prolog:
4521 p = initial_addr;
4522 x msq_init = *(floor(p)); # prolog load
4523 realignment_token = call target_builtin;
4524 loop:
4525 x msq = phi (msq_init, ---)
4527 The stmts marked with x are generated only for the case of
4528 dr_explicit_realign_optimized.
4530 The code above sets up a new (vector) pointer, pointing to the first
4531 location accessed by STMT, and a "floor-aligned" load using that pointer.
4532 It also generates code to compute the "realignment-token" (if the relevant
4533 target hook was defined), and creates a phi-node at the loop-header bb
4534 whose arguments are the result of the prolog-load (created by this
4535 function) and the result of a load that takes place in the loop (to be
4536 created by the caller to this function).
4538 For the case of dr_explicit_realign_optimized:
4539 The caller to this function uses the phi-result (msq) to create the
4540 realignment code inside the loop, and sets up the missing phi argument,
4541 as follows:
4542 loop:
4543 msq = phi (msq_init, lsq)
4544 lsq = *(floor(p')); # load in loop
4545 result = realign_load (msq, lsq, realignment_token);
4547 For the case of dr_explicit_realign:
4548 loop:
4549 msq = *(floor(p)); # load in loop
4550 p' = p + (VS-1);
4551 lsq = *(floor(p')); # load in loop
4552 result = realign_load (msq, lsq, realignment_token);
4554 Input:
4555 STMT - (scalar) load stmt to be vectorized. This load accesses
4556 a memory location that may be unaligned.
4557 BSI - place where new code is to be inserted.
4558 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4559 is used.
4561 Output:
4562 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4563 target hook, if defined.
4564 Return value - the result of the loop-header phi node. */
4566 static tree
4567 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
4568 tree *realignment_token,
4569 enum dr_alignment_support alignment_support_scheme,
4570 tree init_addr,
4571 struct loop **at_loop)
4573 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4574 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4575 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4576 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4577 edge pe;
4578 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4579 tree vec_dest;
4580 tree inc;
4581 tree ptr;
4582 tree data_ref;
4583 tree new_stmt;
4584 basic_block new_bb;
4585 tree msq_init = NULL_TREE;
4586 tree new_temp;
4587 tree phi_stmt;
4588 tree msq = NULL_TREE;
4589 tree stmts = NULL_TREE;
4590 bool inv_p;
4591 bool compute_in_loop = false;
4592 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4593 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
4594 struct loop *loop_for_initial_load;
4596 gcc_assert (alignment_support_scheme == dr_explicit_realign
4597 || alignment_support_scheme == dr_explicit_realign_optimized);
4599 /* We need to generate three things:
4600 1. the misalignment computation
4601 2. the extra vector load (for the optimized realignment scheme).
4602 3. the phi node for the two vectors from which the realignment is
4603 done (for the optimized realignment scheme).
4606 /* 1. Determine where to generate the misalignment computation.
4608 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4609 calculation will be generated by this function, outside the loop (in the
4610 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4611 caller, inside the loop.
4613 Background: If the misalignment remains fixed throughout the iterations of
4614 the loop, then both realignment schemes are applicable, and also the
4615 misalignment computation can be done outside LOOP. This is because we are
4616 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4617 are a multiple of VS (the Vector Size), and therefore the misalignment in
4618 different vectorized LOOP iterations is always the same.
4619 The problem arises only if the memory access is in an inner-loop nested
4620 inside LOOP, which is now being vectorized using outer-loop vectorization.
4621 This is the only case when the misalignment of the memory access may not
4622 remain fixed thtoughout the iterations of the inner-loop (as exaplained in
4623 detail in vect_supportable_dr_alignment). In this case, not only is the
4624 optimized realignment scheme not applicable, but also the misalignment
4625 computation (and generation of the realignment token that is passed to
4626 REALIGN_LOAD) have to be done inside the loop.
4628 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4629 or not, which in turn determines if the misalignment is computed inside
4630 the inner-loop, or outside LOOP. */
4632 if (init_addr != NULL_TREE)
4634 compute_in_loop = true;
4635 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4639 /* 2. Determine where to generate the extra vector load.
4641 For the optimized realignment scheme, instead of generating two vector
4642 loads in each iteration, we generate a single extra vector load in the
4643 preheader of the loop, and in each iteration reuse the result of the
4644 vector load from the previous iteration. In case the memory access is in
4645 an inner-loop nested inside LOOP, which is now being vectorized using
4646 outer-loop vectorization, we need to determine whether this initial vector
4647 load should be generated at the preheader of the inner-loop, or can be
4648 generated at the preheader of LOOP. If the memory access has no evolution
4649 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4650 to be generated inside LOOP (in the preheader of the inner-loop). */
4652 if (nested_in_vect_loop)
4654 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4655 bool invariant_in_outerloop =
4656 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4657 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4659 else
4660 loop_for_initial_load = loop;
4661 if (at_loop)
4662 *at_loop = loop_for_initial_load;
4664 /* 3. For the case of the optimized realignment, create the first vector
4665 load at the loop preheader. */
4667 if (alignment_support_scheme == dr_explicit_realign_optimized)
4669 /* Create msq_init = *(floor(p1)) in the loop preheader */
4671 gcc_assert (!compute_in_loop);
4672 pe = loop_preheader_edge (loop_for_initial_load);
4673 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4674 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
4675 &init_addr, &inc, true, NULL_TREE, &inv_p);
4676 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
4677 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
4678 new_temp = make_ssa_name (vec_dest, new_stmt);
4679 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4680 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4681 gcc_assert (!new_bb);
4682 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
4685 /* 4. Create realignment token using a target builtin, if available.
4686 It is done either inside the containing loop, or before LOOP (as
4687 determined above). */
4689 if (targetm.vectorize.builtin_mask_for_load)
4691 tree builtin_decl;
4693 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4694 if (compute_in_loop)
4695 gcc_assert (init_addr); /* already computed by the caller. */
4696 else
4698 /* Generate the INIT_ADDR computation outside LOOP. */
4699 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4700 NULL_TREE, loop);
4701 pe = loop_preheader_edge (loop);
4702 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
4703 gcc_assert (!new_bb);
4706 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4707 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
4708 vec_dest = vect_create_destination_var (scalar_dest,
4709 TREE_TYPE (new_stmt));
4710 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
4711 new_temp = make_ssa_name (vec_dest, new_stmt);
4712 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4714 if (compute_in_loop)
4715 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
4716 else
4718 /* Generate the misalignment computation outside LOOP. */
4719 pe = loop_preheader_edge (loop);
4720 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
4721 gcc_assert (!new_bb);
4724 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
4726 /* The result of the CALL_EXPR to this builtin is determined from
4727 the value of the parameter and no global variables are touched
4728 which makes the builtin a "const" function. Requiring the
4729 builtin to have the "const" attribute makes it unnecessary
4730 to call mark_call_clobbered. */
4731 gcc_assert (TREE_READONLY (builtin_decl));
4734 if (alignment_support_scheme == dr_explicit_realign)
4735 return msq;
4737 gcc_assert (!compute_in_loop);
4738 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4741 /* 5. Create msq = phi <msq_init, lsq> in loop */
4743 pe = loop_preheader_edge (containing_loop);
4744 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4745 msq = make_ssa_name (vec_dest, NULL_TREE);
4746 phi_stmt = create_phi_node (msq, containing_loop->header);
4747 SSA_NAME_DEF_STMT (msq) = phi_stmt;
4748 add_phi_arg (phi_stmt, msq_init, pe);
4750 return msq;
4754 /* Function vect_strided_load_supported.
4756 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
4757 and FALSE otherwise. */
4759 static bool
4760 vect_strided_load_supported (tree vectype)
4762 optab perm_even_optab, perm_odd_optab;
4763 int mode;
4765 mode = (int) TYPE_MODE (vectype);
4767 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
4768 if (!perm_even_optab)
4770 if (vect_print_dump_info (REPORT_DETAILS))
4771 fprintf (vect_dump, "no optab for perm_even.");
4772 return false;
4775 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
4777 if (vect_print_dump_info (REPORT_DETAILS))
4778 fprintf (vect_dump, "perm_even op not supported by target.");
4779 return false;
4782 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
4783 if (!perm_odd_optab)
4785 if (vect_print_dump_info (REPORT_DETAILS))
4786 fprintf (vect_dump, "no optab for perm_odd.");
4787 return false;
4790 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
4792 if (vect_print_dump_info (REPORT_DETAILS))
4793 fprintf (vect_dump, "perm_odd op not supported by target.");
4794 return false;
4796 return true;
4800 /* Function vect_permute_load_chain.
4802 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4803 a power of 2, generate extract_even/odd stmts to reorder the input data
4804 correctly. Return the final references for loads in RESULT_CHAIN.
4806 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4807 The input is 4 vectors each containing 8 elements. We assign a number to each
4808 element, the input sequence is:
4810 1st vec: 0 1 2 3 4 5 6 7
4811 2nd vec: 8 9 10 11 12 13 14 15
4812 3rd vec: 16 17 18 19 20 21 22 23
4813 4th vec: 24 25 26 27 28 29 30 31
4815 The output sequence should be:
4817 1st vec: 0 4 8 12 16 20 24 28
4818 2nd vec: 1 5 9 13 17 21 25 29
4819 3rd vec: 2 6 10 14 18 22 26 30
4820 4th vec: 3 7 11 15 19 23 27 31
4822 i.e., the first output vector should contain the first elements of each
4823 interleaving group, etc.
4825 We use extract_even/odd instructions to create such output. The input of each
4826 extract_even/odd operation is two vectors
4827 1st vec 2nd vec
4828 0 1 2 3 4 5 6 7
4830 and the output is the vector of extracted even/odd elements. The output of
4831 extract_even will be: 0 2 4 6
4832 and of extract_odd: 1 3 5 7
4835 The permutation is done in log LENGTH stages. In each stage extract_even and
4836 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
4837 order. In our example,
4839 E1: extract_even (1st vec, 2nd vec)
4840 E2: extract_odd (1st vec, 2nd vec)
4841 E3: extract_even (3rd vec, 4th vec)
4842 E4: extract_odd (3rd vec, 4th vec)
4844 The output for the first stage will be:
4846 E1: 0 2 4 6 8 10 12 14
4847 E2: 1 3 5 7 9 11 13 15
4848 E3: 16 18 20 22 24 26 28 30
4849 E4: 17 19 21 23 25 27 29 31
4851 In order to proceed and create the correct sequence for the next stage (or
4852 for the correct output, if the second stage is the last one, as in our
4853 example), we first put the output of extract_even operation and then the
4854 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4855 The input for the second stage is:
4857 1st vec (E1): 0 2 4 6 8 10 12 14
4858 2nd vec (E3): 16 18 20 22 24 26 28 30
4859 3rd vec (E2): 1 3 5 7 9 11 13 15
4860 4th vec (E4): 17 19 21 23 25 27 29 31
4862 The output of the second stage:
4864 E1: 0 4 8 12 16 20 24 28
4865 E2: 2 6 10 14 18 22 26 30
4866 E3: 1 5 9 13 17 21 25 29
4867 E4: 3 7 11 15 19 23 27 31
4869 And RESULT_CHAIN after reordering:
4871 1st vec (E1): 0 4 8 12 16 20 24 28
4872 2nd vec (E3): 1 5 9 13 17 21 25 29
4873 3rd vec (E2): 2 6 10 14 18 22 26 30
4874 4th vec (E4): 3 7 11 15 19 23 27 31. */
4876 static bool
4877 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
4878 unsigned int length,
4879 tree stmt,
4880 block_stmt_iterator *bsi,
4881 VEC(tree,heap) **result_chain)
4883 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
4884 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4885 tree tmp;
4886 int i;
4887 unsigned int j;
4889 /* Check that the operation is supported. */
4890 if (!vect_strided_load_supported (vectype))
4891 return false;
4893 *result_chain = VEC_copy (tree, heap, dr_chain);
4894 for (i = 0; i < exact_log2 (length); i++)
4896 for (j = 0; j < length; j +=2)
4898 first_vect = VEC_index (tree, dr_chain, j);
4899 second_vect = VEC_index (tree, dr_chain, j+1);
4901 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4902 perm_dest = create_tmp_var (vectype, "vect_perm_even");
4903 DECL_GIMPLE_REG_P (perm_dest) = 1;
4904 add_referenced_var (perm_dest);
4906 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
4907 first_vect, second_vect);
4908 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4910 data_ref = make_ssa_name (perm_dest, perm_stmt);
4911 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4912 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4913 mark_symbols_for_renaming (perm_stmt);
4915 VEC_replace (tree, *result_chain, j/2, data_ref);
4917 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4918 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
4919 DECL_GIMPLE_REG_P (perm_dest) = 1;
4920 add_referenced_var (perm_dest);
4922 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
4923 first_vect, second_vect);
4924 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4925 data_ref = make_ssa_name (perm_dest, perm_stmt);
4926 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
4927 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4928 mark_symbols_for_renaming (perm_stmt);
4930 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
4932 dr_chain = VEC_copy (tree, heap, *result_chain);
4934 return true;
4938 /* Function vect_transform_strided_load.
4940 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4941 to perform their permutation and ascribe the result vectorized statements to
4942 the scalar statements.
4945 static bool
4946 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
4947 block_stmt_iterator *bsi)
4949 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4950 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4951 tree next_stmt, new_stmt;
4952 VEC(tree,heap) *result_chain = NULL;
4953 unsigned int i, gap_count;
4954 tree tmp_data_ref;
4956 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4957 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4958 vectors, that are ready for vector computation. */
4959 result_chain = VEC_alloc (tree, heap, size);
4960 /* Permute. */
4961 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
4962 return false;
4964 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4965 Since we scan the chain starting from it's first node, their order
4966 corresponds the order of data-refs in RESULT_CHAIN. */
4967 next_stmt = first_stmt;
4968 gap_count = 1;
4969 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
4971 if (!next_stmt)
4972 break;
4974 /* Skip the gaps. Loads created for the gaps will be removed by dead
4975 code elimination pass later.
4976 DR_GROUP_GAP is the number of steps in elements from the previous
4977 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
4978 correspond to the gaps.
4980 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
4982 gap_count++;
4983 continue;
4986 while (next_stmt)
4988 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4989 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4990 copies, and we put the new vector statement in the first available
4991 RELATED_STMT. */
4992 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4993 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4994 else
4996 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4997 tree rel_stmt = STMT_VINFO_RELATED_STMT (
4998 vinfo_for_stmt (prev_stmt));
4999 while (rel_stmt)
5001 prev_stmt = rel_stmt;
5002 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5004 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5006 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5007 gap_count = 1;
5008 /* If NEXT_STMT accesses the same DR as the previous statement,
5009 put the same TMP_DATA_REF as its vectorized statement; otherwise
5010 get the next data-ref from RESULT_CHAIN. */
5011 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5012 break;
5015 return true;
5019 /* vectorizable_load.
5021 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5022 can be vectorized.
5023 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5024 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5025 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5027 bool
5028 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5030 tree scalar_dest;
5031 tree vec_dest = NULL;
5032 tree data_ref = NULL;
5033 tree op;
5034 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5035 stmt_vec_info prev_stmt_info;
5036 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5037 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5038 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5039 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5040 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5041 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5042 tree new_temp;
5043 int mode;
5044 tree new_stmt = NULL_TREE;
5045 tree dummy;
5046 enum dr_alignment_support alignment_support_scheme;
5047 tree dataref_ptr = NULL_TREE;
5048 tree ptr_incr;
5049 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5050 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5051 int i, j, group_size;
5052 tree msq = NULL_TREE, lsq;
5053 tree offset = NULL_TREE;
5054 tree realignment_token = NULL_TREE;
5055 tree phi = NULL_TREE;
5056 VEC(tree,heap) *dr_chain = NULL;
5057 bool strided_load = false;
5058 tree first_stmt;
5059 tree scalar_type;
5060 bool inv_p;
5061 bool compute_in_loop = false;
5062 struct loop *at_loop;
5064 gcc_assert (ncopies >= 1);
5066 /* FORNOW. This restriction should be relaxed. */
5067 if (nested_in_vect_loop && ncopies > 1)
5069 if (vect_print_dump_info (REPORT_DETAILS))
5070 fprintf (vect_dump, "multiple types in nested loop.");
5071 return false;
5074 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5075 return false;
5077 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5078 return false;
5080 /* FORNOW: not yet supported. */
5081 if (STMT_VINFO_LIVE_P (stmt_info))
5083 if (vect_print_dump_info (REPORT_DETAILS))
5084 fprintf (vect_dump, "value used after loop.");
5085 return false;
5088 /* Is vectorizable load? */
5089 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5090 return false;
5092 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5093 if (TREE_CODE (scalar_dest) != SSA_NAME)
5094 return false;
5096 op = GIMPLE_STMT_OPERAND (stmt, 1);
5097 if (TREE_CODE (op) != ARRAY_REF
5098 && TREE_CODE (op) != INDIRECT_REF
5099 && !DR_GROUP_FIRST_DR (stmt_info))
5100 return false;
5102 if (!STMT_VINFO_DATA_REF (stmt_info))
5103 return false;
5105 scalar_type = TREE_TYPE (DR_REF (dr));
5106 mode = (int) TYPE_MODE (vectype);
5108 /* FORNOW. In some cases can vectorize even if data-type not supported
5109 (e.g. - data copies). */
5110 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5112 if (vect_print_dump_info (REPORT_DETAILS))
5113 fprintf (vect_dump, "Aligned load, but unsupported type.");
5114 return false;
5117 /* Check if the load is a part of an interleaving chain. */
5118 if (DR_GROUP_FIRST_DR (stmt_info))
5120 strided_load = true;
5121 /* FORNOW */
5122 gcc_assert (! nested_in_vect_loop);
5124 /* Check if interleaving is supported. */
5125 if (!vect_strided_load_supported (vectype))
5126 return false;
5129 if (!vec_stmt) /* transformation not required. */
5131 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5132 vect_model_load_cost (stmt_info, ncopies);
5133 return true;
5136 if (vect_print_dump_info (REPORT_DETAILS))
5137 fprintf (vect_dump, "transform load.");
5139 /** Transform. **/
5141 if (strided_load)
5143 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5144 /* Check if the chain of loads is already vectorized. */
5145 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5147 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5148 return true;
5150 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5151 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5152 dr_chain = VEC_alloc (tree, heap, group_size);
5154 else
5156 first_stmt = stmt;
5157 first_dr = dr;
5158 group_size = 1;
5161 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5162 gcc_assert (alignment_support_scheme);
5164 /* In case the vectorization factor (VF) is bigger than the number
5165 of elements that we can fit in a vectype (nunits), we have to generate
5166 more than one vector stmt - i.e - we need to "unroll" the
5167 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5168 from one copy of the vector stmt to the next, in the field
5169 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5170 stages to find the correct vector defs to be used when vectorizing
5171 stmts that use the defs of the current stmt. The example below illustrates
5172 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5173 4 vectorized stmts):
5175 before vectorization:
5176 RELATED_STMT VEC_STMT
5177 S1: x = memref - -
5178 S2: z = x + 1 - -
5180 step 1: vectorize stmt S1:
5181 We first create the vector stmt VS1_0, and, as usual, record a
5182 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5183 Next, we create the vector stmt VS1_1, and record a pointer to
5184 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5185 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5186 stmts and pointers:
5187 RELATED_STMT VEC_STMT
5188 VS1_0: vx0 = memref0 VS1_1 -
5189 VS1_1: vx1 = memref1 VS1_2 -
5190 VS1_2: vx2 = memref2 VS1_3 -
5191 VS1_3: vx3 = memref3 - -
5192 S1: x = load - VS1_0
5193 S2: z = x + 1 - -
5195 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5196 information we recorded in RELATED_STMT field is used to vectorize
5197 stmt S2. */
5199 /* In case of interleaving (non-unit strided access):
5201 S1: x2 = &base + 2
5202 S2: x0 = &base
5203 S3: x1 = &base + 1
5204 S4: x3 = &base + 3
5206 Vectorized loads are created in the order of memory accesses
5207 starting from the access of the first stmt of the chain:
5209 VS1: vx0 = &base
5210 VS2: vx1 = &base + vec_size*1
5211 VS3: vx3 = &base + vec_size*2
5212 VS4: vx4 = &base + vec_size*3
5214 Then permutation statements are generated:
5216 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5217 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5220 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5221 (the order of the data-refs in the output of vect_permute_load_chain
5222 corresponds to the order of scalar stmts in the interleaving chain - see
5223 the documentation of vect_permute_load_chain()).
5224 The generation of permutation stmts and recording them in
5225 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5227 In case of both multiple types and interleaving, the vector loads and
5228 permutation stmts above are created for every copy. The result vector stmts
5229 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5230 STMT_VINFO_RELATED_STMT for the next copies. */
5232 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5233 on a target that supports unaligned accesses (dr_unaligned_supported)
5234 we generate the following code:
5235 p = initial_addr;
5236 indx = 0;
5237 loop {
5238 p = p + indx * vectype_size;
5239 vec_dest = *(p);
5240 indx = indx + 1;
5243 Otherwise, the data reference is potentially unaligned on a target that
5244 does not support unaligned accesses (dr_explicit_realign_optimized) -
5245 then generate the following code, in which the data in each iteration is
5246 obtained by two vector loads, one from the previous iteration, and one
5247 from the current iteration:
5248 p1 = initial_addr;
5249 msq_init = *(floor(p1))
5250 p2 = initial_addr + VS - 1;
5251 realignment_token = call target_builtin;
5252 indx = 0;
5253 loop {
5254 p2 = p2 + indx * vectype_size
5255 lsq = *(floor(p2))
5256 vec_dest = realign_load (msq, lsq, realignment_token)
5257 indx = indx + 1;
5258 msq = lsq;
5259 } */
5261 /* If the misalignment remains the same throughout the execution of the
5262 loop, we can create the init_addr and permutation mask at the loop
5263 preheader. Otherwise, it needs to be created inside the loop.
5264 This can only occur when vectorizing memory accesses in the inner-loop
5265 nested within an outer-loop that is being vectorized. */
5267 if (nested_in_vect_loop_p (loop, stmt)
5268 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5270 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5271 compute_in_loop = true;
5274 if ((alignment_support_scheme == dr_explicit_realign_optimized
5275 || alignment_support_scheme == dr_explicit_realign)
5276 && !compute_in_loop)
5278 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5279 alignment_support_scheme, NULL_TREE,
5280 &at_loop);
5281 if (alignment_support_scheme == dr_explicit_realign_optimized)
5283 phi = SSA_NAME_DEF_STMT (msq);
5284 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5287 else
5288 at_loop = loop;
5290 prev_stmt_info = NULL;
5291 for (j = 0; j < ncopies; j++)
5293 /* 1. Create the vector pointer update chain. */
5294 if (j == 0)
5295 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5296 at_loop, offset,
5297 &dummy, &ptr_incr, false,
5298 NULL_TREE, &inv_p);
5299 else
5300 dataref_ptr =
5301 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5303 for (i = 0; i < group_size; i++)
5305 /* 2. Create the vector-load in the loop. */
5306 switch (alignment_support_scheme)
5308 case dr_aligned:
5309 gcc_assert (aligned_access_p (first_dr));
5310 data_ref = build_fold_indirect_ref (dataref_ptr);
5311 break;
5312 case dr_unaligned_supported:
5314 int mis = DR_MISALIGNMENT (first_dr);
5315 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5317 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5318 data_ref =
5319 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5320 break;
5322 case dr_explicit_realign:
5324 tree ptr, bump;
5325 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5327 if (compute_in_loop)
5328 msq = vect_setup_realignment (first_stmt, bsi,
5329 &realignment_token,
5330 dr_explicit_realign,
5331 dataref_ptr, NULL);
5333 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5334 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5335 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5336 new_temp = make_ssa_name (vec_dest, new_stmt);
5337 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5338 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5339 copy_virtual_operands (new_stmt, stmt);
5340 mark_symbols_for_renaming (new_stmt);
5341 msq = new_temp;
5343 bump = size_binop (MULT_EXPR, vs_minus_1,
5344 TYPE_SIZE_UNIT (scalar_type));
5345 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5346 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5347 break;
5349 case dr_explicit_realign_optimized:
5350 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5351 break;
5352 default:
5353 gcc_unreachable ();
5355 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5356 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5357 new_temp = make_ssa_name (vec_dest, new_stmt);
5358 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5359 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5360 mark_symbols_for_renaming (new_stmt);
5362 /* 3. Handle explicit realignment if necessary/supported. Create in
5363 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5364 if (alignment_support_scheme == dr_explicit_realign_optimized
5365 || alignment_support_scheme == dr_explicit_realign)
5367 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5368 if (!realignment_token)
5369 realignment_token = dataref_ptr;
5370 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5371 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5372 realignment_token);
5373 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5374 new_temp = make_ssa_name (vec_dest, new_stmt);
5375 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5376 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5378 if (alignment_support_scheme == dr_explicit_realign_optimized)
5380 if (i == group_size - 1 && j == ncopies - 1)
5381 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5382 msq = lsq;
5386 /* 4. Handle invariant-load. */
5387 if (inv_p)
5389 gcc_assert (!strided_load);
5390 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5391 if (j == 0)
5393 int k;
5394 tree t = NULL_TREE;
5395 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5397 /* CHECKME: bitpos depends on endianess? */
5398 bitpos = bitsize_zero_node;
5399 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5400 bitsize, bitpos);
5401 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5402 TYPE_UNSIGNED (scalar_type);
5403 vec_dest =
5404 vect_create_destination_var (scalar_dest, NULL_TREE);
5405 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5406 new_temp = make_ssa_name (vec_dest, new_stmt);
5407 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5408 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5410 for (k = nunits - 1; k >= 0; --k)
5411 t = tree_cons (NULL_TREE, new_temp, t);
5412 /* FIXME: use build_constructor directly. */
5413 vec_inv = build_constructor_from_list (vectype, t);
5414 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5415 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5417 else
5418 gcc_unreachable (); /* FORNOW. */
5421 if (strided_load)
5422 VEC_quick_push (tree, dr_chain, new_temp);
5423 if (i < group_size - 1)
5424 dataref_ptr =
5425 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5428 if (strided_load)
5430 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5431 return false;
5432 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5433 dr_chain = VEC_alloc (tree, heap, group_size);
5435 else
5437 if (j == 0)
5438 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5439 else
5440 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5441 prev_stmt_info = vinfo_for_stmt (new_stmt);
5445 return true;
5449 /* Function vectorizable_live_operation.
5451 STMT computes a value that is used outside the loop. Check if
5452 it can be supported. */
5454 bool
5455 vectorizable_live_operation (tree stmt,
5456 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5457 tree *vec_stmt ATTRIBUTE_UNUSED)
5459 tree operation;
5460 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5461 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5462 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5463 int i;
5464 int op_type;
5465 tree op;
5466 tree def, def_stmt;
5467 enum vect_def_type dt;
5469 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5471 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5472 return false;
5474 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5475 return false;
5477 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5478 return false;
5480 /* FORNOW. CHECKME. */
5481 if (nested_in_vect_loop_p (loop, stmt))
5482 return false;
5484 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5485 op_type = TREE_OPERAND_LENGTH (operation);
5487 /* FORNOW: support only if all uses are invariant. This means
5488 that the scalar operations can remain in place, unvectorized.
5489 The original last scalar value that they compute will be used. */
5491 for (i = 0; i < op_type; i++)
5493 op = TREE_OPERAND (operation, i);
5494 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5496 if (vect_print_dump_info (REPORT_DETAILS))
5497 fprintf (vect_dump, "use not simple.");
5498 return false;
5501 if (dt != vect_invariant_def && dt != vect_constant_def)
5502 return false;
5505 /* No transformation is required for the cases we currently support. */
5506 return true;
5510 /* Function vect_is_simple_cond.
5512 Input:
5513 LOOP - the loop that is being vectorized.
5514 COND - Condition that is checked for simple use.
5516 Returns whether a COND can be vectorized. Checks whether
5517 condition operands are supportable using vec_is_simple_use. */
5519 static bool
5520 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
5522 tree lhs, rhs;
5523 tree def;
5524 enum vect_def_type dt;
5526 if (!COMPARISON_CLASS_P (cond))
5527 return false;
5529 lhs = TREE_OPERAND (cond, 0);
5530 rhs = TREE_OPERAND (cond, 1);
5532 if (TREE_CODE (lhs) == SSA_NAME)
5534 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
5535 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
5536 return false;
5538 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
5539 && TREE_CODE (lhs) != FIXED_CST)
5540 return false;
5542 if (TREE_CODE (rhs) == SSA_NAME)
5544 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
5545 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
5546 return false;
5548 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
5549 && TREE_CODE (rhs) != FIXED_CST)
5550 return false;
5552 return true;
5555 /* vectorizable_condition.
5557 Check if STMT is conditional modify expression that can be vectorized.
5558 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5559 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
5560 at BSI.
5562 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5564 bool
5565 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
5567 tree scalar_dest = NULL_TREE;
5568 tree vec_dest = NULL_TREE;
5569 tree op = NULL_TREE;
5570 tree cond_expr, then_clause, else_clause;
5571 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5572 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5573 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
5574 tree vec_compare, vec_cond_expr;
5575 tree new_temp;
5576 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5577 enum machine_mode vec_mode;
5578 tree def;
5579 enum vect_def_type dt;
5580 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5581 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5583 gcc_assert (ncopies >= 1);
5584 if (ncopies > 1)
5585 return false; /* FORNOW */
5587 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5588 return false;
5590 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5591 return false;
5593 /* FORNOW: not yet supported. */
5594 if (STMT_VINFO_LIVE_P (stmt_info))
5596 if (vect_print_dump_info (REPORT_DETAILS))
5597 fprintf (vect_dump, "value used after loop.");
5598 return false;
5601 /* Is vectorizable conditional operation? */
5602 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5603 return false;
5605 op = GIMPLE_STMT_OPERAND (stmt, 1);
5607 if (TREE_CODE (op) != COND_EXPR)
5608 return false;
5610 cond_expr = TREE_OPERAND (op, 0);
5611 then_clause = TREE_OPERAND (op, 1);
5612 else_clause = TREE_OPERAND (op, 2);
5614 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
5615 return false;
5617 /* We do not handle two different vector types for the condition
5618 and the values. */
5619 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
5620 return false;
5622 if (TREE_CODE (then_clause) == SSA_NAME)
5624 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
5625 if (!vect_is_simple_use (then_clause, loop_vinfo,
5626 &then_def_stmt, &def, &dt))
5627 return false;
5629 else if (TREE_CODE (then_clause) != INTEGER_CST
5630 && TREE_CODE (then_clause) != REAL_CST
5631 && TREE_CODE (then_clause) != FIXED_CST)
5632 return false;
5634 if (TREE_CODE (else_clause) == SSA_NAME)
5636 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
5637 if (!vect_is_simple_use (else_clause, loop_vinfo,
5638 &else_def_stmt, &def, &dt))
5639 return false;
5641 else if (TREE_CODE (else_clause) != INTEGER_CST
5642 && TREE_CODE (else_clause) != REAL_CST
5643 && TREE_CODE (else_clause) != FIXED_CST)
5644 return false;
5647 vec_mode = TYPE_MODE (vectype);
5649 if (!vec_stmt)
5651 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
5652 return expand_vec_cond_expr_p (op, vec_mode);
5655 /* Transform */
5657 /* Handle def. */
5658 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5659 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5661 /* Handle cond expr. */
5662 vec_cond_lhs =
5663 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
5664 vec_cond_rhs =
5665 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
5666 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
5667 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
5669 /* Arguments are ready. create the new vector stmt. */
5670 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
5671 vec_cond_lhs, vec_cond_rhs);
5672 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
5673 vec_compare, vec_then_clause, vec_else_clause);
5675 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
5676 new_temp = make_ssa_name (vec_dest, *vec_stmt);
5677 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
5678 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
5680 return true;
5683 /* Function vect_transform_stmt.
5685 Create a vectorized stmt to replace STMT, and insert it at BSI. */
5687 bool
5688 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
5690 bool is_store = false;
5691 tree vec_stmt = NULL_TREE;
5692 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5693 tree orig_stmt_in_pattern;
5694 bool done;
5696 switch (STMT_VINFO_TYPE (stmt_info))
5698 case type_demotion_vec_info_type:
5699 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
5700 gcc_assert (done);
5701 break;
5703 case type_promotion_vec_info_type:
5704 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
5705 gcc_assert (done);
5706 break;
5708 case type_conversion_vec_info_type:
5709 done = vectorizable_conversion (stmt, bsi, &vec_stmt);
5710 gcc_assert (done);
5711 break;
5713 case induc_vec_info_type:
5714 done = vectorizable_induction (stmt, bsi, &vec_stmt);
5715 gcc_assert (done);
5716 break;
5718 case op_vec_info_type:
5719 done = vectorizable_operation (stmt, bsi, &vec_stmt);
5720 gcc_assert (done);
5721 break;
5723 case assignment_vec_info_type:
5724 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
5725 gcc_assert (done);
5726 break;
5728 case load_vec_info_type:
5729 done = vectorizable_load (stmt, bsi, &vec_stmt);
5730 gcc_assert (done);
5731 break;
5733 case store_vec_info_type:
5734 done = vectorizable_store (stmt, bsi, &vec_stmt);
5735 gcc_assert (done);
5736 if (DR_GROUP_FIRST_DR (stmt_info))
5738 /* In case of interleaving, the whole chain is vectorized when the
5739 last store in the chain is reached. Store stmts before the last
5740 one are skipped, and there vec_stmt_info shouldn't be freed
5741 meanwhile. */
5742 *strided_store = true;
5743 if (STMT_VINFO_VEC_STMT (stmt_info))
5744 is_store = true;
5746 else
5747 is_store = true;
5748 break;
5750 case condition_vec_info_type:
5751 done = vectorizable_condition (stmt, bsi, &vec_stmt);
5752 gcc_assert (done);
5753 break;
5755 case call_vec_info_type:
5756 done = vectorizable_call (stmt, bsi, &vec_stmt);
5757 break;
5759 case reduc_vec_info_type:
5760 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
5761 gcc_assert (done);
5762 break;
5764 default:
5765 if (!STMT_VINFO_LIVE_P (stmt_info))
5767 if (vect_print_dump_info (REPORT_DETAILS))
5768 fprintf (vect_dump, "stmt not supported.");
5769 gcc_unreachable ();
5773 if (STMT_VINFO_LIVE_P (stmt_info)
5774 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
5776 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
5777 gcc_assert (done);
5780 if (vec_stmt)
5782 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
5783 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
5784 if (orig_stmt_in_pattern)
5786 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
5787 /* STMT was inserted by the vectorizer to replace a computation idiom.
5788 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
5789 computed this idiom. We need to record a pointer to VEC_STMT in
5790 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
5791 documentation of vect_pattern_recog. */
5792 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
5794 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5795 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
5800 return is_store;
5804 /* This function builds ni_name = number of iterations loop executes
5805 on the loop preheader. */
5807 static tree
5808 vect_build_loop_niters (loop_vec_info loop_vinfo)
5810 tree ni_name, stmt, var;
5811 edge pe;
5812 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5813 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5815 var = create_tmp_var (TREE_TYPE (ni), "niters");
5816 add_referenced_var (var);
5817 ni_name = force_gimple_operand (ni, &stmt, false, var);
5819 pe = loop_preheader_edge (loop);
5820 if (stmt)
5822 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5823 gcc_assert (!new_bb);
5826 return ni_name;
5830 /* This function generates the following statements:
5832 ni_name = number of iterations loop executes
5833 ratio = ni_name / vf
5834 ratio_mult_vf_name = ratio * vf
5836 and places them at the loop preheader edge. */
5838 static void
5839 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5840 tree *ni_name_ptr,
5841 tree *ratio_mult_vf_name_ptr,
5842 tree *ratio_name_ptr)
5845 edge pe;
5846 basic_block new_bb;
5847 tree stmt, ni_name;
5848 tree var;
5849 tree ratio_name;
5850 tree ratio_mult_vf_name;
5851 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5852 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5853 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5854 tree log_vf;
5856 pe = loop_preheader_edge (loop);
5858 /* Generate temporary variable that contains
5859 number of iterations loop executes. */
5861 ni_name = vect_build_loop_niters (loop_vinfo);
5862 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5864 /* Create: ratio = ni >> log2(vf) */
5866 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
5867 if (!is_gimple_val (ratio_name))
5869 var = create_tmp_var (TREE_TYPE (ni), "bnd");
5870 add_referenced_var (var);
5872 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
5873 pe = loop_preheader_edge (loop);
5874 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5875 gcc_assert (!new_bb);
5878 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5880 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5881 ratio_name, log_vf);
5882 if (!is_gimple_val (ratio_mult_vf_name))
5884 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5885 add_referenced_var (var);
5887 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
5888 true, var);
5889 pe = loop_preheader_edge (loop);
5890 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
5891 gcc_assert (!new_bb);
5894 *ni_name_ptr = ni_name;
5895 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5896 *ratio_name_ptr = ratio_name;
5898 return;
5902 /* Function vect_update_ivs_after_vectorizer.
5904 "Advance" the induction variables of LOOP to the value they should take
5905 after the execution of LOOP. This is currently necessary because the
5906 vectorizer does not handle induction variables that are used after the
5907 loop. Such a situation occurs when the last iterations of LOOP are
5908 peeled, because:
5909 1. We introduced new uses after LOOP for IVs that were not originally used
5910 after LOOP: the IVs of LOOP are now used by an epilog loop.
5911 2. LOOP is going to be vectorized; this means that it will iterate N/VF
5912 times, whereas the loop IVs should be bumped N times.
5914 Input:
5915 - LOOP - a loop that is going to be vectorized. The last few iterations
5916 of LOOP were peeled.
5917 - NITERS - the number of iterations that LOOP executes (before it is
5918 vectorized). i.e, the number of times the ivs should be bumped.
5919 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
5920 coming out from LOOP on which there are uses of the LOOP ivs
5921 (this is the path from LOOP->exit to epilog_loop->preheader).
5923 The new definitions of the ivs are placed in LOOP->exit.
5924 The phi args associated with the edge UPDATE_E in the bb
5925 UPDATE_E->dest are updated accordingly.
5927 Assumption 1: Like the rest of the vectorizer, this function assumes
5928 a single loop exit that has a single predecessor.
5930 Assumption 2: The phi nodes in the LOOP header and in update_bb are
5931 organized in the same order.
5933 Assumption 3: The access function of the ivs is simple enough (see
5934 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
5936 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
5937 coming out of LOOP on which the ivs of LOOP are used (this is the path
5938 that leads to the epilog loop; other paths skip the epilog loop). This
5939 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
5940 needs to have its phis updated.
5943 static void
5944 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
5945 edge update_e)
5947 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5948 basic_block exit_bb = single_exit (loop)->dest;
5949 tree phi, phi1;
5950 basic_block update_bb = update_e->dest;
5952 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
5954 /* Make sure there exists a single-predecessor exit bb: */
5955 gcc_assert (single_pred_p (exit_bb));
5957 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
5958 phi && phi1;
5959 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
5961 tree access_fn = NULL;
5962 tree evolution_part;
5963 tree init_expr;
5964 tree step_expr;
5965 tree var, ni, ni_name;
5966 block_stmt_iterator last_bsi;
5968 if (vect_print_dump_info (REPORT_DETAILS))
5970 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
5971 print_generic_expr (vect_dump, phi, TDF_SLIM);
5974 /* Skip virtual phi's. */
5975 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
5977 if (vect_print_dump_info (REPORT_DETAILS))
5978 fprintf (vect_dump, "virtual phi. skip.");
5979 continue;
5982 /* Skip reduction phis. */
5983 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
5985 if (vect_print_dump_info (REPORT_DETAILS))
5986 fprintf (vect_dump, "reduc phi. skip.");
5987 continue;
5990 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
5991 gcc_assert (access_fn);
5992 evolution_part =
5993 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
5994 gcc_assert (evolution_part != NULL_TREE);
5996 /* FORNOW: We do not support IVs whose evolution function is a polynomial
5997 of degree >= 2 or exponential. */
5998 gcc_assert (!tree_is_chrec (evolution_part));
6000 step_expr = evolution_part;
6001 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6002 loop->num));
6004 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6005 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6006 init_expr,
6007 fold_convert (sizetype,
6008 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6009 niters, step_expr)));
6010 else
6011 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6012 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6013 fold_convert (TREE_TYPE (init_expr),
6014 niters),
6015 step_expr),
6016 init_expr);
6020 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6021 add_referenced_var (var);
6023 last_bsi = bsi_last (exit_bb);
6024 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6025 true, BSI_SAME_STMT);
6027 /* Fix phi expressions in the successor bb. */
6028 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6033 /* Function vect_do_peeling_for_loop_bound
6035 Peel the last iterations of the loop represented by LOOP_VINFO.
6036 The peeled iterations form a new epilog loop. Given that the loop now
6037 iterates NITERS times, the new epilog loop iterates
6038 NITERS % VECTORIZATION_FACTOR times.
6040 The original loop will later be made to iterate
6041 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6043 static void
6044 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6046 tree ni_name, ratio_mult_vf_name;
6047 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6048 struct loop *new_loop;
6049 edge update_e;
6050 basic_block preheader;
6051 int loop_num;
6052 unsigned int th;
6053 int min_scalar_loop_bound;
6054 int min_profitable_iters;
6056 if (vect_print_dump_info (REPORT_DETAILS))
6057 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6059 initialize_original_copy_tables ();
6061 /* Generate the following variables on the preheader of original loop:
6063 ni_name = number of iteration the original loop executes
6064 ratio = ni_name / vf
6065 ratio_mult_vf_name = ratio * vf */
6066 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6067 &ratio_mult_vf_name, ratio);
6069 loop_num = loop->num;
6071 /* Analyze cost to set threshhold for vectorized loop. */
6072 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6073 min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
6074 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6076 /* Use the cost model only if it is more conservative than user specified
6077 threshold. */
6079 th = (unsigned) min_scalar_loop_bound;
6080 if (min_profitable_iters
6081 && (!min_scalar_loop_bound
6082 || min_profitable_iters > min_scalar_loop_bound))
6083 th = (unsigned) min_profitable_iters;
6085 if (min_profitable_iters
6086 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6087 && vect_print_dump_info (REPORT_DETAILS))
6088 fprintf (vect_dump, "vectorization may not be profitable.");
6090 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6091 ratio_mult_vf_name, ni_name, false,
6092 th);
6093 gcc_assert (new_loop);
6094 gcc_assert (loop_num == loop->num);
6095 #ifdef ENABLE_CHECKING
6096 slpeel_verify_cfg_after_peeling (loop, new_loop);
6097 #endif
6099 /* A guard that controls whether the new_loop is to be executed or skipped
6100 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6101 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6102 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6103 is on the path where the LOOP IVs are used and need to be updated. */
6105 preheader = loop_preheader_edge (new_loop)->src;
6106 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6107 update_e = EDGE_PRED (preheader, 0);
6108 else
6109 update_e = EDGE_PRED (preheader, 1);
6111 /* Update IVs of original loop as if they were advanced
6112 by ratio_mult_vf_name steps. */
6113 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6115 /* After peeling we have to reset scalar evolution analyzer. */
6116 scev_reset ();
6118 free_original_copy_tables ();
6122 /* Function vect_gen_niters_for_prolog_loop
6124 Set the number of iterations for the loop represented by LOOP_VINFO
6125 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6126 and the misalignment of DR - the data reference recorded in
6127 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6128 this loop, the data reference DR will refer to an aligned location.
6130 The following computation is generated:
6132 If the misalignment of DR is known at compile time:
6133 addr_mis = int mis = DR_MISALIGNMENT (dr);
6134 Else, compute address misalignment in bytes:
6135 addr_mis = addr & (vectype_size - 1)
6137 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6139 (elem_size = element type size; an element is the scalar element
6140 whose type is the inner type of the vectype)
6142 For interleaving,
6144 prolog_niters = min ( LOOP_NITERS ,
6145 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6146 where group_size is the size of the interleaved group.
6148 The above formulas assume that VF == number of elements in the vector. This
6149 may not hold when there are multiple-types in the loop.
6150 In this case, for some data-references in the loop the VF does not represent
6151 the number of elements that fit in the vector. Therefore, instead of VF we
6152 use TYPE_VECTOR_SUBPARTS. */
6154 static tree
6155 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6157 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6158 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6159 tree var, stmt;
6160 tree iters, iters_name;
6161 edge pe;
6162 basic_block new_bb;
6163 tree dr_stmt = DR_STMT (dr);
6164 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6165 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6166 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6167 tree niters_type = TREE_TYPE (loop_niters);
6168 int group_size = 1;
6169 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6170 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6172 if (DR_GROUP_FIRST_DR (stmt_info))
6174 /* For interleaved access element size must be multiplied by the size of
6175 the interleaved group. */
6176 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6177 DR_GROUP_FIRST_DR (stmt_info)));
6178 element_size *= group_size;
6181 pe = loop_preheader_edge (loop);
6183 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6185 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6186 int elem_misalign = byte_misalign / element_size;
6188 if (vect_print_dump_info (REPORT_DETAILS))
6189 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6190 iters = build_int_cst (niters_type,
6191 (nelements - elem_misalign)&(nelements/group_size-1));
6193 else
6195 tree new_stmts = NULL_TREE;
6196 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6197 &new_stmts, NULL_TREE, loop);
6198 tree ptr_type = TREE_TYPE (start_addr);
6199 tree size = TYPE_SIZE (ptr_type);
6200 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6201 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6202 tree elem_size_log =
6203 build_int_cst (type, exact_log2 (vectype_align/nelements));
6204 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6205 tree nelements_tree = build_int_cst (type, nelements);
6206 tree byte_misalign;
6207 tree elem_misalign;
6209 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6210 gcc_assert (!new_bb);
6212 /* Create: byte_misalign = addr & (vectype_size - 1) */
6213 byte_misalign =
6214 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6216 /* Create: elem_misalign = byte_misalign / element_size */
6217 elem_misalign =
6218 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6220 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6221 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6222 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6223 iters = fold_convert (niters_type, iters);
6226 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6227 /* If the loop bound is known at compile time we already verified that it is
6228 greater than vf; since the misalignment ('iters') is at most vf, there's
6229 no need to generate the MIN_EXPR in this case. */
6230 if (TREE_CODE (loop_niters) != INTEGER_CST)
6231 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6233 if (vect_print_dump_info (REPORT_DETAILS))
6235 fprintf (vect_dump, "niters for prolog loop: ");
6236 print_generic_expr (vect_dump, iters, TDF_SLIM);
6239 var = create_tmp_var (niters_type, "prolog_loop_niters");
6240 add_referenced_var (var);
6241 iters_name = force_gimple_operand (iters, &stmt, false, var);
6243 /* Insert stmt on loop preheader edge. */
6244 if (stmt)
6246 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6247 gcc_assert (!new_bb);
6250 return iters_name;
6254 /* Function vect_update_init_of_dr
6256 NITERS iterations were peeled from LOOP. DR represents a data reference
6257 in LOOP. This function updates the information recorded in DR to
6258 account for the fact that the first NITERS iterations had already been
6259 executed. Specifically, it updates the OFFSET field of DR. */
6261 static void
6262 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6264 tree offset = DR_OFFSET (dr);
6266 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6267 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6268 DR_OFFSET (dr) = offset;
6272 /* Function vect_update_inits_of_drs
6274 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6275 This function updates the information recorded for the data references in
6276 the loop to account for the fact that the first NITERS iterations had
6277 already been executed. Specifically, it updates the initial_condition of
6278 the access_function of all the data_references in the loop. */
6280 static void
6281 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6283 unsigned int i;
6284 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6285 struct data_reference *dr;
6287 if (vect_print_dump_info (REPORT_DETAILS))
6288 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6290 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6291 vect_update_init_of_dr (dr, niters);
6295 /* Function vect_do_peeling_for_alignment
6297 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6298 'niters' is set to the misalignment of one of the data references in the
6299 loop, thereby forcing it to refer to an aligned location at the beginning
6300 of the execution of this loop. The data reference for which we are
6301 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6303 static void
6304 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6306 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6307 tree niters_of_prolog_loop, ni_name;
6308 tree n_iters;
6309 struct loop *new_loop;
6311 if (vect_print_dump_info (REPORT_DETAILS))
6312 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6314 initialize_original_copy_tables ();
6316 ni_name = vect_build_loop_niters (loop_vinfo);
6317 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6319 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6320 new_loop =
6321 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6322 niters_of_prolog_loop, ni_name, true, 0);
6323 gcc_assert (new_loop);
6324 #ifdef ENABLE_CHECKING
6325 slpeel_verify_cfg_after_peeling (new_loop, loop);
6326 #endif
6328 /* Update number of times loop executes. */
6329 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6330 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6331 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6333 /* Update the init conditions of the access functions of all data refs. */
6334 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6336 /* After peeling we have to reset scalar evolution analyzer. */
6337 scev_reset ();
6339 free_original_copy_tables ();
6343 /* Function vect_create_cond_for_align_checks.
6345 Create a conditional expression that represents the alignment checks for
6346 all of data references (array element references) whose alignment must be
6347 checked at runtime.
6349 Input:
6350 LOOP_VINFO - two fields of the loop information are used.
6351 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6352 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6354 Output:
6355 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6356 expression.
6357 The returned value is the conditional expression to be used in the if
6358 statement that controls which version of the loop gets executed at runtime.
6360 The algorithm makes two assumptions:
6361 1) The number of bytes "n" in a vector is a power of 2.
6362 2) An address "a" is aligned if a%n is zero and that this
6363 test can be done as a&(n-1) == 0. For example, for 16
6364 byte vectors the test is a&0xf == 0. */
6366 static tree
6367 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6368 tree *cond_expr_stmt_list)
6370 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6371 VEC(tree,heap) *may_misalign_stmts
6372 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6373 tree ref_stmt, tmp;
6374 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6375 tree mask_cst;
6376 unsigned int i;
6377 tree psize;
6378 tree int_ptrsize_type;
6379 char tmp_name[20];
6380 tree or_tmp_name = NULL_TREE;
6381 tree and_tmp, and_tmp_name, and_stmt;
6382 tree ptrsize_zero;
6384 /* Check that mask is one less than a power of 2, i.e., mask is
6385 all zeros followed by all ones. */
6386 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6388 /* CHECKME: what is the best integer or unsigned type to use to hold a
6389 cast from a pointer value? */
6390 psize = TYPE_SIZE (ptr_type_node);
6391 int_ptrsize_type
6392 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6394 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6395 of the first vector of the i'th data reference. */
6397 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6399 tree new_stmt_list = NULL_TREE;
6400 tree addr_base;
6401 tree addr_tmp, addr_tmp_name, addr_stmt;
6402 tree or_tmp, new_or_tmp_name, or_stmt;
6404 /* create: addr_tmp = (int)(address_of_first_vector) */
6405 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6406 &new_stmt_list, NULL_TREE, loop);
6408 if (new_stmt_list != NULL_TREE)
6409 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6411 sprintf (tmp_name, "%s%d", "addr2int", i);
6412 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6413 add_referenced_var (addr_tmp);
6414 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6415 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6416 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6417 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6418 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6420 /* The addresses are OR together. */
6422 if (or_tmp_name != NULL_TREE)
6424 /* create: or_tmp = or_tmp | addr_tmp */
6425 sprintf (tmp_name, "%s%d", "orptrs", i);
6426 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6427 add_referenced_var (or_tmp);
6428 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6429 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6430 or_tmp_name, addr_tmp_name);
6431 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6432 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6433 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6434 or_tmp_name = new_or_tmp_name;
6436 else
6437 or_tmp_name = addr_tmp_name;
6439 } /* end for i */
6441 mask_cst = build_int_cst (int_ptrsize_type, mask);
6443 /* create: and_tmp = or_tmp & mask */
6444 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6445 add_referenced_var (and_tmp);
6446 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6448 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6449 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6450 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6451 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
6453 /* Make and_tmp the left operand of the conditional test against zero.
6454 if and_tmp has a nonzero bit then some address is unaligned. */
6455 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
6456 return build2 (EQ_EXPR, boolean_type_node,
6457 and_tmp_name, ptrsize_zero);
6460 /* Function vect_vfa_segment_size.
6462 Create an expression that computes the size of segment
6463 that will be accessed for a data reference. The functions takes into
6464 account that realignment loads may access one more vector.
6466 Input:
6467 DR: The data reference.
6468 VECT_FACTOR: vectorization factor.
6470 Return an exrpession whose value is the size of segment which will be
6471 accessed by DR. */
6473 static tree
6474 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
6476 tree segment_length;
6478 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
6480 tree vector_size =
6481 build_int_cst (integer_type_node,
6482 GET_MODE_SIZE (TYPE_MODE (STMT_VINFO_VECTYPE
6483 (vinfo_for_stmt (DR_STMT (dr))))));
6485 segment_length =
6486 fold_convert (sizetype,
6487 fold_build2 (PLUS_EXPR, integer_type_node,
6488 fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
6489 vect_factor),
6490 vector_size));
6492 else
6494 segment_length =
6495 fold_convert (sizetype,
6496 fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr),
6497 vect_factor));
6500 return segment_length;
6503 /* Function vect_create_cond_for_alias_checks.
6505 Create a conditional expression that represents the run-time checks for
6506 overlapping of address ranges represented by a list of data references
6507 relations passed as input.
6509 Input:
6510 COND_EXPR - input conditional expression. New conditions will be chained
6511 with logical and operation.
6512 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
6513 to be checked.
6515 Output:
6516 COND_EXPR - conditional expression.
6517 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6518 expression.
6519 The returned value is the conditional expression to be used in the if
6520 statement that controls which version of the loop gets executed at runtime.
6523 static void
6524 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
6525 tree * cond_expr,
6526 tree * cond_expr_stmt_list)
6528 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6529 VEC (ddr_p, heap) * may_alias_ddrs =
6530 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
6531 tree vect_factor =
6532 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
6534 ddr_p ddr;
6535 unsigned int i;
6536 tree part_cond_expr;
6538 /* Create expression
6539 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
6540 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
6544 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
6545 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
6547 if (VEC_empty (ddr_p, may_alias_ddrs))
6548 return;
6550 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
6552 tree stmt_a = DR_STMT (DDR_A (ddr));
6553 tree stmt_b = DR_STMT (DDR_B (ddr));
6555 tree addr_base_a =
6556 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
6557 NULL_TREE, loop);
6558 tree addr_base_b =
6559 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
6560 NULL_TREE, loop);
6562 tree segment_length_a = vect_vfa_segment_size (DDR_A (ddr), vect_factor);
6563 tree segment_length_b = vect_vfa_segment_size (DDR_B (ddr), vect_factor);
6565 if (vect_print_dump_info (REPORT_DR_DETAILS))
6567 fprintf (vect_dump,
6568 "create runtime check for data references ");
6569 print_generic_expr (vect_dump, DR_REF (DDR_A (ddr)), TDF_SLIM);
6570 fprintf (vect_dump, " and ");
6571 print_generic_expr (vect_dump, DR_REF (DDR_B (ddr)), TDF_SLIM);
6575 part_cond_expr =
6576 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
6577 fold_build2 (LT_EXPR, boolean_type_node,
6578 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
6579 addr_base_a,
6580 segment_length_a),
6581 addr_base_b),
6582 fold_build2 (LT_EXPR, boolean_type_node,
6583 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
6584 addr_base_b,
6585 segment_length_b),
6586 addr_base_a));
6588 if (*cond_expr)
6589 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
6590 *cond_expr, part_cond_expr);
6591 else
6592 *cond_expr = part_cond_expr;
6594 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6595 fprintf (vect_dump, "created %u versioning for alias checks.\n",
6596 VEC_length (ddr_p, may_alias_ddrs));
6600 /* Function vect_transform_loop.
6602 The analysis phase has determined that the loop is vectorizable.
6603 Vectorize the loop - created vectorized stmts to replace the scalar
6604 stmts in the loop, and update the loop exit condition. */
6606 void
6607 vect_transform_loop (loop_vec_info loop_vinfo)
6609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6610 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6611 int nbbs = loop->num_nodes;
6612 block_stmt_iterator si, next_si;
6613 int i;
6614 tree ratio = NULL;
6615 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6616 bool strided_store;
6618 if (vect_print_dump_info (REPORT_DETAILS))
6619 fprintf (vect_dump, "=== vec_transform_loop ===");
6621 /* If the loop has data references that may or may not be aligned or/and
6622 has data reference relations whose independence was not proven then
6623 two versions of the loop need to be generated, one which is vectorized
6624 and one which isn't. A test is then generated to control which of the
6625 loops is executed. The test checks for the alignment of all of the
6626 data references that may or may not be aligned. An additional
6627 sequence of runtime tests is generated for each pairs of DDRs whose
6628 independence was not proven. The vectorized version of loop is
6629 executed only if both alias and alignment tests are passed. */
6631 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6632 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6634 struct loop *nloop;
6635 tree cond_expr = NULL_TREE;
6636 tree cond_expr_stmt_list = NULL_TREE;
6637 basic_block condition_bb;
6638 block_stmt_iterator cond_exp_bsi;
6639 basic_block merge_bb;
6640 basic_block new_exit_bb;
6641 edge new_exit_e, e;
6642 tree orig_phi, new_phi, arg;
6643 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
6644 tree gimplify_stmt_list;
6646 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
6647 cond_expr =
6648 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr_stmt_list);
6650 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6651 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
6652 &cond_expr_stmt_list);
6654 cond_expr =
6655 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
6656 cond_expr =
6657 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
6658 NULL_TREE);
6659 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
6661 initialize_original_copy_tables ();
6662 nloop = loop_version (loop, cond_expr, &condition_bb,
6663 prob, prob, REG_BR_PROB_BASE - prob, true);
6664 free_original_copy_tables();
6666 /** Loop versioning violates an assumption we try to maintain during
6667 vectorization - that the loop exit block has a single predecessor.
6668 After versioning, the exit block of both loop versions is the same
6669 basic block (i.e. it has two predecessors). Just in order to simplify
6670 following transformations in the vectorizer, we fix this situation
6671 here by adding a new (empty) block on the exit-edge of the loop,
6672 with the proper loop-exit phis to maintain loop-closed-form. **/
6674 merge_bb = single_exit (loop)->dest;
6675 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
6676 new_exit_bb = split_edge (single_exit (loop));
6677 new_exit_e = single_exit (loop);
6678 e = EDGE_SUCC (new_exit_bb, 0);
6680 for (orig_phi = phi_nodes (merge_bb); orig_phi;
6681 orig_phi = PHI_CHAIN (orig_phi))
6683 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
6684 new_exit_bb);
6685 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
6686 add_phi_arg (new_phi, arg, new_exit_e);
6687 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
6690 /** end loop-exit-fixes after versioning **/
6692 update_ssa (TODO_update_ssa);
6693 cond_exp_bsi = bsi_last (condition_bb);
6694 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
6697 /* CHECKME: we wouldn't need this if we called update_ssa once
6698 for all loops. */
6699 bitmap_zero (vect_memsyms_to_rename);
6701 /* Peel the loop if there are data refs with unknown alignment.
6702 Only one data ref with unknown store is allowed. */
6704 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6705 vect_do_peeling_for_alignment (loop_vinfo);
6707 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6708 compile time constant), or it is a constant that doesn't divide by the
6709 vectorization factor, then an epilog loop needs to be created.
6710 We therefore duplicate the loop: the original loop will be vectorized,
6711 and will compute the first (n/VF) iterations. The second copy of the loop
6712 will remain scalar and will compute the remaining (n%VF) iterations.
6713 (VF is the vectorization factor). */
6715 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6716 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
6717 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
6718 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
6719 else
6720 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6721 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6723 /* 1) Make sure the loop header has exactly two entries
6724 2) Make sure we have a preheader basic block. */
6726 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6728 split_edge (loop_preheader_edge (loop));
6730 /* FORNOW: the vectorizer supports only loops which body consist
6731 of one basic block (header + empty latch). When the vectorizer will
6732 support more involved loop forms, the order by which the BBs are
6733 traversed need to be reconsidered. */
6735 for (i = 0; i < nbbs; i++)
6737 basic_block bb = bbs[i];
6738 stmt_vec_info stmt_info;
6739 tree phi;
6741 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
6743 if (vect_print_dump_info (REPORT_DETAILS))
6745 fprintf (vect_dump, "------>vectorizing phi: ");
6746 print_generic_expr (vect_dump, phi, TDF_SLIM);
6748 stmt_info = vinfo_for_stmt (phi);
6749 if (!stmt_info)
6750 continue;
6751 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6752 && !STMT_VINFO_LIVE_P (stmt_info))
6753 continue;
6755 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6756 != (unsigned HOST_WIDE_INT) vectorization_factor)
6757 && vect_print_dump_info (REPORT_DETAILS))
6758 fprintf (vect_dump, "multiple-types.");
6760 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6762 if (vect_print_dump_info (REPORT_DETAILS))
6763 fprintf (vect_dump, "transform phi.");
6764 vect_transform_stmt (phi, NULL, NULL);
6768 for (si = bsi_start (bb); !bsi_end_p (si);)
6770 tree stmt = bsi_stmt (si);
6771 bool is_store;
6773 if (vect_print_dump_info (REPORT_DETAILS))
6775 fprintf (vect_dump, "------>vectorizing statement: ");
6776 print_generic_expr (vect_dump, stmt, TDF_SLIM);
6779 stmt_info = vinfo_for_stmt (stmt);
6781 /* vector stmts created in the outer-loop during vectorization of
6782 stmts in an inner-loop may not have a stmt_info, and do not
6783 need to be vectorized. */
6784 if (!stmt_info)
6786 bsi_next (&si);
6787 continue;
6790 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6791 && !STMT_VINFO_LIVE_P (stmt_info))
6793 bsi_next (&si);
6794 continue;
6797 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
6798 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6799 != (unsigned HOST_WIDE_INT) vectorization_factor)
6800 && vect_print_dump_info (REPORT_DETAILS))
6801 fprintf (vect_dump, "multiple-types.");
6803 /* -------- vectorize statement ------------ */
6804 if (vect_print_dump_info (REPORT_DETAILS))
6805 fprintf (vect_dump, "transform statement.");
6807 strided_store = false;
6808 is_store = vect_transform_stmt (stmt, &si, &strided_store);
6809 if (is_store)
6811 stmt_ann_t ann;
6812 if (DR_GROUP_FIRST_DR (stmt_info))
6814 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6815 interleaving chain was completed - free all the stores in
6816 the chain. */
6817 tree next = DR_GROUP_FIRST_DR (stmt_info);
6818 tree tmp;
6819 stmt_vec_info next_stmt_info;
6821 while (next)
6823 next_si = bsi_for_stmt (next);
6824 next_stmt_info = vinfo_for_stmt (next);
6825 /* Free the attached stmt_vec_info and remove the stmt. */
6826 ann = stmt_ann (next);
6827 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
6828 free (next_stmt_info);
6829 set_stmt_info (ann, NULL);
6830 bsi_remove (&next_si, true);
6831 next = tmp;
6833 bsi_remove (&si, true);
6834 continue;
6836 else
6838 /* Free the attached stmt_vec_info and remove the stmt. */
6839 ann = stmt_ann (stmt);
6840 free (stmt_info);
6841 set_stmt_info (ann, NULL);
6842 bsi_remove (&si, true);
6843 continue;
6846 bsi_next (&si);
6847 } /* stmts in BB */
6848 } /* BBs in loop */
6850 slpeel_make_loop_iterate_ntimes (loop, ratio);
6852 mark_set_for_renaming (vect_memsyms_to_rename);
6854 /* The memory tags and pointers in vectorized statements need to
6855 have their SSA forms updated. FIXME, why can't this be delayed
6856 until all the loops have been transformed? */
6857 update_ssa (TODO_update_ssa);
6859 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6860 fprintf (vect_dump, "LOOP VECTORIZED.");
6861 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
6862 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");