gcc/ChangeLog:
[official-gcc.git] / gcc / tree-vect-transform.c
blobfc95e6090aa8bc363587090f1d4f994305076295
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to the Free
19 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "ggc.h"
27 #include "tree.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "basic-block.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "expr.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "recog.h"
40 #include "tree-data-ref.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "langhooks.h"
45 #include "tree-pass.h"
46 #include "toplev.h"
47 #include "real.h"
49 /* Utility functions for the code transformation. */
50 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
51 static tree vect_create_destination_var (tree, tree);
52 static tree vect_create_data_ref_ptr
53 (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree);
54 static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree);
55 static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *);
56 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
57 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
58 static tree vect_init_vector (tree, tree, tree);
59 static void vect_finish_stmt_generation
60 (tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
61 static bool vect_is_simple_cond (tree, loop_vec_info);
62 static void update_vuses_to_preheader (tree, struct loop*);
63 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
64 static tree get_initial_def_for_reduction (tree, tree, tree *);
66 /* Utility function dealing with loop peeling (not peeling itself). */
67 static void vect_generate_tmps_on_preheader
68 (loop_vec_info, tree *, tree *, tree *);
69 static tree vect_build_loop_niters (loop_vec_info);
70 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
71 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
72 static void vect_update_init_of_dr (struct data_reference *, tree niters);
73 static void vect_update_inits_of_drs (loop_vec_info, tree);
74 static int vect_min_worthwhile_factor (enum tree_code);
77 /* Function vect_get_new_vect_var.
79 Returns a name for a new variable. The current naming scheme appends the
80 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
81 the name of vectorizer generated variables, and appends that to NAME if
82 provided. */
84 static tree
85 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
87 const char *prefix;
88 tree new_vect_var;
90 switch (var_kind)
92 case vect_simple_var:
93 prefix = "vect_";
94 break;
95 case vect_scalar_var:
96 prefix = "stmp_";
97 break;
98 case vect_pointer_var:
99 prefix = "vect_p";
100 break;
101 default:
102 gcc_unreachable ();
105 if (name)
106 new_vect_var = create_tmp_var (type, concat (prefix, name, NULL));
107 else
108 new_vect_var = create_tmp_var (type, prefix);
110 /* Mark vector typed variable as a gimple register variable. */
111 if (TREE_CODE (type) == VECTOR_TYPE)
112 DECL_GIMPLE_REG_P (new_vect_var) = true;
114 return new_vect_var;
118 /* Function vect_create_addr_base_for_vector_ref.
120 Create an expression that computes the address of the first memory location
121 that will be accessed for a data reference.
123 Input:
124 STMT: The statement containing the data reference.
125 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
126 OFFSET: Optional. If supplied, it is be added to the initial address.
128 Output:
129 1. Return an SSA_NAME whose value is the address of the memory location of
130 the first vector of the data reference.
131 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
132 these statement(s) which define the returned SSA_NAME.
134 FORNOW: We are only handling array accesses with step 1. */
136 static tree
137 vect_create_addr_base_for_vector_ref (tree stmt,
138 tree *new_stmt_list,
139 tree offset)
141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
142 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
143 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
144 tree base_name = build_fold_indirect_ref (data_ref_base);
145 tree vec_stmt;
146 tree addr_base, addr_expr;
147 tree dest, new_stmt;
148 tree base_offset = unshare_expr (DR_OFFSET (dr));
149 tree init = unshare_expr (DR_INIT (dr));
150 tree vect_ptr_type, addr_expr2;
152 /* Create base_offset */
153 base_offset = size_binop (PLUS_EXPR, base_offset, init);
154 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
155 add_referenced_var (dest);
156 base_offset = force_gimple_operand (base_offset, &new_stmt, false, dest);
157 append_to_statement_list_force (new_stmt, new_stmt_list);
159 if (offset)
161 tree tmp = create_tmp_var (TREE_TYPE (base_offset), "offset");
162 tree step;
164 /* For interleaved access step we divide STEP by the size of the
165 interleaving group. */
166 if (DR_GROUP_SIZE (stmt_info))
167 step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr),
168 build_int_cst (TREE_TYPE (offset),
169 DR_GROUP_SIZE (stmt_info)));
170 else
171 step = DR_STEP (dr);
173 add_referenced_var (tmp);
174 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
175 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
176 base_offset, offset);
177 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
178 append_to_statement_list_force (new_stmt, new_stmt_list);
181 /* base + base_offset */
182 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base,
183 base_offset);
185 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
187 /* addr_expr = addr_base */
188 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
189 get_name (base_name));
190 add_referenced_var (addr_expr);
191 vec_stmt = fold_convert (vect_ptr_type, addr_base);
192 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
193 get_name (base_name));
194 add_referenced_var (addr_expr2);
195 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
196 append_to_statement_list_force (new_stmt, new_stmt_list);
198 if (vect_print_dump_info (REPORT_DETAILS))
200 fprintf (vect_dump, "created ");
201 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
203 return vec_stmt;
207 /* Function vect_create_data_ref_ptr.
209 Create a new pointer to vector type (vp), that points to the first location
210 accessed in the loop by STMT, along with the def-use update chain to
211 appropriately advance the pointer through the loop iterations. Also set
212 aliasing information for the pointer. This vector pointer is used by the
213 callers to this function to create a memory reference expression for vector
214 load/store access.
216 Input:
217 1. STMT: a stmt that references memory. Expected to be of the form
218 GIMPLE_MODIFY_STMT <name, data-ref> or
219 GIMPLE_MODIFY_STMT <data-ref, name>.
220 2. BSI: block_stmt_iterator where new stmts can be added.
221 3. OFFSET (optional): an offset to be added to the initial address accessed
222 by the data-ref in STMT.
223 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
224 pointing to the initial address.
225 5. TYPE: if not NULL indicates the required type of the data-ref
227 Output:
228 1. Declare a new ptr to vector_type, and have it point to the base of the
229 data reference (initial addressed accessed by the data reference).
230 For example, for vector of type V8HI, the following code is generated:
232 v8hi *vp;
233 vp = (v8hi *)initial_address;
235 if OFFSET is not supplied:
236 initial_address = &a[init];
237 if OFFSET is supplied:
238 initial_address = &a[init + OFFSET];
240 Return the initial_address in INITIAL_ADDRESS.
242 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
243 update the pointer in each iteration of the loop.
245 Return the increment stmt that updates the pointer in PTR_INCR.
247 3. Return the pointer. */
249 static tree
250 vect_create_data_ref_ptr (tree stmt,
251 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
252 tree offset, tree *initial_address, tree *ptr_incr,
253 bool only_init, tree type)
255 tree base_name;
256 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
259 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
260 tree vect_ptr_type;
261 tree vect_ptr;
262 tree tag;
263 tree new_temp;
264 tree vec_stmt;
265 tree new_stmt_list = NULL_TREE;
266 edge pe = loop_preheader_edge (loop);
267 basic_block new_bb;
268 tree vect_ptr_init;
269 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
271 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
273 if (vect_print_dump_info (REPORT_DETAILS))
275 tree data_ref_base = base_name;
276 fprintf (vect_dump, "create vector-pointer variable to type: ");
277 print_generic_expr (vect_dump, vectype, TDF_SLIM);
278 if (TREE_CODE (data_ref_base) == VAR_DECL)
279 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
280 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
281 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
282 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
283 fprintf (vect_dump, " vectorizing a record based array ref: ");
284 else if (TREE_CODE (data_ref_base) == SSA_NAME)
285 fprintf (vect_dump, " vectorizing a pointer ref: ");
286 print_generic_expr (vect_dump, base_name, TDF_SLIM);
289 /** (1) Create the new vector-pointer variable: **/
290 if (type)
291 vect_ptr_type = build_pointer_type (type);
292 else
293 vect_ptr_type = build_pointer_type (vectype);
294 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
295 get_name (base_name));
296 add_referenced_var (vect_ptr);
298 /** (2) Add aliasing information to the new vector-pointer:
299 (The points-to info (DR_PTR_INFO) may be defined later.) **/
301 tag = DR_MEMTAG (dr);
302 gcc_assert (tag);
304 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
305 tag must be created with tag added to its may alias list. */
306 if (!MTAG_P (tag))
307 new_type_alias (vect_ptr, tag, DR_REF (dr));
308 else
309 set_symbol_mem_tag (vect_ptr, tag);
311 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
313 /** (3) Calculate the initial address the vector-pointer, and set
314 the vector-pointer to point to it before the loop: **/
316 /* Create: (&(base[init_val+offset]) in the loop preheader. */
317 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
318 offset);
319 pe = loop_preheader_edge (loop);
320 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
321 gcc_assert (!new_bb);
322 *initial_address = new_temp;
324 /* Create: p = (vectype *) initial_base */
325 vec_stmt = fold_convert (vect_ptr_type, new_temp);
326 vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vect_ptr, vec_stmt);
327 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
328 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
329 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
330 gcc_assert (!new_bb);
333 /** (4) Handle the updating of the vector-pointer inside the loop: **/
335 if (only_init) /* No update in loop is required. */
337 /* Copy the points-to information if it exists. */
338 if (DR_PTR_INFO (dr))
339 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
340 return vect_ptr_init;
342 else
344 block_stmt_iterator incr_bsi;
345 bool insert_after;
346 tree indx_before_incr, indx_after_incr;
347 tree incr;
349 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
350 create_iv (vect_ptr_init,
351 fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)),
352 NULL_TREE, loop, &incr_bsi, insert_after,
353 &indx_before_incr, &indx_after_incr);
354 incr = bsi_stmt (incr_bsi);
355 set_stmt_info (stmt_ann (incr),
356 new_stmt_vec_info (incr, loop_vinfo));
358 /* Copy the points-to information if it exists. */
359 if (DR_PTR_INFO (dr))
361 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
362 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
364 merge_alias_info (vect_ptr_init, indx_before_incr);
365 merge_alias_info (vect_ptr_init, indx_after_incr);
366 if (ptr_incr)
367 *ptr_incr = incr;
369 return indx_before_incr;
374 /* Function bump_vector_ptr
376 Increment a pointer (to a vector type) by vector-size. Connect the new
377 increment stmt to the existing def-use update-chain of the pointer.
379 The pointer def-use update-chain before this function:
380 DATAREF_PTR = phi (p_0, p_2)
381 ....
382 PTR_INCR: p_2 = DATAREF_PTR + step
384 The pointer def-use update-chain after this function:
385 DATAREF_PTR = phi (p_0, p_2)
386 ....
387 NEW_DATAREF_PTR = DATAREF_PTR + vector_size
388 ....
389 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
391 Input:
392 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
393 in the loop.
394 PTR_INCR - the stmt that updates the pointer in each iteration of the loop.
395 The increment amount across iterations is also expected to be
396 vector_size.
397 BSI - location where the new update stmt is to be placed.
398 STMT - the original scalar memory-access stmt that is being vectorized.
400 Output: Return NEW_DATAREF_PTR as illustrated above.
404 static tree
405 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
406 tree stmt)
408 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
409 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
410 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
411 tree vptr_type = TREE_TYPE (dataref_ptr);
412 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
413 tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype));
414 tree incr_stmt;
415 ssa_op_iter iter;
416 use_operand_p use_p;
417 tree new_dataref_ptr;
419 incr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, ptr_var,
420 build2 (PLUS_EXPR, vptr_type, dataref_ptr, update));
421 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
422 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
423 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
425 /* Update the vector-pointer's cross-iteration increment. */
426 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
428 tree use = USE_FROM_PTR (use_p);
430 if (use == dataref_ptr)
431 SET_USE (use_p, new_dataref_ptr);
432 else
433 gcc_assert (tree_int_cst_compare (use, update) == 0);
436 /* Copy the points-to information if it exists. */
437 if (DR_PTR_INFO (dr))
438 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
439 merge_alias_info (new_dataref_ptr, dataref_ptr);
441 return new_dataref_ptr;
445 /* Function vect_create_destination_var.
447 Create a new temporary of type VECTYPE. */
449 static tree
450 vect_create_destination_var (tree scalar_dest, tree vectype)
452 tree vec_dest;
453 const char *new_name;
454 tree type;
455 enum vect_var_kind kind;
457 kind = vectype ? vect_simple_var : vect_scalar_var;
458 type = vectype ? vectype : TREE_TYPE (scalar_dest);
460 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
462 new_name = get_name (scalar_dest);
463 if (!new_name)
464 new_name = "var_";
465 vec_dest = vect_get_new_vect_var (type, vect_simple_var, new_name);
466 add_referenced_var (vec_dest);
468 return vec_dest;
472 /* Function vect_init_vector.
474 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
475 the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be
476 used in the vectorization of STMT. */
478 static tree
479 vect_init_vector (tree stmt, tree vector_var, tree vector_type)
481 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
482 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
483 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
484 tree new_var;
485 tree init_stmt;
486 tree vec_oprnd;
487 edge pe;
488 tree new_temp;
489 basic_block new_bb;
491 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
492 add_referenced_var (new_var);
494 init_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, new_var, vector_var);
495 new_temp = make_ssa_name (new_var, init_stmt);
496 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
498 pe = loop_preheader_edge (loop);
499 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
500 gcc_assert (!new_bb);
502 if (vect_print_dump_info (REPORT_DETAILS))
504 fprintf (vect_dump, "created new init_stmt: ");
505 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
508 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
509 return vec_oprnd;
513 /* Function vect_get_vec_def_for_operand.
515 OP is an operand in STMT. This function returns a (vector) def that will be
516 used in the vectorized stmt for STMT.
518 In the case that OP is an SSA_NAME which is defined in the loop, then
519 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
521 In case OP is an invariant or constant, a new stmt that creates a vector def
522 needs to be introduced. */
524 static tree
525 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
527 tree vec_oprnd;
528 tree vec_stmt;
529 tree def_stmt;
530 stmt_vec_info def_stmt_info = NULL;
531 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
532 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
533 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
534 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
535 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
536 tree vec_inv;
537 tree vec_cst;
538 tree t = NULL_TREE;
539 tree def;
540 int i;
541 enum vect_def_type dt;
542 bool is_simple_use;
543 tree vector_type;
545 if (vect_print_dump_info (REPORT_DETAILS))
547 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
548 print_generic_expr (vect_dump, op, TDF_SLIM);
551 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
552 gcc_assert (is_simple_use);
553 if (vect_print_dump_info (REPORT_DETAILS))
555 if (def)
557 fprintf (vect_dump, "def = ");
558 print_generic_expr (vect_dump, def, TDF_SLIM);
560 if (def_stmt)
562 fprintf (vect_dump, " def_stmt = ");
563 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
567 switch (dt)
569 /* Case 1: operand is a constant. */
570 case vect_constant_def:
572 if (scalar_def)
573 *scalar_def = op;
575 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
576 if (vect_print_dump_info (REPORT_DETAILS))
577 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
579 for (i = nunits - 1; i >= 0; --i)
581 t = tree_cons (NULL_TREE, op, t);
583 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
584 vec_cst = build_vector (vector_type, t);
586 return vect_init_vector (stmt, vec_cst, vector_type);
589 /* Case 2: operand is defined outside the loop - loop invariant. */
590 case vect_invariant_def:
592 if (scalar_def)
593 *scalar_def = def;
595 /* Create 'vec_inv = {inv,inv,..,inv}' */
596 if (vect_print_dump_info (REPORT_DETAILS))
597 fprintf (vect_dump, "Create vector_inv.");
599 for (i = nunits - 1; i >= 0; --i)
601 t = tree_cons (NULL_TREE, def, t);
604 /* FIXME: use build_constructor directly. */
605 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
606 vec_inv = build_constructor_from_list (vector_type, t);
607 return vect_init_vector (stmt, vec_inv, vector_type);
610 /* Case 3: operand is defined inside the loop. */
611 case vect_loop_def:
613 if (scalar_def)
614 *scalar_def = def_stmt;
616 /* Get the def from the vectorized stmt. */
617 def_stmt_info = vinfo_for_stmt (def_stmt);
618 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
619 gcc_assert (vec_stmt);
620 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
621 return vec_oprnd;
624 /* Case 4: operand is defined by a loop header phi - reduction */
625 case vect_reduction_def:
627 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
629 /* Get the def before the loop */
630 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
631 return get_initial_def_for_reduction (stmt, op, scalar_def);
634 /* Case 5: operand is defined by loop-header phi - induction. */
635 case vect_induction_def:
637 if (vect_print_dump_info (REPORT_DETAILS))
638 fprintf (vect_dump, "induction - unsupported.");
639 internal_error ("no support for induction"); /* FORNOW */
642 default:
643 gcc_unreachable ();
648 /* Function vect_get_vec_def_for_stmt_copy
650 Return a vector-def for an operand. This function is used when the
651 vectorized stmt to be created (by the caller to this function) is a "copy"
652 created in case the vectorized result cannot fit in one vector, and several
653 copies of the vector-stmt are required. In this case the vector-def is
654 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
655 of the stmt that defines VEC_OPRND.
656 DT is the type of the vector def VEC_OPRND.
658 Context:
659 In case the vectorization factor (VF) is bigger than the number
660 of elements that can fit in a vectype (nunits), we have to generate
661 more than one vector stmt to vectorize the scalar stmt. This situation
662 arises when there are multiple data-types operated upon in the loop; the
663 smallest data-type determines the VF, and as a result, when vectorizing
664 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
665 vector stmt (each computing a vector of 'nunits' results, and together
666 computing 'VF' results in each iteration). This function is called when
667 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
668 which VF=16 and nunits=4, so the number of copies required is 4):
670 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
672 S1: x = load VS1.0: vx.0 = memref0 VS1.1
673 VS1.1: vx.1 = memref1 VS1.2
674 VS1.2: vx.2 = memref2 VS1.3
675 VS1.3: vx.3 = memref3
677 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
678 VSnew.1: vz1 = vx.1 + ... VSnew.2
679 VSnew.2: vz2 = vx.2 + ... VSnew.3
680 VSnew.3: vz3 = vx.3 + ...
682 The vectorization of S1 is explained in vectorizable_load.
683 The vectorization of S2:
684 To create the first vector-stmt out of the 4 copies - VSnew.0 -
685 the function 'vect_get_vec_def_for_operand' is called to
686 get the relevant vector-def for each operand of S2. For operand x it
687 returns the vector-def 'vx.0'.
689 To create the remaining copies of the vector-stmt (VSnew.j), this
690 function is called to get the relevant vector-def for each operand. It is
691 obtained from the respective VS1.j stmt, which is recorded in the
692 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
694 For example, to obtain the vector-def 'vx.1' in order to create the
695 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
696 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
697 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
698 and return its def ('vx.1').
699 Overall, to create the above sequence this function will be called 3 times:
700 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
701 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
702 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
704 static tree
705 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
707 tree vec_stmt_for_operand;
708 stmt_vec_info def_stmt_info;
710 if (dt == vect_invariant_def || dt == vect_constant_def)
712 /* Do nothing; can reuse same def. */ ;
713 return vec_oprnd;
716 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
717 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
718 gcc_assert (def_stmt_info);
719 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
720 gcc_assert (vec_stmt_for_operand);
721 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
723 return vec_oprnd;
727 /* Function vect_finish_stmt_generation.
729 Insert a new stmt. */
731 static void
732 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
733 block_stmt_iterator *bsi)
735 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
736 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
738 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
739 set_stmt_info (get_stmt_ann (vec_stmt),
740 new_stmt_vec_info (vec_stmt, loop_vinfo));
742 if (vect_print_dump_info (REPORT_DETAILS))
744 fprintf (vect_dump, "add new stmt: ");
745 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
748 /* Make sure bsi points to the stmt that is being vectorized. */
749 gcc_assert (stmt == bsi_stmt (*bsi));
751 #ifdef USE_MAPPED_LOCATION
752 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
753 #else
754 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
755 #endif
759 #define ADJUST_IN_EPILOG 1
761 /* Function get_initial_def_for_reduction
763 Input:
764 STMT - a stmt that performs a reduction operation in the loop.
765 INIT_VAL - the initial value of the reduction variable
767 Output:
768 SCALAR_DEF - a tree that holds a value to be added to the final result
769 of the reduction (used for "ADJUST_IN_EPILOG" - see below).
770 Return a vector variable, initialized according to the operation that STMT
771 performs. This vector will be used as the initial value of the
772 vector of partial results.
774 Option1 ("ADJUST_IN_EPILOG"): Initialize the vector as follows:
775 add: [0,0,...,0,0]
776 mult: [1,1,...,1,1]
777 min/max: [init_val,init_val,..,init_val,init_val]
778 bit and/or: [init_val,init_val,..,init_val,init_val]
779 and when necessary (e.g. add/mult case) let the caller know
780 that it needs to adjust the result by init_val.
782 Option2: Initialize the vector as follows:
783 add: [0,0,...,0,init_val]
784 mult: [1,1,...,1,init_val]
785 min/max: [init_val,init_val,...,init_val]
786 bit and/or: [init_val,init_val,...,init_val]
787 and no adjustments are needed.
789 For example, for the following code:
791 s = init_val;
792 for (i=0;i<n;i++)
793 s = s + a[i];
795 STMT is 's = s + a[i]', and the reduction variable is 's'.
796 For a vector of 4 units, we want to return either [0,0,0,init_val],
797 or [0,0,0,0] and let the caller know that it needs to adjust
798 the result at the end by 'init_val'.
800 FORNOW: We use the "ADJUST_IN_EPILOG" scheme.
801 TODO: Use some cost-model to estimate which scheme is more profitable.
804 static tree
805 get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
807 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
808 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
809 int nunits = GET_MODE_NUNITS (TYPE_MODE (vectype));
810 int nelements;
811 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
812 tree type = TREE_TYPE (init_val);
813 tree def;
814 tree vec, t = NULL_TREE;
815 bool need_epilog_adjust;
816 int i;
817 tree vector_type;
819 gcc_assert (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
821 switch (code)
823 case WIDEN_SUM_EXPR:
824 case DOT_PROD_EXPR:
825 case PLUS_EXPR:
826 if (INTEGRAL_TYPE_P (type))
827 def = build_int_cst (type, 0);
828 else
829 def = build_real (type, dconst0);
831 #ifdef ADJUST_IN_EPILOG
832 /* All the 'nunits' elements are set to 0. The final result will be
833 adjusted by 'init_val' at the loop epilog. */
834 nelements = nunits;
835 need_epilog_adjust = true;
836 #else
837 /* 'nunits - 1' elements are set to 0; The last element is set to
838 'init_val'. No further adjustments at the epilog are needed. */
839 nelements = nunits - 1;
840 need_epilog_adjust = false;
841 #endif
842 break;
844 case MIN_EXPR:
845 case MAX_EXPR:
846 def = init_val;
847 nelements = nunits;
848 need_epilog_adjust = false;
849 break;
851 default:
852 gcc_unreachable ();
855 for (i = nelements - 1; i >= 0; --i)
856 t = tree_cons (NULL_TREE, def, t);
858 if (nelements == nunits - 1)
860 /* Set the last element of the vector. */
861 t = tree_cons (NULL_TREE, init_val, t);
862 nelements += 1;
864 gcc_assert (nelements == nunits);
866 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
867 if (TREE_CODE (init_val) == INTEGER_CST || TREE_CODE (init_val) == REAL_CST)
868 vec = build_vector (vector_type, t);
869 else
870 vec = build_constructor_from_list (vector_type, t);
872 if (!need_epilog_adjust)
873 *scalar_def = NULL_TREE;
874 else
875 *scalar_def = init_val;
877 return vect_init_vector (stmt, vec, vector_type);
881 /* Function vect_create_epilog_for_reduction
883 Create code at the loop-epilog to finalize the result of a reduction
884 computation.
886 VECT_DEF is a vector of partial results.
887 REDUC_CODE is the tree-code for the epilog reduction.
888 STMT is the scalar reduction stmt that is being vectorized.
889 REDUCTION_PHI is the phi-node that carries the reduction computation.
891 This function:
892 1. Creates the reduction def-use cycle: sets the the arguments for
893 REDUCTION_PHI:
894 The loop-entry argument is the vectorized initial-value of the reduction.
895 The loop-latch argument is VECT_DEF - the vector of partial sums.
896 2. "Reduces" the vector of partial results VECT_DEF into a single result,
897 by applying the operation specified by REDUC_CODE if available, or by
898 other means (whole-vector shifts or a scalar loop).
899 The function also creates a new phi node at the loop exit to preserve
900 loop-closed form, as illustrated below.
902 The flow at the entry to this function:
904 loop:
905 vec_def = phi <null, null> # REDUCTION_PHI
906 VECT_DEF = vector_stmt # vectorized form of STMT
907 s_loop = scalar_stmt # (scalar) STMT
908 loop_exit:
909 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
910 use <s_out0>
911 use <s_out0>
913 The above is transformed by this function into:
915 loop:
916 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
917 VECT_DEF = vector_stmt # vectorized form of STMT
918 s_loop = scalar_stmt # (scalar) STMT
919 loop_exit:
920 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
921 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
922 v_out2 = reduce <v_out1>
923 s_out3 = extract_field <v_out2, 0>
924 s_out4 = adjust_result <s_out3>
925 use <s_out4>
926 use <s_out4>
929 static void
930 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
931 enum tree_code reduc_code, tree reduction_phi)
933 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
934 tree vectype;
935 enum machine_mode mode;
936 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
937 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
938 basic_block exit_bb;
939 tree scalar_dest;
940 tree scalar_type;
941 tree new_phi;
942 block_stmt_iterator exit_bsi;
943 tree vec_dest;
944 tree new_temp;
945 tree new_name;
946 tree epilog_stmt;
947 tree new_scalar_dest, exit_phi;
948 tree bitsize, bitpos, bytesize;
949 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
950 tree scalar_initial_def;
951 tree vec_initial_def;
952 tree orig_name;
953 imm_use_iterator imm_iter;
954 use_operand_p use_p;
955 bool extract_scalar_result;
956 tree reduction_op;
957 tree orig_stmt;
958 tree use_stmt;
959 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
960 int op_type;
962 op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
963 reduction_op = TREE_OPERAND (operation, op_type-1);
964 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
965 mode = TYPE_MODE (vectype);
967 /*** 1. Create the reduction def-use cycle ***/
969 /* 1.1 set the loop-entry arg of the reduction-phi: */
970 /* For the case of reduction, vect_get_vec_def_for_operand returns
971 the scalar def before the loop, that defines the initial value
972 of the reduction variable. */
973 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
974 &scalar_initial_def);
975 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
977 /* 1.2 set the loop-latch arg for the reduction-phi: */
978 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
980 if (vect_print_dump_info (REPORT_DETAILS))
982 fprintf (vect_dump, "transform reduction: created def-use cycle:");
983 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
984 fprintf (vect_dump, "\n");
985 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
989 /*** 2. Create epilog code
990 The reduction epilog code operates across the elements of the vector
991 of partial results computed by the vectorized loop.
992 The reduction epilog code consists of:
993 step 1: compute the scalar result in a vector (v_out2)
994 step 2: extract the scalar result (s_out3) from the vector (v_out2)
995 step 3: adjust the scalar result (s_out3) if needed.
997 Step 1 can be accomplished using one the following three schemes:
998 (scheme 1) using reduc_code, if available.
999 (scheme 2) using whole-vector shifts, if available.
1000 (scheme 3) using a scalar loop. In this case steps 1+2 above are
1001 combined.
1003 The overall epilog code looks like this:
1005 s_out0 = phi <s_loop> # original EXIT_PHI
1006 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
1007 v_out2 = reduce <v_out1> # step 1
1008 s_out3 = extract_field <v_out2, 0> # step 2
1009 s_out4 = adjust_result <s_out3> # step 3
1011 (step 3 is optional, and step2 1 and 2 may be combined).
1012 Lastly, the uses of s_out0 are replaced by s_out4.
1014 ***/
1016 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
1017 v_out1 = phi <v_loop> */
1019 exit_bb = single_exit (loop)->dest;
1020 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
1021 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
1022 exit_bsi = bsi_start (exit_bb);
1024 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
1025 (i.e. when reduc_code is not available) and in the final adjustment code
1026 (if needed). Also get the original scalar reduction variable as
1027 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
1028 represents a reduction pattern), the tree-code and scalar-def are
1029 taken from the original stmt that the pattern-stmt (STMT) replaces.
1030 Otherwise (it is a regular reduction) - the tree-code and scalar-def
1031 are taken from STMT. */
1033 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1034 if (!orig_stmt)
1036 /* Regular reduction */
1037 orig_stmt = stmt;
1039 else
1041 /* Reduction pattern */
1042 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
1043 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1044 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
1046 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1047 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
1048 scalar_type = TREE_TYPE (scalar_dest);
1049 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
1050 bitsize = TYPE_SIZE (scalar_type);
1051 bytesize = TYPE_SIZE_UNIT (scalar_type);
1053 /* 2.3 Create the reduction code, using one of the three schemes described
1054 above. */
1056 if (reduc_code < NUM_TREE_CODES)
1058 /*** Case 1: Create:
1059 v_out2 = reduc_expr <v_out1> */
1061 if (vect_print_dump_info (REPORT_DETAILS))
1062 fprintf (vect_dump, "Reduce using direct vector reduction.");
1064 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1065 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
1066 build1 (reduc_code, vectype, PHI_RESULT (new_phi)));
1067 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1068 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1069 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1071 extract_scalar_result = true;
1073 else
1075 enum tree_code shift_code = 0;
1076 bool have_whole_vector_shift = true;
1077 int bit_offset;
1078 int element_bitsize = tree_low_cst (bitsize, 1);
1079 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1080 tree vec_temp;
1082 if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
1083 shift_code = VEC_RSHIFT_EXPR;
1084 else
1085 have_whole_vector_shift = false;
1087 /* Regardless of whether we have a whole vector shift, if we're
1088 emulating the operation via tree-vect-generic, we don't want
1089 to use it. Only the first round of the reduction is likely
1090 to still be profitable via emulation. */
1091 /* ??? It might be better to emit a reduction tree code here, so that
1092 tree-vect-generic can expand the first round via bit tricks. */
1093 if (!VECTOR_MODE_P (mode))
1094 have_whole_vector_shift = false;
1095 else
1097 optab optab = optab_for_tree_code (code, vectype);
1098 if (optab->handlers[mode].insn_code == CODE_FOR_nothing)
1099 have_whole_vector_shift = false;
1102 if (have_whole_vector_shift)
1104 /*** Case 2: Create:
1105 for (offset = VS/2; offset >= element_size; offset/=2)
1107 Create: va' = vec_shift <va, offset>
1108 Create: va = vop <va, va'>
1109 } */
1111 if (vect_print_dump_info (REPORT_DETAILS))
1112 fprintf (vect_dump, "Reduce using vector shifts");
1114 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1115 new_temp = PHI_RESULT (new_phi);
1117 for (bit_offset = vec_size_in_bits/2;
1118 bit_offset >= element_bitsize;
1119 bit_offset /= 2)
1121 tree bitpos = size_int (bit_offset);
1123 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1124 vec_dest,
1125 build2 (shift_code, vectype,
1126 new_temp, bitpos));
1127 new_name = make_ssa_name (vec_dest, epilog_stmt);
1128 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1129 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1131 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1132 vec_dest,
1133 build2 (code, vectype,
1134 new_name, new_temp));
1135 new_temp = make_ssa_name (vec_dest, epilog_stmt);
1136 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1137 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1140 extract_scalar_result = true;
1142 else
1144 tree rhs;
1146 /*** Case 3: Create:
1147 s = extract_field <v_out2, 0>
1148 for (offset = element_size;
1149 offset < vector_size;
1150 offset += element_size;)
1152 Create: s' = extract_field <v_out2, offset>
1153 Create: s = op <s, s'>
1154 } */
1156 if (vect_print_dump_info (REPORT_DETAILS))
1157 fprintf (vect_dump, "Reduce using scalar code. ");
1159 vec_temp = PHI_RESULT (new_phi);
1160 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
1161 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1162 bitsize_zero_node);
1163 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1164 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1165 new_scalar_dest, rhs);
1166 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1167 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1168 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1170 for (bit_offset = element_bitsize;
1171 bit_offset < vec_size_in_bits;
1172 bit_offset += element_bitsize)
1174 tree bitpos = bitsize_int (bit_offset);
1175 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
1176 bitpos);
1178 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1179 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1180 new_scalar_dest, rhs);
1181 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
1182 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
1183 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1185 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1186 new_scalar_dest,
1187 build2 (code, scalar_type, new_name, new_temp));
1188 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1189 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1190 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1193 extract_scalar_result = false;
1197 /* 2.4 Extract the final scalar result. Create:
1198 s_out3 = extract_field <v_out2, bitpos> */
1200 if (extract_scalar_result)
1202 tree rhs;
1204 if (vect_print_dump_info (REPORT_DETAILS))
1205 fprintf (vect_dump, "extract scalar result");
1207 if (BYTES_BIG_ENDIAN)
1208 bitpos = size_binop (MULT_EXPR,
1209 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
1210 TYPE_SIZE (scalar_type));
1211 else
1212 bitpos = bitsize_zero_node;
1214 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
1215 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
1216 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1217 new_scalar_dest, rhs);
1218 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1219 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1220 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1223 /* 2.4 Adjust the final result by the initial value of the reduction
1224 variable. (When such adjustment is not needed, then
1225 'scalar_initial_def' is zero).
1227 Create:
1228 s_out4 = scalar_expr <s_out3, scalar_initial_def> */
1230 if (scalar_initial_def)
1232 epilog_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
1233 new_scalar_dest,
1234 build2 (code, scalar_type, new_temp, scalar_initial_def));
1235 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
1236 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
1237 bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
1240 /* 2.6 Replace uses of s_out0 with uses of s_out3 */
1242 /* Find the loop-closed-use at the loop exit of the original scalar result.
1243 (The reduction result is expected to have two immediate uses - one at the
1244 latch block, and one at the loop exit). */
1245 exit_phi = NULL;
1246 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
1248 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
1250 exit_phi = USE_STMT (use_p);
1251 break;
1254 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
1255 gcc_assert (exit_phi);
1256 /* Replace the uses: */
1257 orig_name = PHI_RESULT (exit_phi);
1258 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
1259 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1260 SET_USE (use_p, new_temp);
1264 /* Function vectorizable_reduction.
1266 Check if STMT performs a reduction operation that can be vectorized.
1267 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1268 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1269 Return FALSE if not a vectorizable STMT, TRUE otherwise.
1271 This function also handles reduction idioms (patterns) that have been
1272 recognized in advance during vect_pattern_recog. In this case, STMT may be
1273 of this form:
1274 X = pattern_expr (arg0, arg1, ..., X)
1275 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
1276 sequence that had been detected and replaced by the pattern-stmt (STMT).
1278 In some cases of reduction patterns, the type of the reduction variable X is
1279 different than the type of the other arguments of STMT.
1280 In such cases, the vectype that is used when transforming STMT into a vector
1281 stmt is different than the vectype that is used to determine the
1282 vectorization factor, because it consists of a different number of elements
1283 than the actual number of elements that are being operated upon in parallel.
1285 For example, consider an accumulation of shorts into an int accumulator.
1286 On some targets it's possible to vectorize this pattern operating on 8
1287 shorts at a time (hence, the vectype for purposes of determining the
1288 vectorization factor should be V8HI); on the other hand, the vectype that
1289 is used to create the vector form is actually V4SI (the type of the result).
1291 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
1292 indicates what is the actual level of parallelism (V8HI in the example), so
1293 that the right vectorization factor would be derived. This vectype
1294 corresponds to the type of arguments to the reduction stmt, and should *NOT*
1295 be used to create the vectorized stmt. The right vectype for the vectorized
1296 stmt is obtained from the type of the result X:
1297 get_vectype_for_scalar_type (TREE_TYPE (X))
1299 This means that, contrary to "regular" reductions (or "regular" stmts in
1300 general), the following equation:
1301 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
1302 does *NOT* necessarily hold for reduction patterns. */
1304 bool
1305 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1307 tree vec_dest;
1308 tree scalar_dest;
1309 tree op;
1310 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
1311 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1312 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1313 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1315 tree operation;
1316 enum tree_code code, orig_code, epilog_reduc_code = 0;
1317 enum machine_mode vec_mode;
1318 int op_type;
1319 optab optab, reduc_optab;
1320 tree new_temp = NULL_TREE;
1321 tree def, def_stmt;
1322 enum vect_def_type dt;
1323 tree new_phi;
1324 tree scalar_type;
1325 bool is_simple_use;
1326 tree orig_stmt;
1327 stmt_vec_info orig_stmt_info;
1328 tree expr = NULL_TREE;
1329 int i;
1330 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1331 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1332 stmt_vec_info prev_stmt_info;
1333 tree reduc_def;
1334 tree new_stmt = NULL_TREE;
1335 int j;
1337 gcc_assert (ncopies >= 1);
1339 /* 1. Is vectorizable reduction? */
1341 /* Not supportable if the reduction variable is used in the loop. */
1342 if (STMT_VINFO_RELEVANT_P (stmt_info))
1343 return false;
1345 if (!STMT_VINFO_LIVE_P (stmt_info))
1346 return false;
1348 /* Make sure it was already recognized as a reduction computation. */
1349 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
1350 return false;
1352 /* 2. Has this been recognized as a reduction pattern?
1354 Check if STMT represents a pattern that has been recognized
1355 in earlier analysis stages. For stmts that represent a pattern,
1356 the STMT_VINFO_RELATED_STMT field records the last stmt in
1357 the original sequence that constitutes the pattern. */
1359 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1360 if (orig_stmt)
1362 orig_stmt_info = vinfo_for_stmt (orig_stmt);
1363 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
1364 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
1365 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
1368 /* 3. Check the operands of the operation. The first operands are defined
1369 inside the loop body. The last operand is the reduction variable,
1370 which is defined by the loop-header-phi. */
1372 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
1374 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1375 code = TREE_CODE (operation);
1376 op_type = TREE_CODE_LENGTH (code);
1377 if (op_type != binary_op && op_type != ternary_op)
1378 return false;
1379 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1380 scalar_type = TREE_TYPE (scalar_dest);
1382 /* All uses but the last are expected to be defined in the loop.
1383 The last use is the reduction variable. */
1384 for (i = 0; i < op_type-1; i++)
1386 op = TREE_OPERAND (operation, i);
1387 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1388 gcc_assert (is_simple_use);
1389 gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
1390 dt == vect_constant_def);
1393 op = TREE_OPERAND (operation, i);
1394 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1395 gcc_assert (is_simple_use);
1396 gcc_assert (dt == vect_reduction_def);
1397 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1398 if (orig_stmt)
1399 gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
1400 else
1401 gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
1403 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
1404 return false;
1406 /* 4. Supportable by target? */
1408 /* 4.1. check support for the operation in the loop */
1409 optab = optab_for_tree_code (code, vectype);
1410 if (!optab)
1412 if (vect_print_dump_info (REPORT_DETAILS))
1413 fprintf (vect_dump, "no optab.");
1414 return false;
1416 vec_mode = TYPE_MODE (vectype);
1417 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1419 if (vect_print_dump_info (REPORT_DETAILS))
1420 fprintf (vect_dump, "op not supported by target.");
1421 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1422 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1423 < vect_min_worthwhile_factor (code))
1424 return false;
1425 if (vect_print_dump_info (REPORT_DETAILS))
1426 fprintf (vect_dump, "proceeding using word mode.");
1429 /* Worthwhile without SIMD support? */
1430 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1431 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1432 < vect_min_worthwhile_factor (code))
1434 if (vect_print_dump_info (REPORT_DETAILS))
1435 fprintf (vect_dump, "not worthwhile without SIMD support.");
1436 return false;
1439 /* 4.2. Check support for the epilog operation.
1441 If STMT represents a reduction pattern, then the type of the
1442 reduction variable may be different than the type of the rest
1443 of the arguments. For example, consider the case of accumulation
1444 of shorts into an int accumulator; The original code:
1445 S1: int_a = (int) short_a;
1446 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
1448 was replaced with:
1449 STMT: int_acc = widen_sum <short_a, int_acc>
1451 This means that:
1452 1. The tree-code that is used to create the vector operation in the
1453 epilog code (that reduces the partial results) is not the
1454 tree-code of STMT, but is rather the tree-code of the original
1455 stmt from the pattern that STMT is replacing. I.e, in the example
1456 above we want to use 'widen_sum' in the loop, but 'plus' in the
1457 epilog.
1458 2. The type (mode) we use to check available target support
1459 for the vector operation to be created in the *epilog*, is
1460 determined by the type of the reduction variable (in the example
1461 above we'd check this: plus_optab[vect_int_mode]).
1462 However the type (mode) we use to check available target support
1463 for the vector operation to be created *inside the loop*, is
1464 determined by the type of the other arguments to STMT (in the
1465 example we'd check this: widen_sum_optab[vect_short_mode]).
1467 This is contrary to "regular" reductions, in which the types of all
1468 the arguments are the same as the type of the reduction variable.
1469 For "regular" reductions we can therefore use the same vector type
1470 (and also the same tree-code) when generating the epilog code and
1471 when generating the code inside the loop. */
1473 if (orig_stmt)
1475 /* This is a reduction pattern: get the vectype from the type of the
1476 reduction variable, and get the tree-code from orig_stmt. */
1477 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
1478 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
1479 vec_mode = TYPE_MODE (vectype);
1481 else
1483 /* Regular reduction: use the same vectype and tree-code as used for
1484 the vector code inside the loop can be used for the epilog code. */
1485 orig_code = code;
1488 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
1489 return false;
1490 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
1491 if (!reduc_optab)
1493 if (vect_print_dump_info (REPORT_DETAILS))
1494 fprintf (vect_dump, "no optab for reduction.");
1495 epilog_reduc_code = NUM_TREE_CODES;
1497 if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
1499 if (vect_print_dump_info (REPORT_DETAILS))
1500 fprintf (vect_dump, "reduc op not supported by target.");
1501 epilog_reduc_code = NUM_TREE_CODES;
1504 if (!vec_stmt) /* transformation not required. */
1506 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
1507 return true;
1510 /** Transform. **/
1512 if (vect_print_dump_info (REPORT_DETAILS))
1513 fprintf (vect_dump, "transform reduction.");
1515 /* Create the destination vector */
1516 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1518 /* Create the reduction-phi that defines the reduction-operand. */
1519 new_phi = create_phi_node (vec_dest, loop->header);
1521 /* In case the vectorization factor (VF) is bigger than the number
1522 of elements that we can fit in a vectype (nunits), we have to generate
1523 more than one vector stmt - i.e - we need to "unroll" the
1524 vector stmt by a factor VF/nunits. For more details see documentation
1525 in vectorizable_operation. */
1527 prev_stmt_info = NULL;
1528 for (j = 0; j < ncopies; j++)
1530 /* Handle uses. */
1531 if (j == 0)
1533 op = TREE_OPERAND (operation, 0);
1534 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
1535 if (op_type == ternary_op)
1537 op = TREE_OPERAND (operation, 1);
1538 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
1541 /* Get the vector def for the reduction variable from the phi node */
1542 reduc_def = PHI_RESULT (new_phi);
1544 else
1546 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
1547 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
1548 if (op_type == ternary_op)
1549 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
1551 /* Get the vector def for the reduction variable from the vectorized
1552 reduction operation generated in the previous iteration (j-1) */
1553 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
1556 /* Arguments are ready. create the new vector stmt. */
1558 if (op_type == binary_op)
1559 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
1560 else
1561 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
1562 reduc_def);
1563 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
1564 new_temp = make_ssa_name (vec_dest, new_stmt);
1565 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1566 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1568 if (j == 0)
1569 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1570 else
1571 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1572 prev_stmt_info = vinfo_for_stmt (new_stmt);
1575 /* Finalize the reduction-phi (set it's arguments) and create the
1576 epilog reduction code. */
1577 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
1578 return true;
1581 /* Checks if CALL can be vectorized in type VECTYPE. Returns
1582 a function declaration if the target has a vectorized version
1583 of the function, or NULL_TREE if the function cannot be vectorized. */
1585 tree
1586 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
1588 tree fndecl = get_callee_fndecl (call);
1589 enum built_in_function code;
1591 /* We only handle functions that do not read or clobber memory -- i.e.
1592 const or novops ones. */
1593 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
1594 return NULL_TREE;
1596 if (!fndecl
1597 || TREE_CODE (fndecl) != FUNCTION_DECL
1598 || !DECL_BUILT_IN (fndecl))
1599 return NULL_TREE;
1601 code = DECL_FUNCTION_CODE (fndecl);
1602 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
1603 vectype_in);
1606 /* Function vectorizable_call.
1608 Check if STMT performs a function call that can be vectorized.
1609 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1610 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1611 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1613 bool
1614 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1616 tree vec_dest;
1617 tree scalar_dest;
1618 tree operation;
1619 tree args, type;
1620 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
1621 tree vectype_out, vectype_in;
1622 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1623 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
1624 enum vect_def_type dt[2];
1625 int ncopies, j, nargs;
1627 /* Is STMT a vectorizable call? */
1628 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1629 return false;
1631 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1632 return false;
1634 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1635 if (TREE_CODE (operation) != CALL_EXPR)
1636 return false;
1638 /* Process function arguments. */
1639 rhs_type = NULL_TREE;
1640 for (args = TREE_OPERAND (operation, 1), nargs = 0;
1641 args; args = TREE_CHAIN (args), ++nargs)
1643 tree op = TREE_VALUE (args);
1645 /* Bail out if the function has more than two arguments, we
1646 do not have interesting builtin functions to vectorize with
1647 more than two arguments. */
1648 if (nargs >= 2)
1649 return false;
1651 /* We can only handle calls with arguments of the same type. */
1652 if (rhs_type
1653 && rhs_type != TREE_TYPE (op))
1655 if (vect_print_dump_info (REPORT_DETAILS))
1656 fprintf (vect_dump, "argument types differ.");
1657 return false;
1659 rhs_type = TREE_TYPE (op);
1661 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
1663 if (vect_print_dump_info (REPORT_DETAILS))
1664 fprintf (vect_dump, "use not simple.");
1665 return false;
1669 /* No arguments is also not good. */
1670 if (nargs == 0)
1671 return false;
1673 vectype_in = get_vectype_for_scalar_type (rhs_type);
1675 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
1676 vectype_out = get_vectype_for_scalar_type (lhs_type);
1678 /* Only handle the case of vectors with the same number of elements.
1679 FIXME: We need a way to handle for example the SSE2 cvtpd2dq
1680 instruction which converts V2DFmode to V4SImode but only
1681 using the lower half of the V4SImode result. */
1682 if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
1683 return false;
1685 /* For now, we only vectorize functions if a target specific builtin
1686 is available. TODO -- in some cases, it might be profitable to
1687 insert the calls for pieces of the vector, in order to be able
1688 to vectorize other operations in the loop. */
1689 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
1690 if (fndecl == NULL_TREE)
1692 if (vect_print_dump_info (REPORT_DETAILS))
1693 fprintf (vect_dump, "function is not vectorizable.");
1695 return false;
1698 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
1700 if (!vec_stmt) /* transformation not required. */
1702 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
1703 return true;
1706 /** Transform. **/
1708 if (vect_print_dump_info (REPORT_DETAILS))
1709 fprintf (vect_dump, "transform operation.");
1711 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1712 / TYPE_VECTOR_SUBPARTS (vectype_out));
1713 gcc_assert (ncopies >= 1);
1715 /* Handle def. */
1716 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1717 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
1719 prev_stmt_info = NULL;
1720 for (j = 0; j < ncopies; ++j)
1722 tree new_stmt, vargs;
1723 tree vec_oprnd[2];
1724 int n;
1726 /* Build argument list for the vectorized call. */
1727 vargs = NULL_TREE;
1728 for (args = TREE_OPERAND (operation, 1), n = 0;
1729 args; args = TREE_CHAIN (args), ++n)
1731 tree op = TREE_VALUE (args);
1733 if (j == 0)
1734 vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
1735 else
1736 vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
1738 vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
1740 vargs = nreverse (vargs);
1742 rhs = build_function_call_expr (fndecl, vargs);
1743 new_stmt = build2 (GIMPLE_MODIFY_STMT, NULL_TREE, vec_dest, rhs);
1744 new_temp = make_ssa_name (vec_dest, new_stmt);
1745 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
1747 vect_finish_stmt_generation (stmt, new_stmt, bsi);
1749 if (j == 0)
1750 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
1751 else
1752 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
1753 prev_stmt_info = vinfo_for_stmt (new_stmt);
1756 /* The call in STMT might prevent it from being removed in dce. We however
1757 cannot remove it here, due to the way the ssa name it defines is mapped
1758 to the new definition. So just replace rhs of the statement with something
1759 harmless. */
1760 type = TREE_TYPE (scalar_dest);
1761 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
1763 return true;
1767 /* Function vectorizable_assignment.
1769 Check if STMT performs an assignment (copy) that can be vectorized.
1770 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1771 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1772 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1774 bool
1775 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1777 tree vec_dest;
1778 tree scalar_dest;
1779 tree op;
1780 tree vec_oprnd;
1781 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1782 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1783 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1784 tree new_temp;
1785 tree def, def_stmt;
1786 enum vect_def_type dt;
1787 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1788 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
1790 gcc_assert (ncopies >= 1);
1791 if (ncopies > 1)
1792 return false; /* FORNOW */
1794 /* Is vectorizable assignment? */
1795 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1796 return false;
1798 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1800 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1801 return false;
1803 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1804 if (TREE_CODE (scalar_dest) != SSA_NAME)
1805 return false;
1807 op = GIMPLE_STMT_OPERAND (stmt, 1);
1808 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
1810 if (vect_print_dump_info (REPORT_DETAILS))
1811 fprintf (vect_dump, "use not simple.");
1812 return false;
1815 if (!vec_stmt) /* transformation not required. */
1817 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
1818 return true;
1821 /** Transform. **/
1822 if (vect_print_dump_info (REPORT_DETAILS))
1823 fprintf (vect_dump, "transform assignment.");
1825 /* Handle def. */
1826 vec_dest = vect_create_destination_var (scalar_dest, vectype);
1828 /* Handle use. */
1829 op = GIMPLE_STMT_OPERAND (stmt, 1);
1830 vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
1832 /* Arguments are ready. create the new vector stmt. */
1833 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, vec_oprnd);
1834 new_temp = make_ssa_name (vec_dest, *vec_stmt);
1835 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
1836 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
1838 return true;
1842 /* Function vect_min_worthwhile_factor.
1844 For a loop where we could vectorize the operation indicated by CODE,
1845 return the minimum vectorization factor that makes it worthwhile
1846 to use generic vectors. */
1847 static int
1848 vect_min_worthwhile_factor (enum tree_code code)
1850 switch (code)
1852 case PLUS_EXPR:
1853 case MINUS_EXPR:
1854 case NEGATE_EXPR:
1855 return 4;
1857 case BIT_AND_EXPR:
1858 case BIT_IOR_EXPR:
1859 case BIT_XOR_EXPR:
1860 case BIT_NOT_EXPR:
1861 return 2;
1863 default:
1864 return INT_MAX;
1869 /* Function vectorizable_operation.
1871 Check if STMT performs a binary or unary operation that can be vectorized.
1872 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
1873 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
1874 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
1876 bool
1877 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
1879 tree vec_dest;
1880 tree scalar_dest;
1881 tree operation;
1882 tree op0, op1 = NULL;
1883 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
1884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1885 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1886 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1887 enum tree_code code;
1888 enum machine_mode vec_mode;
1889 tree new_temp;
1890 int op_type;
1891 optab optab;
1892 int icode;
1893 enum machine_mode optab_op2_mode;
1894 tree def, def_stmt;
1895 enum vect_def_type dt0, dt1;
1896 tree new_stmt;
1897 stmt_vec_info prev_stmt_info;
1898 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
1899 int nunits_out;
1900 tree vectype_out;
1901 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
1902 int j;
1904 gcc_assert (ncopies >= 1);
1906 /* Is STMT a vectorizable binary/unary operation? */
1907 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1908 return false;
1910 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
1912 if (STMT_VINFO_LIVE_P (stmt_info))
1914 /* FORNOW: not yet supported. */
1915 if (vect_print_dump_info (REPORT_DETAILS))
1916 fprintf (vect_dump, "value used after loop.");
1917 return false;
1920 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
1921 return false;
1923 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
1924 return false;
1926 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
1927 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
1928 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
1929 if (nunits_out != nunits_in)
1930 return false;
1932 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1933 code = TREE_CODE (operation);
1934 optab = optab_for_tree_code (code, vectype);
1936 /* Support only unary or binary operations. */
1937 op_type = TREE_CODE_LENGTH (code);
1938 if (op_type != unary_op && op_type != binary_op)
1940 if (vect_print_dump_info (REPORT_DETAILS))
1941 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
1942 return false;
1945 op0 = TREE_OPERAND (operation, 0);
1946 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
1948 if (vect_print_dump_info (REPORT_DETAILS))
1949 fprintf (vect_dump, "use not simple.");
1950 return false;
1953 if (op_type == binary_op)
1955 op1 = TREE_OPERAND (operation, 1);
1956 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
1958 if (vect_print_dump_info (REPORT_DETAILS))
1959 fprintf (vect_dump, "use not simple.");
1960 return false;
1964 /* Supportable by target? */
1965 if (!optab)
1967 if (vect_print_dump_info (REPORT_DETAILS))
1968 fprintf (vect_dump, "no optab.");
1969 return false;
1971 vec_mode = TYPE_MODE (vectype);
1972 icode = (int) optab->handlers[(int) vec_mode].insn_code;
1973 if (icode == CODE_FOR_nothing)
1975 if (vect_print_dump_info (REPORT_DETAILS))
1976 fprintf (vect_dump, "op not supported by target.");
1977 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
1978 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1979 < vect_min_worthwhile_factor (code))
1980 return false;
1981 if (vect_print_dump_info (REPORT_DETAILS))
1982 fprintf (vect_dump, "proceeding using word mode.");
1985 /* Worthwhile without SIMD support? */
1986 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1987 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1988 < vect_min_worthwhile_factor (code))
1990 if (vect_print_dump_info (REPORT_DETAILS))
1991 fprintf (vect_dump, "not worthwhile without SIMD support.");
1992 return false;
1995 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
1997 /* FORNOW: not yet supported. */
1998 if (!VECTOR_MODE_P (vec_mode))
1999 return false;
2001 /* Invariant argument is needed for a vector shift
2002 by a scalar shift operand. */
2003 optab_op2_mode = insn_data[icode].operand[2].mode;
2004 if (! (VECTOR_MODE_P (optab_op2_mode)
2005 || dt1 == vect_constant_def
2006 || dt1 == vect_invariant_def))
2008 if (vect_print_dump_info (REPORT_DETAILS))
2009 fprintf (vect_dump, "operand mode requires invariant argument.");
2010 return false;
2014 if (!vec_stmt) /* transformation not required. */
2016 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
2017 return true;
2020 /** Transform. **/
2022 if (vect_print_dump_info (REPORT_DETAILS))
2023 fprintf (vect_dump, "transform binary/unary operation.");
2025 /* Handle def. */
2026 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2028 /* In case the vectorization factor (VF) is bigger than the number
2029 of elements that we can fit in a vectype (nunits), we have to generate
2030 more than one vector stmt - i.e - we need to "unroll" the
2031 vector stmt by a factor VF/nunits. In doing so, we record a pointer
2032 from one copy of the vector stmt to the next, in the field
2033 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
2034 stages to find the correct vector defs to be used when vectorizing
2035 stmts that use the defs of the current stmt. The example below illustrates
2036 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
2037 4 vectorized stmts):
2039 before vectorization:
2040 RELATED_STMT VEC_STMT
2041 S1: x = memref - -
2042 S2: z = x + 1 - -
2044 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
2045 there):
2046 RELATED_STMT VEC_STMT
2047 VS1_0: vx0 = memref0 VS1_1 -
2048 VS1_1: vx1 = memref1 VS1_2 -
2049 VS1_2: vx2 = memref2 VS1_3 -
2050 VS1_3: vx3 = memref3 - -
2051 S1: x = load - VS1_0
2052 S2: z = x + 1 - -
2054 step2: vectorize stmt S2 (done here):
2055 To vectorize stmt S2 we first need to find the relevant vector
2056 def for the first operand 'x'. This is, as usual, obtained from
2057 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
2058 that defines 'x' (S1). This way we find the stmt VS1_0, and the
2059 relevant vector def 'vx0'. Having found 'vx0' we can generate
2060 the vector stmt VS2_0, and as usual, record it in the
2061 STMT_VINFO_VEC_STMT of stmt S2.
2062 When creating the second copy (VS2_1), we obtain the relevant vector
2063 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
2064 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
2065 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
2066 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
2067 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
2068 chain of stmts and pointers:
2069 RELATED_STMT VEC_STMT
2070 VS1_0: vx0 = memref0 VS1_1 -
2071 VS1_1: vx1 = memref1 VS1_2 -
2072 VS1_2: vx2 = memref2 VS1_3 -
2073 VS1_3: vx3 = memref3 - -
2074 S1: x = load - VS1_0
2075 VS2_0: vz0 = vx0 + v1 VS2_1 -
2076 VS2_1: vz1 = vx1 + v1 VS2_2 -
2077 VS2_2: vz2 = vx2 + v1 VS2_3 -
2078 VS2_3: vz3 = vx3 + v1 - -
2079 S2: z = x + 1 - VS2_0 */
2081 prev_stmt_info = NULL;
2082 for (j = 0; j < ncopies; j++)
2084 /* Handle uses. */
2085 if (j == 0)
2087 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2088 if (op_type == binary_op)
2090 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
2092 /* Vector shl and shr insn patterns can be defined with
2093 scalar operand 2 (shift operand). In this case, use
2094 constant or loop invariant op1 directly, without
2095 extending it to vector mode first. */
2096 optab_op2_mode = insn_data[icode].operand[2].mode;
2097 if (!VECTOR_MODE_P (optab_op2_mode))
2099 if (vect_print_dump_info (REPORT_DETAILS))
2100 fprintf (vect_dump, "operand 1 using scalar mode.");
2101 vec_oprnd1 = op1;
2104 if (!vec_oprnd1)
2105 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2108 else
2110 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2111 if (op_type == binary_op)
2112 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2115 /* Arguments are ready. create the new vector stmt. */
2117 if (op_type == binary_op)
2118 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2119 build2 (code, vectype, vec_oprnd0, vec_oprnd1));
2120 else
2121 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
2122 build1 (code, vectype, vec_oprnd0));
2123 new_temp = make_ssa_name (vec_dest, new_stmt);
2124 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2125 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2127 if (j == 0)
2128 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2129 else
2130 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2131 prev_stmt_info = vinfo_for_stmt (new_stmt);
2134 return true;
2138 /* Function vectorizable_type_demotion
2140 Check if STMT performs a binary or unary operation that involves
2141 type demotion, and if it can be vectorized.
2142 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2143 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2144 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2146 bool
2147 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
2148 tree *vec_stmt)
2150 tree vec_dest;
2151 tree scalar_dest;
2152 tree operation;
2153 tree op0;
2154 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2155 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2156 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2157 enum tree_code code;
2158 tree new_temp;
2159 tree def, def_stmt;
2160 enum vect_def_type dt0;
2161 tree new_stmt;
2162 stmt_vec_info prev_stmt_info;
2163 int nunits_in;
2164 int nunits_out;
2165 tree vectype_out;
2166 int ncopies;
2167 int j;
2168 tree expr;
2169 tree vectype_in;
2170 tree scalar_type;
2171 optab optab;
2172 enum machine_mode vec_mode;
2174 /* Is STMT a vectorizable type-demotion operation? */
2176 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2177 return false;
2179 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2181 if (STMT_VINFO_LIVE_P (stmt_info))
2183 /* FORNOW: not yet supported. */
2184 if (vect_print_dump_info (REPORT_DETAILS))
2185 fprintf (vect_dump, "value used after loop.");
2186 return false;
2189 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2190 return false;
2192 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2193 return false;
2195 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2196 code = TREE_CODE (operation);
2197 if (code != NOP_EXPR && code != CONVERT_EXPR)
2198 return false;
2200 op0 = TREE_OPERAND (operation, 0);
2201 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2202 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2204 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2205 scalar_type = TREE_TYPE (scalar_dest);
2206 vectype_out = get_vectype_for_scalar_type (scalar_type);
2207 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2208 if (nunits_in != nunits_out / 2) /* FORNOW */
2209 return false;
2211 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
2212 gcc_assert (ncopies >= 1);
2214 if (! INTEGRAL_TYPE_P (scalar_type)
2215 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2216 return false;
2218 /* Check the operands of the operation. */
2219 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2221 if (vect_print_dump_info (REPORT_DETAILS))
2222 fprintf (vect_dump, "use not simple.");
2223 return false;
2226 /* Supportable by target? */
2227 code = VEC_PACK_MOD_EXPR;
2228 optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in);
2229 if (!optab)
2230 return false;
2232 vec_mode = TYPE_MODE (vectype_in);
2233 if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
2234 return false;
2236 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2238 if (!vec_stmt) /* transformation not required. */
2240 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
2241 return true;
2244 /** Transform. **/
2246 if (vect_print_dump_info (REPORT_DETAILS))
2247 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
2248 ncopies);
2250 /* Handle def. */
2251 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2253 /* In case the vectorization factor (VF) is bigger than the number
2254 of elements that we can fit in a vectype (nunits), we have to generate
2255 more than one vector stmt - i.e - we need to "unroll" the
2256 vector stmt by a factor VF/nunits. */
2257 prev_stmt_info = NULL;
2258 for (j = 0; j < ncopies; j++)
2260 /* Handle uses. */
2261 if (j == 0)
2263 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2264 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2265 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd0);
2267 else
2269 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
2270 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2273 /* Arguments are ready. Create the new vector stmt. */
2274 expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1);
2275 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2276 new_temp = make_ssa_name (vec_dest, new_stmt);
2277 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2278 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2280 if (j == 0)
2281 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2282 else
2283 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2285 prev_stmt_info = vinfo_for_stmt (new_stmt);
2288 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2289 return true;
2293 /* Function vect_gen_widened_results_half
2295 Create a vector stmt whose code, type, number of arguments, and result
2296 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
2297 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
2298 In the case that CODE is a CALL_EXPR, this means that a call to DECL
2299 needs to be created (DECL is a function-decl of a target-builtin).
2300 STMT is the original scalar stmt that we are vectorizing. */
2302 static tree
2303 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
2304 tree vec_oprnd0, tree vec_oprnd1, int op_type,
2305 tree vec_dest, block_stmt_iterator *bsi,
2306 tree stmt)
2308 tree vec_params;
2309 tree expr;
2310 tree new_stmt;
2311 tree new_temp;
2312 tree sym;
2313 ssa_op_iter iter;
2315 /* Generate half of the widened result: */
2316 if (code == CALL_EXPR)
2318 /* Target specific support */
2319 vec_params = build_tree_list (NULL_TREE, vec_oprnd0);
2320 if (op_type == binary_op)
2321 vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params);
2322 expr = build_function_call_expr (decl, vec_params);
2324 else
2326 /* Generic support */
2327 gcc_assert (op_type == TREE_CODE_LENGTH (code));
2328 if (op_type == binary_op)
2329 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
2330 else
2331 expr = build1 (code, vectype, vec_oprnd0);
2333 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, expr);
2334 new_temp = make_ssa_name (vec_dest, new_stmt);
2335 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2336 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2338 if (code == CALL_EXPR)
2340 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
2342 if (TREE_CODE (sym) == SSA_NAME)
2343 sym = SSA_NAME_VAR (sym);
2344 mark_sym_for_renaming (sym);
2348 return new_stmt;
2352 /* Function vectorizable_type_promotion
2354 Check if STMT performs a binary or unary operation that involves
2355 type promotion, and if it can be vectorized.
2356 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2357 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2358 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2360 bool
2361 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
2362 tree *vec_stmt)
2364 tree vec_dest;
2365 tree scalar_dest;
2366 tree operation;
2367 tree op0, op1 = NULL;
2368 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
2369 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2370 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2371 enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing;
2372 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
2373 int op_type;
2374 tree def, def_stmt;
2375 enum vect_def_type dt0, dt1;
2376 tree new_stmt;
2377 stmt_vec_info prev_stmt_info;
2378 int nunits_in;
2379 int nunits_out;
2380 tree vectype_out;
2381 int ncopies;
2382 int j;
2383 tree vectype_in;
2385 /* Is STMT a vectorizable type-promotion operation? */
2387 if (!STMT_VINFO_RELEVANT_P (stmt_info))
2388 return false;
2390 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
2392 if (STMT_VINFO_LIVE_P (stmt_info))
2394 /* FORNOW: not yet supported. */
2395 if (vect_print_dump_info (REPORT_DETAILS))
2396 fprintf (vect_dump, "value used after loop.");
2397 return false;
2400 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2401 return false;
2403 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
2404 return false;
2406 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2407 code = TREE_CODE (operation);
2408 if (code != NOP_EXPR && code != WIDEN_MULT_EXPR)
2409 return false;
2411 op0 = TREE_OPERAND (operation, 0);
2412 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
2413 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
2414 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
2415 gcc_assert (ncopies >= 1);
2417 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2418 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
2419 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
2420 if (nunits_out != nunits_in / 2) /* FORNOW */
2421 return false;
2423 if (! INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
2424 || !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2425 return false;
2427 /* Check the operands of the operation. */
2428 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
2430 if (vect_print_dump_info (REPORT_DETAILS))
2431 fprintf (vect_dump, "use not simple.");
2432 return false;
2435 op_type = TREE_CODE_LENGTH (code);
2436 if (op_type == binary_op)
2438 op1 = TREE_OPERAND (operation, 1);
2439 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1))
2441 if (vect_print_dump_info (REPORT_DETAILS))
2442 fprintf (vect_dump, "use not simple.");
2443 return false;
2447 /* Supportable by target? */
2448 if (!supportable_widening_operation (code, stmt, vectype_in,
2449 &decl1, &decl2, &code1, &code2))
2450 return false;
2452 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
2454 if (!vec_stmt) /* transformation not required. */
2456 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
2457 return true;
2460 /** Transform. **/
2462 if (vect_print_dump_info (REPORT_DETAILS))
2463 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
2464 ncopies);
2466 /* Handle def. */
2467 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
2469 /* In case the vectorization factor (VF) is bigger than the number
2470 of elements that we can fit in a vectype (nunits), we have to generate
2471 more than one vector stmt - i.e - we need to "unroll" the
2472 vector stmt by a factor VF/nunits. */
2474 prev_stmt_info = NULL;
2475 for (j = 0; j < ncopies; j++)
2477 /* Handle uses. */
2478 if (j == 0)
2480 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
2481 if (op_type == binary_op)
2482 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
2484 else
2486 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
2487 if (op_type == binary_op)
2488 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1);
2491 /* Arguments are ready. Create the new vector stmt. We are creating
2492 two vector defs because the widened result does not fit in one vector.
2493 The vectorized stmt can be expressed as a call to a taregt builtin,
2494 or a using a tree-code. */
2495 /* Generate first half of the widened result: */
2496 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
2497 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2498 if (j == 0)
2499 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
2500 else
2501 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2502 prev_stmt_info = vinfo_for_stmt (new_stmt);
2504 /* Generate second half of the widened result: */
2505 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
2506 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
2507 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2508 prev_stmt_info = vinfo_for_stmt (new_stmt);
2512 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
2513 return true;
2517 /* Function vect_strided_store_supported.
2519 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
2520 and FALSE otherwise. */
2522 static bool
2523 vect_strided_store_supported (tree vectype)
2525 optab interleave_high_optab, interleave_low_optab;
2526 int mode;
2528 mode = (int) TYPE_MODE (vectype);
2530 /* Check that the operation is supported. */
2531 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
2532 vectype);
2533 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
2534 vectype);
2535 if (!interleave_high_optab || !interleave_low_optab)
2537 if (vect_print_dump_info (REPORT_DETAILS))
2538 fprintf (vect_dump, "no optab for interleave.");
2539 return false;
2542 if (interleave_high_optab->handlers[(int) mode].insn_code
2543 == CODE_FOR_nothing
2544 || interleave_low_optab->handlers[(int) mode].insn_code
2545 == CODE_FOR_nothing)
2547 if (vect_print_dump_info (REPORT_DETAILS))
2548 fprintf (vect_dump, "interleave op not supported by target.");
2549 return false;
2551 return true;
2555 /* Function vect_permute_store_chain.
2557 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
2558 a power of 2, generate interleave_high/low stmts to reorder the data
2559 correctly for the stores. Return the final references for stores in
2560 RESULT_CHAIN.
2562 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
2563 The input is 4 vectors each containing 8 elements. We assign a number to each
2564 element, the input sequence is:
2566 1st vec: 0 1 2 3 4 5 6 7
2567 2nd vec: 8 9 10 11 12 13 14 15
2568 3rd vec: 16 17 18 19 20 21 22 23
2569 4th vec: 24 25 26 27 28 29 30 31
2571 The output sequence should be:
2573 1st vec: 0 8 16 24 1 9 17 25
2574 2nd vec: 2 10 18 26 3 11 19 27
2575 3rd vec: 4 12 20 28 5 13 21 30
2576 4th vec: 6 14 22 30 7 15 23 31
2578 i.e., we interleave the contents of the four vectors in their order.
2580 We use interleave_high/low instructions to create such output. The input of
2581 each interleave_high/low operation is two vectors:
2582 1st vec 2nd vec
2583 0 1 2 3 4 5 6 7
2584 the even elements of the result vector are obtained left-to-right from the
2585 high/low elements of the first vector. The odd elements of the result are
2586 obtained left-to-right from the high/low elements of the second vector.
2587 The output of interleave_high will be: 0 4 1 5
2588 and of interleave_low: 2 6 3 7
2591 The permutation is done in log LENGTH stages. In each stage interleave_high
2592 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
2593 where the first argument is taken from the first half of DR_CHAIN and the
2594 second argument from it's second half.
2595 In our example,
2597 I1: interleave_high (1st vec, 3rd vec)
2598 I2: interleave_low (1st vec, 3rd vec)
2599 I3: interleave_high (2nd vec, 4th vec)
2600 I4: interleave_low (2nd vec, 4th vec)
2602 The output for the first stage is:
2604 I1: 0 16 1 17 2 18 3 19
2605 I2: 4 20 5 21 6 22 7 23
2606 I3: 8 24 9 25 10 26 11 27
2607 I4: 12 28 13 29 14 30 15 31
2609 The output of the second stage, i.e. the final result is:
2611 I1: 0 8 16 24 1 9 17 25
2612 I2: 2 10 18 26 3 11 19 27
2613 I3: 4 12 20 28 5 13 21 30
2614 I4: 6 14 22 30 7 15 23 31. */
2616 static bool
2617 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
2618 unsigned int length,
2619 tree stmt,
2620 block_stmt_iterator *bsi,
2621 VEC(tree,heap) **result_chain)
2623 tree perm_dest, perm_stmt, vect1, vect2, high, low;
2624 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2625 tree scalar_dest;
2626 int i;
2627 unsigned int j;
2628 VEC(tree,heap) *first, *second;
2630 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2631 first = VEC_alloc (tree, heap, length/2);
2632 second = VEC_alloc (tree, heap, length/2);
2634 /* Check that the operation is supported. */
2635 if (!vect_strided_store_supported (vectype))
2636 return false;
2638 *result_chain = VEC_copy (tree, heap, dr_chain);
2640 for (i = 0; i < exact_log2 (length); i++)
2642 for (j = 0; j < length/2; j++)
2644 vect1 = VEC_index (tree, dr_chain, j);
2645 vect2 = VEC_index (tree, dr_chain, j+length/2);
2647 /* Create interleaving stmt:
2648 in the case of big endian:
2649 high = interleave_high (vect1, vect2)
2650 and in the case of little endian:
2651 high = interleave_low (vect1, vect2). */
2652 perm_dest = create_tmp_var (vectype, "vect_inter_high");
2653 DECL_GIMPLE_REG_P (perm_dest) = 1;
2654 add_referenced_var (perm_dest);
2655 if (BYTES_BIG_ENDIAN)
2656 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2657 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2658 vect1, vect2));
2659 else
2660 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2661 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2662 vect1, vect2));
2663 high = make_ssa_name (perm_dest, perm_stmt);
2664 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
2665 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2666 VEC_replace (tree, *result_chain, 2*j, high);
2668 /* Create interleaving stmt:
2669 in the case of big endian:
2670 low = interleave_low (vect1, vect2)
2671 and in the case of little endian:
2672 low = interleave_high (vect1, vect2). */
2673 perm_dest = create_tmp_var (vectype, "vect_inter_low");
2674 DECL_GIMPLE_REG_P (perm_dest) = 1;
2675 add_referenced_var (perm_dest);
2676 if (BYTES_BIG_ENDIAN)
2677 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2678 build2 (VEC_INTERLEAVE_LOW_EXPR, vectype,
2679 vect1, vect2));
2680 else
2681 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
2682 build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype,
2683 vect1, vect2));
2684 low = make_ssa_name (perm_dest, perm_stmt);
2685 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
2686 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
2687 VEC_replace (tree, *result_chain, 2*j+1, low);
2689 dr_chain = VEC_copy (tree, heap, *result_chain);
2691 return true;
2695 /* Function vectorizable_store.
2697 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
2698 can be vectorized.
2699 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2700 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2701 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
2703 bool
2704 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2706 tree scalar_dest;
2707 tree data_ref;
2708 tree op;
2709 tree vec_oprnd = NULL_TREE;
2710 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2711 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
2712 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2713 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2714 enum machine_mode vec_mode;
2715 tree dummy;
2716 enum dr_alignment_support alignment_support_cheme;
2717 ssa_op_iter iter;
2718 def_operand_p def_p;
2719 tree def, def_stmt;
2720 enum vect_def_type dt;
2721 stmt_vec_info prev_stmt_info = NULL;
2722 tree dataref_ptr = NULL_TREE;
2723 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2724 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2725 int j;
2726 tree next_stmt, first_stmt;
2727 bool strided_store = false;
2728 unsigned int group_size, i;
2729 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
2730 gcc_assert (ncopies >= 1);
2732 /* Is vectorizable store? */
2734 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
2735 return false;
2737 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2738 if (TREE_CODE (scalar_dest) != ARRAY_REF
2739 && TREE_CODE (scalar_dest) != INDIRECT_REF
2740 && !DR_GROUP_FIRST_DR (stmt_info))
2741 return false;
2743 op = GIMPLE_STMT_OPERAND (stmt, 1);
2744 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
2746 if (vect_print_dump_info (REPORT_DETAILS))
2747 fprintf (vect_dump, "use not simple.");
2748 return false;
2751 vec_mode = TYPE_MODE (vectype);
2752 /* FORNOW. In some cases can vectorize even if data-type not supported
2753 (e.g. - array initialization with 0). */
2754 if (mov_optab->handlers[(int)vec_mode].insn_code == CODE_FOR_nothing)
2755 return false;
2757 if (!STMT_VINFO_DATA_REF (stmt_info))
2758 return false;
2760 if (DR_GROUP_FIRST_DR (stmt_info))
2762 strided_store = true;
2763 if (!vect_strided_store_supported (vectype))
2764 return false;
2767 if (!vec_stmt) /* transformation not required. */
2769 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
2770 return true;
2773 /** Transform. **/
2775 if (vect_print_dump_info (REPORT_DETAILS))
2776 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
2778 if (strided_store)
2780 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
2781 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2782 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
2784 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
2786 /* We vectorize all the stmts of the interleaving group when we
2787 reach the last stmt in the group. */
2788 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
2789 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
2791 *vec_stmt = NULL_TREE;
2792 return true;
2795 else
2797 first_stmt = stmt;
2798 first_dr = dr;
2799 group_size = 1;
2802 dr_chain = VEC_alloc (tree, heap, group_size);
2803 oprnds = VEC_alloc (tree, heap, group_size);
2805 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
2806 gcc_assert (alignment_support_cheme);
2807 gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */
2809 /* In case the vectorization factor (VF) is bigger than the number
2810 of elements that we can fit in a vectype (nunits), we have to generate
2811 more than one vector stmt - i.e - we need to "unroll" the
2812 vector stmt by a factor VF/nunits. For more details see documentation in
2813 vect_get_vec_def_for_copy_stmt. */
2815 /* In case of interleaving (non-unit strided access):
2817 S1: &base + 2 = x2
2818 S2: &base = x0
2819 S3: &base + 1 = x1
2820 S4: &base + 3 = x3
2822 We create vectorized stores starting from base address (the access of the
2823 first stmt in the chain (S2 in the above example), when the last store stmt
2824 of the chain (S4) is reached:
2826 VS1: &base = vx2
2827 VS2: &base + vec_size*1 = vx0
2828 VS3: &base + vec_size*2 = vx1
2829 VS4: &base + vec_size*3 = vx3
2831 Then permutation statements are generated:
2833 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
2834 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
2837 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
2838 (the order of the data-refs in the output of vect_permute_store_chain
2839 corresponds to the order of scalar stmts in the interleaving chain - see
2840 the documentation of vect_permute_store_chain()).
2842 In case of both multiple types and interleaving, above vector stores and
2843 permutation stmts are created for every copy. The result vector stmts are
2844 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
2845 STMT_VINFO_RELATED_STMT for the next copies.
2848 prev_stmt_info = NULL;
2849 for (j = 0; j < ncopies; j++)
2851 tree new_stmt;
2852 tree ptr_incr;
2854 if (j == 0)
2856 /* For interleaved stores we collect vectorized defs for all the
2857 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
2858 as an input to vect_permute_store_chain(), and OPRNDS as an input
2859 to vect_get_vec_def_for_stmt_copy() for the next copy.
2860 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2861 OPRNDS are of size 1. */
2862 next_stmt = first_stmt;
2863 for (i = 0; i < group_size; i++)
2865 /* Since gaps are not supported for interleaved stores, GROUP_SIZE
2866 is the exact number of stmts in the chain. Therefore, NEXT_STMT
2867 can't be NULL_TREE. In case that there is no interleaving,
2868 GROUP_SIZE is 1, and only one iteration of the loop will be
2869 executed. */
2870 gcc_assert (next_stmt);
2871 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
2872 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
2873 VEC_quick_push(tree, dr_chain, vec_oprnd);
2874 VEC_quick_push(tree, oprnds, vec_oprnd);
2875 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2877 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE,
2878 &dummy, &ptr_incr, false,
2879 TREE_TYPE (vec_oprnd));
2881 else
2883 /* For interleaved stores we created vectorized defs for all the
2884 defs stored in OPRNDS in the previous iteration (previous copy).
2885 DR_CHAIN is then used as an input to vect_permute_store_chain(),
2886 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
2887 next copy.
2888 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
2889 OPRNDS are of size 1. */
2890 for (i = 0; i < group_size; i++)
2892 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt,
2893 VEC_index (tree, oprnds, i));
2894 VEC_replace(tree, dr_chain, i, vec_oprnd);
2895 VEC_replace(tree, oprnds, i, vec_oprnd);
2897 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2900 if (strided_store)
2902 result_chain = VEC_alloc (tree, heap, group_size);
2903 /* Permute. */
2904 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
2905 &result_chain))
2906 return false;
2909 next_stmt = first_stmt;
2910 for (i = 0; i < group_size; i++)
2912 /* For strided stores vectorized defs are interleaved in
2913 vect_permute_store_chain(). */
2914 if (strided_store)
2915 vec_oprnd = VEC_index(tree, result_chain, i);
2917 data_ref = build_fold_indirect_ref (dataref_ptr);
2918 /* Arguments are ready. Create the new vector stmt. */
2919 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, data_ref,
2920 vec_oprnd);
2921 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2923 /* Set the VDEFs for the vector pointer. If this virtual def
2924 has a use outside the loop and a loop peel is performed
2925 then the def may be renamed by the peel. Mark it for
2926 renaming so the later use will also be renamed. */
2927 copy_virtual_operands (new_stmt, next_stmt);
2928 if (j == 0)
2930 /* The original store is deleted so the same SSA_NAMEs
2931 can be used. */
2932 FOR_EACH_SSA_TREE_OPERAND (def, next_stmt, iter, SSA_OP_VDEF)
2934 SSA_NAME_DEF_STMT (def) = new_stmt;
2935 mark_sym_for_renaming (SSA_NAME_VAR (def));
2938 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2940 else
2942 /* Create new names for all the definitions created by COPY and
2943 add replacement mappings for each new name. */
2944 FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VDEF)
2946 create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p);
2947 mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p)));
2950 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
2953 prev_stmt_info = vinfo_for_stmt (new_stmt);
2954 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
2955 if (!next_stmt)
2956 break;
2957 /* Bump the vector pointer. */
2958 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
2962 return true;
2966 /* Function vect_setup_realignment
2968 This function is called when vectorizing an unaligned load using
2969 the dr_unaligned_software_pipeline scheme.
2970 This function generates the following code at the loop prolog:
2972 p = initial_addr;
2973 msq_init = *(floor(p)); # prolog load
2974 realignment_token = call target_builtin;
2975 loop:
2976 msq = phi (msq_init, ---)
2978 The code above sets up a new (vector) pointer, pointing to the first
2979 location accessed by STMT, and a "floor-aligned" load using that pointer.
2980 It also generates code to compute the "realignment-token" (if the relevant
2981 target hook was defined), and creates a phi-node at the loop-header bb
2982 whose arguments are the result of the prolog-load (created by this
2983 function) and the result of a load that takes place in the loop (to be
2984 created by the caller to this function).
2985 The caller to this function uses the phi-result (msq) to create the
2986 realignment code inside the loop, and sets up the missing phi argument,
2987 as follows:
2989 loop:
2990 msq = phi (msq_init, lsq)
2991 lsq = *(floor(p')); # load in loop
2992 result = realign_load (msq, lsq, realignment_token);
2994 Input:
2995 STMT - (scalar) load stmt to be vectorized. This load accesses
2996 a memory location that may be unaligned.
2997 BSI - place where new code is to be inserted.
2999 Output:
3000 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
3001 target hook, if defined.
3002 Return value - the result of the loop-header phi node. */
3004 static tree
3005 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
3006 tree *realignment_token)
3008 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3009 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3010 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3011 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3012 edge pe = loop_preheader_edge (loop);
3013 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3014 tree vec_dest;
3015 tree init_addr;
3016 tree inc;
3017 tree ptr;
3018 tree data_ref;
3019 tree new_stmt;
3020 basic_block new_bb;
3021 tree msq_init;
3022 tree new_temp;
3023 tree phi_stmt;
3024 tree msq;
3026 /* 1. Create msq_init = *(floor(p1)) in the loop preheader */
3027 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3028 ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true,
3029 NULL_TREE);
3030 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
3031 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest, data_ref);
3032 new_temp = make_ssa_name (vec_dest, new_stmt);
3033 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3034 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3035 gcc_assert (!new_bb);
3036 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
3037 copy_virtual_operands (new_stmt, stmt);
3038 update_vuses_to_preheader (new_stmt, loop);
3040 /* 2. Create permutation mask, if required, in loop preheader. */
3041 if (targetm.vectorize.builtin_mask_for_load)
3043 tree builtin_decl;
3044 tree params = build_tree_list (NULL_TREE, init_addr);
3046 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
3047 new_stmt = build_function_call_expr (builtin_decl, params);
3048 vec_dest = vect_create_destination_var (scalar_dest,
3049 TREE_TYPE (new_stmt));
3050 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3051 new_stmt);
3052 new_temp = make_ssa_name (vec_dest, new_stmt);
3053 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3054 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
3055 gcc_assert (!new_bb);
3056 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
3058 /* The result of the CALL_EXPR to this builtin is determined from
3059 the value of the parameter and no global variables are touched
3060 which makes the builtin a "const" function. Requiring the
3061 builtin to have the "const" attribute makes it unnecessary
3062 to call mark_call_clobbered. */
3063 gcc_assert (TREE_READONLY (builtin_decl));
3066 /* 3. Create msq = phi <msq_init, lsq> in loop */
3067 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3068 msq = make_ssa_name (vec_dest, NULL_TREE);
3069 phi_stmt = create_phi_node (msq, loop->header);
3070 SSA_NAME_DEF_STMT (msq) = phi_stmt;
3071 add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop));
3073 return msq;
3077 /* Function vect_strided_load_supported.
3079 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
3080 and FALSE otherwise. */
3082 static bool
3083 vect_strided_load_supported (tree vectype)
3085 optab perm_even_optab, perm_odd_optab;
3086 int mode;
3088 mode = (int) TYPE_MODE (vectype);
3090 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
3091 if (!perm_even_optab)
3093 if (vect_print_dump_info (REPORT_DETAILS))
3094 fprintf (vect_dump, "no optab for perm_even.");
3095 return false;
3098 if (perm_even_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3100 if (vect_print_dump_info (REPORT_DETAILS))
3101 fprintf (vect_dump, "perm_even op not supported by target.");
3102 return false;
3105 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
3106 if (!perm_odd_optab)
3108 if (vect_print_dump_info (REPORT_DETAILS))
3109 fprintf (vect_dump, "no optab for perm_odd.");
3110 return false;
3113 if (perm_odd_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3115 if (vect_print_dump_info (REPORT_DETAILS))
3116 fprintf (vect_dump, "perm_odd op not supported by target.");
3117 return false;
3119 return true;
3123 /* Function vect_permute_load_chain.
3125 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
3126 a power of 2, generate extract_even/odd stmts to reorder the input data
3127 correctly. Return the final references for loads in RESULT_CHAIN.
3129 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3130 The input is 4 vectors each containing 8 elements. We assign a number to each
3131 element, the input sequence is:
3133 1st vec: 0 1 2 3 4 5 6 7
3134 2nd vec: 8 9 10 11 12 13 14 15
3135 3rd vec: 16 17 18 19 20 21 22 23
3136 4th vec: 24 25 26 27 28 29 30 31
3138 The output sequence should be:
3140 1st vec: 0 4 8 12 16 20 24 28
3141 2nd vec: 1 5 9 13 17 21 25 29
3142 3rd vec: 2 6 10 14 18 22 26 30
3143 4th vec: 3 7 11 15 19 23 27 31
3145 i.e., the first output vector should contain the first elements of each
3146 interleaving group, etc.
3148 We use extract_even/odd instructions to create such output. The input of each
3149 extract_even/odd operation is two vectors
3150 1st vec 2nd vec
3151 0 1 2 3 4 5 6 7
3153 and the output is the vector of extracted even/odd elements. The output of
3154 extract_even will be: 0 2 4 6
3155 and of extract_odd: 1 3 5 7
3158 The permutation is done in log LENGTH stages. In each stage extract_even and
3159 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
3160 order. In our example,
3162 E1: extract_even (1st vec, 2nd vec)
3163 E2: extract_odd (1st vec, 2nd vec)
3164 E3: extract_even (3rd vec, 4th vec)
3165 E4: extract_odd (3rd vec, 4th vec)
3167 The output for the first stage will be:
3169 E1: 0 2 4 6 8 10 12 14
3170 E2: 1 3 5 7 9 11 13 15
3171 E3: 16 18 20 22 24 26 28 30
3172 E4: 17 19 21 23 25 27 29 31
3174 In order to proceed and create the correct sequence for the next stage (or
3175 for the correct output, if the second stage is the last one, as in our
3176 example), we first put the output of extract_even operation and then the
3177 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
3178 The input for the second stage is:
3180 1st vec (E1): 0 2 4 6 8 10 12 14
3181 2nd vec (E3): 16 18 20 22 24 26 28 30
3182 3rd vec (E2): 1 3 5 7 9 11 13 15
3183 4th vec (E4): 17 19 21 23 25 27 29 31
3185 The output of the second stage:
3187 E1: 0 4 8 12 16 20 24 28
3188 E2: 2 6 10 14 18 22 26 30
3189 E3: 1 5 9 13 17 21 25 29
3190 E4: 3 7 11 15 19 23 27 31
3192 And RESULT_CHAIN after reordering:
3194 1st vec (E1): 0 4 8 12 16 20 24 28
3195 2nd vec (E3): 1 5 9 13 17 21 25 29
3196 3rd vec (E2): 2 6 10 14 18 22 26 30
3197 4th vec (E4): 3 7 11 15 19 23 27 31. */
3199 static bool
3200 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
3201 unsigned int length,
3202 tree stmt,
3203 block_stmt_iterator *bsi,
3204 VEC(tree,heap) **result_chain)
3206 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
3207 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3208 int i;
3209 unsigned int j;
3211 /* Check that the operation is supported. */
3212 if (!vect_strided_load_supported (vectype))
3213 return false;
3215 *result_chain = VEC_copy (tree, heap, dr_chain);
3216 for (i = 0; i < exact_log2 (length); i++)
3218 for (j = 0; j < length; j +=2)
3220 first_vect = VEC_index (tree, dr_chain, j);
3221 second_vect = VEC_index (tree, dr_chain, j+1);
3223 /* data_ref = permute_even (first_data_ref, second_data_ref); */
3224 perm_dest = create_tmp_var (vectype, "vect_perm_even");
3225 DECL_GIMPLE_REG_P (perm_dest) = 1;
3226 add_referenced_var (perm_dest);
3228 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3229 build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
3230 first_vect, second_vect));
3232 data_ref = make_ssa_name (perm_dest, perm_stmt);
3233 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3234 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3235 mark_symbols_for_renaming (perm_stmt);
3237 VEC_replace (tree, *result_chain, j/2, data_ref);
3239 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
3240 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
3241 DECL_GIMPLE_REG_P (perm_dest) = 1;
3242 add_referenced_var (perm_dest);
3244 perm_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, perm_dest,
3245 build2 (VEC_EXTRACT_ODD_EXPR, vectype,
3246 first_vect, second_vect));
3247 data_ref = make_ssa_name (perm_dest, perm_stmt);
3248 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
3249 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
3250 mark_symbols_for_renaming (perm_stmt);
3252 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
3254 dr_chain = VEC_copy (tree, heap, *result_chain);
3256 return true;
3260 /* Function vect_transform_strided_load.
3262 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
3263 to perform their permutation and ascribe the result vectorized statements to
3264 the scalar statements.
3267 static bool
3268 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
3269 block_stmt_iterator *bsi)
3271 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3272 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3273 tree next_stmt, new_stmt;
3274 VEC(tree,heap) *result_chain = NULL;
3275 unsigned int i, gap_count;
3276 tree tmp_data_ref;
3278 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
3279 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
3280 vectors, that are ready for vector computation. */
3281 result_chain = VEC_alloc (tree, heap, size);
3282 /* Permute. */
3283 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
3284 return false;
3286 /* Put a permuted data-ref in the VECTORIZED_STMT field.
3287 Since we scan the chain starting from it's first node, their order
3288 corresponds the order of data-refs in RESULT_CHAIN. */
3289 next_stmt = first_stmt;
3290 gap_count = 1;
3291 for (i = 0; VEC_iterate(tree, result_chain, i, tmp_data_ref); i++)
3293 if (!next_stmt)
3294 break;
3296 /* Skip the gaps. Loads created for the gaps will be removed by dead
3297 code elimination pass later.
3298 DR_GROUP_GAP is the number of steps in elements from the previous
3299 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
3300 correspond to the gaps.
3302 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
3304 gap_count++;
3305 continue;
3308 while (next_stmt)
3310 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
3311 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
3312 copies, and we put the new vector statement in the first available
3313 RELATED_STMT. */
3314 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
3315 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
3316 else
3318 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
3319 tree rel_stmt = STMT_VINFO_RELATED_STMT (
3320 vinfo_for_stmt (prev_stmt));
3321 while (rel_stmt)
3323 prev_stmt = rel_stmt;
3324 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
3326 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
3328 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
3329 gap_count = 1;
3330 /* If NEXT_STMT accesses the same DR as the previous statement,
3331 put the same TMP_DATA_REF as its vectorized statement; otherwise
3332 get the next data-ref from RESULT_CHAIN. */
3333 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
3334 break;
3337 return true;
3341 /* vectorizable_load.
3343 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
3344 can be vectorized.
3345 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3346 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3347 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3349 bool
3350 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3352 tree scalar_dest;
3353 tree vec_dest = NULL;
3354 tree data_ref = NULL;
3355 tree op;
3356 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3357 stmt_vec_info prev_stmt_info;
3358 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3359 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3360 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
3361 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3362 tree new_temp;
3363 int mode;
3364 tree new_stmt = NULL_TREE;
3365 tree dummy;
3366 enum dr_alignment_support alignment_support_cheme;
3367 tree dataref_ptr = NULL_TREE;
3368 tree ptr_incr;
3369 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3370 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3371 int i, j, group_size;
3372 tree msq = NULL_TREE, lsq;
3373 tree offset = NULL_TREE;
3374 tree realignment_token = NULL_TREE;
3375 tree phi_stmt = NULL_TREE;
3376 VEC(tree,heap) *dr_chain = NULL;
3377 bool strided_load = false;
3378 tree first_stmt;
3380 /* Is vectorizable load? */
3381 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3382 return false;
3384 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3386 if (STMT_VINFO_LIVE_P (stmt_info))
3388 /* FORNOW: not yet supported. */
3389 if (vect_print_dump_info (REPORT_DETAILS))
3390 fprintf (vect_dump, "value used after loop.");
3391 return false;
3394 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3395 return false;
3397 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3398 if (TREE_CODE (scalar_dest) != SSA_NAME)
3399 return false;
3401 op = GIMPLE_STMT_OPERAND (stmt, 1);
3402 if (TREE_CODE (op) != ARRAY_REF
3403 && TREE_CODE (op) != INDIRECT_REF
3404 && !DR_GROUP_FIRST_DR (stmt_info))
3405 return false;
3407 if (!STMT_VINFO_DATA_REF (stmt_info))
3408 return false;
3410 mode = (int) TYPE_MODE (vectype);
3412 /* FORNOW. In some cases can vectorize even if data-type not supported
3413 (e.g. - data copies). */
3414 if (mov_optab->handlers[mode].insn_code == CODE_FOR_nothing)
3416 if (vect_print_dump_info (REPORT_DETAILS))
3417 fprintf (vect_dump, "Aligned load, but unsupported type.");
3418 return false;
3421 /* Check if the load is a part of an interleaving chain. */
3422 if (DR_GROUP_FIRST_DR (stmt_info))
3424 strided_load = true;
3426 /* Check if interleaving is supported. */
3427 if (!vect_strided_load_supported (vectype))
3428 return false;
3431 if (!vec_stmt) /* transformation not required. */
3433 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
3434 return true;
3437 /** Transform. **/
3439 if (vect_print_dump_info (REPORT_DETAILS))
3440 fprintf (vect_dump, "transform load.");
3442 if (strided_load)
3444 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
3445 /* Check if the chain of loads is already vectorized. */
3446 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
3448 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3449 return true;
3451 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
3452 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
3453 dr_chain = VEC_alloc (tree, heap, group_size);
3455 else
3457 first_stmt = stmt;
3458 first_dr = dr;
3459 group_size = 1;
3462 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
3463 gcc_assert (alignment_support_cheme);
3466 /* In case the vectorization factor (VF) is bigger than the number
3467 of elements that we can fit in a vectype (nunits), we have to generate
3468 more than one vector stmt - i.e - we need to "unroll" the
3469 vector stmt by a factor VF/nunits. In doing so, we record a pointer
3470 from one copy of the vector stmt to the next, in the field
3471 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
3472 stages to find the correct vector defs to be used when vectorizing
3473 stmts that use the defs of the current stmt. The example below illustrates
3474 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
3475 4 vectorized stmts):
3477 before vectorization:
3478 RELATED_STMT VEC_STMT
3479 S1: x = memref - -
3480 S2: z = x + 1 - -
3482 step 1: vectorize stmt S1:
3483 We first create the vector stmt VS1_0, and, as usual, record a
3484 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
3485 Next, we create the vector stmt VS1_1, and record a pointer to
3486 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
3487 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
3488 stmts and pointers:
3489 RELATED_STMT VEC_STMT
3490 VS1_0: vx0 = memref0 VS1_1 -
3491 VS1_1: vx1 = memref1 VS1_2 -
3492 VS1_2: vx2 = memref2 VS1_3 -
3493 VS1_3: vx3 = memref3 - -
3494 S1: x = load - VS1_0
3495 S2: z = x + 1 - -
3497 See in documentation in vect_get_vec_def_for_stmt_copy for how the
3498 information we recorded in RELATED_STMT field is used to vectorize
3499 stmt S2. */
3501 /* In case of interleaving (non-unit strided access):
3503 S1: x2 = &base + 2
3504 S2: x0 = &base
3505 S3: x1 = &base + 1
3506 S4: x3 = &base + 3
3508 Vectorized loads are created in the order of memory accesses
3509 starting from the access of the first stmt of the chain:
3511 VS1: vx0 = &base
3512 VS2: vx1 = &base + vec_size*1
3513 VS3: vx3 = &base + vec_size*2
3514 VS4: vx4 = &base + vec_size*3
3516 Then permutation statements are generated:
3518 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
3519 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
3522 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
3523 (the order of the data-refs in the output of vect_permute_load_chain
3524 corresponds to the order of scalar stmts in the interleaving chain - see
3525 the documentation of vect_permute_load_chain()).
3526 The generation of permutation stmts and recording them in
3527 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
3529 In case of both multiple types and interleaving, the vector loads and
3530 permutation stmts above are created for every copy. The result vector stmts
3531 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
3532 STMT_VINFO_RELATED_STMT for the next copies. */
3534 /* If the data reference is aligned (dr_aligned) or potentially unaligned
3535 on a target that supports unaligned accesses (dr_unaligned_supported)
3536 we generate the following code:
3537 p = initial_addr;
3538 indx = 0;
3539 loop {
3540 p = p + indx * vectype_size;
3541 vec_dest = *(p);
3542 indx = indx + 1;
3545 Otherwise, the data reference is potentially unaligned on a target that
3546 does not support unaligned accesses (dr_unaligned_software_pipeline) -
3547 then generate the following code, in which the data in each iteration is
3548 obtained by two vector loads, one from the previous iteration, and one
3549 from the current iteration:
3550 p1 = initial_addr;
3551 msq_init = *(floor(p1))
3552 p2 = initial_addr + VS - 1;
3553 realignment_token = call target_builtin;
3554 indx = 0;
3555 loop {
3556 p2 = p2 + indx * vectype_size
3557 lsq = *(floor(p2))
3558 vec_dest = realign_load (msq, lsq, realignment_token)
3559 indx = indx + 1;
3560 msq = lsq;
3561 } */
3563 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3565 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token);
3566 phi_stmt = SSA_NAME_DEF_STMT (msq);
3567 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
3570 prev_stmt_info = NULL;
3571 for (j = 0; j < ncopies; j++)
3573 /* 1. Create the vector pointer update chain. */
3574 if (j == 0)
3575 dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy,
3576 &ptr_incr, false, NULL_TREE);
3577 else
3578 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3580 for (i = 0; i < group_size; i++)
3582 /* 2. Create the vector-load in the loop. */
3583 switch (alignment_support_cheme)
3585 case dr_aligned:
3586 gcc_assert (aligned_access_p (first_dr));
3587 data_ref = build_fold_indirect_ref (dataref_ptr);
3588 break;
3589 case dr_unaligned_supported:
3591 int mis = DR_MISALIGNMENT (first_dr);
3592 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
3594 gcc_assert (!aligned_access_p (first_dr));
3595 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
3596 data_ref =
3597 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
3598 break;
3600 case dr_unaligned_software_pipeline:
3601 gcc_assert (!aligned_access_p (first_dr));
3602 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
3603 break;
3604 default:
3605 gcc_unreachable ();
3607 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3608 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3609 data_ref);
3610 new_temp = make_ssa_name (vec_dest, new_stmt);
3611 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3612 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3613 copy_virtual_operands (new_stmt, stmt);
3614 mark_symbols_for_renaming (new_stmt);
3616 /* 3. Handle explicit realignment if necessary/supported. */
3617 if (alignment_support_cheme == dr_unaligned_software_pipeline)
3619 /* Create in loop:
3620 <vec_dest = realign_load (msq, lsq, realignment_token)> */
3621 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
3622 if (!realignment_token)
3623 realignment_token = dataref_ptr;
3624 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3625 new_stmt =
3626 build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token);
3627 new_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3628 new_stmt);
3629 new_temp = make_ssa_name (vec_dest, new_stmt);
3630 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3631 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3632 if (i == group_size - 1 && j == ncopies - 1)
3633 add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop));
3634 msq = lsq;
3636 if (strided_load)
3637 VEC_quick_push (tree, dr_chain, new_temp);
3638 if (i < group_size - 1)
3639 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt);
3642 if (strided_load)
3644 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
3645 return false;
3646 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3647 dr_chain = VEC_alloc (tree, heap, group_size);
3649 else
3651 if (j == 0)
3652 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3653 else
3654 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3655 prev_stmt_info = vinfo_for_stmt (new_stmt);
3659 return true;
3663 /* Function vectorizable_live_operation.
3665 STMT computes a value that is used outside the loop. Check if
3666 it can be supported. */
3668 bool
3669 vectorizable_live_operation (tree stmt,
3670 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3671 tree *vec_stmt ATTRIBUTE_UNUSED)
3673 tree operation;
3674 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3675 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3676 int i;
3677 enum tree_code code;
3678 int op_type;
3679 tree op;
3680 tree def, def_stmt;
3681 enum vect_def_type dt;
3683 if (!STMT_VINFO_LIVE_P (stmt_info))
3684 return false;
3686 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3687 return false;
3689 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3690 return false;
3692 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3693 code = TREE_CODE (operation);
3695 op_type = TREE_CODE_LENGTH (code);
3697 /* FORNOW: support only if all uses are invariant. This means
3698 that the scalar operations can remain in place, unvectorized.
3699 The original last scalar value that they compute will be used. */
3701 for (i = 0; i < op_type; i++)
3703 op = TREE_OPERAND (operation, i);
3704 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
3706 if (vect_print_dump_info (REPORT_DETAILS))
3707 fprintf (vect_dump, "use not simple.");
3708 return false;
3711 if (dt != vect_invariant_def && dt != vect_constant_def)
3712 return false;
3715 /* No transformation is required for the cases we currently support. */
3716 return true;
3720 /* Function vect_is_simple_cond.
3722 Input:
3723 LOOP - the loop that is being vectorized.
3724 COND - Condition that is checked for simple use.
3726 Returns whether a COND can be vectorized. Checks whether
3727 condition operands are supportable using vec_is_simple_use. */
3729 static bool
3730 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
3732 tree lhs, rhs;
3733 tree def;
3734 enum vect_def_type dt;
3736 if (!COMPARISON_CLASS_P (cond))
3737 return false;
3739 lhs = TREE_OPERAND (cond, 0);
3740 rhs = TREE_OPERAND (cond, 1);
3742 if (TREE_CODE (lhs) == SSA_NAME)
3744 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
3745 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
3746 return false;
3748 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST)
3749 return false;
3751 if (TREE_CODE (rhs) == SSA_NAME)
3753 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
3754 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
3755 return false;
3757 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST)
3758 return false;
3760 return true;
3763 /* vectorizable_condition.
3765 Check if STMT is conditional modify expression that can be vectorized.
3766 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3767 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
3768 at BSI.
3770 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3772 bool
3773 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3775 tree scalar_dest = NULL_TREE;
3776 tree vec_dest = NULL_TREE;
3777 tree op = NULL_TREE;
3778 tree cond_expr, then_clause, else_clause;
3779 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3780 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3781 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
3782 tree vec_compare, vec_cond_expr;
3783 tree new_temp;
3784 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3785 enum machine_mode vec_mode;
3786 tree def;
3787 enum vect_def_type dt;
3788 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3789 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3791 gcc_assert (ncopies >= 1);
3792 if (ncopies > 1)
3793 return false; /* FORNOW */
3795 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3796 return false;
3798 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def);
3800 if (STMT_VINFO_LIVE_P (stmt_info))
3802 /* FORNOW: not yet supported. */
3803 if (vect_print_dump_info (REPORT_DETAILS))
3804 fprintf (vect_dump, "value used after loop.");
3805 return false;
3808 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3809 return false;
3811 op = GIMPLE_STMT_OPERAND (stmt, 1);
3813 if (TREE_CODE (op) != COND_EXPR)
3814 return false;
3816 cond_expr = TREE_OPERAND (op, 0);
3817 then_clause = TREE_OPERAND (op, 1);
3818 else_clause = TREE_OPERAND (op, 2);
3820 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
3821 return false;
3823 /* We do not handle two different vector types for the condition
3824 and the values. */
3825 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
3826 return false;
3828 if (TREE_CODE (then_clause) == SSA_NAME)
3830 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
3831 if (!vect_is_simple_use (then_clause, loop_vinfo,
3832 &then_def_stmt, &def, &dt))
3833 return false;
3835 else if (TREE_CODE (then_clause) != INTEGER_CST
3836 && TREE_CODE (then_clause) != REAL_CST)
3837 return false;
3839 if (TREE_CODE (else_clause) == SSA_NAME)
3841 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
3842 if (!vect_is_simple_use (else_clause, loop_vinfo,
3843 &else_def_stmt, &def, &dt))
3844 return false;
3846 else if (TREE_CODE (else_clause) != INTEGER_CST
3847 && TREE_CODE (else_clause) != REAL_CST)
3848 return false;
3851 vec_mode = TYPE_MODE (vectype);
3853 if (!vec_stmt)
3855 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
3856 return expand_vec_cond_expr_p (op, vec_mode);
3859 /* Transform */
3861 /* Handle def. */
3862 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3863 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3865 /* Handle cond expr. */
3866 vec_cond_lhs =
3867 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
3868 vec_cond_rhs =
3869 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
3870 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
3871 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
3873 /* Arguments are ready. create the new vector stmt. */
3874 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
3875 vec_cond_lhs, vec_cond_rhs);
3876 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
3877 vec_compare, vec_then_clause, vec_else_clause);
3879 *vec_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node, vec_dest,
3880 vec_cond_expr);
3881 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3882 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3883 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3885 return true;
3888 /* Function vect_transform_stmt.
3890 Create a vectorized stmt to replace STMT, and insert it at BSI. */
3892 bool
3893 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
3895 bool is_store = false;
3896 tree vec_stmt = NULL_TREE;
3897 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3898 tree orig_stmt_in_pattern;
3899 bool done;
3901 if (STMT_VINFO_RELEVANT_P (stmt_info))
3903 switch (STMT_VINFO_TYPE (stmt_info))
3905 case type_demotion_vec_info_type:
3906 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
3907 gcc_assert (done);
3908 break;
3910 case type_promotion_vec_info_type:
3911 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
3912 gcc_assert (done);
3913 break;
3915 case op_vec_info_type:
3916 done = vectorizable_operation (stmt, bsi, &vec_stmt);
3917 gcc_assert (done);
3918 break;
3920 case assignment_vec_info_type:
3921 done = vectorizable_assignment (stmt, bsi, &vec_stmt);
3922 gcc_assert (done);
3923 break;
3925 case load_vec_info_type:
3926 done = vectorizable_load (stmt, bsi, &vec_stmt);
3927 gcc_assert (done);
3928 break;
3930 case store_vec_info_type:
3931 done = vectorizable_store (stmt, bsi, &vec_stmt);
3932 gcc_assert (done);
3933 if (DR_GROUP_FIRST_DR (stmt_info))
3935 /* In case of interleaving, the whole chain is vectorized when the
3936 last store in the chain is reached. Store stmts before the last
3937 one are skipped, and there vec_stmt_info shouldn't be freed
3938 meanwhile. */
3939 *strided_store = true;
3940 if (STMT_VINFO_VEC_STMT (stmt_info))
3941 is_store = true;
3943 else
3944 is_store = true;
3945 break;
3947 case condition_vec_info_type:
3948 done = vectorizable_condition (stmt, bsi, &vec_stmt);
3949 gcc_assert (done);
3950 break;
3952 case call_vec_info_type:
3953 done = vectorizable_call (stmt, bsi, &vec_stmt);
3954 break;
3956 default:
3957 if (vect_print_dump_info (REPORT_DETAILS))
3958 fprintf (vect_dump, "stmt not supported.");
3959 gcc_unreachable ();
3962 gcc_assert (vec_stmt || *strided_store);
3963 if (vec_stmt)
3965 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
3966 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
3967 if (orig_stmt_in_pattern)
3969 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
3970 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
3972 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3974 /* STMT was inserted by the vectorizer to replace a
3975 computation idiom. ORIG_STMT_IN_PATTERN is a stmt in the
3976 original sequence that computed this idiom. We need to
3977 record a pointer to VEC_STMT in the stmt_info of
3978 ORIG_STMT_IN_PATTERN. See more details in the
3979 documentation of vect_pattern_recog. */
3981 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
3987 if (STMT_VINFO_LIVE_P (stmt_info))
3989 switch (STMT_VINFO_TYPE (stmt_info))
3991 case reduc_vec_info_type:
3992 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
3993 gcc_assert (done);
3994 break;
3996 default:
3997 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
3998 gcc_assert (done);
4002 return is_store;
4006 /* This function builds ni_name = number of iterations loop executes
4007 on the loop preheader. */
4009 static tree
4010 vect_build_loop_niters (loop_vec_info loop_vinfo)
4012 tree ni_name, stmt, var;
4013 edge pe;
4014 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4015 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
4017 var = create_tmp_var (TREE_TYPE (ni), "niters");
4018 add_referenced_var (var);
4019 ni_name = force_gimple_operand (ni, &stmt, false, var);
4021 pe = loop_preheader_edge (loop);
4022 if (stmt)
4024 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4025 gcc_assert (!new_bb);
4028 return ni_name;
4032 /* This function generates the following statements:
4034 ni_name = number of iterations loop executes
4035 ratio = ni_name / vf
4036 ratio_mult_vf_name = ratio * vf
4038 and places them at the loop preheader edge. */
4040 static void
4041 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
4042 tree *ni_name_ptr,
4043 tree *ratio_mult_vf_name_ptr,
4044 tree *ratio_name_ptr)
4047 edge pe;
4048 basic_block new_bb;
4049 tree stmt, ni_name;
4050 tree var;
4051 tree ratio_name;
4052 tree ratio_mult_vf_name;
4053 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4054 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
4055 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4056 tree log_vf;
4058 pe = loop_preheader_edge (loop);
4060 /* Generate temporary variable that contains
4061 number of iterations loop executes. */
4063 ni_name = vect_build_loop_niters (loop_vinfo);
4064 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
4066 /* Create: ratio = ni >> log2(vf) */
4068 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
4069 if (!is_gimple_val (ratio_name))
4071 var = create_tmp_var (TREE_TYPE (ni), "bnd");
4072 add_referenced_var (var);
4074 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
4075 pe = loop_preheader_edge (loop);
4076 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4077 gcc_assert (!new_bb);
4080 /* Create: ratio_mult_vf = ratio << log2 (vf). */
4082 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
4083 ratio_name, log_vf);
4084 if (!is_gimple_val (ratio_mult_vf_name))
4086 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
4087 add_referenced_var (var);
4089 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
4090 true, var);
4091 pe = loop_preheader_edge (loop);
4092 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4093 gcc_assert (!new_bb);
4096 *ni_name_ptr = ni_name;
4097 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
4098 *ratio_name_ptr = ratio_name;
4100 return;
4104 /* Function update_vuses_to_preheader.
4106 Input:
4107 STMT - a statement with potential VUSEs.
4108 LOOP - the loop whose preheader will contain STMT.
4110 It's possible to vectorize a loop even though an SSA_NAME from a VUSE
4111 appears to be defined in a VDEF in another statement in a loop.
4112 One such case is when the VUSE is at the dereference of a __restricted__
4113 pointer in a load and the VDEF is at the dereference of a different
4114 __restricted__ pointer in a store. Vectorization may result in
4115 copy_virtual_uses being called to copy the problematic VUSE to a new
4116 statement that is being inserted in the loop preheader. This procedure
4117 is called to change the SSA_NAME in the new statement's VUSE from the
4118 SSA_NAME updated in the loop to the related SSA_NAME available on the
4119 path entering the loop.
4121 When this function is called, we have the following situation:
4123 # vuse <name1>
4124 S1: vload
4125 do {
4126 # name1 = phi < name0 , name2>
4128 # vuse <name1>
4129 S2: vload
4131 # name2 = vdef <name1>
4132 S3: vstore
4134 }while...
4136 Stmt S1 was created in the loop preheader block as part of misaligned-load
4137 handling. This function fixes the name of the vuse of S1 from 'name1' to
4138 'name0'. */
4140 static void
4141 update_vuses_to_preheader (tree stmt, struct loop *loop)
4143 basic_block header_bb = loop->header;
4144 edge preheader_e = loop_preheader_edge (loop);
4145 ssa_op_iter iter;
4146 use_operand_p use_p;
4148 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VUSE)
4150 tree ssa_name = USE_FROM_PTR (use_p);
4151 tree def_stmt = SSA_NAME_DEF_STMT (ssa_name);
4152 tree name_var = SSA_NAME_VAR (ssa_name);
4153 basic_block bb = bb_for_stmt (def_stmt);
4155 /* For a use before any definitions, def_stmt is a NOP_EXPR. */
4156 if (!IS_EMPTY_STMT (def_stmt)
4157 && flow_bb_inside_loop_p (loop, bb))
4159 /* If the block containing the statement defining the SSA_NAME
4160 is in the loop then it's necessary to find the definition
4161 outside the loop using the PHI nodes of the header. */
4162 tree phi;
4163 bool updated = false;
4165 for (phi = phi_nodes (header_bb); phi; phi = TREE_CHAIN (phi))
4167 if (SSA_NAME_VAR (PHI_RESULT (phi)) == name_var)
4169 SET_USE (use_p, PHI_ARG_DEF (phi, preheader_e->dest_idx));
4170 updated = true;
4171 break;
4174 gcc_assert (updated);
4180 /* Function vect_update_ivs_after_vectorizer.
4182 "Advance" the induction variables of LOOP to the value they should take
4183 after the execution of LOOP. This is currently necessary because the
4184 vectorizer does not handle induction variables that are used after the
4185 loop. Such a situation occurs when the last iterations of LOOP are
4186 peeled, because:
4187 1. We introduced new uses after LOOP for IVs that were not originally used
4188 after LOOP: the IVs of LOOP are now used by an epilog loop.
4189 2. LOOP is going to be vectorized; this means that it will iterate N/VF
4190 times, whereas the loop IVs should be bumped N times.
4192 Input:
4193 - LOOP - a loop that is going to be vectorized. The last few iterations
4194 of LOOP were peeled.
4195 - NITERS - the number of iterations that LOOP executes (before it is
4196 vectorized). i.e, the number of times the ivs should be bumped.
4197 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
4198 coming out from LOOP on which there are uses of the LOOP ivs
4199 (this is the path from LOOP->exit to epilog_loop->preheader).
4201 The new definitions of the ivs are placed in LOOP->exit.
4202 The phi args associated with the edge UPDATE_E in the bb
4203 UPDATE_E->dest are updated accordingly.
4205 Assumption 1: Like the rest of the vectorizer, this function assumes
4206 a single loop exit that has a single predecessor.
4208 Assumption 2: The phi nodes in the LOOP header and in update_bb are
4209 organized in the same order.
4211 Assumption 3: The access function of the ivs is simple enough (see
4212 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
4214 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
4215 coming out of LOOP on which the ivs of LOOP are used (this is the path
4216 that leads to the epilog loop; other paths skip the epilog loop). This
4217 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
4218 needs to have its phis updated.
4221 static void
4222 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
4223 edge update_e)
4225 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4226 basic_block exit_bb = single_exit (loop)->dest;
4227 tree phi, phi1;
4228 basic_block update_bb = update_e->dest;
4230 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
4232 /* Make sure there exists a single-predecessor exit bb: */
4233 gcc_assert (single_pred_p (exit_bb));
4235 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
4236 phi && phi1;
4237 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
4239 tree access_fn = NULL;
4240 tree evolution_part;
4241 tree init_expr;
4242 tree step_expr;
4243 tree var, stmt, ni, ni_name;
4244 block_stmt_iterator last_bsi;
4246 if (vect_print_dump_info (REPORT_DETAILS))
4248 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
4249 print_generic_expr (vect_dump, phi, TDF_SLIM);
4252 /* Skip virtual phi's. */
4253 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
4255 if (vect_print_dump_info (REPORT_DETAILS))
4256 fprintf (vect_dump, "virtual phi. skip.");
4257 continue;
4260 /* Skip reduction phis. */
4261 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
4263 if (vect_print_dump_info (REPORT_DETAILS))
4264 fprintf (vect_dump, "reduc phi. skip.");
4265 continue;
4268 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
4269 gcc_assert (access_fn);
4270 evolution_part =
4271 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
4272 gcc_assert (evolution_part != NULL_TREE);
4274 /* FORNOW: We do not support IVs whose evolution function is a polynomial
4275 of degree >= 2 or exponential. */
4276 gcc_assert (!tree_is_chrec (evolution_part));
4278 step_expr = evolution_part;
4279 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
4280 loop->num));
4282 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
4283 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
4284 fold_convert (TREE_TYPE (init_expr),
4285 niters),
4286 step_expr),
4287 init_expr);
4289 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
4290 add_referenced_var (var);
4292 ni_name = force_gimple_operand (ni, &stmt, false, var);
4294 /* Insert stmt into exit_bb. */
4295 last_bsi = bsi_last (exit_bb);
4296 if (stmt)
4297 bsi_insert_before (&last_bsi, stmt, BSI_SAME_STMT);
4299 /* Fix phi expressions in the successor bb. */
4300 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
4305 /* Function vect_do_peeling_for_loop_bound
4307 Peel the last iterations of the loop represented by LOOP_VINFO.
4308 The peeled iterations form a new epilog loop. Given that the loop now
4309 iterates NITERS times, the new epilog loop iterates
4310 NITERS % VECTORIZATION_FACTOR times.
4312 The original loop will later be made to iterate
4313 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
4315 static void
4316 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
4318 tree ni_name, ratio_mult_vf_name;
4319 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4320 struct loop *new_loop;
4321 edge update_e;
4322 basic_block preheader;
4323 int loop_num;
4324 unsigned int th;
4326 if (vect_print_dump_info (REPORT_DETAILS))
4327 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
4329 initialize_original_copy_tables ();
4331 /* Generate the following variables on the preheader of original loop:
4333 ni_name = number of iteration the original loop executes
4334 ratio = ni_name / vf
4335 ratio_mult_vf_name = ratio * vf */
4336 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
4337 &ratio_mult_vf_name, ratio);
4339 loop_num = loop->num;
4340 /* Threshold for vectorized loop. */
4341 th = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)) *
4342 LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4343 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
4344 ratio_mult_vf_name, ni_name, false, th);
4345 gcc_assert (new_loop);
4346 gcc_assert (loop_num == loop->num);
4347 #ifdef ENABLE_CHECKING
4348 slpeel_verify_cfg_after_peeling (loop, new_loop);
4349 #endif
4351 /* A guard that controls whether the new_loop is to be executed or skipped
4352 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
4353 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
4354 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
4355 is on the path where the LOOP IVs are used and need to be updated. */
4357 preheader = loop_preheader_edge (new_loop)->src;
4358 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
4359 update_e = EDGE_PRED (preheader, 0);
4360 else
4361 update_e = EDGE_PRED (preheader, 1);
4363 /* Update IVs of original loop as if they were advanced
4364 by ratio_mult_vf_name steps. */
4365 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
4367 /* After peeling we have to reset scalar evolution analyzer. */
4368 scev_reset ();
4370 free_original_copy_tables ();
4374 /* Function vect_gen_niters_for_prolog_loop
4376 Set the number of iterations for the loop represented by LOOP_VINFO
4377 to the minimum between LOOP_NITERS (the original iteration count of the loop)
4378 and the misalignment of DR - the data reference recorded in
4379 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
4380 this loop, the data reference DR will refer to an aligned location.
4382 The following computation is generated:
4384 If the misalignment of DR is known at compile time:
4385 addr_mis = int mis = DR_MISALIGNMENT (dr);
4386 Else, compute address misalignment in bytes:
4387 addr_mis = addr & (vectype_size - 1)
4389 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
4391 (elem_size = element type size; an element is the scalar element
4392 whose type is the inner type of the vectype)
4394 For interleaving,
4396 prolog_niters = min ( LOOP_NITERS ,
4397 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
4398 where group_size is the size of the interleaved group.
4401 static tree
4402 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
4404 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
4405 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4406 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4407 tree var, stmt;
4408 tree iters, iters_name;
4409 edge pe;
4410 basic_block new_bb;
4411 tree dr_stmt = DR_STMT (dr);
4412 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
4413 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4414 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
4415 tree niters_type = TREE_TYPE (loop_niters);
4416 int group_size = 1;
4417 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
4419 if (DR_GROUP_FIRST_DR (stmt_info))
4421 /* For interleaved access element size must be multiplied by the size of
4422 the interleaved group. */
4423 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
4424 DR_GROUP_FIRST_DR (stmt_info)));
4425 element_size *= group_size;
4428 pe = loop_preheader_edge (loop);
4430 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
4432 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
4433 int elem_misalign = byte_misalign / element_size;
4435 if (vect_print_dump_info (REPORT_DETAILS))
4436 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
4437 iters = build_int_cst (niters_type,
4438 (vf - elem_misalign)&(vf/group_size-1));
4440 else
4442 tree new_stmts = NULL_TREE;
4443 tree start_addr =
4444 vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE);
4445 tree ptr_type = TREE_TYPE (start_addr);
4446 tree size = TYPE_SIZE (ptr_type);
4447 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
4448 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
4449 tree elem_size_log =
4450 build_int_cst (type, exact_log2 (vectype_align/vf));
4451 tree vf_minus_1 = build_int_cst (type, vf - 1);
4452 tree vf_tree = build_int_cst (type, vf);
4453 tree byte_misalign;
4454 tree elem_misalign;
4456 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
4457 gcc_assert (!new_bb);
4459 /* Create: byte_misalign = addr & (vectype_size - 1) */
4460 byte_misalign =
4461 fold_build2 (BIT_AND_EXPR, type, start_addr, vectype_size_minus_1);
4463 /* Create: elem_misalign = byte_misalign / element_size */
4464 elem_misalign =
4465 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
4467 /* Create: (niters_type) (VF - elem_misalign)&(VF - 1) */
4468 iters = fold_build2 (MINUS_EXPR, type, vf_tree, elem_misalign);
4469 iters = fold_build2 (BIT_AND_EXPR, type, iters, vf_minus_1);
4470 iters = fold_convert (niters_type, iters);
4473 /* Create: prolog_loop_niters = min (iters, loop_niters) */
4474 /* If the loop bound is known at compile time we already verified that it is
4475 greater than vf; since the misalignment ('iters') is at most vf, there's
4476 no need to generate the MIN_EXPR in this case. */
4477 if (TREE_CODE (loop_niters) != INTEGER_CST)
4478 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
4480 if (vect_print_dump_info (REPORT_DETAILS))
4482 fprintf (vect_dump, "niters for prolog loop: ");
4483 print_generic_expr (vect_dump, iters, TDF_SLIM);
4486 var = create_tmp_var (niters_type, "prolog_loop_niters");
4487 add_referenced_var (var);
4488 iters_name = force_gimple_operand (iters, &stmt, false, var);
4490 /* Insert stmt on loop preheader edge. */
4491 if (stmt)
4493 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
4494 gcc_assert (!new_bb);
4497 return iters_name;
4501 /* Function vect_update_init_of_dr
4503 NITERS iterations were peeled from LOOP. DR represents a data reference
4504 in LOOP. This function updates the information recorded in DR to
4505 account for the fact that the first NITERS iterations had already been
4506 executed. Specifically, it updates the OFFSET field of DR. */
4508 static void
4509 vect_update_init_of_dr (struct data_reference *dr, tree niters)
4511 tree offset = DR_OFFSET (dr);
4513 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
4514 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
4515 DR_OFFSET (dr) = offset;
4519 /* Function vect_update_inits_of_drs
4521 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
4522 This function updates the information recorded for the data references in
4523 the loop to account for the fact that the first NITERS iterations had
4524 already been executed. Specifically, it updates the initial_condition of the
4525 access_function of all the data_references in the loop. */
4527 static void
4528 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
4530 unsigned int i;
4531 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
4532 struct data_reference *dr;
4534 if (vect_dump && (dump_flags & TDF_DETAILS))
4535 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
4537 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
4538 vect_update_init_of_dr (dr, niters);
4542 /* Function vect_do_peeling_for_alignment
4544 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
4545 'niters' is set to the misalignment of one of the data references in the
4546 loop, thereby forcing it to refer to an aligned location at the beginning
4547 of the execution of this loop. The data reference for which we are
4548 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
4550 static void
4551 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
4553 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4554 tree niters_of_prolog_loop, ni_name;
4555 tree n_iters;
4556 struct loop *new_loop;
4558 if (vect_print_dump_info (REPORT_DETAILS))
4559 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
4561 initialize_original_copy_tables ();
4563 ni_name = vect_build_loop_niters (loop_vinfo);
4564 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
4566 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
4567 new_loop =
4568 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
4569 niters_of_prolog_loop, ni_name, true, 0);
4570 gcc_assert (new_loop);
4571 #ifdef ENABLE_CHECKING
4572 slpeel_verify_cfg_after_peeling (new_loop, loop);
4573 #endif
4575 /* Update number of times loop executes. */
4576 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
4577 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
4578 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
4580 /* Update the init conditions of the access functions of all data refs. */
4581 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
4583 /* After peeling we have to reset scalar evolution analyzer. */
4584 scev_reset ();
4586 free_original_copy_tables ();
4590 /* Function vect_create_cond_for_align_checks.
4592 Create a conditional expression that represents the alignment checks for
4593 all of data references (array element references) whose alignment must be
4594 checked at runtime.
4596 Input:
4597 LOOP_VINFO - two fields of the loop information are used.
4598 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4599 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4601 Output:
4602 COND_EXPR_STMT_LIST - statements needed to construct the conditional
4603 expression.
4604 The returned value is the conditional expression to be used in the if
4605 statement that controls which version of the loop gets executed at runtime.
4607 The algorithm makes two assumptions:
4608 1) The number of bytes "n" in a vector is a power of 2.
4609 2) An address "a" is aligned if a%n is zero and that this
4610 test can be done as a&(n-1) == 0. For example, for 16
4611 byte vectors the test is a&0xf == 0. */
4613 static tree
4614 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4615 tree *cond_expr_stmt_list)
4617 VEC(tree,heap) *may_misalign_stmts
4618 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4619 tree ref_stmt;
4620 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4621 tree mask_cst;
4622 unsigned int i;
4623 tree psize;
4624 tree int_ptrsize_type;
4625 char tmp_name[20];
4626 tree or_tmp_name = NULL_TREE;
4627 tree and_tmp, and_tmp_name, and_stmt;
4628 tree ptrsize_zero;
4630 /* Check that mask is one less than a power of 2, i.e., mask is
4631 all zeros followed by all ones. */
4632 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
4634 /* CHECKME: what is the best integer or unsigned type to use to hold a
4635 cast from a pointer value? */
4636 psize = TYPE_SIZE (ptr_type_node);
4637 int_ptrsize_type
4638 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
4640 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
4641 of the first vector of the i'th data reference. */
4643 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
4645 tree new_stmt_list = NULL_TREE;
4646 tree addr_base;
4647 tree addr_tmp, addr_tmp_name, addr_stmt;
4648 tree or_tmp, new_or_tmp_name, or_stmt;
4650 /* create: addr_tmp = (int)(address_of_first_vector) */
4651 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
4652 &new_stmt_list,
4653 NULL_TREE);
4655 if (new_stmt_list != NULL_TREE)
4656 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
4658 sprintf (tmp_name, "%s%d", "addr2int", i);
4659 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4660 add_referenced_var (addr_tmp);
4661 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
4662 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
4663 addr_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4664 addr_tmp_name, addr_stmt);
4665 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
4666 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
4668 /* The addresses are OR together. */
4670 if (or_tmp_name != NULL_TREE)
4672 /* create: or_tmp = or_tmp | addr_tmp */
4673 sprintf (tmp_name, "%s%d", "orptrs", i);
4674 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
4675 add_referenced_var (or_tmp);
4676 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
4677 or_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4678 new_or_tmp_name,
4679 build2 (BIT_IOR_EXPR, int_ptrsize_type,
4680 or_tmp_name,
4681 addr_tmp_name));
4682 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
4683 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
4684 or_tmp_name = new_or_tmp_name;
4686 else
4687 or_tmp_name = addr_tmp_name;
4689 } /* end for i */
4691 mask_cst = build_int_cst (int_ptrsize_type, mask);
4693 /* create: and_tmp = or_tmp & mask */
4694 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
4695 add_referenced_var (and_tmp);
4696 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
4698 and_stmt = build2 (GIMPLE_MODIFY_STMT, void_type_node,
4699 and_tmp_name,
4700 build2 (BIT_AND_EXPR, int_ptrsize_type,
4701 or_tmp_name, mask_cst));
4702 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
4703 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
4705 /* Make and_tmp the left operand of the conditional test against zero.
4706 if and_tmp has a nonzero bit then some address is unaligned. */
4707 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4708 return build2 (EQ_EXPR, boolean_type_node,
4709 and_tmp_name, ptrsize_zero);
4713 /* Function vect_transform_loop.
4715 The analysis phase has determined that the loop is vectorizable.
4716 Vectorize the loop - created vectorized stmts to replace the scalar
4717 stmts in the loop, and update the loop exit condition. */
4719 void
4720 vect_transform_loop (loop_vec_info loop_vinfo)
4722 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4723 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4724 int nbbs = loop->num_nodes;
4725 block_stmt_iterator si;
4726 int i;
4727 tree ratio = NULL;
4728 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4729 bool strided_store;
4731 if (vect_print_dump_info (REPORT_DETAILS))
4732 fprintf (vect_dump, "=== vec_transform_loop ===");
4734 /* If the loop has data references that may or may not be aligned then
4735 two versions of the loop need to be generated, one which is vectorized
4736 and one which isn't. A test is then generated to control which of the
4737 loops is executed. The test checks for the alignment of all of the
4738 data references that may or may not be aligned. */
4740 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
4742 struct loop *nloop;
4743 tree cond_expr;
4744 tree cond_expr_stmt_list = NULL_TREE;
4745 basic_block condition_bb;
4746 block_stmt_iterator cond_exp_bsi;
4747 basic_block merge_bb;
4748 basic_block new_exit_bb;
4749 edge new_exit_e, e;
4750 tree orig_phi, new_phi, arg;
4751 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
4753 cond_expr = vect_create_cond_for_align_checks (loop_vinfo,
4754 &cond_expr_stmt_list);
4755 initialize_original_copy_tables ();
4756 nloop = loop_version (loop, cond_expr, &condition_bb,
4757 prob, prob, REG_BR_PROB_BASE - prob, true);
4758 free_original_copy_tables();
4760 /** Loop versioning violates an assumption we try to maintain during
4761 vectorization - that the loop exit block has a single predecessor.
4762 After versioning, the exit block of both loop versions is the same
4763 basic block (i.e. it has two predecessors). Just in order to simplify
4764 following transformations in the vectorizer, we fix this situation
4765 here by adding a new (empty) block on the exit-edge of the loop,
4766 with the proper loop-exit phis to maintain loop-closed-form. **/
4768 merge_bb = single_exit (loop)->dest;
4769 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
4770 new_exit_bb = split_edge (single_exit (loop));
4771 new_exit_e = single_exit (loop);
4772 e = EDGE_SUCC (new_exit_bb, 0);
4774 for (orig_phi = phi_nodes (merge_bb); orig_phi;
4775 orig_phi = PHI_CHAIN (orig_phi))
4777 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
4778 new_exit_bb);
4779 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4780 add_phi_arg (new_phi, arg, new_exit_e);
4781 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
4784 /** end loop-exit-fixes after versioning **/
4786 update_ssa (TODO_update_ssa);
4787 cond_exp_bsi = bsi_last (condition_bb);
4788 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
4791 /* CHECKME: we wouldn't need this if we called update_ssa once
4792 for all loops. */
4793 bitmap_zero (vect_memsyms_to_rename);
4795 /* Peel the loop if there are data refs with unknown alignment.
4796 Only one data ref with unknown store is allowed. */
4798 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4799 vect_do_peeling_for_alignment (loop_vinfo);
4801 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4802 compile time constant), or it is a constant that doesn't divide by the
4803 vectorization factor, then an epilog loop needs to be created.
4804 We therefore duplicate the loop: the original loop will be vectorized,
4805 and will compute the first (n/VF) iterations. The second copy of the loop
4806 will remain scalar and will compute the remaining (n%VF) iterations.
4807 (VF is the vectorization factor). */
4809 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4810 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4811 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
4812 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
4813 else
4814 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4815 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4817 /* 1) Make sure the loop header has exactly two entries
4818 2) Make sure we have a preheader basic block. */
4820 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4822 split_edge (loop_preheader_edge (loop));
4824 /* FORNOW: the vectorizer supports only loops which body consist
4825 of one basic block (header + empty latch). When the vectorizer will
4826 support more involved loop forms, the order by which the BBs are
4827 traversed need to be reconsidered. */
4829 for (i = 0; i < nbbs; i++)
4831 basic_block bb = bbs[i];
4833 for (si = bsi_start (bb); !bsi_end_p (si);)
4835 tree stmt = bsi_stmt (si);
4836 stmt_vec_info stmt_info;
4837 bool is_store;
4839 if (vect_print_dump_info (REPORT_DETAILS))
4841 fprintf (vect_dump, "------>vectorizing statement: ");
4842 print_generic_expr (vect_dump, stmt, TDF_SLIM);
4844 stmt_info = vinfo_for_stmt (stmt);
4845 gcc_assert (stmt_info);
4846 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4847 && !STMT_VINFO_LIVE_P (stmt_info))
4849 bsi_next (&si);
4850 continue;
4853 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4854 != (unsigned HOST_WIDE_INT) vectorization_factor)
4855 && vect_print_dump_info (REPORT_DETAILS))
4856 fprintf (vect_dump, "multiple-types.");
4858 /* -------- vectorize statement ------------ */
4859 if (vect_print_dump_info (REPORT_DETAILS))
4860 fprintf (vect_dump, "transform statement.");
4862 strided_store = false;
4863 is_store = vect_transform_stmt (stmt, &si, &strided_store);
4864 if (is_store)
4866 stmt_ann_t ann;
4867 if (DR_GROUP_FIRST_DR (stmt_info))
4869 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4870 interleaving chain was completed - free all the stores in
4871 the chain. */
4872 tree next = DR_GROUP_FIRST_DR (stmt_info);
4873 tree tmp;
4874 stmt_vec_info next_stmt_info;
4876 while (next)
4878 next_stmt_info = vinfo_for_stmt (next);
4879 /* Free the attached stmt_vec_info and remove the stmt. */
4880 ann = stmt_ann (next);
4881 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
4882 free (next_stmt_info);
4883 set_stmt_info (ann, NULL);
4884 next = tmp;
4886 bsi_remove (&si, true);
4887 continue;
4889 else
4891 /* Free the attached stmt_vec_info and remove the stmt. */
4892 ann = stmt_ann (stmt);
4893 free (stmt_info);
4894 set_stmt_info (ann, NULL);
4895 bsi_remove (&si, true);
4896 continue;
4899 else
4901 if (strided_store)
4903 /* This is case of skipped interleaved store. We don't free
4904 its stmt_vec_info. */
4905 bsi_remove (&si, true);
4906 continue;
4909 bsi_next (&si);
4910 } /* stmts in BB */
4911 } /* BBs in loop */
4913 slpeel_make_loop_iterate_ntimes (loop, ratio);
4915 mark_set_for_renaming (vect_memsyms_to_rename);
4917 /* The memory tags and pointers in vectorized statements need to
4918 have their SSA forms updated. FIXME, why can't this be delayed
4919 until all the loops have been transformed? */
4920 update_ssa (TODO_update_ssa);
4922 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
4923 fprintf (vect_dump, "LOOP VECTORIZED.");